Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 6 Oct 2016 16:52:23 +0000 (09:52 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 6 Oct 2016 16:52:23 +0000 (09:52 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Oct 2016 16:52:23 +0000 (09:52 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 6 Oct 2016 16:52:23 +0000 (09:52 -0700)
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README

index 8c3306e..91f54ff 100644 (file)
--- a/Documentation/sysctl/README
+++ b/Documentation/sysctl/README
@@ -69,6 +69,7 @@ proc/         <empty>
  sunrpc/                SUN Remote Procedure Call (NFS)
  vm/            memory management tuning
                 buffer and cache management
+user/          Per user per user namespace limits
  
  These are the subdirs I have on my system. There might be more
  or other subdirs in another setup. If you see another dir, I'd
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt

index 302b5ed..35e17f7 100644 (file)
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -265,6 +265,13 @@ aio-nr can grow to.
  
  ==============================================================
  
+mount-max:
+
+This denotes the maximum number of mounts that may exist
+in a mount namespace.
+
+==============================================================
+
  
  2. /proc/sys/fs/binfmt_misc
  ----------------------------------------------------------
diff --git a/Documentation/sysctl/user.txt b/Documentation/sysctl/user.txt

new file mode 100644 (file)

index 0000000..1291c49
--- /dev/null
+++ b/Documentation/sysctl/user.txt
@@ -0,0 +1,66 @@
+Documentation for /proc/sys/user/*     kernel version 4.9.0
+       (c) 2016                Eric Biederman <ebiederm@xmission.com>
+
+==============================================================
+
+This file contains the documetation for the sysctl files in
+/proc/sys/user.
+
+The files in this directory can be used to override the default
+limits on the number of namespaces and other objects that have
+per user per user namespace limits.
+
+The primary purpose of these limits is to stop programs that
+malfunction and attempt to create a ridiculous number of objects,
+before the malfunction becomes a system wide problem.  It is the
+intention that the defaults of these limits are set high enough that
+no program in normal operation should run into these limits.
+
+The creation of per user per user namespace objects are charged to
+the user in the user namespace who created the object and
+verified to be below the per user limit in that user namespace.
+
+The creation of objects is also charged to all of the users
+who created user namespaces the creation of the object happens
+in (user namespaces can be nested) and verified to be below the per user
+limits in the user namespaces of those users.
+
+This recursive counting of created objects ensures that creating a
+user namespace does not allow a user to escape their current limits.
+
+Currently, these files are in /proc/sys/user:
+
+- max_cgroup_namespaces
+
+  The maximum number of cgroup namespaces that any user in the current
+  user namespace may create.
+
+- max_ipc_namespaces
+
+  The maximum number of ipc namespaces that any user in the current
+  user namespace may create.
+
+- max_mnt_namespaces
+
+  The maximum number of mount namespaces that any user in the current
+  user namespace may create.
+
+- max_net_namespaces
+
+  The maximum number of network namespaces that any user in the
+  current user namespace may create.
+
+- max_pid_namespaces
+
+  The maximum number of pid namespaces that any user in the current
+  user namespace may create.
+
+- max_user_namespaces
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
+
+- max_uts_namespaces
+
+  The maximum number of user namespaces that any user in the current
+  user namespace may create.
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c

index 431fd7e..e44271d 100644 (file)
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -431,8 +431,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
                 memcpy(&wq->name, &qstr, sizeof(struct qstr));
                 wq->dev = autofs4_get_dev(sbi);
                 wq->ino = autofs4_get_ino(sbi);
-               wq->uid = current_uid();
-               wq->gid = current_gid();
+               wq->uid = current_real_cred()->uid;
+               wq->gid = current_real_cred()->gid;
                 wq->pid = pid;
                 wq->tgid = tgid;
                 wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/mount.h b/fs/mount.h

index 14db05d..d2e25d7 100644 (file)
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,9 +10,12 @@ struct mnt_namespace {
         struct mount *  root;
         struct list_head        list;
         struct user_namespace   *user_ns;
+       struct ucounts          *ucounts;
         u64                     seq;    /* Sequence number to prevent loops */
         wait_queue_head_t poll;
         u64 event;
+       unsigned int            mounts; /* # of mounts in the namespace */
+       unsigned int            pending_mounts;
  };
  
  struct mnt_pcp {
diff --git a/fs/namespace.c b/fs/namespace.c

index 7bb2cda..db1b5a3 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,9 @@
  #include "pnode.h"
  #include "internal.h"
  
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
  static unsigned int m_hash_mask __read_mostly;
  static unsigned int m_hash_shift __read_mostly;
  static unsigned int mp_hash_mask __read_mostly;
@@ -899,6 +902,9 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
  
         list_splice(&head, n->list.prev);
  
+       n->mounts += n->pending_mounts;
+       n->pending_mounts = 0;
+
         attach_shadowed(mnt, parent, shadows);
         touch_mnt_namespace(n);
  }
@@ -1419,11 +1425,16 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
                 propagate_umount(&tmp_list);
  
         while (!list_empty(&tmp_list)) {
+               struct mnt_namespace *ns;
                 bool disconnect;
                 p = list_first_entry(&tmp_list, struct mount, mnt_list);
                 list_del_init(&p->mnt_expire);
                 list_del_init(&p->mnt_list);
-               __touch_mnt_namespace(p->mnt_ns);
+               ns = p->mnt_ns;
+               if (ns) {
+                       ns->mounts--;
+                       __touch_mnt_namespace(ns);
+               }
                 p->mnt_ns = NULL;
                 if (how & UMOUNT_SYNC)
                         p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
@@ -1840,6 +1851,28 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
         return 0;
  }
  
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+       unsigned int max = READ_ONCE(sysctl_mount_max);
+       unsigned int mounts = 0, old, pending, sum;
+       struct mount *p;
+
+       for (p = mnt; p; p = next_mnt(p, mnt))
+               mounts++;
+
+       old = ns->mounts;
+       pending = ns->pending_mounts;
+       sum = old + pending;
+       if ((old > sum) ||
+           (pending > sum) ||
+           (max < sum) ||
+           (mounts > (max - sum)))
+               return -ENOSPC;
+
+       ns->pending_mounts = pending + mounts;
+       return 0;
+}
+
  /*
   *  @source_mnt : mount tree to be attached
   *  @nd         : place the mount tree @source_mnt is attached
@@ -1909,10 +1942,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                         struct path *parent_path)
  {
         HLIST_HEAD(tree_list);
+       struct mnt_namespace *ns = dest_mnt->mnt_ns;
         struct mount *child, *p;
         struct hlist_node *n;
         int err;
  
+       /* Is there space to add these mounts to the mount namespace? */
+       if (!parent_path) {
+               err = count_mounts(ns, source_mnt);
+               if (err)
+                       goto out;
+       }
+
         if (IS_MNT_SHARED(dest_mnt)) {
                 err = invent_group_ids(source_mnt, true);
                 if (err)
@@ -1949,11 +1990,13 @@ static int attach_recursive_mnt(struct mount *source_mnt,
   out_cleanup_ids:
         while (!hlist_empty(&tree_list)) {
                 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+               child->mnt_parent->mnt_ns->pending_mounts = 0;
                 umount_tree(child, UMOUNT_SYNC);
         }
         unlock_mount_hash();
         cleanup_group_ids(source_mnt, NULL);
   out:
+       ns->pending_mounts = 0;
         return err;
  }
  
@@ -2719,9 +2762,20 @@ dput_out:
         return retval;
  }
  
+static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
+}
+
+static void dec_mnt_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
+}
+
  static void free_mnt_ns(struct mnt_namespace *ns)
  {
         ns_free_inum(&ns->ns);
+       dec_mnt_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         kfree(ns);
  }
@@ -2738,14 +2792,22 @@ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
  {
         struct mnt_namespace *new_ns;
+       struct ucounts *ucounts;
         int ret;
  
+       ucounts = inc_mnt_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
-       if (!new_ns)
+       if (!new_ns) {
+               dec_mnt_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+       }
         ret = ns_alloc_inum(&new_ns->ns);
         if (ret) {
                 kfree(new_ns);
+               dec_mnt_namespaces(ucounts);
                 return ERR_PTR(ret);
         }
         new_ns->ns.ops = &mntns_operations;
@@ -2756,6 +2818,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
         init_waitqueue_head(&new_ns->poll);
         new_ns->event = 0;
         new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
+       new_ns->mounts = 0;
+       new_ns->pending_mounts = 0;
         return new_ns;
  }
  
@@ -2805,6 +2870,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
         q = new;
         while (p) {
                 q->mnt_ns = new_ns;
+               new_ns->mounts++;
                 if (new_fs) {
                         if (&p->mnt == new_fs->root.mnt) {
                                 new_fs->root.mnt = mntget(&q->mnt);
@@ -2843,6 +2909,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *m)
                 struct mount *mnt = real_mount(m);
                 mnt->mnt_ns = new_ns;
                 new_ns->root = mnt;
+               new_ns->mounts++;
                 list_add(&mnt->mnt_list, &new_ns->list);
         } else {
                 mntput(m);
@@ -3348,10 +3415,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return 0;
  }
  
+static struct user_namespace *mntns_owner(struct ns_common *ns)
+{
+       return to_mnt_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations mntns_operations = {
         .name           = "mnt",
         .type           = CLONE_NEWNS,
         .get            = mntns_get,
         .put            = mntns_put,
         .install        = mntns_install,
+       .owner          = mntns_owner,
  };
diff --git a/fs/nsfs.c b/fs/nsfs.c

index 8f20d60..30bb100 100644 (file)
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -5,11 +5,16 @@
  #include <linux/magic.h>
  #include <linux/ktime.h>
  #include <linux/seq_file.h>
+#include <linux/user_namespace.h>
+#include <linux/nsfs.h>
  
  static struct vfsmount *nsfs_mnt;
  
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+                       unsigned long arg);
  static const struct file_operations ns_file_operations = {
         .llseek         = no_llseek,
+       .unlocked_ioctl = ns_ioctl,
  };
  
  static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode)
         ns->ops->put(ns);
  }
  
-void *ns_get_path(struct path *path, struct task_struct *task,
-                       const struct proc_ns_operations *ns_ops)
+static void *__ns_get_path(struct path *path, struct ns_common *ns)
  {
-       struct vfsmount *mnt = mntget(nsfs_mnt);
+       struct vfsmount *mnt = nsfs_mnt;
         struct qstr qname = { .name = "", };
         struct dentry *dentry;
         struct inode *inode;
-       struct ns_common *ns;
         unsigned long d;
  
-again:
-       ns = ns_ops->get(task);
-       if (!ns) {
-               mntput(mnt);
-               return ERR_PTR(-ENOENT);
-       }
         rcu_read_lock();
         d = atomic_long_read(&ns->stashed);
         if (!d)
@@ -68,17 +65,16 @@ again:
         if (!lockref_get_not_dead(&dentry->d_lockref))
                 goto slow;
         rcu_read_unlock();
-       ns_ops->put(ns);
+       ns->ops->put(ns);
  got_it:
-       path->mnt = mnt;
+       path->mnt = mntget(mnt);
         path->dentry = dentry;
         return NULL;
  slow:
         rcu_read_unlock();
         inode = new_inode_pseudo(mnt->mnt_sb);
         if (!inode) {
-               ns_ops->put(ns);
-               mntput(mnt);
+               ns->ops->put(ns);
                 return ERR_PTR(-ENOMEM);
         }
         inode->i_ino = ns->inum;
@@ -91,21 +87,96 @@ slow:
         dentry = d_alloc_pseudo(mnt->mnt_sb, &qname);
         if (!dentry) {
                 iput(inode);
-               mntput(mnt);
                 return ERR_PTR(-ENOMEM);
         }
         d_instantiate(dentry, inode);
-       dentry->d_fsdata = (void *)ns_ops;
+       dentry->d_fsdata = (void *)ns->ops;
         d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
         if (d) {
                 d_delete(dentry);       /* make sure ->d_prune() does nothing */
                 dput(dentry);
                 cpu_relax();
-               goto again;
+               return ERR_PTR(-EAGAIN);
         }
         goto got_it;
  }
  
+void *ns_get_path(struct path *path, struct task_struct *task,
+                       const struct proc_ns_operations *ns_ops)
+{
+       struct ns_common *ns;
+       void *ret;
+
+again:
+       ns = ns_ops->get(task);
+       if (!ns)
+               return ERR_PTR(-ENOENT);
+
+       ret = __ns_get_path(path, ns);
+       if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
+               goto again;
+       return ret;
+}
+
+static int open_related_ns(struct ns_common *ns,
+                  struct ns_common *(*get_ns)(struct ns_common *ns))
+{
+       struct path path = {};
+       struct file *f;
+       void *err;
+       int fd;
+
+       fd = get_unused_fd_flags(O_CLOEXEC);
+       if (fd < 0)
+               return fd;
+
+       while (1) {
+               struct ns_common *relative;
+
+               relative = get_ns(ns);
+               if (IS_ERR(relative)) {
+                       put_unused_fd(fd);
+                       return PTR_ERR(relative);
+               }
+
+               err = __ns_get_path(&path, relative);
+               if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
+                       continue;
+               break;
+       }
+       if (IS_ERR(err)) {
+               put_unused_fd(fd);
+               return PTR_ERR(err);
+       }
+
+       f = dentry_open(&path, O_RDONLY, current_cred());
+       path_put(&path);
+       if (IS_ERR(f)) {
+               put_unused_fd(fd);
+               fd = PTR_ERR(f);
+       } else
+               fd_install(fd, f);
+
+       return fd;
+}
+
+static long ns_ioctl(struct file *filp, unsigned int ioctl,
+                       unsigned long arg)
+{
+       struct ns_common *ns = get_proc_ns(file_inode(filp));
+
+       switch (ioctl) {
+       case NS_GET_USERNS:
+               return open_related_ns(ns, ns_get_owner);
+       case NS_GET_PARENT:
+               if (!ns->ops->get_parent)
+                       return -EINVAL;
+               return open_related_ns(ns, ns->ops->get_parent);
+       default:
+               return -ENOTTY;
+       }
+}
+
  int ns_get_name(char *buf, size_t size, struct task_struct *task,
                         const struct proc_ns_operations *ns_ops)
  {
diff --git a/fs/pnode.c b/fs/pnode.c

index 9989970..234a9ac 100644 (file)
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -259,7 +259,7 @@ static int propagate_one(struct mount *m)
                 read_sequnlock_excl(&mount_lock);
         }
         hlist_add_head(&child->mnt_hash, list);
-       return 0;
+       return count_mounts(m->mnt_ns, child);
  }
  
  /*
diff --git a/fs/pnode.h b/fs/pnode.h

index 0fcdbe7..550f5a8 100644 (file)
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -52,4 +52,5 @@ void mnt_set_mountpoint(struct mount *, struct mountpoint *,
  struct mount *copy_tree(struct mount *, struct dentry *, int);
  bool is_path_reachable(struct mount *, struct dentry *,
                          const struct path *root);
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt);
  #endif /* _LINUX_PNODE_H */
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c

index 2ed3d71..71025b9 100644 (file)
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -72,7 +72,7 @@ static DEFINE_SPINLOCK(sysctl_lock);
  
  static void drop_sysctl_table(struct ctl_table_header *header);
  static int sysctl_follow_link(struct ctl_table_header **phead,
-       struct ctl_table **pentry, struct nsproxy *namespaces);
+       struct ctl_table **pentry);
  static int insert_links(struct ctl_table_header *head);
  static void put_links(struct ctl_table_header *header);
  
@@ -319,11 +319,11 @@ static void sysctl_head_finish(struct ctl_table_header *head)
  }
  
  static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
+lookup_header_set(struct ctl_table_root *root)
  {
         struct ctl_table_set *set = &root->default_set;
         if (root->lookup)
-               set = root->lookup(root, namespaces);
+               set = root->lookup(root);
         return set;
  }
  
@@ -496,7 +496,7 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
                 goto out;
  
         if (S_ISLNK(p->mode)) {
-               ret = sysctl_follow_link(&h, &p, current->nsproxy);
+               ret = sysctl_follow_link(&h, &p);
                 err = ERR_PTR(ret);
                 if (ret)
                         goto out;
@@ -664,7 +664,7 @@ static bool proc_sys_link_fill_cache(struct file *file,
  
         if (S_ISLNK(table->mode)) {
                 /* It is not an error if we can not follow the link ignore it */
-               int err = sysctl_follow_link(&head, &table, current->nsproxy);
+               int err = sysctl_follow_link(&head, &table);
                 if (err)
                         goto out;
         }
@@ -981,7 +981,7 @@ static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
  }
  
  static int sysctl_follow_link(struct ctl_table_header **phead,
-       struct ctl_table **pentry, struct nsproxy *namespaces)
+       struct ctl_table **pentry)
  {
         struct ctl_table_header *head;
         struct ctl_table_root *root;
@@ -993,7 +993,7 @@ static int sysctl_follow_link(struct ctl_table_header **phead,
         ret = 0;
         spin_lock(&sysctl_lock);
         root = (*pentry)->data;
-       set = lookup_header_set(root, namespaces);
+       set = lookup_header_set(root);
         dir = xlate_dir(set, (*phead)->parent);
         if (IS_ERR(dir))
                 ret = PTR_ERR(dir);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index a4414a1..440a721 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -644,6 +644,7 @@ struct cgroup_namespace {
         atomic_t                count;
         struct ns_common        ns;
         struct user_namespace   *user_ns;
+       struct ucounts          *ucounts;
         struct css_set          *root_cset;
  };
  
diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h

index d10e54f..848e579 100644 (file)
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -58,6 +58,7 @@ struct ipc_namespace {
  
         /* user_ns which owns the ipc ns */
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
  
         struct ns_common ns;
  };
diff --git a/include/linux/mount.h b/include/linux/mount.h

index 54a594d..1172cce 100644 (file)
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -96,4 +96,6 @@ extern void mark_mounts_for_expiry(struct list_head *mounts);
  
  extern dev_t name_to_dev_t(const char *name);
  
+extern unsigned int sysctl_mount_max;
+
  #endif /* _LINUX_MOUNT_H */
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h

index 918b117..34cce96 100644 (file)
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -40,6 +40,7 @@ struct pid_namespace {
         struct fs_pin *bacct;
  #endif
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
         struct work_struct proc_work;
         kgid_t pid_gid;
         int hide_pid;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h

index de0e771..12cb8bd 100644 (file)
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -18,6 +18,8 @@ struct proc_ns_operations {
         struct ns_common *(*get)(struct task_struct *task);
         void (*put)(struct ns_common *ns);
         int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
+       struct user_namespace *(*owner)(struct ns_common *ns);
+       struct ns_common *(*get_parent)(struct ns_common *ns);
  };
  
  extern const struct proc_ns_operations netns_operations;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h

index ecc3e07..adf4e51 100644 (file)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -158,8 +158,7 @@ struct ctl_table_set {
  
  struct ctl_table_root {
         struct ctl_table_set default_set;
-       struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
-                                          struct nsproxy *namespaces);
+       struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
         void (*set_ownership)(struct ctl_table_header *head,
                               struct ctl_table *table,
                               kuid_t *uid, kgid_t *gid);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h

index 9217169..eb209d4 100644 (file)
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,19 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
  
  #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
  
+struct ucounts;
+
+enum ucount_type {
+       UCOUNT_USER_NAMESPACES,
+       UCOUNT_PID_NAMESPACES,
+       UCOUNT_UTS_NAMESPACES,
+       UCOUNT_IPC_NAMESPACES,
+       UCOUNT_NET_NAMESPACES,
+       UCOUNT_MNT_NAMESPACES,
+       UCOUNT_CGROUP_NAMESPACES,
+       UCOUNT_COUNTS,
+};
+
  struct user_namespace {
         struct uid_gid_map      uid_map;
         struct uid_gid_map      gid_map;
@@ -39,10 +52,30 @@ struct user_namespace {
         struct key              *persistent_keyring_register;
         struct rw_semaphore     persistent_keyring_register_sem;
  #endif
+       struct work_struct      work;
+#ifdef CONFIG_SYSCTL
+       struct ctl_table_set    set;
+       struct ctl_table_header *sysctls;
+#endif
+       struct ucounts          *ucounts;
+       int ucount_max[UCOUNT_COUNTS];
+};
+
+struct ucounts {
+       struct hlist_node node;
+       struct user_namespace *ns;
+       kuid_t uid;
+       atomic_t count;
+       atomic_t ucount[UCOUNT_COUNTS];
  };
  
  extern struct user_namespace init_user_ns;
  
+bool setup_userns_sysctls(struct user_namespace *ns);
+void retire_userns_sysctls(struct user_namespace *ns);
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+
  #ifdef CONFIG_USER_NS
  
  static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -54,12 +87,12 @@ static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
  
  extern int create_user_ns(struct cred *new);
  extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
-extern void free_user_ns(struct user_namespace *ns);
+extern void __put_user_ns(struct user_namespace *ns);
  
  static inline void put_user_ns(struct user_namespace *ns)
  {
         if (ns && atomic_dec_and_test(&ns->count))
-               free_user_ns(ns);
+               __put_user_ns(ns);
  }
  
  struct seq_operations;
@@ -73,6 +106,8 @@ extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t,
  extern int proc_setgroups_show(struct seq_file *m, void *v);
  extern bool userns_may_setgroups(const struct user_namespace *ns);
  extern bool current_in_userns(const struct user_namespace *target_ns);
+
+struct ns_common *ns_get_owner(struct ns_common *ns);
  #else
  
  static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -106,6 +141,11 @@ static inline bool current_in_userns(const struct user_namespace *target_ns)
  {
         return true;
  }
+
+static inline struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+       return ERR_PTR(-EPERM);
+}
  #endif
  
  #endif /* _LINUX_USER_H */
diff --git a/include/linux/utsname.h b/include/linux/utsname.h

index 5093f58..60f0bb8 100644 (file)
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -24,6 +24,7 @@ struct uts_namespace {
         struct kref kref;
         struct new_utsname name;
         struct user_namespace *user_ns;
+       struct ucounts *ucounts;
         struct ns_common ns;
  };
  extern struct uts_namespace init_uts_ns;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h

index 0933c74..fc4f757 100644 (file)
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -60,6 +60,7 @@ struct net {
         struct list_head        exit_list;      /* Use only net_mutex */
  
         struct user_namespace   *user_ns;       /* Owning user namespace */
+       struct ucounts          *ucounts;
         spinlock_t              nsid_lock;
         struct idr              netns_ids;
  
diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h

new file mode 100644 (file)

index 0000000..3af6172
--- /dev/null
+++ b/include/uapi/linux/nsfs.h
@@ -0,0 +1,13 @@
+#ifndef __LINUX_NSFS_H
+#define __LINUX_NSFS_H
+
+#include <linux/ioctl.h>
+
+#define NSIO   0xb7
+
+/* Returns a file descriptor that refers to an owning user namespace */
+#define NS_GET_USERNS  _IO(NSIO, 0x1)
+/* Returns a file descriptor that refers to a parent namespace */
+#define NS_GET_PARENT  _IO(NSIO, 0x2)
+
+#endif /* __LINUX_NSFS_H */
diff --git a/ipc/namespace.c b/ipc/namespace.c

index d87e6ba..0abdea4 100644 (file)
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -16,39 +16,61 @@
  
  #include "util.h"
  
+static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
+}
+
+static void dec_ipc_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_IPC_NAMESPACES);
+}
+
  static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
                                            struct ipc_namespace *old_ns)
  {
         struct ipc_namespace *ns;
+       struct ucounts *ucounts;
         int err;
  
+       err = -ENOSPC;
+       ucounts = inc_ipc_namespaces(user_ns);
+       if (!ucounts)
+               goto fail;
+
+       err = -ENOMEM;
         ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL);
         if (ns == NULL)
-               return ERR_PTR(-ENOMEM);
+               goto fail_dec;
  
         err = ns_alloc_inum(&ns->ns);
-       if (err) {
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_free;
         ns->ns.ops = &ipcns_operations;
  
         atomic_set(&ns->count, 1);
         ns->user_ns = get_user_ns(user_ns);
+       ns->ucounts = ucounts;
  
         err = mq_init_ns(ns);
-       if (err) {
-               put_user_ns(ns->user_ns);
-               ns_free_inum(&ns->ns);
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_put;
  
         sem_init_ns(ns);
         msg_init_ns(ns);
         shm_init_ns(ns);
  
         return ns;
+
+fail_put:
+       put_user_ns(ns->user_ns);
+       ns_free_inum(&ns->ns);
+fail_free:
+       kfree(ns);
+fail_dec:
+       dec_ipc_namespaces(ucounts);
+fail:
+       return ERR_PTR(err);
  }
  
  struct ipc_namespace *copy_ipcs(unsigned long flags,
@@ -96,6 +118,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
         msg_exit_ns(ns);
         shm_exit_ns(ns);
  
+       dec_ipc_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -165,10 +188,16 @@ static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
         return 0;
  }
  
+static struct user_namespace *ipcns_owner(struct ns_common *ns)
+{
+       return to_ipc_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations ipcns_operations = {
         .name           = "ipc",
         .type           = CLONE_NEWIPC,
         .get            = ipcns_get,
         .put            = ipcns_put,
         .install        = ipcns_install,
+       .owner          = ipcns_owner,
  };
diff --git a/kernel/Makefile b/kernel/Makefile

index e2ec54e..eb26e12 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
             extable.o params.o \
             kthread.o sys_ni.o nsproxy.o \
             notifier.o ksysfs.o cred.o reboot.o \
-           async.o range.o smpboot.o
+           async.o range.o smpboot.o ucount.o
  
  obj-$(CONFIG_MULTIUSER) += groups.o
  
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 9ba2831..4406615 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -6328,6 +6328,16 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
  
  /* cgroup namespaces */
  
+static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
+}
+
+static void dec_cgroup_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
+}
+
  static struct cgroup_namespace *alloc_cgroup_ns(void)
  {
         struct cgroup_namespace *new_ns;
@@ -6349,6 +6359,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
  void free_cgroup_ns(struct cgroup_namespace *ns)
  {
         put_css_set(ns->root_cset);
+       dec_cgroup_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -6360,6 +6371,7 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                         struct cgroup_namespace *old_ns)
  {
         struct cgroup_namespace *new_ns;
+       struct ucounts *ucounts;
         struct css_set *cset;
  
         BUG_ON(!old_ns);
@@ -6373,6 +6385,10 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
         if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
  
+       ucounts = inc_cgroup_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         /* It is not safe to take cgroup_mutex here */
         spin_lock_irq(&css_set_lock);
         cset = task_css_set(current);
@@ -6382,10 +6398,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
         new_ns = alloc_cgroup_ns();
         if (IS_ERR(new_ns)) {
                 put_css_set(cset);
+               dec_cgroup_namespaces(ucounts);
                 return new_ns;
         }
  
         new_ns->user_ns = get_user_ns(user_ns);
+       new_ns->ucounts = ucounts;
         new_ns->root_cset = cset;
  
         return new_ns;
@@ -6436,12 +6454,18 @@ static void cgroupns_put(struct ns_common *ns)
         put_cgroup_ns(to_cg_ns(ns));
  }
  
+static struct user_namespace *cgroupns_owner(struct ns_common *ns)
+{
+       return to_cg_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations cgroupns_operations = {
         .name           = "cgroup",
         .type           = CLONE_NEWCGROUP,
         .get            = cgroupns_get,
         .put            = cgroupns_put,
         .install        = cgroupns_install,
+       .owner          = cgroupns_owner,
  };
  
  static __init int cgroup_namespaces_init(void)
diff --git a/kernel/fork.c b/kernel/fork.c

index c060c7e..9a05bd9 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -418,6 +418,7 @@ int arch_task_struct_size __read_mostly;
  
  void __init fork_init(void)
  {
+       int i;
  #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
  #ifndef ARCH_MIN_TASKALIGN
  #define ARCH_MIN_TASKALIGN     L1_CACHE_BYTES
@@ -437,6 +438,10 @@ void __init fork_init(void)
         init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
         init_task.signal->rlim[RLIMIT_SIGPENDING] =
                 init_task.signal->rlim[RLIMIT_NPROC];
+
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               init_user_ns.ucount_max[i] = max_threads/2;
+       }
  }
  
  int __weak arch_dup_task_struct(struct task_struct *dst,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c

index a65ba13..df9e8e9 100644 (file)
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -79,23 +79,36 @@ static void proc_cleanup_work(struct work_struct *work)
  /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
  #define MAX_PID_NS_LEVEL 32
  
+static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
+}
+
+static void dec_pid_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
+}
+
  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
         struct pid_namespace *parent_pid_ns)
  {
         struct pid_namespace *ns;
         unsigned int level = parent_pid_ns->level + 1;
+       struct ucounts *ucounts;
         int i;
         int err;
  
-       if (level > MAX_PID_NS_LEVEL) {
-               err = -EINVAL;
+       err = -ENOSPC;
+       if (level > MAX_PID_NS_LEVEL)
+               goto out;
+       ucounts = inc_pid_namespaces(user_ns);
+       if (!ucounts)
                 goto out;
-       }
  
         err = -ENOMEM;
         ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
         if (ns == NULL)
-               goto out;
+               goto out_dec;
  
         ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
         if (!ns->pidmap[0].page)
@@ -114,6 +127,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
         ns->level = level;
         ns->parent = get_pid_ns(parent_pid_ns);
         ns->user_ns = get_user_ns(user_ns);
+       ns->ucounts = ucounts;
         ns->nr_hashed = PIDNS_HASH_ADDING;
         INIT_WORK(&ns->proc_work, proc_cleanup_work);
  
@@ -129,6 +143,8 @@ out_free_map:
         kfree(ns->pidmap[0].page);
  out_free:
         kmem_cache_free(pid_ns_cachep, ns);
+out_dec:
+       dec_pid_namespaces(ucounts);
  out:
         return ERR_PTR(err);
  }
@@ -146,6 +162,7 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
         ns_free_inum(&ns->ns);
         for (i = 0; i < PIDMAP_ENTRIES; i++)
                 kfree(ns->pidmap[i].page);
+       dec_pid_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         call_rcu(&ns->rcu, delayed_free_pidns);
  }
@@ -388,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return 0;
  }
  
+static struct ns_common *pidns_get_parent(struct ns_common *ns)
+{
+       struct pid_namespace *active = task_active_pid_ns(current);
+       struct pid_namespace *pid_ns, *p;
+
+       /* See if the parent is in the current namespace */
+       pid_ns = p = to_pid_ns(ns)->parent;
+       for (;;) {
+               if (!p)
+                       return ERR_PTR(-EPERM);
+               if (p == active)
+                       break;
+               p = p->parent;
+       }
+
+       return &get_pid_ns(pid_ns)->ns;
+}
+
+static struct user_namespace *pidns_owner(struct ns_common *ns)
+{
+       return to_pid_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations pidns_operations = {
         .name           = "pid",
         .type           = CLONE_NEWPID,
         .get            = pidns_get,
         .put            = pidns_put,
         .install        = pidns_install,
+       .owner          = pidns_owner,
+       .get_parent     = pidns_get_parent,
  };
  
  static __init int pid_namespaces_init(void)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index a13bbda..a43775c 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -65,6 +65,7 @@
  #include <linux/sched/sysctl.h>
  #include <linux/kexec.h>
  #include <linux/bpf.h>
+#include <linux/mount.h>
  
  #include <asm/uaccess.h>
  #include <asm/processor.h>
@@ -1838,6 +1839,14 @@ static struct ctl_table fs_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_doulongvec_minmax,
         },
+       {
+               .procname       = "mount-max",
+               .data           = &sysctl_mount_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &one,
+       },
         { }
  };
  
diff --git a/kernel/ucount.c b/kernel/ucount.c

new file mode 100644 (file)

index 0000000..9d20d5d
--- /dev/null
+++ b/kernel/ucount.c
@@ -0,0 +1,235 @@
+/*
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License as
+ *  published by the Free Software Foundation, version 2 of the
+ *  License.
+ */
+
+#include <linux/stat.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+#include <linux/user_namespace.h>
+
+#define UCOUNTS_HASHTABLE_BITS 10
+static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
+static DEFINE_SPINLOCK(ucounts_lock);
+
+#define ucounts_hashfn(ns, uid)                                                \
+       hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
+                 UCOUNTS_HASHTABLE_BITS)
+#define ucounts_hashentry(ns, uid)     \
+       (ucounts_hashtable + ucounts_hashfn(ns, uid))
+
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *
+set_lookup(struct ctl_table_root *root)
+{
+       return &current_user_ns()->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+       return &current_user_ns()->set == set;
+}
+
+static int set_permissions(struct ctl_table_header *head,
+                                 struct ctl_table *table)
+{
+       struct user_namespace *user_ns =
+               container_of(head->set, struct user_namespace, set);
+       int mode;
+
+       /* Allow users with CAP_SYS_RESOURCE unrestrained access */
+       if (ns_capable(user_ns, CAP_SYS_RESOURCE))
+               mode = (table->mode & S_IRWXU) >> 6;
+       else
+       /* Allow all others at most read-only access */
+               mode = table->mode & S_IROTH;
+       return (mode << 6) | (mode << 3) | mode;
+}
+
+static struct ctl_table_root set_root = {
+       .lookup = set_lookup,
+       .permissions = set_permissions,
+};
+
+static int zero = 0;
+static int int_max = INT_MAX;
+#define UCOUNT_ENTRY(name)                             \
+       {                                               \
+               .procname       = name,                 \
+               .maxlen         = sizeof(int),          \
+               .mode           = 0644,                 \
+               .proc_handler   = proc_dointvec_minmax, \
+               .extra1         = &zero,                \
+               .extra2         = &int_max,             \
+       }
+static struct ctl_table user_table[] = {
+       UCOUNT_ENTRY("max_user_namespaces"),
+       UCOUNT_ENTRY("max_pid_namespaces"),
+       UCOUNT_ENTRY("max_uts_namespaces"),
+       UCOUNT_ENTRY("max_ipc_namespaces"),
+       UCOUNT_ENTRY("max_net_namespaces"),
+       UCOUNT_ENTRY("max_mnt_namespaces"),
+       UCOUNT_ENTRY("max_cgroup_namespaces"),
+       { }
+};
+#endif /* CONFIG_SYSCTL */
+
+bool setup_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+       struct ctl_table *tbl;
+       setup_sysctl_set(&ns->set, &set_root, set_is_seen);
+       tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
+       if (tbl) {
+               int i;
+               for (i = 0; i < UCOUNT_COUNTS; i++) {
+                       tbl[i].data = &ns->ucount_max[i];
+               }
+               ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl);
+       }
+       if (!ns->sysctls) {
+               kfree(tbl);
+               retire_sysctl_set(&ns->set);
+               return false;
+       }
+#endif
+       return true;
+}
+
+void retire_userns_sysctls(struct user_namespace *ns)
+{
+#ifdef CONFIG_SYSCTL
+       struct ctl_table *tbl;
+
+       tbl = ns->sysctls->ctl_table_arg;
+       unregister_sysctl_table(ns->sysctls);
+       retire_sysctl_set(&ns->set);
+       kfree(tbl);
+#endif
+}
+
+static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struct hlist_head *hashent)
+{
+       struct ucounts *ucounts;
+
+       hlist_for_each_entry(ucounts, hashent, node) {
+               if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns))
+                       return ucounts;
+       }
+       return NULL;
+}
+
+static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+{
+       struct hlist_head *hashent = ucounts_hashentry(ns, uid);
+       struct ucounts *ucounts, *new;
+
+       spin_lock(&ucounts_lock);
+       ucounts = find_ucounts(ns, uid, hashent);
+       if (!ucounts) {
+               spin_unlock(&ucounts_lock);
+
+               new = kzalloc(sizeof(*new), GFP_KERNEL);
+               if (!new)
+                       return NULL;
+
+               new->ns = ns;
+               new->uid = uid;
+               atomic_set(&new->count, 0);
+
+               spin_lock(&ucounts_lock);
+               ucounts = find_ucounts(ns, uid, hashent);
+               if (ucounts) {
+                       kfree(new);
+               } else {
+                       hlist_add_head(&new->node, hashent);
+                       ucounts = new;
+               }
+       }
+       if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+               ucounts = NULL;
+       spin_unlock(&ucounts_lock);
+       return ucounts;
+}
+
+static void put_ucounts(struct ucounts *ucounts)
+{
+       if (atomic_dec_and_test(&ucounts->count)) {
+               spin_lock(&ucounts_lock);
+               hlist_del_init(&ucounts->node);
+               spin_unlock(&ucounts_lock);
+
+               kfree(ucounts);
+       }
+}
+
+static inline bool atomic_inc_below(atomic_t *v, int u)
+{
+       int c, old;
+       c = atomic_read(v);
+       for (;;) {
+               if (unlikely(c >= u))
+                       return false;
+               old = atomic_cmpxchg(v, c, c+1);
+               if (likely(old == c))
+                       return true;
+               c = old;
+       }
+}
+
+struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
+                          enum ucount_type type)
+{
+       struct ucounts *ucounts, *iter, *bad;
+       struct user_namespace *tns;
+       ucounts = get_ucounts(ns, uid);
+       for (iter = ucounts; iter; iter = tns->ucounts) {
+               int max;
+               tns = iter->ns;
+               max = READ_ONCE(tns->ucount_max[type]);
+               if (!atomic_inc_below(&iter->ucount[type], max))
+                       goto fail;
+       }
+       return ucounts;
+fail:
+       bad = iter;
+       for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
+               atomic_dec(&iter->ucount[type]);
+
+       put_ucounts(ucounts);
+       return NULL;
+}
+
+void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
+{
+       struct ucounts *iter;
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               int dec = atomic_dec_if_positive(&iter->ucount[type]);
+               WARN_ON_ONCE(dec < 0);
+       }
+       put_ucounts(ucounts);
+}
+
+static __init int user_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+       static struct ctl_table_header *user_header;
+       static struct ctl_table empty[1];
+       /*
+        * It is necessary to register the user directory in the
+        * default set so that registrations in the child sets work
+        * properly.
+        */
+       user_header = register_sysctl("user", empty);
+       BUG_ON(!user_header);
+       BUG_ON(!setup_userns_sysctls(&init_user_ns));
+#endif
+       return 0;
+}
+subsys_initcall(user_namespace_sysctl_init);
+
+
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c

index 68f5942..86b7854 100644 (file)
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -29,6 +29,17 @@ static DEFINE_MUTEX(userns_state_mutex);
  static bool new_idmap_permitted(const struct file *file,
                                 struct user_namespace *ns, int cap_setid,
                                 struct uid_gid_map *map);
+static void free_user_ns(struct work_struct *work);
+
+static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
+{
+       return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
+}
+
+static void dec_user_namespaces(struct ucounts *ucounts)
+{
+       return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
+}
  
  static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
  {
@@ -62,10 +73,16 @@ int create_user_ns(struct cred *new)
         struct user_namespace *ns, *parent_ns = new->user_ns;
         kuid_t owner = new->euid;
         kgid_t group = new->egid;
-       int ret;
+       struct ucounts *ucounts;
+       int ret, i;
  
+       ret = -ENOSPC;
         if (parent_ns->level > 32)
-               return -EUSERS;
+               goto fail;
+
+       ucounts = inc_user_namespaces(parent_ns, owner);
+       if (!ucounts)
+               goto fail;
  
         /*
          * Verify that we can not violate the policy of which files
@@ -73,26 +90,27 @@ int create_user_ns(struct cred *new)
          * by verifing that the root directory is at the root of the
          * mount namespace which allows all files to be accessed.
          */
+       ret = -EPERM;
         if (current_chrooted())
-               return -EPERM;
+               goto fail_dec;
  
         /* The creator needs a mapping in the parent user namespace
          * or else we won't be able to reasonably tell userspace who
          * created a user_namespace.
          */
+       ret = -EPERM;
         if (!kuid_has_mapping(parent_ns, owner) ||
             !kgid_has_mapping(parent_ns, group))
-               return -EPERM;
+               goto fail_dec;
  
+       ret = -ENOMEM;
         ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
         if (!ns)
-               return -ENOMEM;
+               goto fail_dec;
  
         ret = ns_alloc_inum(&ns->ns);
-       if (ret) {
-               kmem_cache_free(user_ns_cachep, ns);
-               return ret;
-       }
+       if (ret)
+               goto fail_free;
         ns->ns.ops = &userns_operations;
  
         atomic_set(&ns->count, 1);
@@ -101,18 +119,37 @@ int create_user_ns(struct cred *new)
         ns->level = parent_ns->level + 1;
         ns->owner = owner;
         ns->group = group;
+       INIT_WORK(&ns->work, free_user_ns);
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               ns->ucount_max[i] = INT_MAX;
+       }
+       ns->ucounts = ucounts;
  
         /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
         mutex_lock(&userns_state_mutex);
         ns->flags = parent_ns->flags;
         mutex_unlock(&userns_state_mutex);
  
-       set_cred_user_ns(new, ns);
-
  #ifdef CONFIG_PERSISTENT_KEYRINGS
         init_rwsem(&ns->persistent_keyring_register_sem);
  #endif
+       ret = -ENOMEM;
+       if (!setup_userns_sysctls(ns))
+               goto fail_keyring;
+
+       set_cred_user_ns(new, ns);
         return 0;
+fail_keyring:
+#ifdef CONFIG_PERSISTENT_KEYRINGS
+       key_put(ns->persistent_keyring_register);
+#endif
+       ns_free_inum(&ns->ns);
+fail_free:
+       kmem_cache_free(user_ns_cachep, ns);
+fail_dec:
+       dec_user_namespaces(ucounts);
+fail:
+       return ret;
  }
  
  int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
@@ -135,21 +172,30 @@ int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
         return err;
  }
  
-void free_user_ns(struct user_namespace *ns)
+static void free_user_ns(struct work_struct *work)
  {
-       struct user_namespace *parent;
+       struct user_namespace *parent, *ns =
+               container_of(work, struct user_namespace, work);
  
         do {
+               struct ucounts *ucounts = ns->ucounts;
                 parent = ns->parent;
+               retire_userns_sysctls(ns);
  #ifdef CONFIG_PERSISTENT_KEYRINGS
                 key_put(ns->persistent_keyring_register);
  #endif
                 ns_free_inum(&ns->ns);
                 kmem_cache_free(user_ns_cachep, ns);
+               dec_user_namespaces(ucounts);
                 ns = parent;
         } while (atomic_dec_and_test(&parent->count));
  }
-EXPORT_SYMBOL(free_user_ns);
+
+void __put_user_ns(struct user_namespace *ns)
+{
+       schedule_work(&ns->work);
+}
+EXPORT_SYMBOL(__put_user_ns);
  
  static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
  {
@@ -1004,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return commit_creds(cred);
  }
  
+struct ns_common *ns_get_owner(struct ns_common *ns)
+{
+       struct user_namespace *my_user_ns = current_user_ns();
+       struct user_namespace *owner, *p;
+
+       /* See if the owner is in the current user namespace */
+       owner = p = ns->ops->owner(ns);
+       for (;;) {
+               if (!p)
+                       return ERR_PTR(-EPERM);
+               if (p == my_user_ns)
+                       break;
+               p = p->parent;
+       }
+
+       return &get_user_ns(owner)->ns;
+}
+
+static struct user_namespace *userns_owner(struct ns_common *ns)
+{
+       return to_user_ns(ns)->parent;
+}
+
  const struct proc_ns_operations userns_operations = {
         .name           = "user",
         .type           = CLONE_NEWUSER,
         .get            = userns_get,
         .put            = userns_put,
         .install        = userns_install,
+       .owner          = userns_owner,
+       .get_parent     = ns_get_owner,
  };
  
  static __init int user_namespaces_init(void)
diff --git a/kernel/utsname.c b/kernel/utsname.c

index 831ea71..6976cd4 100644 (file)
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -17,6 +17,16 @@
  #include <linux/user_namespace.h>
  #include <linux/proc_ns.h>
  
+static struct ucounts *inc_uts_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES);
+}
+
+static void dec_uts_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES);
+}
+
  static struct uts_namespace *create_uts_ns(void)
  {
         struct uts_namespace *uts_ns;
@@ -36,18 +46,24 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
                                           struct uts_namespace *old_ns)
  {
         struct uts_namespace *ns;
+       struct ucounts *ucounts;
         int err;
  
+       err = -ENOSPC;
+       ucounts = inc_uts_namespaces(user_ns);
+       if (!ucounts)
+               goto fail;
+
+       err = -ENOMEM;
         ns = create_uts_ns();
         if (!ns)
-               return ERR_PTR(-ENOMEM);
+               goto fail_dec;
  
         err = ns_alloc_inum(&ns->ns);
-       if (err) {
-               kfree(ns);
-               return ERR_PTR(err);
-       }
+       if (err)
+               goto fail_free;
  
+       ns->ucounts = ucounts;
         ns->ns.ops = &utsns_operations;
  
         down_read(&uts_sem);
@@ -55,6 +71,13 @@ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns,
         ns->user_ns = get_user_ns(user_ns);
         up_read(&uts_sem);
         return ns;
+
+fail_free:
+       kfree(ns);
+fail_dec:
+       dec_uts_namespaces(ucounts);
+fail:
+       return ERR_PTR(err);
  }
  
  /*
@@ -85,6 +108,7 @@ void free_uts_ns(struct kref *kref)
         struct uts_namespace *ns;
  
         ns = container_of(kref, struct uts_namespace, kref);
+       dec_uts_namespaces(ns->ucounts);
         put_user_ns(ns->user_ns);
         ns_free_inum(&ns->ns);
         kfree(ns);
@@ -130,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
         return 0;
  }
  
+static struct user_namespace *utsns_owner(struct ns_common *ns)
+{
+       return to_uts_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations utsns_operations = {
         .name           = "uts",
         .type           = CLONE_NEWUTS,
         .get            = utsns_get,
         .put            = utsns_put,
         .install        = utsns_install,
+       .owner          = utsns_owner,
  };
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c

index 42bdda0..989434f 100644 (file)
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -309,6 +309,16 @@ out_undo:
  
  
  #ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+       return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+       dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
  static struct kmem_cache *net_cachep;
  static struct workqueue_struct *netns_wq;
  
@@ -350,19 +360,27 @@ void net_drop_ns(void *p)
  struct net *copy_net_ns(unsigned long flags,
                         struct user_namespace *user_ns, struct net *old_net)
  {
+       struct ucounts *ucounts;
         struct net *net;
         int rv;
  
         if (!(flags & CLONE_NEWNET))
                 return get_net(old_net);
  
+       ucounts = inc_net_namespaces(user_ns);
+       if (!ucounts)
+               return ERR_PTR(-ENOSPC);
+
         net = net_alloc();
-       if (!net)
+       if (!net) {
+               dec_net_namespaces(ucounts);
                 return ERR_PTR(-ENOMEM);
+       }
  
         get_user_ns(user_ns);
  
         mutex_lock(&net_mutex);
+       net->ucounts = ucounts;
         rv = setup_net(net, user_ns);
         if (rv == 0) {
                 rtnl_lock();
@@ -371,6 +389,7 @@ struct net *copy_net_ns(unsigned long flags,
         }
         mutex_unlock(&net_mutex);
         if (rv < 0) {
+               dec_net_namespaces(ucounts);
                 put_user_ns(user_ns);
                 net_drop_ns(net);
                 return ERR_PTR(rv);
@@ -443,6 +462,7 @@ static void cleanup_net(struct work_struct *work)
         /* Finally it is safe to free my network namespace structure */
         list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                 list_del_init(&net->exit_list);
+               dec_net_namespaces(net->ucounts);
                 put_user_ns(net->user_ns);
                 net_drop_ns(net);
         }
@@ -1004,11 +1024,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
         return 0;
  }
  
+static struct user_namespace *netns_owner(struct ns_common *ns)
+{
+       return to_net_ns(ns)->user_ns;
+}
+
  const struct proc_ns_operations netns_operations = {
         .name           = "net",
         .type           = CLONE_NEWNET,
         .get            = netns_get,
         .put            = netns_put,
         .install        = netns_install,
+       .owner          = netns_owner,
  };
  #endif
diff --git a/net/sysctl_net.c b/net/sysctl_net.c

index e0c71bd..9199813 100644 (file)
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -27,9 +27,9 @@
  #endif
  
  static struct ctl_table_set *
-net_ctl_header_lookup(struct ctl_table_root *root, struct nsproxy *namespaces)
+net_ctl_header_lookup(struct ctl_table_root *root)
  {
-       return &namespaces->net_ns->sysctls;
+       return &current->nsproxy->net_ns->sysctls;
  }
  
  static int is_seen(struct ctl_table_set *set)
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile

index ff9e5f2..f770dba 100644 (file)
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -15,6 +15,7 @@ TARGETS += memory-hotplug
  TARGETS += mount
  TARGETS += mqueue
  TARGETS += net
+TARGETS += nsfs
  TARGETS += powerpc
  TARGETS += pstore
  TARGETS += ptrace
diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/nsfs/Makefile

new file mode 100644 (file)

index 0000000..2306054
--- /dev/null
+++ b/tools/testing/selftests/nsfs/Makefile
@@ -0,0 +1,12 @@
+TEST_PROGS := owner pidns
+
+CFLAGS := -Wall -Werror
+
+all: owner pidns
+owner: owner.c
+pidns: pidns.c
+
+clean:
+       $(RM) owner pidns
+
+include ../lib.mk
diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/nsfs/owner.c

new file mode 100644 (file)

index 0000000..437205f
--- /dev/null
+++ b/tools/testing/selftests/nsfs/owner.c
@@ -0,0 +1,91 @@
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define NSIO    0xb7
+#define NS_GET_USERNS   _IO(NSIO, 0x1)
+
+#define pr_err(fmt, ...) \
+               ({ \
+                       fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+                               __func__, __LINE__, ##__VA_ARGS__); \
+                       1; \
+               })
+
+int main(int argc, char *argvp[])
+{
+       int pfd[2], ns, uns, init_uns;
+       struct stat st1, st2;
+       char path[128];
+       pid_t pid;
+       char c;
+
+       if (pipe(pfd))
+               return 1;
+
+       pid = fork();
+       if (pid < 0)
+               return pr_err("fork");
+       if (pid == 0) {
+               prctl(PR_SET_PDEATHSIG, SIGKILL);
+               if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
+                       return pr_err("unshare");
+               close(pfd[0]);
+               close(pfd[1]);
+               while (1)
+                       sleep(1);
+               return 0;
+       }
+       close(pfd[1]);
+       if (read(pfd[0], &c, 1) != 0)
+               return pr_err("Unable to read from pipe");
+       close(pfd[0]);
+
+       snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+       ns = open(path, O_RDONLY);
+       if (ns < 0)
+               return pr_err("Unable to open %s", path);
+
+       uns = ioctl(ns, NS_GET_USERNS);
+       if (uns < 0)
+               return pr_err("Unable to get an owning user namespace");
+
+       if (fstat(uns, &st1))
+               return pr_err("fstat");
+
+       snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+       if (stat(path, &st2))
+               return pr_err("stat");
+
+       if (st1.st_ino != st2.st_ino)
+               return pr_err("NS_GET_USERNS returned a wrong namespace");
+
+       init_uns = ioctl(uns, NS_GET_USERNS);
+       if (uns < 0)
+               return pr_err("Unable to get an owning user namespace");
+
+       if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+               return pr_err("Don't get EPERM");
+
+       if (unshare(CLONE_NEWUSER))
+               return pr_err("unshare");
+
+       if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
+               return pr_err("Don't get EPERM");
+       if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+               return pr_err("Don't get EPERM");
+
+       kill(pid, SIGKILL);
+       wait(NULL);
+       return 0;
+}
diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/nsfs/pidns.c

new file mode 100644 (file)

index 0000000..ae3a0d6
--- /dev/null
+++ b/tools/testing/selftests/nsfs/pidns.c
@@ -0,0 +1,78 @@
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define pr_err(fmt, ...) \
+               ({ \
+                       fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+                               __func__, __LINE__, ##__VA_ARGS__); \
+                       1; \
+               })
+
+#define NSIO   0xb7
+#define NS_GET_USERNS   _IO(NSIO, 0x1)
+#define NS_GET_PARENT   _IO(NSIO, 0x2)
+
+#define __stack_aligned__      __attribute__((aligned(16)))
+struct cr_clone_arg {
+       char stack[128] __stack_aligned__;
+       char stack_ptr[0];
+};
+
+static int child(void *args)
+{
+       prctl(PR_SET_PDEATHSIG, SIGKILL);
+       while (1)
+               sleep(1);
+       exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+       char *ns_strs[] = {"pid", "user"};
+       char path[] = "/proc/0123456789/ns/pid";
+       struct cr_clone_arg ca;
+       struct stat st1, st2;
+       int ns, pns, i;
+       pid_t pid;
+
+       pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
+       if (pid < 0)
+               return pr_err("clone");
+
+       for (i = 0; i < 2; i++) {
+               snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
+               ns = open(path, O_RDONLY);
+               if (ns < 0)
+                       return pr_err("Unable to open %s", path);
+
+               pns = ioctl(ns, NS_GET_PARENT);
+               if (pns < 0)
+                       return pr_err("Unable to get a parent pidns");
+
+               snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
+               if (stat(path, &st2))
+                       return pr_err("Unable to stat %s", path);
+               if (fstat(pns, &st1))
+                       return pr_err("Unable to stat the parent pidns");
+               if (st1.st_ino != st2.st_ino)
+                       return pr_err("NS_GET_PARENT returned a wrong namespace");
+
+               if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
+                       return pr_err("Don't get EPERM");;
+       }
+
+       kill(pid, SIGKILL);
+       wait(NULL);
+       return 0;
+}
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 6 Oct 2016 16:52:23 +0000 (09:52 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 6 Oct 2016 16:52:23 +0000 (09:52 -0700)
Documentation/sysctl/README		patch \| blob \| history
Documentation/sysctl/fs.txt		patch \| blob \| history
Documentation/sysctl/user.txt	[new file with mode: 0644]	patch \| blob
fs/autofs4/waitq.c		patch \| blob \| history
fs/mount.h		patch \| blob \| history
fs/namespace.c		patch \| blob \| history
fs/nsfs.c		patch \| blob \| history
fs/pnode.c		patch \| blob \| history
fs/pnode.h		patch \| blob \| history
fs/proc/proc_sysctl.c		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/ipc_namespace.h		patch \| blob \| history
include/linux/mount.h		patch \| blob \| history
include/linux/pid_namespace.h		patch \| blob \| history
include/linux/proc_ns.h		patch \| blob \| history
include/linux/sysctl.h		patch \| blob \| history
include/linux/user_namespace.h		patch \| blob \| history
include/linux/utsname.h		patch \| blob \| history
include/net/net_namespace.h		patch \| blob \| history
include/uapi/linux/nsfs.h	[new file with mode: 0644]	patch \| blob
ipc/namespace.c		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/pid_namespace.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
kernel/ucount.c	[new file with mode: 0644]	patch \| blob
kernel/user_namespace.c		patch \| blob \| history
kernel/utsname.c		patch \| blob \| history
net/core/net_namespace.c		patch \| blob \| history
net/sysctl_net.c		patch \| blob \| history
tools/testing/selftests/Makefile		patch \| blob \| history
tools/testing/selftests/nsfs/Makefile	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/nsfs/owner.c	[new file with mode: 0644]	patch \| blob
tools/testing/selftests/nsfs/pidns.c	[new file with mode: 0644]	patch \| blob