Merge branch 'stable-4.8' of git://git.infradead.org/users/pcmoore/audit
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
Pull audit fixes from Paul Moore:
 "Two small patches to fix some bugs with the audit-by-executable
  functionality we introduced back in v4.3 (both patches are marked
  for the stable folks)"

* 'stable-4.8' of git://git.infradead.org/users/pcmoore/audit:
  audit: fix exe_file access in audit_exe_compare
  mm: introduce get_task_exe_file

1  2 
fs/proc/base.c
include/linux/mm.h
kernel/audit_watch.c
kernel/fork.c

diff --combined fs/proc/base.c
@@@ -579,8 -579,11 +579,8 @@@ static int proc_oom_score(struct seq_fi
        unsigned long totalpages = totalram_pages + total_swap_pages;
        unsigned long points = 0;
  
 -      read_lock(&tasklist_lock);
 -      if (pid_alive(task))
 -              points = oom_badness(task, NULL, NULL, totalpages) *
 -                                              1000 / totalpages;
 -      read_unlock(&tasklist_lock);
 +      points = oom_badness(task, NULL, NULL, totalpages) *
 +                                      1000 / totalpages;
        seq_printf(m, "%lu\n", points);
  
        return 0;
@@@ -1021,107 -1024,23 +1021,107 @@@ static ssize_t oom_adj_read(struct fil
        char buffer[PROC_NUMBUF];
        int oom_adj = OOM_ADJUST_MIN;
        size_t len;
 -      unsigned long flags;
  
        if (!task)
                return -ESRCH;
 -      if (lock_task_sighand(task, &flags)) {
 -              if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
 -                      oom_adj = OOM_ADJUST_MAX;
 -              else
 -                      oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
 -                                OOM_SCORE_ADJ_MAX;
 -              unlock_task_sighand(task, &flags);
 -      }
 +      if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
 +              oom_adj = OOM_ADJUST_MAX;
 +      else
 +              oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
 +                        OOM_SCORE_ADJ_MAX;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
  }
  
 +static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
 +{
 +      static DEFINE_MUTEX(oom_adj_mutex);
 +      struct mm_struct *mm = NULL;
 +      struct task_struct *task;
 +      int err = 0;
 +
 +      task = get_proc_task(file_inode(file));
 +      if (!task)
 +              return -ESRCH;
 +
 +      mutex_lock(&oom_adj_mutex);
 +      if (legacy) {
 +              if (oom_adj < task->signal->oom_score_adj &&
 +                              !capable(CAP_SYS_RESOURCE)) {
 +                      err = -EACCES;
 +                      goto err_unlock;
 +              }
 +              /*
 +               * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
 +               * /proc/pid/oom_score_adj instead.
 +               */
 +              pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
 +                        current->comm, task_pid_nr(current), task_pid_nr(task),
 +                        task_pid_nr(task));
 +      } else {
 +              if ((short)oom_adj < task->signal->oom_score_adj_min &&
 +                              !capable(CAP_SYS_RESOURCE)) {
 +                      err = -EACCES;
 +                      goto err_unlock;
 +              }
 +      }
 +
 +      /*
 +       * Make sure we will check other processes sharing the mm if this is
 +       * not vfrok which wants its own oom_score_adj.
 +       * pin the mm so it doesn't go away and get reused after task_unlock
 +       */
 +      if (!task->vfork_done) {
 +              struct task_struct *p = find_lock_task_mm(task);
 +
 +              if (p) {
 +                      if (atomic_read(&p->mm->mm_users) > 1) {
 +                              mm = p->mm;
 +                              atomic_inc(&mm->mm_count);
 +                      }
 +                      task_unlock(p);
 +              }
 +      }
 +
 +      task->signal->oom_score_adj = oom_adj;
 +      if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
 +              task->signal->oom_score_adj_min = (short)oom_adj;
 +      trace_oom_score_adj_update(task);
 +
 +      if (mm) {
 +              struct task_struct *p;
 +
 +              rcu_read_lock();
 +              for_each_process(p) {
 +                      if (same_thread_group(task, p))
 +                              continue;
 +
 +                      /* do not touch kernel threads or the global init */
 +                      if (p->flags & PF_KTHREAD || is_global_init(p))
 +                              continue;
 +
 +                      task_lock(p);
 +                      if (!p->vfork_done && process_shares_mm(p, mm)) {
 +                              pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
 +                                              task_pid_nr(p), p->comm,
 +                                              p->signal->oom_score_adj, oom_adj,
 +                                              task_pid_nr(task), task->comm);
 +                              p->signal->oom_score_adj = oom_adj;
 +                              if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
 +                                      p->signal->oom_score_adj_min = (short)oom_adj;
 +                      }
 +                      task_unlock(p);
 +              }
 +              rcu_read_unlock();
 +              mmdrop(mm);
 +      }
 +err_unlock:
 +      mutex_unlock(&oom_adj_mutex);
 +      put_task_struct(task);
 +      return err;
 +}
 +
  /*
   * /proc/pid/oom_adj exists solely for backwards compatibility with previous
   * kernels.  The effective policy is defined by oom_score_adj, which has a
  static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
  {
 -      struct task_struct *task;
        char buffer[PROC_NUMBUF];
        int oom_adj;
 -      unsigned long flags;
        int err;
  
        memset(buffer, 0, sizeof(buffer));
                goto out;
        }
  
 -      task = get_proc_task(file_inode(file));
 -      if (!task) {
 -              err = -ESRCH;
 -              goto out;
 -      }
 -
 -      task_lock(task);
 -      if (!task->mm) {
 -              err = -EINVAL;
 -              goto err_task_lock;
 -      }
 -
 -      if (!lock_task_sighand(task, &flags)) {
 -              err = -ESRCH;
 -              goto err_task_lock;
 -      }
 -
        /*
         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
         * value is always attainable.
        else
                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
  
 -      if (oom_adj < task->signal->oom_score_adj &&
 -          !capable(CAP_SYS_RESOURCE)) {
 -              err = -EACCES;
 -              goto err_sighand;
 -      }
 -
 -      /*
 -       * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
 -       * /proc/pid/oom_score_adj instead.
 -       */
 -      pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
 -                current->comm, task_pid_nr(current), task_pid_nr(task),
 -                task_pid_nr(task));
 -
 -      task->signal->oom_score_adj = oom_adj;
 -      trace_oom_score_adj_update(task);
 -err_sighand:
 -      unlock_task_sighand(task, &flags);
 -err_task_lock:
 -      task_unlock(task);
 -      put_task_struct(task);
 +      err = __set_oom_adj(file, oom_adj, true);
  out:
        return err < 0 ? err : count;
  }
@@@ -1182,11 -1140,15 +1182,11 @@@ static ssize_t oom_score_adj_read(struc
        struct task_struct *task = get_proc_task(file_inode(file));
        char buffer[PROC_NUMBUF];
        short oom_score_adj = OOM_SCORE_ADJ_MIN;
 -      unsigned long flags;
        size_t len;
  
        if (!task)
                return -ESRCH;
 -      if (lock_task_sighand(task, &flags)) {
 -              oom_score_adj = task->signal->oom_score_adj;
 -              unlock_task_sighand(task, &flags);
 -      }
 +      oom_score_adj = task->signal->oom_score_adj;
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
  static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                        size_t count, loff_t *ppos)
  {
 -      struct task_struct *task;
        char buffer[PROC_NUMBUF];
 -      unsigned long flags;
        int oom_score_adj;
        int err;
  
                goto out;
        }
  
 -      task = get_proc_task(file_inode(file));
 -      if (!task) {
 -              err = -ESRCH;
 -              goto out;
 -      }
 -
 -      task_lock(task);
 -      if (!task->mm) {
 -              err = -EINVAL;
 -              goto err_task_lock;
 -      }
 -
 -      if (!lock_task_sighand(task, &flags)) {
 -              err = -ESRCH;
 -              goto err_task_lock;
 -      }
 -
 -      if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
 -                      !capable(CAP_SYS_RESOURCE)) {
 -              err = -EACCES;
 -              goto err_sighand;
 -      }
 -
 -      task->signal->oom_score_adj = (short)oom_score_adj;
 -      if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
 -              task->signal->oom_score_adj_min = (short)oom_score_adj;
 -      trace_oom_score_adj_update(task);
 -
 -err_sighand:
 -      unlock_task_sighand(task, &flags);
 -err_task_lock:
 -      task_unlock(task);
 -      put_task_struct(task);
 +      err = __set_oom_adj(file, oom_score_adj, false);
  out:
        return err < 0 ? err : count;
  }
@@@ -1556,18 -1552,13 +1556,13 @@@ static const struct file_operations pro
  static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
  {
        struct task_struct *task;
-       struct mm_struct *mm;
        struct file *exe_file;
  
        task = get_proc_task(d_inode(dentry));
        if (!task)
                return -ENOENT;
-       mm = get_task_mm(task);
+       exe_file = get_task_exe_file(task);
        put_task_struct(task);
-       if (!mm)
-               return -ENOENT;
-       exe_file = get_mm_exe_file(mm);
-       mmput(mm);
        if (exe_file) {
                *exe_path = exe_file->f_path;
                path_get(&exe_file->f_path);
@@@ -1824,17 -1815,12 +1819,17 @@@ bool proc_fill_cache(struct file *file
  
        child = d_hash_and_lookup(dir, &qname);
        if (!child) {
 -              child = d_alloc(dir, &qname);
 -              if (!child)
 -                      goto end_instantiate;
 -              if (instantiate(d_inode(dir), child, task, ptr) < 0) {
 -                      dput(child);
 +              DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 +              child = d_alloc_parallel(dir, &qname, &wq);
 +              if (IS_ERR(child))
                        goto end_instantiate;
 +              if (d_in_lookup(child)) {
 +                      int err = instantiate(d_inode(dir), child, task, ptr);
 +                      d_lookup_done(child);
 +                      if (err < 0) {
 +                              dput(child);
 +                              goto end_instantiate;
 +                      }
                }
        }
        inode = d_inode(child);
@@@ -2164,8 -2150,8 +2159,8 @@@ out
  
  static const struct file_operations proc_map_files_operations = {
        .read           = generic_read_dir,
 -      .iterate        = proc_map_files_readdir,
 -      .llseek         = default_llseek,
 +      .iterate_shared = proc_map_files_readdir,
 +      .llseek         = generic_file_llseek,
  };
  
  #ifdef CONFIG_CHECKPOINT_RESTORE
@@@ -2512,8 -2498,8 +2507,8 @@@ static int proc_attr_dir_readdir(struc
  
  static const struct file_operations proc_attr_dir_operations = {
        .read           = generic_read_dir,
 -      .iterate        = proc_attr_dir_readdir,
 -      .llseek         = default_llseek,
 +      .iterate_shared = proc_attr_dir_readdir,
 +      .llseek         = generic_file_llseek,
  };
  
  static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@@ -2920,8 -2906,8 +2915,8 @@@ static int proc_tgid_base_readdir(struc
  
  static const struct file_operations proc_tgid_base_operations = {
        .read           = generic_read_dir,
 -      .iterate        = proc_tgid_base_readdir,
 -      .llseek         = default_llseek,
 +      .iterate_shared = proc_tgid_base_readdir,
 +      .llseek         = generic_file_llseek,
  };
  
  static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@@ -3166,44 -3152,6 +3161,44 @@@ int proc_pid_readdir(struct file *file
        return 0;
  }
  
 +/*
 + * proc_tid_comm_permission is a special permission function exclusively
 + * used for the node /proc/<pid>/task/<tid>/comm.
 + * It bypasses generic permission checks in the case where a task of the same
 + * task group attempts to access the node.
 + * The rationale behind this is that glibc and bionic access this node for
 + * cross thread naming (pthread_set/getname_np(!self)). However, if
 + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
 + * which locks out the cross thread naming implementation.
 + * This function makes sure that the node is always accessible for members of
 + * same thread group.
 + */
 +static int proc_tid_comm_permission(struct inode *inode, int mask)
 +{
 +      bool is_same_tgroup;
 +      struct task_struct *task;
 +
 +      task = get_proc_task(inode);
 +      if (!task)
 +              return -ESRCH;
 +      is_same_tgroup = same_thread_group(current, task);
 +      put_task_struct(task);
 +
 +      if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
 +              /* This file (/proc/<pid>/task/<tid>/comm) can always be
 +               * read or written by the members of the corresponding
 +               * thread group.
 +               */
 +              return 0;
 +      }
 +
 +      return generic_permission(inode, mask);
 +}
 +
 +static const struct inode_operations proc_tid_comm_inode_operations = {
 +              .permission = proc_tid_comm_permission,
 +};
 +
  /*
   * Tasks
   */
@@@ -3222,9 -3170,7 +3217,9 @@@ static const struct pid_entry tid_base_
  #ifdef CONFIG_SCHED_DEBUG
        REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
  #endif
 -      REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
 +      NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
 +                       &proc_tid_comm_inode_operations,
 +                       &proc_pid_set_comm_operations, {}),
  #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
        ONE("syscall",   S_IRUSR, proc_pid_syscall),
  #endif
@@@ -3308,8 -3254,8 +3303,8 @@@ static struct dentry *proc_tid_base_loo
  
  static const struct file_operations proc_tid_base_operations = {
        .read           = generic_read_dir,
 -      .iterate        = proc_tid_base_readdir,
 -      .llseek         = default_llseek,
 +      .iterate_shared = proc_tid_base_readdir,
 +      .llseek         = generic_file_llseek,
  };
  
  static const struct inode_operations proc_tid_base_inode_operations = {
@@@ -3519,6 -3465,6 +3514,6 @@@ static const struct inode_operations pr
  
  static const struct file_operations proc_task_operations = {
        .read           = generic_read_dir,
 -      .iterate        = proc_task_readdir,
 -      .llseek         = default_llseek,
 +      .iterate_shared = proc_task_readdir,
 +      .llseek         = generic_file_llseek,
  };
diff --combined include/linux/mm.h
@@@ -72,10 -72,6 +72,10 @@@ extern int mmap_rnd_compat_bits __read_
  #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
  #endif
  
 +#ifndef page_to_virt
 +#define page_to_virt(x)       __va(PFN_PHYS(page_to_pfn(x)))
 +#endif
 +
  /*
   * To prevent common memory management code establishing
   * a zero page mapping on a read fault.
@@@ -303,40 -299,10 +303,40 @@@ struct vm_fault 
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
 -      /* for ->map_pages() only */
 -      pgoff_t max_pgoff;              /* map pages for offset from pgoff till
 -                                       * max_pgoff inclusive */
 -      pte_t *pte;                     /* pte entry associated with ->pgoff */
 +      void *entry;                    /* ->fault handler can alternatively
 +                                       * return locked DAX entry. In that
 +                                       * case handler should return
 +                                       * VM_FAULT_DAX_LOCKED and fill in
 +                                       * entry here.
 +                                       */
 +};
 +
 +/*
 + * Page fault context: passes though page fault handler instead of endless list
 + * of function arguments.
 + */
 +struct fault_env {
 +      struct vm_area_struct *vma;     /* Target VMA */
 +      unsigned long address;          /* Faulting virtual address */
 +      unsigned int flags;             /* FAULT_FLAG_xxx flags */
 +      pmd_t *pmd;                     /* Pointer to pmd entry matching
 +                                       * the 'address'
 +                                       */
 +      pte_t *pte;                     /* Pointer to pte entry matching
 +                                       * the 'address'. NULL if the page
 +                                       * table hasn't been allocated.
 +                                       */
 +      spinlock_t *ptl;                /* Page table lock.
 +                                       * Protects pte page table if 'pte'
 +                                       * is not NULL, otherwise pmd.
 +                                       */
 +      pgtable_t prealloc_pte;         /* Pre-allocated pte page table.
 +                                       * vm_ops->map_pages() calls
 +                                       * alloc_set_pte() from atomic context.
 +                                       * do_fault_around() pre-allocates
 +                                       * page table to avoid allocation from
 +                                       * atomic context.
 +                                       */
  };
  
  /*
@@@ -351,8 -317,7 +351,8 @@@ struct vm_operations_struct 
        int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
        int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
                                                pmd_t *, unsigned int flags);
 -      void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
 +      void (*map_pages)(struct fault_env *fe,
 +                      pgoff_t start_pgoff, pgoff_t end_pgoff);
  
        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
@@@ -478,14 -443,14 +478,14 @@@ unsigned long vmalloc_to_pfn(const voi
   * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
   * is no special casing required.
   */
 -static inline int is_vmalloc_addr(const void *x)
 +static inline bool is_vmalloc_addr(const void *x)
  {
  #ifdef CONFIG_MMU
        unsigned long addr = (unsigned long)x;
  
        return addr >= VMALLOC_START && addr < VMALLOC_END;
  #else
 -      return 0;
 +      return false;
  #endif
  }
  #ifdef CONFIG_MMU
@@@ -506,7 -471,8 +506,7 @@@ static inline atomic_t *compound_mapcou
  
  static inline int compound_mapcount(struct page *page)
  {
 -      if (!PageCompound(page))
 -              return 0;
 +      VM_BUG_ON_PAGE(!PageCompound(page), page);
        page = compound_head(page);
        return atomic_read(compound_mapcount_ptr(page)) + 1;
  }
@@@ -562,6 -528,7 +562,6 @@@ void __put_page(struct page *page)
  void put_pages_list(struct list_head *pages);
  
  void split_page(struct page *page, unsigned int order);
 -int split_free_page(struct page *page);
  
  /*
   * Compound pages have a destructor function.  Provide a
@@@ -625,8 -592,8 +625,8 @@@ static inline pte_t maybe_mkwrite(pte_
        return pte;
  }
  
 -void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 -              struct page *page, pte_t *pte, bool write, bool anon);
 +int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
 +              struct page *page);
  #endif
  
  /*
@@@ -763,7 -730,7 +763,7 @@@ static inline void get_page(struct pag
        page = compound_head(page);
        /*
         * Getting a normal page or the head of a compound page
 -       * requires to already have an elevated page->_count.
 +       * requires to already have an elevated page->_refcount.
         */
        VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
        page_ref_inc(page);
@@@ -879,7 -846,10 +879,7 @@@ extern int page_cpupid_xchg_last(struc
  
  static inline void page_cpupid_reset_last(struct page *page)
  {
 -      int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
 -
 -      page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
 -      page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
 +      page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
  }
  #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
  #else /* !CONFIG_NUMA_BALANCING */
@@@ -933,11 -903,6 +933,11 @@@ static inline struct zone *page_zone(co
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
  }
  
 +static inline pg_data_t *page_pgdat(const struct page *page)
 +{
 +      return NODE_DATA(page_to_nid(page));
 +}
 +
  #ifdef SECTION_IN_PAGE_FLAGS
  static inline void set_page_section(struct page *page, unsigned long section)
  {
@@@ -978,21 -943,11 +978,21 @@@ static inline struct mem_cgroup *page_m
  {
        return page->mem_cgroup;
  }
 +static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 +{
 +      WARN_ON_ONCE(!rcu_read_lock_held());
 +      return READ_ONCE(page->mem_cgroup);
 +}
  #else
  static inline struct mem_cgroup *page_memcg(struct page *page)
  {
        return NULL;
  }
 +static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 +{
 +      WARN_ON_ONCE(!rcu_read_lock_held());
 +      return NULL;
 +}
  #endif
  
  /*
  
  static __always_inline void *lowmem_page_address(const struct page *page)
  {
 -      return __va(PFN_PHYS(page_to_pfn(page)));
 +      return page_to_virt(page);
  }
  
  #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
@@@ -1073,8 -1028,26 +1073,8 @@@ static inline pgoff_t page_file_index(s
        return page->index;
  }
  
 -/*
 - * Return true if this page is mapped into pagetables.
 - * For compound page it returns true if any subpage of compound page is mapped.
 - */
 -static inline bool page_mapped(struct page *page)
 -{
 -      int i;
 -      if (likely(!PageCompound(page)))
 -              return atomic_read(&page->_mapcount) >= 0;
 -      page = compound_head(page);
 -      if (atomic_read(compound_mapcount_ptr(page)) >= 0)
 -              return true;
 -      if (PageHuge(page))
 -              return false;
 -      for (i = 0; i < hpage_nr_pages(page); i++) {
 -              if (atomic_read(&page[i]._mapcount) >= 0)
 -                      return true;
 -      }
 -      return false;
 -}
 +bool page_mapped(struct page *page);
 +struct address_space *page_mapping(struct page *page);
  
  /*
   * Return true only if the page has been allocated with
@@@ -1122,7 -1095,6 +1122,7 @@@ static inline void clear_page_pfmemallo
  #define VM_FAULT_LOCKED       0x0200  /* ->fault locked the returned page */
  #define VM_FAULT_RETRY        0x0400  /* ->fault blocked, must retry */
  #define VM_FAULT_FALLBACK 0x0800      /* huge page fault failed, fall back to small */
 +#define VM_FAULT_DAX_LOCKED 0x1000    /* ->fault has locked DAX entry */
  
  #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
  
@@@ -1255,14 -1227,15 +1255,14 @@@ int generic_error_remove_page(struct ad
  int invalidate_inode_page(struct page *page);
  
  #ifdef CONFIG_MMU
 -extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 -                      unsigned long address, unsigned int flags);
 +extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 +              unsigned int flags);
  extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
  #else
 -static inline int handle_mm_fault(struct mm_struct *mm,
 -                      struct vm_area_struct *vma, unsigned long address,
 -                      unsigned int flags)
 +static inline int handle_mm_fault(struct vm_area_struct *vma,
 +              unsigned long address, unsigned int flags)
  {
        /* should never happen if there's no MMU */
        BUG();
@@@ -1809,7 -1782,7 +1809,7 @@@ extern void free_highmem_page(struct pa
  extern void adjust_managed_page_count(struct page *page, long count);
  extern void mem_init_print_info(const char *str);
  
 -extern void reserve_bootmem_region(unsigned long start, unsigned long end);
 +extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
  
  /* Free the reserved page into the buddy system, so it gets managed. */
  static inline void __free_reserved_page(struct page *page)
@@@ -2014,6 -1987,7 +2014,7 @@@ extern void mm_drop_all_locks(struct mm
  
  extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
  extern struct file *get_mm_exe_file(struct mm_struct *mm);
+ extern struct file *get_task_exe_file(struct task_struct *task);
  
  extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
  extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
@@@ -2057,9 -2031,9 +2058,9 @@@ static inline void mm_populate(unsigne
  #endif
  
  /* These take the mm semaphore themselves */
 -extern unsigned long vm_brk(unsigned long, unsigned long);
 +extern int __must_check vm_brk(unsigned long, unsigned long);
  extern int vm_munmap(unsigned long, size_t);
 -extern unsigned long vm_mmap(struct file *, unsigned long,
 +extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
          unsigned long, unsigned long,
          unsigned long, unsigned long);
  
@@@ -2102,8 -2076,7 +2103,8 @@@ extern void truncate_inode_pages_final(
  
  /* generic vm_area_ops exported for stackable file systems */
  extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 -extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf);
 +extern void filemap_map_pages(struct fault_env *fe,
 +              pgoff_t start_pgoff, pgoff_t end_pgoff);
  extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
  
  /* mm/page-writeback.c */
@@@ -2299,8 -2272,6 +2300,8 @@@ static inline int in_gate_area(struct m
  }
  #endif        /* __HAVE_ARCH_GATE_AREA */
  
 +extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
 +
  #ifdef CONFIG_SYSCTL
  extern int sysctl_drop_caches;
  int drop_caches_sysctl_handler(struct ctl_table *, int,
@@@ -2435,9 -2406,6 +2436,9 @@@ static inline bool page_is_guard(struc
                return false;
  
        page_ext = lookup_page_ext(page);
 +      if (unlikely(!page_ext))
 +              return false;
 +
        return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
  }
  #else
diff --combined kernel/audit_watch.c
@@@ -19,6 -19,7 +19,7 @@@
   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
   */
  
+ #include <linux/file.h>
  #include <linux/kernel.h>
  #include <linux/audit.h>
  #include <linux/kthread.h>
@@@ -367,7 -368,7 +368,7 @@@ static int audit_get_nd(struct audit_wa
        inode_unlock(d_backing_inode(parent->dentry));
        if (d_is_positive(d)) {
                /* update watch filter fields */
 -              watch->dev = d_backing_inode(d)->i_sb->s_dev;
 +              watch->dev = d->d_sb->s_dev;
                watch->ino = d_backing_inode(d)->i_ino;
        }
        dput(d);
@@@ -544,10 -545,11 +545,11 @@@ int audit_exe_compare(struct task_struc
        unsigned long ino;
        dev_t dev;
  
-       rcu_read_lock();
-       exe_file = rcu_dereference(tsk->mm->exe_file);
+       exe_file = get_task_exe_file(tsk);
+       if (!exe_file)
+               return 0;
        ino = exe_file->f_inode->i_ino;
        dev = exe_file->f_inode->i_sb->s_dev;
-       rcu_read_unlock();
+       fput(exe_file);
        return audit_mark_compare(mark, ino, dev);
  }
diff --combined kernel/fork.c
@@@ -148,49 -148,57 +148,49 @@@ static inline void free_task_struct(str
  }
  #endif
  
 -void __weak arch_release_thread_info(struct thread_info *ti)
 +void __weak arch_release_thread_stack(unsigned long *stack)
  {
  }
  
 -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
 +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
  
  /*
   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
   * kmemcache based allocator.
   */
  # if THREAD_SIZE >= PAGE_SIZE
 -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
  {
 -      struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
 -                                                THREAD_SIZE_ORDER);
 -
 -      if (page)
 -              memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 -                                          1 << THREAD_SIZE_ORDER);
 +      struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 +                                           THREAD_SIZE_ORDER);
  
        return page ? page_address(page) : NULL;
  }
  
 -static inline void free_thread_info(struct thread_info *ti)
 +static inline void free_thread_stack(unsigned long *stack)
  {
 -      struct page *page = virt_to_page(ti);
 -
 -      memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 -                                  -(1 << THREAD_SIZE_ORDER));
 -      __free_kmem_pages(page, THREAD_SIZE_ORDER);
 +      __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
  }
  # else
 -static struct kmem_cache *thread_info_cache;
 +static struct kmem_cache *thread_stack_cache;
  
 -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
  {
 -      return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
 +      return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
  }
  
 -static void free_thread_info(struct thread_info *ti)
 +static void free_thread_stack(unsigned long *stack)
  {
 -      kmem_cache_free(thread_info_cache, ti);
 +      kmem_cache_free(thread_stack_cache, stack);
  }
  
 -void thread_info_cache_init(void)
 +void thread_stack_cache_init(void)
  {
 -      thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
 +      thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
                                              THREAD_SIZE, 0, NULL);
 -      BUG_ON(thread_info_cache == NULL);
 +      BUG_ON(thread_stack_cache == NULL);
  }
  # endif
  #endif
@@@ -213,24 -221,18 +213,24 @@@ struct kmem_cache *vm_area_cachep
  /* SLAB cache for mm_struct structures (tsk->mm) */
  static struct kmem_cache *mm_cachep;
  
 -static void account_kernel_stack(struct thread_info *ti, int account)
 +static void account_kernel_stack(unsigned long *stack, int account)
  {
 -      struct zone *zone = page_zone(virt_to_page(ti));
 +      /* All stack pages are in the same zone and belong to the same memcg. */
 +      struct page *first_page = virt_to_page(stack);
 +
 +      mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 +                          THREAD_SIZE / 1024 * account);
  
 -      mod_zone_page_state(zone, NR_KERNEL_STACK, account);
 +      memcg_kmem_update_page_stat(
 +              first_page, MEMCG_KERNEL_STACK_KB,
 +              account * (THREAD_SIZE / 1024));
  }
  
  void free_task(struct task_struct *tsk)
  {
        account_kernel_stack(tsk->stack, -1);
 -      arch_release_thread_info(tsk->stack);
 -      free_thread_info(tsk->stack);
 +      arch_release_thread_stack(tsk->stack);
 +      free_thread_stack(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@@ -338,27 -340,26 +338,27 @@@ void set_task_stack_end_magic(struct ta
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
  }
  
 -static struct task_struct *dup_task_struct(struct task_struct *orig)
 +static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  {
        struct task_struct *tsk;
 -      struct thread_info *ti;
 -      int node = tsk_fork_get_node(orig);
 +      unsigned long *stack;
        int err;
  
 +      if (node == NUMA_NO_NODE)
 +              node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
  
 -      ti = alloc_thread_info_node(tsk, node);
 -      if (!ti)
 +      stack = alloc_thread_stack_node(tsk, node);
 +      if (!stack)
                goto free_tsk;
  
        err = arch_dup_task_struct(tsk, orig);
        if (err)
 -              goto free_ti;
 +              goto free_stack;
  
 -      tsk->stack = ti;
 +      tsk->stack = stack;
  #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
  
 -      account_kernel_stack(ti, 1);
 +      account_kernel_stack(stack, 1);
  
        kcov_task_init(tsk);
  
        return tsk;
  
 -free_ti:
 -      free_thread_info(ti);
 +free_stack:
 +      free_thread_stack(stack);
  free_tsk:
        free_task_struct(tsk);
        return NULL;
@@@ -412,10 -413,7 +412,10 @@@ static int dup_mmap(struct mm_struct *m
        unsigned long charge;
  
        uprobe_start_dup_mmap();
 -      down_write(&oldmm->mmap_sem);
 +      if (down_write_killable(&oldmm->mmap_sem)) {
 +              retval = -EINTR;
 +              goto fail_uprobe_end;
 +      }
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
@@@ -527,7 -525,6 +527,7 @@@ out
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
 +fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;
  fail_nomem_anon_vma_fork:
@@@ -702,26 -699,6 +702,26 @@@ void __mmdrop(struct mm_struct *mm
  }
  EXPORT_SYMBOL_GPL(__mmdrop);
  
 +static inline void __mmput(struct mm_struct *mm)
 +{
 +      VM_BUG_ON(atomic_read(&mm->mm_users));
 +
 +      uprobe_clear_state(mm);
 +      exit_aio(mm);
 +      ksm_exit(mm);
 +      khugepaged_exit(mm); /* must run before exit_mmap */
 +      exit_mmap(mm);
 +      set_mm_exe_file(mm, NULL);
 +      if (!list_empty(&mm->mmlist)) {
 +              spin_lock(&mmlist_lock);
 +              list_del(&mm->mmlist);
 +              spin_unlock(&mmlist_lock);
 +      }
 +      if (mm->binfmt)
 +              module_put(mm->binfmt->module);
 +      mmdrop(mm);
 +}
 +
  /*
   * Decrement the use count and release all resources for an mm.
   */
@@@ -729,26 -706,24 +729,26 @@@ void mmput(struct mm_struct *mm
  {
        might_sleep();
  
 +      if (atomic_dec_and_test(&mm->mm_users))
 +              __mmput(mm);
 +}
 +EXPORT_SYMBOL_GPL(mmput);
 +
 +#ifdef CONFIG_MMU
 +static void mmput_async_fn(struct work_struct *work)
 +{
 +      struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
 +      __mmput(mm);
 +}
 +
 +void mmput_async(struct mm_struct *mm)
 +{
        if (atomic_dec_and_test(&mm->mm_users)) {
 -              uprobe_clear_state(mm);
 -              exit_aio(mm);
 -              ksm_exit(mm);
 -              khugepaged_exit(mm); /* must run before exit_mmap */
 -              exit_mmap(mm);
 -              set_mm_exe_file(mm, NULL);
 -              if (!list_empty(&mm->mmlist)) {
 -                      spin_lock(&mmlist_lock);
 -                      list_del(&mm->mmlist);
 -                      spin_unlock(&mmlist_lock);
 -              }
 -              if (mm->binfmt)
 -                      module_put(mm->binfmt->module);
 -              mmdrop(mm);
 +              INIT_WORK(&mm->async_put_work, mmput_async_fn);
 +              schedule_work(&mm->async_put_work);
        }
  }
 -EXPORT_SYMBOL_GPL(mmput);
 +#endif
  
  /**
   * set_mm_exe_file - change a reference to the mm's executable file
@@@ -797,6 -772,29 +797,29 @@@ struct file *get_mm_exe_file(struct mm_
        return exe_file;
  }
  EXPORT_SYMBOL(get_mm_exe_file);
+ /**
+  * get_task_exe_file - acquire a reference to the task's executable file
+  *
+  * Returns %NULL if task's mm (if any) has no associated executable file or
+  * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+  * User must release file via fput().
+  */
+ struct file *get_task_exe_file(struct task_struct *task)
+ {
+       struct file *exe_file = NULL;
+       struct mm_struct *mm;
+       task_lock(task);
+       mm = task->mm;
+       if (mm) {
+               if (!(task->flags & PF_KTHREAD))
+                       exe_file = get_mm_exe_file(mm);
+       }
+       task_unlock(task);
+       return exe_file;
+ }
+ EXPORT_SYMBOL(get_task_exe_file);
  
  /**
   * get_task_mm - acquire a reference to the task's mm
@@@ -1281,8 -1279,7 +1304,8 @@@ static struct task_struct *copy_process
                                        int __user *child_tidptr,
                                        struct pid *pid,
                                        int trace,
 -                                      unsigned long tls)
 +                                      unsigned long tls,
 +                                      int node)
  {
        int retval;
        struct task_struct *p;
                goto fork_out;
  
        retval = -ENOMEM;
 -      p = dup_task_struct(current);
 +      p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
  
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
 -      threadgroup_change_begin(current);
        cgroup_fork(p);
  #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
                pid = alloc_pid(p->nsproxy->pid_ns_for_children);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
 -                      goto bad_fork_cleanup_io;
 +                      goto bad_fork_cleanup_thread;
                }
        }
  
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
 -              p->sas_ss_sp = p->sas_ss_size = 0;
 +              sas_ss_reset(p);
  
        /*
         * Syscall tracing and stepping should be turned off in the
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
  
 +      threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
  bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p);
  bad_fork_free_pid:
 +      threadgroup_change_end(current);
        if (pid != &init_struct_pid)
                free_pid(pid);
 +bad_fork_cleanup_thread:
 +      exit_thread(p);
  bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
@@@ -1689,6 -1683,7 +1712,6 @@@ bad_fork_cleanup_policy
        mpol_put(p->mempolicy);
  bad_fork_cleanup_threadgroup_lock:
  #endif
 -      threadgroup_change_end(current);
        delayacct_tsk_free(p);
  bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
@@@ -1712,8 -1707,7 +1735,8 @@@ static inline void init_idle_pids(struc
  struct task_struct *fork_idle(int cpu)
  {
        struct task_struct *task;
 -      task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
 +      task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
 +                          cpu_to_node(cpu));
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@@ -1758,7 -1752,7 +1781,7 @@@ long _do_fork(unsigned long clone_flags
        }
  
        p = copy_process(clone_flags, stack_start, stack_size,
 -                       child_tidptr, NULL, trace, tls);
 +                       child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.