Merge branch 'stable-4.8' of git://git.infradead.org/users/pcmoore/audit

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
diff --combined fs/proc/base.c

index 54e2702,da8b194..ac0df4d
--- 1/fs/proc/base.c
--- 2/fs/proc/base.c
+++ b/fs/proc/base.c
@@@ -579,8 -579,11 +579,8 @@@ static int proc_oom_score(struct seq_fi
         unsigned long totalpages = totalram_pages + total_swap_pages;
         unsigned long points = 0;
   
- -      read_lock(&tasklist_lock);
- -      if (pid_alive(task))
- -              points = oom_badness(task, NULL, NULL, totalpages) *
- -                                              1000 / totalpages;
- -      read_unlock(&tasklist_lock);
+ +      points = oom_badness(task, NULL, NULL, totalpages) *
+ +                                      1000 / totalpages;
         seq_printf(m, "%lu\n", points);
   
         return 0;
@@@ -1021,107 -1024,23 +1021,107 @@@ static ssize_t oom_adj_read(struct fil
         char buffer[PROC_NUMBUF];
         int oom_adj = OOM_ADJUST_MIN;
         size_t len;
- -      unsigned long flags;
   
         if (!task)
                 return -ESRCH;
- -      if (lock_task_sighand(task, &flags)) {
- -              if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
- -                      oom_adj = OOM_ADJUST_MAX;
- -              else
- -                      oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
- -                                OOM_SCORE_ADJ_MAX;
- -              unlock_task_sighand(task, &flags);
- -      }
+ +      if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ +              oom_adj = OOM_ADJUST_MAX;
+ +      else
+ +              oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ +                        OOM_SCORE_ADJ_MAX;
         put_task_struct(task);
         len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
         return simple_read_from_buffer(buf, count, ppos, buffer, len);
   }
   
+ +static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
+ +{
+ +      static DEFINE_MUTEX(oom_adj_mutex);
+ +      struct mm_struct *mm = NULL;
+ +      struct task_struct *task;
+ +      int err = 0;
+ +
+ +      task = get_proc_task(file_inode(file));
+ +      if (!task)
+ +              return -ESRCH;
+ +
+ +      mutex_lock(&oom_adj_mutex);
+ +      if (legacy) {
+ +              if (oom_adj < task->signal->oom_score_adj &&
+ +                              !capable(CAP_SYS_RESOURCE)) {
+ +                      err = -EACCES;
+ +                      goto err_unlock;
+ +              }
+ +              /*
+ +               * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ +               * /proc/pid/oom_score_adj instead.
+ +               */
+ +              pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ +                        current->comm, task_pid_nr(current), task_pid_nr(task),
+ +                        task_pid_nr(task));
+ +      } else {
+ +              if ((short)oom_adj < task->signal->oom_score_adj_min &&
+ +                              !capable(CAP_SYS_RESOURCE)) {
+ +                      err = -EACCES;
+ +                      goto err_unlock;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Make sure we will check other processes sharing the mm if this is
+ +       * not vfrok which wants its own oom_score_adj.
+ +       * pin the mm so it doesn't go away and get reused after task_unlock
+ +       */
+ +      if (!task->vfork_done) {
+ +              struct task_struct *p = find_lock_task_mm(task);
+ +
+ +              if (p) {
+ +                      if (atomic_read(&p->mm->mm_users) > 1) {
+ +                              mm = p->mm;
+ +                              atomic_inc(&mm->mm_count);
+ +                      }
+ +                      task_unlock(p);
+ +              }
+ +      }
+ +
+ +      task->signal->oom_score_adj = oom_adj;
+ +      if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ +              task->signal->oom_score_adj_min = (short)oom_adj;
+ +      trace_oom_score_adj_update(task);
+ +
+ +      if (mm) {
+ +              struct task_struct *p;
+ +
+ +              rcu_read_lock();
+ +              for_each_process(p) {
+ +                      if (same_thread_group(task, p))
+ +                              continue;
+ +
+ +                      /* do not touch kernel threads or the global init */
+ +                      if (p->flags & PF_KTHREAD || is_global_init(p))
+ +                              continue;
+ +
+ +                      task_lock(p);
+ +                      if (!p->vfork_done && process_shares_mm(p, mm)) {
+ +                              pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
+ +                                              task_pid_nr(p), p->comm,
+ +                                              p->signal->oom_score_adj, oom_adj,
+ +                                              task_pid_nr(task), task->comm);
+ +                              p->signal->oom_score_adj = oom_adj;
+ +                              if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ +                                      p->signal->oom_score_adj_min = (short)oom_adj;
+ +                      }
+ +                      task_unlock(p);
+ +              }
+ +              rcu_read_unlock();
+ +              mmdrop(mm);
+ +      }
+ +err_unlock:
+ +      mutex_unlock(&oom_adj_mutex);
+ +      put_task_struct(task);
+ +      return err;
+ +}
+ +
   /*
    * /proc/pid/oom_adj exists solely for backwards compatibility with previous
    * kernels.  The effective policy is defined by oom_score_adj, which has a
@@@ -1135,8 -1054,10 +1135,8 @@@
   static ssize_t oom_adj_write(struct file *file, const char __user *buf,
                              size_t count, loff_t *ppos)
   {
- -      struct task_struct *task;
         char buffer[PROC_NUMBUF];
         int oom_adj;
- -      unsigned long flags;
         int err;
   
         memset(buffer, 0, sizeof(buffer));
@@@ -1156,6 -1077,23 +1156,6 @@@
                 goto out;
         }
   
- -      task = get_proc_task(file_inode(file));
- -      if (!task) {
- -              err = -ESRCH;
- -              goto out;
- -      }
- -
- -      task_lock(task);
- -      if (!task->mm) {
- -              err = -EINVAL;
- -              goto err_task_lock;
- -      }
- -
- -      if (!lock_task_sighand(task, &flags)) {
- -              err = -ESRCH;
- -              goto err_task_lock;
- -      }
- -
         /*
          * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
          * value is always attainable.
@@@ -1165,7 -1103,27 +1165,7 @@@
         else
                 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
   
- -      if (oom_adj < task->signal->oom_score_adj &&
- -          !capable(CAP_SYS_RESOURCE)) {
- -              err = -EACCES;
- -              goto err_sighand;
- -      }
- -
- -      /*
- -       * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
- -       * /proc/pid/oom_score_adj instead.
- -       */
- -      pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
- -                current->comm, task_pid_nr(current), task_pid_nr(task),
- -                task_pid_nr(task));
- -
- -      task->signal->oom_score_adj = oom_adj;
- -      trace_oom_score_adj_update(task);
- -err_sighand:
- -      unlock_task_sighand(task, &flags);
- -err_task_lock:
- -      task_unlock(task);
- -      put_task_struct(task);
+ +      err = __set_oom_adj(file, oom_adj, true);
   out:
         return err < 0 ? err : count;
   }
@@@ -1182,11 -1140,15 +1182,11 @@@ static ssize_t oom_score_adj_read(struc
         struct task_struct *task = get_proc_task(file_inode(file));
         char buffer[PROC_NUMBUF];
         short oom_score_adj = OOM_SCORE_ADJ_MIN;
- -      unsigned long flags;
         size_t len;
   
         if (!task)
                 return -ESRCH;
- -      if (lock_task_sighand(task, &flags)) {
- -              oom_score_adj = task->signal->oom_score_adj;
- -              unlock_task_sighand(task, &flags);
- -      }
+ +      oom_score_adj = task->signal->oom_score_adj;
         put_task_struct(task);
         len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
         return simple_read_from_buffer(buf, count, ppos, buffer, len);
@@@ -1195,7 -1157,9 +1195,7 @@@
   static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                                         size_t count, loff_t *ppos)
   {
- -      struct task_struct *task;
         char buffer[PROC_NUMBUF];
- -      unsigned long flags;
         int oom_score_adj;
         int err;
   
@@@ -1216,7 -1180,39 +1216,7 @@@
                 goto out;
         }
   
- -      task = get_proc_task(file_inode(file));
- -      if (!task) {
- -              err = -ESRCH;
- -              goto out;
- -      }
- -
- -      task_lock(task);
- -      if (!task->mm) {
- -              err = -EINVAL;
- -              goto err_task_lock;
- -      }
- -
- -      if (!lock_task_sighand(task, &flags)) {
- -              err = -ESRCH;
- -              goto err_task_lock;
- -      }
- -
- -      if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
- -                      !capable(CAP_SYS_RESOURCE)) {
- -              err = -EACCES;
- -              goto err_sighand;
- -      }
- -
- -      task->signal->oom_score_adj = (short)oom_score_adj;
- -      if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
- -              task->signal->oom_score_adj_min = (short)oom_score_adj;
- -      trace_oom_score_adj_update(task);
- -
- -err_sighand:
- -      unlock_task_sighand(task, &flags);
- -err_task_lock:
- -      task_unlock(task);
- -      put_task_struct(task);
+ +      err = __set_oom_adj(file, oom_score_adj, false);
   out:
         return err < 0 ? err : count;
   }
@@@ -1556,18 -1552,13 +1556,13 @@@ static const struct file_operations pro
   static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
   {
         struct task_struct *task;
-       struct mm_struct *mm;
         struct file *exe_file;
   
         task = get_proc_task(d_inode(dentry));
         if (!task)
                 return -ENOENT;
-       mm = get_task_mm(task);
+       exe_file = get_task_exe_file(task);
         put_task_struct(task);
-       if (!mm)
-               return -ENOENT;
-       exe_file = get_mm_exe_file(mm);
-       mmput(mm);
         if (exe_file) {
                 *exe_path = exe_file->f_path;
                 path_get(&exe_file->f_path);
@@@ -1824,17 -1815,12 +1819,17 @@@ bool proc_fill_cache(struct file *file
   
         child = d_hash_and_lookup(dir, &qname);
         if (!child) {
- -              child = d_alloc(dir, &qname);
- -              if (!child)
- -                      goto end_instantiate;
- -              if (instantiate(d_inode(dir), child, task, ptr) < 0) {
- -                      dput(child);
+ +              DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ +              child = d_alloc_parallel(dir, &qname, &wq);
+ +              if (IS_ERR(child))
                         goto end_instantiate;
+ +              if (d_in_lookup(child)) {
+ +                      int err = instantiate(d_inode(dir), child, task, ptr);
+ +                      d_lookup_done(child);
+ +                      if (err < 0) {
+ +                              dput(child);
+ +                              goto end_instantiate;
+ +                      }
                 }
         }
         inode = d_inode(child);
@@@ -2164,8 -2150,8 +2159,8 @@@ out
   
   static const struct file_operations proc_map_files_operations = {
         .read           = generic_read_dir,
- -      .iterate        = proc_map_files_readdir,
- -      .llseek         = default_llseek,
+ +      .iterate_shared = proc_map_files_readdir,
+ +      .llseek         = generic_file_llseek,
   };
   
   #ifdef CONFIG_CHECKPOINT_RESTORE
@@@ -2512,8 -2498,8 +2507,8 @@@ static int proc_attr_dir_readdir(struc
   
   static const struct file_operations proc_attr_dir_operations = {
         .read           = generic_read_dir,
- -      .iterate        = proc_attr_dir_readdir,
- -      .llseek         = default_llseek,
+ +      .iterate_shared = proc_attr_dir_readdir,
+ +      .llseek         = generic_file_llseek,
   };
   
   static struct dentry *proc_attr_dir_lookup(struct inode *dir,
@@@ -2920,8 -2906,8 +2915,8 @@@ static int proc_tgid_base_readdir(struc
   
   static const struct file_operations proc_tgid_base_operations = {
         .read           = generic_read_dir,
- -      .iterate        = proc_tgid_base_readdir,
- -      .llseek         = default_llseek,
+ +      .iterate_shared = proc_tgid_base_readdir,
+ +      .llseek         = generic_file_llseek,
   };
   
   static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
@@@ -3166,44 -3152,6 +3161,44 @@@ int proc_pid_readdir(struct file *file
         return 0;
   }
   
+ +/*
+ + * proc_tid_comm_permission is a special permission function exclusively
+ + * used for the node /proc/<pid>/task/<tid>/comm.
+ + * It bypasses generic permission checks in the case where a task of the same
+ + * task group attempts to access the node.
+ + * The rationale behind this is that glibc and bionic access this node for
+ + * cross thread naming (pthread_set/getname_np(!self)). However, if
+ + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0,
+ + * which locks out the cross thread naming implementation.
+ + * This function makes sure that the node is always accessible for members of
+ + * same thread group.
+ + */
+ +static int proc_tid_comm_permission(struct inode *inode, int mask)
+ +{
+ +      bool is_same_tgroup;
+ +      struct task_struct *task;
+ +
+ +      task = get_proc_task(inode);
+ +      if (!task)
+ +              return -ESRCH;
+ +      is_same_tgroup = same_thread_group(current, task);
+ +      put_task_struct(task);
+ +
+ +      if (likely(is_same_tgroup && !(mask & MAY_EXEC))) {
+ +              /* This file (/proc/<pid>/task/<tid>/comm) can always be
+ +               * read or written by the members of the corresponding
+ +               * thread group.
+ +               */
+ +              return 0;
+ +      }
+ +
+ +      return generic_permission(inode, mask);
+ +}
+ +
+ +static const struct inode_operations proc_tid_comm_inode_operations = {
+ +              .permission = proc_tid_comm_permission,
+ +};
+ +
   /*
    * Tasks
    */
@@@ -3222,9 -3170,7 +3217,9 @@@ static const struct pid_entry tid_base_
   #ifdef CONFIG_SCHED_DEBUG
         REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
   #endif
- -      REG("comm",      S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
+ +      NOD("comm",      S_IFREG|S_IRUGO|S_IWUSR,
+ +                       &proc_tid_comm_inode_operations,
+ +                       &proc_pid_set_comm_operations, {}),
   #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
         ONE("syscall",   S_IRUSR, proc_pid_syscall),
   #endif
@@@ -3308,8 -3254,8 +3303,8 @@@ static struct dentry *proc_tid_base_loo
   
   static const struct file_operations proc_tid_base_operations = {
         .read           = generic_read_dir,
- -      .iterate        = proc_tid_base_readdir,
- -      .llseek         = default_llseek,
+ +      .iterate_shared = proc_tid_base_readdir,
+ +      .llseek         = generic_file_llseek,
   };
   
   static const struct inode_operations proc_tid_base_inode_operations = {
@@@ -3519,6 -3465,6 +3514,6 @@@ static const struct inode_operations pr
   
   static const struct file_operations proc_task_operations = {
         .read           = generic_read_dir,
- -      .iterate        = proc_task_readdir,
- -      .llseek         = default_llseek,
+ +      .iterate_shared = proc_task_readdir,
+ +      .llseek         = generic_file_llseek,
   };
diff --combined include/linux/mm.h

index 08ed53e,004c73a..ef815b9
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -72,10 -72,6 +72,10 @@@ extern int mmap_rnd_compat_bits __read_
   #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
   #endif
   
+ +#ifndef page_to_virt
+ +#define page_to_virt(x)       __va(PFN_PHYS(page_to_pfn(x)))
+ +#endif
+ +
   /*
    * To prevent common memory management code establishing
    * a zero page mapping on a read fault.
@@@ -303,40 -299,10 +303,40 @@@ struct vm_fault 
                                          * is set (which is also implied by
                                          * VM_FAULT_ERROR).
                                          */
- -      /* for ->map_pages() only */
- -      pgoff_t max_pgoff;              /* map pages for offset from pgoff till
- -                                       * max_pgoff inclusive */
- -      pte_t *pte;                     /* pte entry associated with ->pgoff */
+ +      void *entry;                    /* ->fault handler can alternatively
+ +                                       * return locked DAX entry. In that
+ +                                       * case handler should return
+ +                                       * VM_FAULT_DAX_LOCKED and fill in
+ +                                       * entry here.
+ +                                       */
+ +};
+ +
+ +/*
+ + * Page fault context: passes though page fault handler instead of endless list
+ + * of function arguments.
+ + */
+ +struct fault_env {
+ +      struct vm_area_struct *vma;     /* Target VMA */
+ +      unsigned long address;          /* Faulting virtual address */
+ +      unsigned int flags;             /* FAULT_FLAG_xxx flags */
+ +      pmd_t *pmd;                     /* Pointer to pmd entry matching
+ +                                       * the 'address'
+ +                                       */
+ +      pte_t *pte;                     /* Pointer to pte entry matching
+ +                                       * the 'address'. NULL if the page
+ +                                       * table hasn't been allocated.
+ +                                       */
+ +      spinlock_t *ptl;                /* Page table lock.
+ +                                       * Protects pte page table if 'pte'
+ +                                       * is not NULL, otherwise pmd.
+ +                                       */
+ +      pgtable_t prealloc_pte;         /* Pre-allocated pte page table.
+ +                                       * vm_ops->map_pages() calls
+ +                                       * alloc_set_pte() from atomic context.
+ +                                       * do_fault_around() pre-allocates
+ +                                       * page table to avoid allocation from
+ +                                       * atomic context.
+ +                                       */
   };
   
   /*
@@@ -351,8 -317,7 +351,8 @@@ struct vm_operations_struct 
         int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
         int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
                                                 pmd_t *, unsigned int flags);
- -      void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ +      void (*map_pages)(struct fault_env *fe,
+ +                      pgoff_t start_pgoff, pgoff_t end_pgoff);
   
         /* notification that a previously read-only page is about to become
          * writable, if an error is returned it will cause a SIGBUS */
@@@ -478,14 -443,14 +478,14 @@@ unsigned long vmalloc_to_pfn(const voi
    * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
    * is no special casing required.
    */
- -static inline int is_vmalloc_addr(const void *x)
+ +static inline bool is_vmalloc_addr(const void *x)
   {
   #ifdef CONFIG_MMU
         unsigned long addr = (unsigned long)x;
   
         return addr >= VMALLOC_START && addr < VMALLOC_END;
   #else
- -      return 0;
+ +      return false;
   #endif
   }
   #ifdef CONFIG_MMU
@@@ -506,7 -471,8 +506,7 @@@ static inline atomic_t *compound_mapcou
   
   static inline int compound_mapcount(struct page *page)
   {
- -      if (!PageCompound(page))
- -              return 0;
+ +      VM_BUG_ON_PAGE(!PageCompound(page), page);
         page = compound_head(page);
         return atomic_read(compound_mapcount_ptr(page)) + 1;
   }
@@@ -562,6 -528,7 +562,6 @@@ void __put_page(struct page *page)
   void put_pages_list(struct list_head *pages);
   
   void split_page(struct page *page, unsigned int order);
- -int split_free_page(struct page *page);
   
   /*
    * Compound pages have a destructor function.  Provide a
@@@ -625,8 -592,8 +625,8 @@@ static inline pte_t maybe_mkwrite(pte_
         return pte;
   }
   
- -void do_set_pte(struct vm_area_struct *vma, unsigned long address,
- -              struct page *page, pte_t *pte, bool write, bool anon);
+ +int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
+ +              struct page *page);
   #endif
   
   /*
@@@ -763,7 -730,7 +763,7 @@@ static inline void get_page(struct pag
         page = compound_head(page);
         /*
          * Getting a normal page or the head of a compound page
- -       * requires to already have an elevated page->_count.
+ +       * requires to already have an elevated page->_refcount.
          */
         VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
         page_ref_inc(page);
@@@ -879,7 -846,10 +879,7 @@@ extern int page_cpupid_xchg_last(struc
   
   static inline void page_cpupid_reset_last(struct page *page)
   {
- -      int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
- -
- -      page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
- -      page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+ +      page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
   }
   #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
   #else /* !CONFIG_NUMA_BALANCING */
@@@ -933,11 -903,6 +933,11 @@@ static inline struct zone *page_zone(co
         return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
   }
   
+ +static inline pg_data_t *page_pgdat(const struct page *page)
+ +{
+ +      return NODE_DATA(page_to_nid(page));
+ +}
+ +
   #ifdef SECTION_IN_PAGE_FLAGS
   static inline void set_page_section(struct page *page, unsigned long section)
   {
@@@ -978,21 -943,11 +978,21 @@@ static inline struct mem_cgroup *page_m
   {
         return page->mem_cgroup;
   }
+ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+ +{
+ +      WARN_ON_ONCE(!rcu_read_lock_held());
+ +      return READ_ONCE(page->mem_cgroup);
+ +}
   #else
   static inline struct mem_cgroup *page_memcg(struct page *page)
   {
         return NULL;
   }
+ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+ +{
+ +      WARN_ON_ONCE(!rcu_read_lock_held());
+ +      return NULL;
+ +}
   #endif
   
   /*
@@@ -1002,7 -957,7 +1002,7 @@@
   
   static __always_inline void *lowmem_page_address(const struct page *page)
   {
- -      return __va(PFN_PHYS(page_to_pfn(page)));
+ +      return page_to_virt(page);
   }
   
   #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
@@@ -1073,8 -1028,26 +1073,8 @@@ static inline pgoff_t page_file_index(s
         return page->index;
   }
   
- -/*
- - * Return true if this page is mapped into pagetables.
- - * For compound page it returns true if any subpage of compound page is mapped.
- - */
- -static inline bool page_mapped(struct page *page)
- -{
- -      int i;
- -      if (likely(!PageCompound(page)))
- -              return atomic_read(&page->_mapcount) >= 0;
- -      page = compound_head(page);
- -      if (atomic_read(compound_mapcount_ptr(page)) >= 0)
- -              return true;
- -      if (PageHuge(page))
- -              return false;
- -      for (i = 0; i < hpage_nr_pages(page); i++) {
- -              if (atomic_read(&page[i]._mapcount) >= 0)
- -                      return true;
- -      }
- -      return false;
- -}
+ +bool page_mapped(struct page *page);
+ +struct address_space *page_mapping(struct page *page);
   
   /*
    * Return true only if the page has been allocated with
@@@ -1122,7 -1095,6 +1122,7 @@@ static inline void clear_page_pfmemallo
   #define VM_FAULT_LOCKED       0x0200  /* ->fault locked the returned page */
   #define VM_FAULT_RETRY        0x0400  /* ->fault blocked, must retry */
   #define VM_FAULT_FALLBACK 0x0800      /* huge page fault failed, fall back to small */
+ +#define VM_FAULT_DAX_LOCKED 0x1000    /* ->fault has locked DAX entry */
   
   #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
   
@@@ -1255,14 -1227,15 +1255,14 @@@ int generic_error_remove_page(struct ad
   int invalidate_inode_page(struct page *page);
   
   #ifdef CONFIG_MMU
- -extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- -                      unsigned long address, unsigned int flags);
+ +extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ +              unsigned int flags);
   extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                             unsigned long address, unsigned int fault_flags,
                             bool *unlocked);
   #else
- -static inline int handle_mm_fault(struct mm_struct *mm,
- -                      struct vm_area_struct *vma, unsigned long address,
- -                      unsigned int flags)
+ +static inline int handle_mm_fault(struct vm_area_struct *vma,
+ +              unsigned long address, unsigned int flags)
   {
         /* should never happen if there's no MMU */
         BUG();
@@@ -1809,7 -1782,7 +1809,7 @@@ extern void free_highmem_page(struct pa
   extern void adjust_managed_page_count(struct page *page, long count);
   extern void mem_init_print_info(const char *str);
   
- -extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+ +extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
   
   /* Free the reserved page into the buddy system, so it gets managed. */
   static inline void __free_reserved_page(struct page *page)
@@@ -2014,6 -1987,7 +2014,7 @@@ extern void mm_drop_all_locks(struct mm
   
   extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
   extern struct file *get_mm_exe_file(struct mm_struct *mm);
+ extern struct file *get_task_exe_file(struct task_struct *task);
   
   extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
   extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
@@@ -2057,9 -2031,9 +2058,9 @@@ static inline void mm_populate(unsigne
   #endif
   
   /* These take the mm semaphore themselves */
- -extern unsigned long vm_brk(unsigned long, unsigned long);
+ +extern int __must_check vm_brk(unsigned long, unsigned long);
   extern int vm_munmap(unsigned long, size_t);
- -extern unsigned long vm_mmap(struct file *, unsigned long,
+ +extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
           unsigned long, unsigned long,
           unsigned long, unsigned long);
   
@@@ -2102,8 -2076,7 +2103,8 @@@ extern void truncate_inode_pages_final(
   
   /* generic vm_area_ops exported for stackable file systems */
   extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
- -extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf);
+ +extern void filemap_map_pages(struct fault_env *fe,
+ +              pgoff_t start_pgoff, pgoff_t end_pgoff);
   extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
   
   /* mm/page-writeback.c */
@@@ -2299,8 -2272,6 +2300,8 @@@ static inline int in_gate_area(struct m
   }
   #endif        /* __HAVE_ARCH_GATE_AREA */
   
+ +extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
+ +
   #ifdef CONFIG_SYSCTL
   extern int sysctl_drop_caches;
   int drop_caches_sysctl_handler(struct ctl_table *, int,
@@@ -2435,9 -2406,6 +2436,9 @@@ static inline bool page_is_guard(struc
                 return false;
   
         page_ext = lookup_page_ext(page);
+ +      if (unlikely(!page_ext))
+ +              return false;
+ +
         return test_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
   }
   #else
diff --combined kernel/audit_watch.c

index d6709eb,4846691..0d302a8
--- 1/kernel/audit_watch.c
--- 2/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@@ -19,6 -19,7 +19,7 @@@
    * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    */
   
+ #include <linux/file.h>
   #include <linux/kernel.h>
   #include <linux/audit.h>
   #include <linux/kthread.h>
@@@ -367,7 -368,7 +368,7 @@@ static int audit_get_nd(struct audit_wa
         inode_unlock(d_backing_inode(parent->dentry));
         if (d_is_positive(d)) {
                 /* update watch filter fields */
- -              watch->dev = d_backing_inode(d)->i_sb->s_dev;
+ +              watch->dev = d->d_sb->s_dev;
                 watch->ino = d_backing_inode(d)->i_ino;
         }
         dput(d);
@@@ -544,10 -545,11 +545,11 @@@ int audit_exe_compare(struct task_struc
         unsigned long ino;
         dev_t dev;
   
-       rcu_read_lock();
-       exe_file = rcu_dereference(tsk->mm->exe_file);
+       exe_file = get_task_exe_file(tsk);
+       if (!exe_file)
+               return 0;
         ino = exe_file->f_inode->i_ino;
         dev = exe_file->f_inode->i_sb->s_dev;
-       rcu_read_unlock();
+       fput(exe_file);
         return audit_mark_compare(mark, ino, dev);
   }
diff --combined kernel/fork.c

index aaf7823,42451ae..36c0daa
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -148,49 -148,57 +148,49 @@@ static inline void free_task_struct(str
   }
   #endif
   
- -void __weak arch_release_thread_info(struct thread_info *ti)
+ +void __weak arch_release_thread_stack(unsigned long *stack)
   {
   }
   
- -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+ +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
   
   /*
    * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
    * kmemcache based allocator.
    */
   # if THREAD_SIZE >= PAGE_SIZE
- -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                   int node)
   {
- -      struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
- -                                                THREAD_SIZE_ORDER);
- -
- -      if (page)
- -              memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- -                                          1 << THREAD_SIZE_ORDER);
+ +      struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ +                                           THREAD_SIZE_ORDER);
   
         return page ? page_address(page) : NULL;
   }
   
- -static inline void free_thread_info(struct thread_info *ti)
+ +static inline void free_thread_stack(unsigned long *stack)
   {
- -      struct page *page = virt_to_page(ti);
- -
- -      memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
- -                                  -(1 << THREAD_SIZE_ORDER));
- -      __free_kmem_pages(page, THREAD_SIZE_ORDER);
+ +      __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
   }
   # else
- -static struct kmem_cache *thread_info_cache;
+ +static struct kmem_cache *thread_stack_cache;
   
- -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                   int node)
   {
- -      return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+ +      return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
   }
   
- -static void free_thread_info(struct thread_info *ti)
+ +static void free_thread_stack(unsigned long *stack)
   {
- -      kmem_cache_free(thread_info_cache, ti);
+ +      kmem_cache_free(thread_stack_cache, stack);
   }
   
- -void thread_info_cache_init(void)
+ +void thread_stack_cache_init(void)
   {
- -      thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ +      thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
                                               THREAD_SIZE, 0, NULL);
- -      BUG_ON(thread_info_cache == NULL);
+ +      BUG_ON(thread_stack_cache == NULL);
   }
   # endif
   #endif
@@@ -213,24 -221,18 +213,24 @@@ struct kmem_cache *vm_area_cachep
   /* SLAB cache for mm_struct structures (tsk->mm) */
   static struct kmem_cache *mm_cachep;
   
- -static void account_kernel_stack(struct thread_info *ti, int account)
+ +static void account_kernel_stack(unsigned long *stack, int account)
   {
- -      struct zone *zone = page_zone(virt_to_page(ti));
+ +      /* All stack pages are in the same zone and belong to the same memcg. */
+ +      struct page *first_page = virt_to_page(stack);
+ +
+ +      mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+ +                          THREAD_SIZE / 1024 * account);
   
- -      mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+ +      memcg_kmem_update_page_stat(
+ +              first_page, MEMCG_KERNEL_STACK_KB,
+ +              account * (THREAD_SIZE / 1024));
   }
   
   void free_task(struct task_struct *tsk)
   {
         account_kernel_stack(tsk->stack, -1);
- -      arch_release_thread_info(tsk->stack);
- -      free_thread_info(tsk->stack);
+ +      arch_release_thread_stack(tsk->stack);
+ +      free_thread_stack(tsk->stack);
         rt_mutex_debug_task_free(tsk);
         ftrace_graph_exit_task(tsk);
         put_seccomp_filter(tsk);
@@@ -338,27 -340,26 +338,27 @@@ void set_task_stack_end_magic(struct ta
         *stackend = STACK_END_MAGIC;    /* for overflow detection */
   }
   
- -static struct task_struct *dup_task_struct(struct task_struct *orig)
+ +static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
   {
         struct task_struct *tsk;
- -      struct thread_info *ti;
- -      int node = tsk_fork_get_node(orig);
+ +      unsigned long *stack;
         int err;
   
+ +      if (node == NUMA_NO_NODE)
+ +              node = tsk_fork_get_node(orig);
         tsk = alloc_task_struct_node(node);
         if (!tsk)
                 return NULL;
   
- -      ti = alloc_thread_info_node(tsk, node);
- -      if (!ti)
+ +      stack = alloc_thread_stack_node(tsk, node);
+ +      if (!stack)
                 goto free_tsk;
   
         err = arch_dup_task_struct(tsk, orig);
         if (err)
- -              goto free_ti;
+ +              goto free_stack;
   
- -      tsk->stack = ti;
+ +      tsk->stack = stack;
   #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
@@@ -390,14 -391,14 +390,14 @@@
         tsk->task_frag.page = NULL;
         tsk->wake_q.next = NULL;
   
- -      account_kernel_stack(ti, 1);
+ +      account_kernel_stack(stack, 1);
   
         kcov_task_init(tsk);
   
         return tsk;
   
- -free_ti:
- -      free_thread_info(ti);
+ +free_stack:
+ +      free_thread_stack(stack);
   free_tsk:
         free_task_struct(tsk);
         return NULL;
@@@ -412,10 -413,7 +412,10 @@@ static int dup_mmap(struct mm_struct *m
         unsigned long charge;
   
         uprobe_start_dup_mmap();
- -      down_write(&oldmm->mmap_sem);
+ +      if (down_write_killable(&oldmm->mmap_sem)) {
+ +              retval = -EINTR;
+ +              goto fail_uprobe_end;
+ +      }
         flush_cache_dup_mm(oldmm);
         uprobe_dup_mmap(oldmm, mm);
         /*
@@@ -527,7 -525,6 +527,7 @@@ out
         up_write(&mm->mmap_sem);
         flush_tlb_mm(oldmm);
         up_write(&oldmm->mmap_sem);
+ +fail_uprobe_end:
         uprobe_end_dup_mmap();
         return retval;
   fail_nomem_anon_vma_fork:
@@@ -702,26 -699,6 +702,26 @@@ void __mmdrop(struct mm_struct *mm
   }
   EXPORT_SYMBOL_GPL(__mmdrop);
   
+ +static inline void __mmput(struct mm_struct *mm)
+ +{
+ +      VM_BUG_ON(atomic_read(&mm->mm_users));
+ +
+ +      uprobe_clear_state(mm);
+ +      exit_aio(mm);
+ +      ksm_exit(mm);
+ +      khugepaged_exit(mm); /* must run before exit_mmap */
+ +      exit_mmap(mm);
+ +      set_mm_exe_file(mm, NULL);
+ +      if (!list_empty(&mm->mmlist)) {
+ +              spin_lock(&mmlist_lock);
+ +              list_del(&mm->mmlist);
+ +              spin_unlock(&mmlist_lock);
+ +      }
+ +      if (mm->binfmt)
+ +              module_put(mm->binfmt->module);
+ +      mmdrop(mm);
+ +}
+ +
   /*
    * Decrement the use count and release all resources for an mm.
    */
@@@ -729,26 -706,24 +729,26 @@@ void mmput(struct mm_struct *mm
   {
         might_sleep();
   
+ +      if (atomic_dec_and_test(&mm->mm_users))
+ +              __mmput(mm);
+ +}
+ +EXPORT_SYMBOL_GPL(mmput);
+ +
+ +#ifdef CONFIG_MMU
+ +static void mmput_async_fn(struct work_struct *work)
+ +{
+ +      struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ +      __mmput(mm);
+ +}
+ +
+ +void mmput_async(struct mm_struct *mm)
+ +{
         if (atomic_dec_and_test(&mm->mm_users)) {
- -              uprobe_clear_state(mm);
- -              exit_aio(mm);
- -              ksm_exit(mm);
- -              khugepaged_exit(mm); /* must run before exit_mmap */
- -              exit_mmap(mm);
- -              set_mm_exe_file(mm, NULL);
- -              if (!list_empty(&mm->mmlist)) {
- -                      spin_lock(&mmlist_lock);
- -                      list_del(&mm->mmlist);
- -                      spin_unlock(&mmlist_lock);
- -              }
- -              if (mm->binfmt)
- -                      module_put(mm->binfmt->module);
- -              mmdrop(mm);
+ +              INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ +              schedule_work(&mm->async_put_work);
         }
   }
- -EXPORT_SYMBOL_GPL(mmput);
+ +#endif
   
   /**
    * set_mm_exe_file - change a reference to the mm's executable file
@@@ -797,6 -772,29 +797,29 @@@ struct file *get_mm_exe_file(struct mm_
         return exe_file;
   }
   EXPORT_SYMBOL(get_mm_exe_file);
+ 
+ /**
+  * get_task_exe_file - acquire a reference to the task's executable file
+  *
+  * Returns %NULL if task's mm (if any) has no associated executable file or
+  * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+  * User must release file via fput().
+  */
+ struct file *get_task_exe_file(struct task_struct *task)
+ {
+       struct file *exe_file = NULL;
+       struct mm_struct *mm;
+ 
+       task_lock(task);
+       mm = task->mm;
+       if (mm) {
+               if (!(task->flags & PF_KTHREAD))
+                       exe_file = get_mm_exe_file(mm);
+       }
+       task_unlock(task);
+       return exe_file;
+ }
+ EXPORT_SYMBOL(get_task_exe_file);
   
   /**
    * get_task_mm - acquire a reference to the task's mm
@@@ -1281,8 -1279,7 +1304,8 @@@ static struct task_struct *copy_process
                                         int __user *child_tidptr,
                                         struct pid *pid,
                                         int trace,
- -                                      unsigned long tls)
+ +                                      unsigned long tls,
+ +                                      int node)
   {
         int retval;
         struct task_struct *p;
@@@ -1334,7 -1331,7 +1357,7 @@@
                 goto fork_out;
   
         retval = -ENOMEM;
- -      p = dup_task_struct(current);
+ +      p = dup_task_struct(current, node);
         if (!p)
                 goto fork_out;
   
@@@ -1404,6 -1401,7 +1427,6 @@@
         p->real_start_time = ktime_get_boot_ns();
         p->io_context = NULL;
         p->audit_context = NULL;
- -      threadgroup_change_begin(current);
         cgroup_fork(p);
   #ifdef CONFIG_NUMA
         p->mempolicy = mpol_dup(p->mempolicy);
@@@ -1495,7 -1493,7 +1518,7 @@@
                 pid = alloc_pid(p->nsproxy->pid_ns_for_children);
                 if (IS_ERR(pid)) {
                         retval = PTR_ERR(pid);
- -                      goto bad_fork_cleanup_io;
+ +                      goto bad_fork_cleanup_thread;
                 }
         }
   
@@@ -1519,7 -1517,7 +1542,7 @@@
          * sigaltstack should be cleared when sharing the same VM
          */
         if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
- -              p->sas_ss_sp = p->sas_ss_size = 0;
+ +              sas_ss_reset(p);
   
         /*
          * Syscall tracing and stepping should be turned off in the
@@@ -1555,7 -1553,6 +1578,7 @@@
         INIT_LIST_HEAD(&p->thread_group);
         p->task_works = NULL;
   
+ +      threadgroup_change_begin(current);
         /*
          * Ensure that the cgroup subsystem policies allow the new process to be
          * forked. It should be noted the the new process's css_set can be changed
@@@ -1656,11 -1653,8 +1679,11 @@@
   bad_fork_cancel_cgroup:
         cgroup_cancel_fork(p);
   bad_fork_free_pid:
+ +      threadgroup_change_end(current);
         if (pid != &init_struct_pid)
                 free_pid(pid);
+ +bad_fork_cleanup_thread:
+ +      exit_thread(p);
   bad_fork_cleanup_io:
         if (p->io_context)
                 exit_io_context(p);
@@@ -1689,6 -1683,7 +1712,6 @@@ bad_fork_cleanup_policy
         mpol_put(p->mempolicy);
   bad_fork_cleanup_threadgroup_lock:
   #endif
- -      threadgroup_change_end(current);
         delayacct_tsk_free(p);
   bad_fork_cleanup_count:
         atomic_dec(&p->cred->user->processes);
@@@ -1712,8 -1707,7 +1735,8 @@@ static inline void init_idle_pids(struc
   struct task_struct *fork_idle(int cpu)
   {
         struct task_struct *task;
- -      task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+ +      task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+ +                          cpu_to_node(cpu));
         if (!IS_ERR(task)) {
                 init_idle_pids(task->pids);
                 init_idle(task, cpu);
@@@ -1758,7 -1752,7 +1781,7 @@@ long _do_fork(unsigned long clone_flags
         }
   
         p = copy_process(clone_flags, stack_start, stack_size,
- -                       child_tidptr, NULL, trace, tls);
+ +                       child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
         /*
          * Do this prior waking up the new thread - the thread pointer
          * might get invalid after that point, if the thread exits quickly.
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Sep 2016 22:55:56 +0000 (15:55 -0700)
		1	2
fs/proc/base.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/audit_watch.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history