Merge branch 'stable-4.8' of git://git.infradead.org/users/pcmoore/audit
[cascardo/linux.git] / kernel / fork.c
index 42451ae..36c0daa 100644 (file)
@@ -148,57 +148,49 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
 
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
 {
-       struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
-                                                 THREAD_SIZE_ORDER);
-
-       if (page)
-               memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                           1 << THREAD_SIZE_ORDER);
+       struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+                                            THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
 }
 
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-       struct page *page = virt_to_page(ti);
-
-       memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
-                                   -(1 << THREAD_SIZE_ORDER));
-       __free_kmem_pages(page, THREAD_SIZE_ORDER);
+       __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
                                                  int node)
 {
-       return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+       return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(unsigned long *stack)
 {
-       kmem_cache_free(thread_info_cache, ti);
+       kmem_cache_free(thread_stack_cache, stack);
 }
 
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-       thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+       thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
                                              THREAD_SIZE, 0, NULL);
-       BUG_ON(thread_info_cache == NULL);
+       BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -221,18 +213,24 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-       struct zone *zone = page_zone(virt_to_page(ti));
+       /* All stack pages are in the same zone and belong to the same memcg. */
+       struct page *first_page = virt_to_page(stack);
 
-       mod_zone_page_state(zone, NR_KERNEL_STACK, account);
+       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                           THREAD_SIZE / 1024 * account);
+
+       memcg_kmem_update_page_stat(
+               first_page, MEMCG_KERNEL_STACK_KB,
+               account * (THREAD_SIZE / 1024));
 }
 
 void free_task(struct task_struct *tsk)
 {
        account_kernel_stack(tsk->stack, -1);
-       arch_release_thread_info(tsk->stack);
-       free_thread_info(tsk->stack);
+       arch_release_thread_stack(tsk->stack);
+       free_thread_stack(tsk->stack);
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -340,26 +338,27 @@ void set_task_stack_end_magic(struct task_struct *tsk)
        *stackend = STACK_END_MAGIC;    /* for overflow detection */
 }
 
-static struct task_struct *dup_task_struct(struct task_struct *orig)
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
-       struct thread_info *ti;
-       int node = tsk_fork_get_node(orig);
+       unsigned long *stack;
        int err;
 
+       if (node == NUMA_NO_NODE)
+               node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;
 
-       ti = alloc_thread_info_node(tsk, node);
-       if (!ti)
+       stack = alloc_thread_stack_node(tsk, node);
+       if (!stack)
                goto free_tsk;
 
        err = arch_dup_task_struct(tsk, orig);
        if (err)
-               goto free_ti;
+               goto free_stack;
 
-       tsk->stack = ti;
+       tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -391,14 +390,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
 
-       account_kernel_stack(ti, 1);
+       account_kernel_stack(stack, 1);
 
        kcov_task_init(tsk);
 
        return tsk;
 
-free_ti:
-       free_thread_info(ti);
+free_stack:
+       free_thread_stack(stack);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
@@ -413,7 +412,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
        unsigned long charge;
 
        uprobe_start_dup_mmap();
-       down_write(&oldmm->mmap_sem);
+       if (down_write_killable(&oldmm->mmap_sem)) {
+               retval = -EINTR;
+               goto fail_uprobe_end;
+       }
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
@@ -525,6 +527,7 @@ out:
        up_write(&mm->mmap_sem);
        flush_tlb_mm(oldmm);
        up_write(&oldmm->mmap_sem);
+fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;
 fail_nomem_anon_vma_fork:
@@ -699,6 +702,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+       VM_BUG_ON(atomic_read(&mm->mm_users));
+
+       uprobe_clear_state(mm);
+       exit_aio(mm);
+       ksm_exit(mm);
+       khugepaged_exit(mm); /* must run before exit_mmap */
+       exit_mmap(mm);
+       set_mm_exe_file(mm, NULL);
+       if (!list_empty(&mm->mmlist)) {
+               spin_lock(&mmlist_lock);
+               list_del(&mm->mmlist);
+               spin_unlock(&mmlist_lock);
+       }
+       if (mm->binfmt)
+               module_put(mm->binfmt->module);
+       mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +729,26 @@ void mmput(struct mm_struct *mm)
 {
        might_sleep();
 
+       if (atomic_dec_and_test(&mm->mm_users))
+               __mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+       struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+       __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
        if (atomic_dec_and_test(&mm->mm_users)) {
-               uprobe_clear_state(mm);
-               exit_aio(mm);
-               ksm_exit(mm);
-               khugepaged_exit(mm); /* must run before exit_mmap */
-               exit_mmap(mm);
-               set_mm_exe_file(mm, NULL);
-               if (!list_empty(&mm->mmlist)) {
-                       spin_lock(&mmlist_lock);
-                       list_del(&mm->mmlist);
-                       spin_unlock(&mmlist_lock);
-               }
-               if (mm->binfmt)
-                       module_put(mm->binfmt->module);
-               mmdrop(mm);
+               INIT_WORK(&mm->async_put_work, mmput_async_fn);
+               schedule_work(&mm->async_put_work);
        }
 }
-EXPORT_SYMBOL_GPL(mmput);
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
@@ -1279,7 +1304,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                                        int __user *child_tidptr,
                                        struct pid *pid,
                                        int trace,
-                                       unsigned long tls)
+                                       unsigned long tls,
+                                       int node)
 {
        int retval;
        struct task_struct *p;
@@ -1331,7 +1357,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                goto fork_out;
 
        retval = -ENOMEM;
-       p = dup_task_struct(current);
+       p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
 
@@ -1401,7 +1427,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
-       threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1493,7 +1518,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                pid = alloc_pid(p->nsproxy->pid_ns_for_children);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
-                       goto bad_fork_cleanup_io;
+                       goto bad_fork_cleanup_thread;
                }
        }
 
@@ -1517,7 +1542,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
-               p->sas_ss_sp = p->sas_ss_size = 0;
+               sas_ss_reset(p);
 
        /*
         * Syscall tracing and stepping should be turned off in the
@@ -1553,6 +1578,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
 
+       threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
@@ -1653,8 +1679,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p);
 bad_fork_free_pid:
+       threadgroup_change_end(current);
        if (pid != &init_struct_pid)
                free_pid(pid);
+bad_fork_cleanup_thread:
+       exit_thread(p);
 bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
@@ -1683,7 +1712,6 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-       threadgroup_change_end(current);
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
@@ -1707,7 +1735,8 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
-       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+                           cpu_to_node(cpu));
        if (!IS_ERR(task)) {
                init_idle_pids(task->pids);
                init_idle(task, cpu);
@@ -1752,7 +1781,7 @@ long _do_fork(unsigned long clone_flags,
        }
 
        p = copy_process(clone_flags, stack_start, stack_size,
-                        child_tidptr, NULL, trace, tls);
+                        child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.