Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...
[cascardo/linux.git] / kernel / fork.c
index 3cb4853..9a05bd9 100644 (file)
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
-# if THREAD_SIZE >= PAGE_SIZE
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-                                                 int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush.  Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
+#ifdef CONFIG_VMAP_STACK
+       void *stack;
+       int i;
+
+       local_irq_disable();
+       for (i = 0; i < NR_CACHED_STACKS; i++) {
+               struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+
+               if (!s)
+                       continue;
+               this_cpu_write(cached_stacks[i], NULL);
+
+               tsk->stack_vm_area = s;
+               local_irq_enable();
+               return s->addr;
+       }
+       local_irq_enable();
+
+       stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+                                    VMALLOC_START, VMALLOC_END,
+                                    THREADINFO_GFP | __GFP_HIGHMEM,
+                                    PAGE_KERNEL,
+                                    0, node, __builtin_return_address(0));
+
+       /*
+        * We can't call find_vm_area() in interrupt context, and
+        * free_thread_stack() can be called in interrupt context,
+        * so cache the vm_struct.
+        */
+       if (stack)
+               tsk->stack_vm_area = find_vm_area(stack);
+       return stack;
+#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
+#endif
 }
 
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
 {
-       __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+       if (task_stack_vm_area(tsk)) {
+               unsigned long flags;
+               int i;
+
+               local_irq_save(flags);
+               for (i = 0; i < NR_CACHED_STACKS; i++) {
+                       if (this_cpu_read(cached_stacks[i]))
+                               continue;
+
+                       this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+                       local_irq_restore(flags);
+                       return;
+               }
+               local_irq_restore(flags);
+
+               vfree(tsk->stack);
+               return;
+       }
+#endif
+
+       __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
 {
-       kmem_cache_free(thread_stack_cache, stack);
+       kmem_cache_free(thread_stack_cache, tsk->stack);
 }
 
 void thread_stack_cache_init(void)
@@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-       /* All stack pages are in the same zone and belong to the same memcg. */
-       struct page *first_page = virt_to_page(stack);
+       void *stack = task_stack_page(tsk);
+       struct vm_struct *vm = task_stack_vm_area(tsk);
+
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+       if (vm) {
+               int i;
+
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+                       mod_zone_page_state(page_zone(vm->pages[i]),
+                                           NR_KERNEL_STACK_KB,
+                                           PAGE_SIZE / 1024 * account);
+               }
+
+               /* All stack pages belong to the same memcg. */
+               memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       } else {
+               /*
+                * All stack pages are in the same zone and belong to the
+                * same memcg.
+                */
+               struct page *first_page = virt_to_page(stack);
 
-       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                           THREAD_SIZE / 1024 * account);
+               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                                   THREAD_SIZE / 1024 * account);
 
-       memcg_kmem_update_page_stat(
-               first_page, MEMCG_KERNEL_STACK_KB,
-               account * (THREAD_SIZE / 1024));
+               memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       }
 }
 
-void free_task(struct task_struct *tsk)
+static void release_task_stack(struct task_struct *tsk)
 {
-       account_kernel_stack(tsk->stack, -1);
+       account_kernel_stack(tsk, -1);
        arch_release_thread_stack(tsk->stack);
-       free_thread_stack(tsk->stack);
+       free_thread_stack(tsk);
+       tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = NULL;
+#endif
+}
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+       if (atomic_dec_and_test(&tsk->stack_refcount))
+               release_task_stack(tsk);
+}
+#endif
+
+void free_task(struct task_struct *tsk)
+{
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+       /*
+        * The task is finally done with both the stack and thread_info,
+        * so free both.
+        */
+       release_task_stack(tsk);
+#else
+       /*
+        * If the task had a separate stack allocation, it should be gone
+        * by now.
+        */
+       WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -347,6 +463,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
        unsigned long *stack;
+       struct vm_struct *stack_vm_area;
        int err;
 
        if (node == NUMA_NO_NODE)
@@ -359,11 +476,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!stack)
                goto free_tsk;
 
+       stack_vm_area = task_stack_vm_area(tsk);
+
        err = arch_dup_task_struct(tsk, orig);
+
+       /*
+        * arch_dup_task_struct() clobbers the stack-related fields.  Make
+        * sure they're properly initialized before using any stack-related
+        * functions again.
+        */
+       tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       atomic_set(&tsk->stack_refcount, 1);
+#endif
+
        if (err)
                goto free_stack;
 
-       tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -395,14 +527,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
 
-       account_kernel_stack(stack, 1);
+       account_kernel_stack(tsk, 1);
 
        kcov_task_init(tsk);
 
        return tsk;
 
 free_stack:
-       free_thread_stack(stack);
+       free_thread_stack(tsk);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
@@ -803,6 +935,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mm_exe_file);
 
+/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+       struct file *exe_file = NULL;
+       struct mm_struct *mm;
+
+       task_lock(task);
+       mm = task->mm;
+       if (mm) {
+               if (!(task->flags & PF_KTHREAD))
+                       exe_file = get_mm_exe_file(mm);
+       }
+       task_unlock(task);
+       return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
@@ -918,14 +1073,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
        deactivate_mm(tsk, mm);
 
        /*
-        * If we're exiting normally, clear a user-space tid field if
-        * requested.  We leave this alone when dying by signal, to leave
-        * the value intact in a core dump, and to save the unnecessary
-        * trouble, say, a killed vfork parent shouldn't touch this mm.
-        * Userland only wants this done for a sys_exit.
+        * Signal userspace if we're not exiting with a core dump
+        * because we want to leave the value intact for debugging
+        * purposes.
         */
        if (tsk->clear_child_tid) {
-               if (!(tsk->flags & PF_SIGNALED) &&
+               if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
                    atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
@@ -1409,7 +1562,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->real_start_time = ktime_get_boot_ns();
        p->io_context = NULL;
        p->audit_context = NULL;
-       threadgroup_change_begin(current);
        cgroup_fork(p);
 #ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
@@ -1561,6 +1713,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->thread_group);
        p->task_works = NULL;
 
+       threadgroup_change_begin(current);
        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted the the new process's css_set can be changed
@@ -1661,6 +1814,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p);
 bad_fork_free_pid:
+       threadgroup_change_end(current);
        if (pid != &init_struct_pid)
                free_pid(pid);
 bad_fork_cleanup_thread:
@@ -1693,12 +1847,12 @@ bad_fork_cleanup_policy:
        mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-       threadgroup_change_end(current);
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
 bad_fork_free:
+       put_task_stack(p);
        free_task(p);
 fork_out:
        return ERR_PTR(retval);