Merge tag 'gcc-plugins-v4.9-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[cascardo/linux.git] / kernel / fork.c
index 0539388..623259f 100644 (file)
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
-# if THREAD_SIZE >= PAGE_SIZE
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-                                                 int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush.  Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
+#ifdef CONFIG_VMAP_STACK
+       void *stack;
+       int i;
+
+       local_irq_disable();
+       for (i = 0; i < NR_CACHED_STACKS; i++) {
+               struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+
+               if (!s)
+                       continue;
+               this_cpu_write(cached_stacks[i], NULL);
+
+               tsk->stack_vm_area = s;
+               local_irq_enable();
+               return s->addr;
+       }
+       local_irq_enable();
+
+       stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+                                    VMALLOC_START, VMALLOC_END,
+                                    THREADINFO_GFP | __GFP_HIGHMEM,
+                                    PAGE_KERNEL,
+                                    0, node, __builtin_return_address(0));
+
+       /*
+        * We can't call find_vm_area() in interrupt context, and
+        * free_thread_stack() can be called in interrupt context,
+        * so cache the vm_struct.
+        */
+       if (stack)
+               tsk->stack_vm_area = find_vm_area(stack);
+       return stack;
+#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
+#endif
 }
 
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
 {
-       __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+       if (task_stack_vm_area(tsk)) {
+               unsigned long flags;
+               int i;
+
+               local_irq_save(flags);
+               for (i = 0; i < NR_CACHED_STACKS; i++) {
+                       if (this_cpu_read(cached_stacks[i]))
+                               continue;
+
+                       this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+                       local_irq_restore(flags);
+                       return;
+               }
+               local_irq_restore(flags);
+
+               vfree(tsk->stack);
+               return;
+       }
+#endif
+
+       __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
 {
-       kmem_cache_free(thread_stack_cache, stack);
+       kmem_cache_free(thread_stack_cache, tsk->stack);
 }
 
 void thread_stack_cache_init(void)
@@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-       /* All stack pages are in the same zone and belong to the same memcg. */
-       struct page *first_page = virt_to_page(stack);
+       void *stack = task_stack_page(tsk);
+       struct vm_struct *vm = task_stack_vm_area(tsk);
+
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+       if (vm) {
+               int i;
+
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+                       mod_zone_page_state(page_zone(vm->pages[i]),
+                                           NR_KERNEL_STACK_KB,
+                                           PAGE_SIZE / 1024 * account);
+               }
+
+               /* All stack pages belong to the same memcg. */
+               memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       } else {
+               /*
+                * All stack pages are in the same zone and belong to the
+                * same memcg.
+                */
+               struct page *first_page = virt_to_page(stack);
 
-       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                           THREAD_SIZE / 1024 * account);
+               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                                   THREAD_SIZE / 1024 * account);
 
-       memcg_kmem_update_page_stat(
-               first_page, MEMCG_KERNEL_STACK_KB,
-               account * (THREAD_SIZE / 1024));
+               memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       }
 }
 
-void free_task(struct task_struct *tsk)
+static void release_task_stack(struct task_struct *tsk)
 {
-       account_kernel_stack(tsk->stack, -1);
+       account_kernel_stack(tsk, -1);
        arch_release_thread_stack(tsk->stack);
-       free_thread_stack(tsk->stack);
+       free_thread_stack(tsk);
+       tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = NULL;
+#endif
+}
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+       if (atomic_dec_and_test(&tsk->stack_refcount))
+               release_task_stack(tsk);
+}
+#endif
+
+void free_task(struct task_struct *tsk)
+{
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+       /*
+        * The task is finally done with both the stack and thread_info,
+        * so free both.
+        */
+       release_task_stack(tsk);
+#else
+       /*
+        * If the task had a separate stack allocation, it should be gone
+        * by now.
+        */
+       WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -243,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig)
 {
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
+       /*
+        * __mmdrop is not safe to call from softirq context on x86 due to
+        * pgd_dtor so postpone it to the async context
+        */
+       if (sig->oom_mm)
+               mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
 }
 
@@ -302,6 +424,7 @@ int arch_task_struct_size __read_mostly;
 
 void __init fork_init(void)
 {
+       int i;
 #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN     L1_CACHE_BYTES
@@ -321,6 +444,10 @@ void __init fork_init(void)
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];
+
+       for (i = 0; i < UCOUNT_COUNTS; i++) {
+               init_user_ns.ucount_max[i] = max_threads/2;
+       }
 }
 
 int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -342,6 +469,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
        unsigned long *stack;
+       struct vm_struct *stack_vm_area;
        int err;
 
        if (node == NUMA_NO_NODE)
@@ -354,11 +482,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!stack)
                goto free_tsk;
 
+       stack_vm_area = task_stack_vm_area(tsk);
+
        err = arch_dup_task_struct(tsk, orig);
+
+       /*
+        * arch_dup_task_struct() clobbers the stack-related fields.  Make
+        * sure they're properly initialized before using any stack-related
+        * functions again.
+        */
+       tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       atomic_set(&tsk->stack_refcount, 1);
+#endif
+
        if (err)
                goto free_stack;
 
-       tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -390,14 +533,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
 
-       account_kernel_stack(stack, 1);
+       account_kernel_stack(tsk, 1);
 
        kcov_task_init(tsk);
 
        return tsk;
 
 free_stack:
-       free_thread_stack(stack);
+       free_thread_stack(tsk);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
@@ -712,6 +855,7 @@ static inline void __mmput(struct mm_struct *mm)
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
+       mm_put_huge_zero_page(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
@@ -720,6 +864,7 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
+       set_bit(MMF_OOM_SKIP, &mm->flags);
        mmdrop(mm);
 }
 
@@ -1717,6 +1862,7 @@ bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
 bad_fork_free:
+       put_task_stack(p);
        free_task(p);
 fork_out:
        return ERR_PTR(retval);