X-Git-Url: http://git.cascardo.info/?a=blobdiff_plain;f=kernel%2Ffork.c;h=6d42242485cb2863e940ae0d54983a2d34789096;hb=904763e1fb5eebf8249ec41a2019e5e32246df2f;hp=beb31725f7e2746fb17cdd305c4193ab2c70e551;hpb=72a9cdd083005900f15934e8568f1ac43a6bb755;p=cascardo%2Flinux.git diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e2..6d42242485cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack) * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { +#ifdef CONFIG_VMAP_STACK + void *stack; + int i; + + local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *s = this_cpu_read(cached_stacks[i]); + + if (!s) + continue; + this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; + local_irq_enable(); + return s->addr; + } + local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); + + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ + if (stack) + tsk->stack_vm_area = find_vm_area(stack); + return stack; +#else struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); +#ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; + int i; + + local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_read(cached_stacks[i])) + continue; + + this_cpu_write(cached_stacks[i], tsk->stack_vm_area); + local_irq_restore(flags); + return; + } + local_irq_restore(flags); + + vfree(tsk->stack); + return; + } +#endif + + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, stack); + kmem_cache_free(thread_stack_cache, tsk->stack); } void thread_stack_cache_init(void) @@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account) { - /* All stack pages are in the same zone and belong to the same memcg. */ - struct page *first_page = virt_to_page(stack); + void *stack = task_stack_page(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + + if (vm) { + int i; + + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_zone_page_state(page_zone(vm->pages[i]), + NR_KERNEL_STACK_KB, + PAGE_SIZE / 1024 * account); + } + + /* All stack pages belong to the same memcg. */ + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } else { + /* + * All stack pages are in the same zone and belong to the + * same memcg. + */ + struct page *first_page = virt_to_page(stack); - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat( - first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } } -void free_task(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk) { - account_kernel_stack(tsk->stack, -1); + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); - free_thread_stack(tsk->stack); + free_thread_stack(tsk); + tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = NULL; +#endif +} + +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk); +} +#endif + +void free_task(struct task_struct *tsk) +{ +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -243,6 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + /* + * __mmdrop is not safe to call from softirq context on x86 due to + * pgd_dtor so postpone it to the async context + */ + if (sig->oom_mm) + mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } @@ -302,6 +424,7 @@ int arch_task_struct_size __read_mostly; void __init fork_init(void) { + int i; #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES @@ -321,6 +444,10 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; + + for (i = 0; i < UCOUNT_COUNTS; i++) { + init_user_ns.ucount_max[i] = max_threads/2; + } } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -342,6 +469,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; + struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) @@ -354,11 +482,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + stack_vm_area = task_stack_vm_area(tsk); + err = arch_dup_task_struct(tsk, orig); + + /* + * arch_dup_task_struct() clobbers the stack-related fields. Make + * sure they're properly initialized before using any stack-related + * functions again. + */ + tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; +#endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + atomic_set(&tsk->stack_refcount, 1); +#endif + if (err) goto free_stack; - tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -390,14 +533,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(stack, 1); + account_kernel_stack(tsk, 1); kcov_task_init(tsk); return tsk; free_stack: - free_thread_stack(stack); + free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; @@ -711,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -719,6 +863,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -1715,6 +1860,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval);