sched/x86: Pass kernel thread parameters in 'struct fork_frame'
authorBrian Gerst <brgerst@gmail.com>
Sat, 13 Aug 2016 16:38:20 +0000 (12:38 -0400)
committerIngo Molnar <mingo@kernel.org>
Wed, 24 Aug 2016 10:31:50 +0000 (12:31 +0200)
Instead of setting up a fake pt_regs context, put the kernel thread
function pointer and arg into the unused callee-restored registers
of 'struct fork_frame'.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1471106302-10159-6-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/include/asm/switch_to.h
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c

index bf8f221..b75a8bc 100644 (file)
@@ -240,35 +240,34 @@ END(__switch_to_asm)
  * A newly forked process directly context switches into this address.
  *
  * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
  */
 ENTRY(ret_from_fork)
        pushl   %eax
        call    schedule_tail
        popl    %eax
 
+       testl   %ebx, %ebx
+       jnz     1f              /* kernel threads are uncommon */
+
+2:
        /* When we fork, we trace the syscall return in the child, too. */
        movl    %esp, %eax
        call    syscall_return_slowpath
        jmp     restore_all
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-       pushl   %eax
-       call    schedule_tail
-       popl    %eax
-       movl    PT_EBP(%esp), %eax
-       call    *PT_EBX(%esp)
-       movl    $0, PT_EAX(%esp)
 
+       /* kernel thread */
+1:     movl    %edi, %eax
+       call    *%ebx
        /*
-        * Kernel threads return to userspace as if returning from a syscall.
-        * We should check whether anything actually uses this path and, if so,
-        * consider switching it over to ret_from_fork.
+        * A kernel thread is allowed to return here after successfully
+        * calling do_execve().  Exit to userspace to complete the execve()
+        * syscall.
         */
-       movl    %esp, %eax
-       call    syscall_return_slowpath
-       jmp     restore_all
-ENDPROC(ret_from_kernel_thread)
+       movl    $0, PT_EAX(%esp)
+       jmp     2b
+END(ret_from_fork)
 
 /*
  * Return to user mode is not as complex as all this looks,
index c1af8ac..c0373d6 100644 (file)
@@ -407,37 +407,34 @@ END(__switch_to_asm)
  * A newly forked process directly context switches into this address.
  *
  * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
  */
 ENTRY(ret_from_fork)
        movq    %rax, %rdi
        call    schedule_tail                   /* rdi: 'prev' task parameter */
 
-       testb   $3, CS(%rsp)                    /* from kernel_thread? */
-       jnz     1f
-
-       /*
-        * We came from kernel_thread.  This code path is quite twisted, and
-        * someone should clean it up.
-        *
-        * copy_thread_tls stashes the function pointer in RBX and the
-        * parameter to be passed in RBP.  The called function is permitted
-        * to call do_execve and thereby jump to user mode.
-        */
-       movq    RBP(%rsp), %rdi
-       call    *RBX(%rsp)
-       movl    $0, RAX(%rsp)
-
-       /*
-        * Fall through as though we're exiting a syscall.  This makes a
-        * twisted sort of sense if we just called do_execve.
-        */
+       testq   %rbx, %rbx                      /* from kernel_thread? */
+       jnz     1f                              /* kernel threads are uncommon */
 
-1:
+2:
        movq    %rsp, %rdi
        call    syscall_return_slowpath /* returns with IRQs disabled */
        TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
        SWAPGS
        jmp     restore_regs_and_iret
+
+1:
+       /* kernel thread */
+       movq    %r12, %rdi
+       call    *%rbx
+       /*
+        * A kernel thread is allowed to return here after successfully
+        * calling do_execve().  Exit to userspace to complete the execve()
+        * syscall.
+        */
+       movq    $0, RAX(%rsp)
+       jmp     2b
 END(ret_from_fork)
 
 /*
index 886d5ea..5cb436a 100644 (file)
@@ -34,6 +34,8 @@ static inline void prepare_switch_to(struct task_struct *prev,
 #endif
 }
 
+asmlinkage void ret_from_fork(void);
+
 /* data that is pointed to by thread.sp */
 struct inactive_task_frame {
 #ifdef CONFIG_X86_64
index 4bedbc0..18714a1 100644 (file)
@@ -55,9 +55,6 @@
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
 /*
  * Return saved PC of a blocked thread.
  */
@@ -139,6 +136,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        int err;
 
        frame->bp = 0;
+       frame->ret_addr = (unsigned long) ret_from_fork;
        p->thread.sp = (unsigned long) fork_frame;
        p->thread.sp0 = (unsigned long) (childregs+1);
        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -146,25 +144,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
-               frame->ret_addr = (unsigned long) ret_from_kernel_thread;
-               task_user_gs(p) = __KERNEL_STACK_CANARY;
-               childregs->ds = __USER_DS;
-               childregs->es = __USER_DS;
-               childregs->fs = __KERNEL_PERCPU;
-               childregs->bx = sp;     /* function */
-               childregs->bp = arg;
-               childregs->orig_ax = -1;
-               childregs->cs = __KERNEL_CS | get_kernel_rpl();
-               childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+               frame->bx = sp;         /* function */
+               frame->di = arg;
                p->thread.io_bitmap_ptr = NULL;
                return 0;
        }
+       frame->bx = 0;
        *childregs = *current_pt_regs();
        childregs->ax = 0;
        if (sp)
                childregs->sp = sp;
 
-       frame->ret_addr = (unsigned long) ret_from_fork;
        task_user_gs(p) = get_user_gs(current_pt_regs());
 
        p->thread.io_bitmap_ptr = NULL;
index 827eeed..b812cd0 100644 (file)
@@ -50,8 +50,6 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 
-asmlinkage extern void ret_from_fork(void);
-
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
@@ -165,15 +163,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
-               childregs->sp = (unsigned long)childregs;
-               childregs->ss = __KERNEL_DS;
-               childregs->bx = sp; /* function */
-               childregs->bp = arg;
-               childregs->orig_ax = -1;
-               childregs->cs = __KERNEL_CS | get_kernel_rpl();
-               childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+               frame->bx = sp;         /* function */
+               frame->r12 = arg;
                return 0;
        }
+       frame->bx = 0;
        *childregs = *current_pt_regs();
 
        childregs->ax = 0;