X-Git-Url: http://git.cascardo.info/?p=cascardo%2Flinux.git;a=blobdiff_plain;f=arch%2Fx86%2Fentry%2Fentry_64.S;h=ef766a358b37dd355e8d5d75c10a61d47a83e04f;hp=766d48db95453e337f2fb95f137a80aa83d7124c;hb=84d69848c97faab0c25aa2667b273404d2e2a64a;hpb=590abbdd273304b55824bcb9ea91840ea375575d diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 766d48db9545..ef766a358b37 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -180,7 +180,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_path entry_SYSCALL_64_fastpath: @@ -218,7 +219,8 @@ entry_SYSCALL_64_fastpath: */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f LOCKDEP_SYS_EXIT @@ -289,11 +291,15 @@ return_from_SYSCALL_64: jne opportunistic_sysret_failed /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot + * restore RF properly. If the slowpath sets it for whatever reason, we + * need to restore it correctly. + * + * SYSRET can restore TF, but unlike IRET, restoring TF results in a + * trap from userspace immediately after SYSRET. This would cause an + * infinite loop whenever #DB happens with register state that satisfies + * the opportunistic SYSRET conditions. For example, single-stepping + * this user code: * * movq $stuck_here, %rcx * pushfq @@ -348,8 +354,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - /* Called from C */ - jmp *%rax /* called from C */ + jmp *%rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func @@ -365,42 +370,74 @@ END(ptregs_\func) #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym) #include +/* + * %rdi: prev task + * %rsi: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in inactive_task_frame + */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* switch stack */ + movq %rsp, TASK_threadsp(%rdi) + movq TASK_threadsp(%rsi), %rsp + +#ifdef CONFIG_CC_STACKPROTECTOR + movq TASK_stack_canary(%rsi), %rbx + movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + + jmp __switch_to +END(__switch_to_asm) + /* * A newly forked process directly context switches into this address. * - * rdi: prev task we switched from + * rax: prev task we switched from + * rbx: kernel thread func (NULL for user thread) + * r12: kernel thread arg */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) - + movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ - testb $3, CS(%rsp) /* from kernel_thread? */ - jnz 1f - - /* - * We came from kernel_thread. This code path is quite twisted, and - * someone should clean it up. - * - * copy_thread_tls stashes the function pointer in RBX and the - * parameter to be passed in RBP. The called function is permitted - * to call do_execve and thereby jump to user mode. - */ - movq RBP(%rsp), %rdi - call *RBX(%rsp) - movl $0, RAX(%rsp) - - /* - * Fall through as though we're exiting a syscall. This makes a - * twisted sort of sense if we just called do_execve. - */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ -1: +2: movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS jmp restore_regs_and_iret + +1: + /* kernel thread */ + movq %r12, %rdi + call *%rbx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. + */ + movq $0, RAX(%rsp) + jmp 2b END(ret_from_fork) /* @@ -552,27 +589,69 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq %rax - pushq %rdi + /* + * We are running with user GSBASE. All GPRs contain their user + * values. We have a percpu ESPFIX stack that is eight slots + * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom + * of the ESPFIX stack. + * + * We clobber RAX and RDI in this code. We stash RDI on the + * normal stack and RAX on the ESPFIX stack. + * + * The ESPFIX stack layout we set up looks like this: + * + * --- top of ESPFIX stack --- + * SS + * RSP + * RFLAGS + * CS + * RIP <-- RSP points here when we're done + * RAX <-- espfix_waddr points here + * --- bottom of ESPFIX stack --- + */ + + pushq %rdi /* Stash user RDI */ SWAPGS movq PER_CPU_VAR(espfix_waddr), %rdi - movq %rax, (0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ movq %rax, (1*8)(%rdi) - movq (3*8)(%rsp), %rax /* CS */ + movq (2*8)(%rsp), %rax /* user CS */ movq %rax, (2*8)(%rdi) - movq (4*8)(%rsp), %rax /* RFLAGS */ + movq (3*8)(%rsp), %rax /* user RFLAGS */ movq %rax, (3*8)(%rdi) - movq (6*8)(%rsp), %rax /* SS */ + movq (5*8)(%rsp), %rax /* user SS */ movq %rax, (5*8)(%rdi) - movq (5*8)(%rsp), %rax /* RSP */ + movq (4*8)(%rsp), %rax /* user RSP */ movq %rax, (4*8)(%rdi) - andl $0xffff0000, %eax - popq %rdi + /* Now RAX == RSP. */ + + andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ + popq %rdi /* Restore user RDI */ + + /* + * espfix_stack[31:16] == 0. The page tables are set up such that + * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of + * espfix_waddr for any X. That is, there are 65536 RO aliases of + * the same page. Set up RSP so that RSP[31:16] contains the + * respective 16 bits of the /userspace/ RSP and RSP nonetheless + * still points to an RO alias of the ESPFIX stack. + */ orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp - popq %rax + + /* + * At this point, we cannot write to the stack any more, but we can + * still read. + */ + popq %rax /* Restore user RAX */ + + /* + * RSP now points to an ordinary IRET frame, except that the page + * is read-only and RSP[31:16] are preloaded with the userspace + * values. We can now IRET back to userspace. + */ jmp native_irq_return_iret #endif END(common_interrupt) @@ -602,9 +681,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym) .endm #endif +/* Make sure APIC interrupt handlers end up in the irqentry section: */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) +# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +# define POP_SECTION_IRQENTRY .popsection +#else +# define PUSH_SECTION_IRQENTRY +# define POP_SECTION_IRQENTRY +#endif + .macro apicinterrupt num sym do_sym +PUSH_SECTION_IRQENTRY apicinterrupt3 \num \sym \do_sym trace_apicinterrupt \num \sym +POP_SECTION_IRQENTRY .endm #ifdef CONFIG_SMP @@ -989,7 +1079,6 @@ ENTRY(error_entry) testb $3, CS+8(%rsp) jz .Lerror_kernelspace -.Lerror_entry_from_usermode_swapgs: /* * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. @@ -1032,7 +1121,8 @@ ENTRY(error_entry) * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ - jmp .Lerror_entry_from_usermode_swapgs + SWAPGS + jmp .Lerror_entry_done .Lbstep_iret: /* Fix truncated RIP */ @@ -1060,7 +1150,7 @@ END(error_entry) /* - * On entry, EBS is a "return to kernel mode" flag: + * On entry, EBX is a "return to kernel mode" flag: * 1: already in kernel mode, don't need SWAPGS * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */