Merge branch 'x86/urgent' into x86/asm
authorThomas Gleixner <tglx@linutronix.de>
Fri, 30 Sep 2016 10:38:28 +0000 (12:38 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Fri, 30 Sep 2016 10:38:28 +0000 (12:38 +0200)
Get the cr4 fixes so we can apply the final cleanup

108 files changed:
Documentation/trace/ftrace-design.txt
arch/Kconfig
arch/arm/kernel/ftrace.c
arch/arm64/kernel/entry-ftrace.S
arch/arm64/kernel/ftrace.c
arch/blackfin/kernel/ftrace-entry.S
arch/blackfin/kernel/ftrace.c
arch/ia64/include/asm/thread_info.h
arch/microblaze/kernel/ftrace.c
arch/mips/kernel/ftrace.c
arch/parisc/kernel/ftrace.c
arch/powerpc/kernel/ftrace.c
arch/s390/kernel/ftrace.c
arch/sh/kernel/ftrace.c
arch/sparc/Kconfig
arch/sparc/include/asm/ftrace.h
arch/sparc/kernel/ftrace.c
arch/tile/kernel/ftrace.c
arch/x86/Kconfig
arch/x86/entry/common.c
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/events/core.c
arch/x86/include/asm/alternative.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/fpu/xstate.h
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/kaslr.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/pgtable_64_types.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/realmode.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/stacktrace.h
arch/x86/include/asm/switch_to.h
arch/x86/include/asm/syscall.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/traps.h
arch/x86/include/asm/unwind.h [new file with mode: 0644]
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/sleep.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/apic_noop.c
arch/x86/kernel/apic/bigsmp_32.c
arch/x86/kernel/apic/msi.c
arch/x86/kernel/apic/probe_32.c
arch/x86/kernel/apic/x2apic_cluster.c
arch/x86/kernel/apic/x2apic_phys.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/cpu/mtrr/mtrr.h
arch/x86/kernel/dumpstack.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/fpu/init.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/head_32.S
arch/x86/kernel/head_64.S
arch/x86/kernel/irq_64.c
arch/x86/kernel/kgdb.c
arch/x86/kernel/ksysfs.c
arch/x86/kernel/kvmclock.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/signal.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/stacktrace.c
arch/x86/kernel/traps.c
arch/x86/kernel/unwind_frame.c [new file with mode: 0644]
arch/x86/kernel/unwind_guess.c [new file with mode: 0644]
arch/x86/kernel/x86_init.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/mm/fault.c
arch/x86/mm/kaslr.c
arch/x86/mm/tlb.c
arch/x86/oprofile/backtrace.c
arch/x86/pci/pcbios.c
arch/x86/um/ptrace_32.c
drivers/iommu/amd_iommu.c
drivers/iommu/amd_iommu_types.h
fs/proc/base.c
include/linux/ftrace.h
include/linux/init_task.h
include/linux/sched.h
include/linux/thread_info.h
init/Kconfig
init/init_task.c
kernel/fork.c
kernel/kthread.c
kernel/sched/core.c
kernel/sched/sched.h
kernel/trace/Kconfig
kernel/trace/trace_functions_graph.c
lib/dma-debug.c
lib/syscall.c
tools/testing/selftests/x86/ptrace_syscall.c
tools/testing/selftests/x86/sigreturn.c

index dd5f916..a273dd0 100644 (file)
@@ -203,6 +203,17 @@ along to ftrace_push_return_trace() instead of a stub value of 0.
 
 Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer.
 
+HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+--------------------------------
+
+An arch may pass in a pointer to the return address on the stack.  This
+prevents potential stack unwinding issues where the unwinder gets out of
+sync with ret_stack and the wrong addresses are reported by
+ftrace_graph_ret_addr().
+
+Adding support for it is easy: just define the macro in asm/ftrace.h and
+pass the return address pointer as the 'retp' argument to
+ftrace_push_return_trace().
 
 HAVE_FTRACE_NMI_ENTER
 ---------------------
index fd6e971..180ea33 100644 (file)
@@ -696,4 +696,38 @@ config ARCH_NO_COHERENT_DMA_MMAP
 config CPU_NO_EFFICIENT_FFS
        def_bool n
 
+config HAVE_ARCH_VMAP_STACK
+       def_bool n
+       help
+         An arch should select this symbol if it can support kernel stacks
+         in vmalloc space.  This means:
+
+         - vmalloc space must be large enough to hold many kernel stacks.
+           This may rule out many 32-bit architectures.
+
+         - Stacks in vmalloc space need to work reliably.  For example, if
+           vmap page tables are created on demand, either this mechanism
+           needs to work while the stack points to a virtual address with
+           unpopulated page tables or arch code (switch_to() and switch_mm(),
+           most likely) needs to ensure that the stack's page table entries
+           are populated before running on a possibly unpopulated stack.
+
+         - If the stack overflows into a guard page, something reasonable
+           should happen.  The definition of "reasonable" is flexible, but
+           instantly rebooting without logging anything would be unfriendly.
+
+config VMAP_STACK
+       default y
+       bool "Use a virtually-mapped stack"
+       depends on HAVE_ARCH_VMAP_STACK && !KASAN
+       ---help---
+         Enable this if you want the use virtually-mapped kernel stacks
+         with guard pages.  This causes kernel stack overflows to be
+         caught immediately rather than causing difficult-to-diagnose
+         corruption.
+
+         This is presently incompatible with KASAN because KASAN expects
+         the stack to map directly to the KASAN shadow map using a formula
+         that is incorrect if the stack is in vmalloc space.
+
 source "kernel/gcov/Kconfig"
index 709ee1d..3f17594 100644 (file)
@@ -218,7 +218,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
        }
 
        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
-                                      frame_pointer);
+                                      frame_pointer, NULL);
        if (err == -EBUSY) {
                *parent = old;
                return;
index 0f03a8f..aef02d2 100644 (file)
@@ -219,7 +219,7 @@ ENDPROC(ftrace_graph_caller)
  *
  * Run ftrace_return_to_handler() before going back to parent.
  * @fp is checked against the value passed by ftrace_graph_caller()
- * only when CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST is enabled.
+ * only when HAVE_FUNCTION_GRAPH_FP_TEST is enabled.
  */
 ENTRY(return_to_handler)
        save_return_regs
index ebecf9a..40ad08a 100644 (file)
@@ -138,7 +138,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
                return;
 
        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
-                                      frame_pointer);
+                                      frame_pointer, NULL);
        if (err == -EBUSY)
                return;
        else
index 28d0595..3b8bdcb 100644 (file)
@@ -169,7 +169,7 @@ ENTRY(_ftrace_graph_caller)
        r0 = sp;        /* unsigned long *parent */
        r1 = [sp];      /* unsigned long self_addr */
 # endif
-# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+# ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        r2 = fp;        /* unsigned long frame_pointer */
 # endif
        r0 += 16;       /* skip the 4 local regs on stack */
@@ -190,7 +190,7 @@ ENTRY(_return_to_handler)
        [--sp] = r1;
 
        /* get original return address */
-# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+# ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        r0 = fp;        /* Blackfin is sane, so omit this */
 # endif
        call _ftrace_return_to_handler;
index 095de0f..8dad758 100644 (file)
@@ -107,7 +107,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
                return;
 
        if (ftrace_push_return_trace(*parent, self_addr, &trace.depth,
-                                    frame_pointer) == -EBUSY)
+                                    frame_pointer, NULL) == -EBUSY)
                return;
 
        trace.func = self_addr;
index 29bd597..c702642 100644 (file)
@@ -56,7 +56,7 @@ struct thread_info {
 #define alloc_thread_stack_node(tsk, node)     ((unsigned long *) 0)
 #define task_thread_info(tsk)  ((struct thread_info *) 0)
 #endif
-#define free_thread_stack(ti)  /* nothing */
+#define free_thread_stack(tsk) /* nothing */
 #define task_stack_page(tsk)   ((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
index fc7b48a..d57563c 100644 (file)
@@ -63,7 +63,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
                return;
        }
 
-       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0);
+       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL);
        if (err == -EBUSY) {
                *parent = old;
                return;
index 937c54b..30a3b75 100644 (file)
@@ -382,8 +382,8 @@ void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra,
        if (unlikely(faulted))
                goto out;
 
-       if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp)
-           == -EBUSY) {
+       if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp,
+                                    NULL) == -EBUSY) {
                *parent_ra_addr = old_parent_ra;
                return;
        }
index a828a0a..5a5506a 100644 (file)
@@ -48,7 +48,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent,
                return;
 
         if (ftrace_push_return_trace(old, self_addr, &trace.depth,
-                       ) == -EBUSY)
+                                    0, NULL) == -EBUSY)
                 return;
 
        /* activate parisc_return_to_handler() as return point */
index cc52d97..a95639b 100644 (file)
@@ -593,7 +593,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
        if (!ftrace_graph_entry(&trace))
                goto out;
 
-       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY)
+       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0,
+                                    NULL) == -EBUSY)
                goto out;
 
        parent = return_hooker;
index 0f7bfeb..60a8a4e 100644 (file)
@@ -209,7 +209,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip)
        /* Only trace if the calling function expects to. */
        if (!ftrace_graph_entry(&trace))
                goto out;
-       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY)
+       if (ftrace_push_return_trace(parent, ip, &trace.depth, 0,
+                                    NULL) == -EBUSY)
                goto out;
        parent = (unsigned long) return_to_handler;
 out:
index 38993e0..95eccd4 100644 (file)
@@ -382,7 +382,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
                return;
        }
 
-       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0);
+       err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL);
        if (err == -EBUSY) {
                __raw_writel(old, parent);
                return;
index 59b0960..f5d60f1 100644 (file)
@@ -56,7 +56,6 @@ config SPARC64
        def_bool 64BIT
        select HAVE_FUNCTION_TRACER
        select HAVE_FUNCTION_GRAPH_TRACER
-       select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_KRETPROBES
        select HAVE_KPROBES
        select HAVE_RCU_TABLE_FREE if SMP
index 3192a8e..62755a3 100644 (file)
@@ -9,6 +9,10 @@
 void _mcount(void);
 #endif
 
+#endif /* CONFIG_MCOUNT */
+
+#if defined(CONFIG_SPARC64) && !defined(CC_USE_FENTRY)
+#define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE
index 0a2d2dd..6bcff69 100644 (file)
@@ -131,7 +131,7 @@ unsigned long prepare_ftrace_return(unsigned long parent,
                return parent + 8UL;
 
        if (ftrace_push_return_trace(parent, self_addr, &trace.depth,
-                                    frame_pointer) == -EBUSY)
+                                    frame_pointer, NULL) == -EBUSY)
                return parent + 8UL;
 
        trace.func = self_addr;
index 4a57208..b827a41 100644 (file)
@@ -184,7 +184,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
        *parent = return_hooker;
 
        err = ftrace_push_return_trace(old, self_addr, &trace.depth,
-                                      frame_pointer);
+                                      frame_pointer, NULL);
        if (err == -EBUSY) {
                *parent = old;
                return;
index 2a1f0ce..2a83bc8 100644 (file)
@@ -93,6 +93,7 @@ config X86
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select HAVE_ARCH_WITHIN_STACK_FRAMES
        select HAVE_EBPF_JIT                    if X86_64
+       select HAVE_ARCH_VMAP_STACK             if X86_64
        select HAVE_CC_STACKPROTECTOR
        select HAVE_CMPXCHG_DOUBLE
        select HAVE_CMPXCHG_LOCAL
@@ -109,7 +110,6 @@ config X86
        select HAVE_EXIT_THREAD
        select HAVE_FENTRY                      if X86_64
        select HAVE_FTRACE_MCOUNT_RECORD
-       select HAVE_FUNCTION_GRAPH_FP_TEST
        select HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACER
        select HAVE_GCC_PLUGINS
@@ -157,6 +157,7 @@ config X86
        select SPARSE_IRQ
        select SRCU
        select SYSCTL_EXCEPTION_TRACE
+       select THREAD_INFO_IN_TASK
        select USER_STACKTRACE_SUPPORT
        select VIRT_TO_BUS
        select X86_DEV_DMA_OPS                  if X86_64
index 1433f6b..bdd9cc5 100644 (file)
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
-static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
-{
-       unsigned long top_of_stack =
-               (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
-       return (struct thread_info *)(top_of_stack - THREAD_SIZE);
-}
-
 #ifdef CONFIG_CONTEXT_TRACKING
 /* Called on entry from user mode with IRQs off. */
 __visible inline void enter_from_user_mode(void)
@@ -71,7 +64,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
 {
        u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 
-       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       struct thread_info *ti = current_thread_info();
        unsigned long ret = 0;
        bool emulated = false;
        u32 work;
@@ -173,18 +166,17 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
                /* Disable IRQs and retry */
                local_irq_disable();
 
-               cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+               cached_flags = READ_ONCE(current_thread_info()->flags);
 
                if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
                        break;
-
        }
 }
 
 /* Called with IRQs disabled. */
 __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
 {
-       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       struct thread_info *ti = current_thread_info();
        u32 cached_flags;
 
        if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -209,7 +201,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
         * special case only applies after poking regs and before the
         * very next return to user mode.
         */
-       ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
+       current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
 #endif
 
        user_enter_irqoff();
@@ -247,7 +239,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
  */
 __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 {
-       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       struct thread_info *ti = current_thread_info();
        u32 cached_flags = READ_ONCE(ti->flags);
 
        CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -270,7 +262,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
 #ifdef CONFIG_X86_64
 __visible void do_syscall_64(struct pt_regs *regs)
 {
-       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       struct thread_info *ti = current_thread_info();
        unsigned long nr = regs->orig_ax;
 
        enter_from_user_mode();
@@ -303,11 +295,11 @@ __visible void do_syscall_64(struct pt_regs *regs)
  */
 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 {
-       struct thread_info *ti = pt_regs_to_thread_info(regs);
+       struct thread_info *ti = current_thread_info();
        unsigned int nr = (unsigned int)regs->orig_ax;
 
 #ifdef CONFIG_IA32_EMULATION
-       ti->status |= TS_COMPAT;
+       current->thread.status |= TS_COMPAT;
 #endif
 
        if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
index 0b56666..b75a8bc 100644 (file)
        POP_GS_EX
 .endm
 
+/*
+ * %eax: prev task
+ * %edx: next task
+ */
+ENTRY(__switch_to_asm)
+       /*
+        * Save callee-saved registers
+        * This must match the order in struct inactive_task_frame
+        */
+       pushl   %ebp
+       pushl   %ebx
+       pushl   %edi
+       pushl   %esi
+
+       /* switch stack */
+       movl    %esp, TASK_threadsp(%eax)
+       movl    TASK_threadsp(%edx), %esp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+       movl    TASK_stack_canary(%edx), %ebx
+       movl    %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+#endif
+
+       /* restore callee-saved registers */
+       popl    %esi
+       popl    %edi
+       popl    %ebx
+       popl    %ebp
+
+       jmp     __switch_to
+END(__switch_to_asm)
+
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * eax: prev task we switched from
+ * ebx: kernel thread func (NULL for user thread)
+ * edi: kernel thread arg
+ */
 ENTRY(ret_from_fork)
        pushl   %eax
        call    schedule_tail
        popl    %eax
 
+       testl   %ebx, %ebx
+       jnz     1f              /* kernel threads are uncommon */
+
+2:
        /* When we fork, we trace the syscall return in the child, too. */
        movl    %esp, %eax
        call    syscall_return_slowpath
        jmp     restore_all
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-       pushl   %eax
-       call    schedule_tail
-       popl    %eax
-       movl    PT_EBP(%esp), %eax
-       call    *PT_EBX(%esp)
-       movl    $0, PT_EAX(%esp)
 
+       /* kernel thread */
+1:     movl    %edi, %eax
+       call    *%ebx
        /*
-        * Kernel threads return to userspace as if returning from a syscall.
-        * We should check whether anything actually uses this path and, if so,
-        * consider switching it over to ret_from_fork.
+        * A kernel thread is allowed to return here after successfully
+        * calling do_execve().  Exit to userspace to complete the execve()
+        * syscall.
         */
-       movl    %esp, %eax
-       call    syscall_return_slowpath
-       jmp     restore_all
-ENDPROC(ret_from_kernel_thread)
+       movl    $0, PT_EAX(%esp)
+       jmp     2b
+END(ret_from_fork)
 
 /*
  * Return to user mode is not as complex as all this looks,
index d172c61..80ab68a 100644 (file)
@@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
         * If we need to do entry work or if we guess we'll need to do
         * exit work, go straight to the slow path.
         */
-       testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+       movq    PER_CPU_VAR(current_task), %r11
+       testl   $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
        jnz     entry_SYSCALL64_slow_path
 
 entry_SYSCALL_64_fastpath:
@@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath:
         */
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       testl   $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+       movq    PER_CPU_VAR(current_task), %r11
+       testl   $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11)
        jnz     1f
 
        LOCKDEP_SYS_EXIT
@@ -351,8 +353,7 @@ ENTRY(stub_ptregs_64)
        jmp     entry_SYSCALL64_slow_path
 
 1:
-       /* Called from C */
-       jmp     *%rax                           /* called from C */
+       jmp     *%rax                           /* Called from C */
 END(stub_ptregs_64)
 
 .macro ptregs_stub func
@@ -368,42 +369,74 @@ END(ptregs_\func)
 #define __SYSCALL_64(nr, sym, qual) __SYSCALL_64_QUAL_##qual(sym)
 #include <asm/syscalls_64.h>
 
+/*
+ * %rdi: prev task
+ * %rsi: next task
+ */
+ENTRY(__switch_to_asm)
+       /*
+        * Save callee-saved registers
+        * This must match the order in inactive_task_frame
+        */
+       pushq   %rbp
+       pushq   %rbx
+       pushq   %r12
+       pushq   %r13
+       pushq   %r14
+       pushq   %r15
+
+       /* switch stack */
+       movq    %rsp, TASK_threadsp(%rdi)
+       movq    TASK_threadsp(%rsi), %rsp
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+       movq    TASK_stack_canary(%rsi), %rbx
+       movq    %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset
+#endif
+
+       /* restore callee-saved registers */
+       popq    %r15
+       popq    %r14
+       popq    %r13
+       popq    %r12
+       popq    %rbx
+       popq    %rbp
+
+       jmp     __switch_to
+END(__switch_to_asm)
+
 /*
  * A newly forked process directly context switches into this address.
  *
- * rdi: prev task we switched from
+ * rax: prev task we switched from
+ * rbx: kernel thread func (NULL for user thread)
+ * r12: kernel thread arg
  */
 ENTRY(ret_from_fork)
-       LOCK ; btr $TIF_FORK, TI_flags(%r8)
-
+       movq    %rax, %rdi
        call    schedule_tail                   /* rdi: 'prev' task parameter */
 
-       testb   $3, CS(%rsp)                    /* from kernel_thread? */
-       jnz     1f
+       testq   %rbx, %rbx                      /* from kernel_thread? */
+       jnz     1f                              /* kernel threads are uncommon */
 
-       /*
-        * We came from kernel_thread.  This code path is quite twisted, and
-        * someone should clean it up.
-        *
-        * copy_thread_tls stashes the function pointer in RBX and the
-        * parameter to be passed in RBP.  The called function is permitted
-        * to call do_execve and thereby jump to user mode.
-        */
-       movq    RBP(%rsp), %rdi
-       call    *RBX(%rsp)
-       movl    $0, RAX(%rsp)
-
-       /*
-        * Fall through as though we're exiting a syscall.  This makes a
-        * twisted sort of sense if we just called do_execve.
-        */
-
-1:
+2:
        movq    %rsp, %rdi
        call    syscall_return_slowpath /* returns with IRQs disabled */
        TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
        SWAPGS
        jmp     restore_regs_and_iret
+
+1:
+       /* kernel thread */
+       movq    %r12, %rdi
+       call    *%rbx
+       /*
+        * A kernel thread is allowed to return here after successfully
+        * calling do_execve().  Exit to userspace to complete the execve()
+        * syscall.
+        */
+       movq    $0, RAX(%rsp)
+       jmp     2b
 END(ret_from_fork)
 
 /*
@@ -555,27 +588,69 @@ native_irq_return_iret:
 
 #ifdef CONFIG_X86_ESPFIX64
 native_irq_return_ldt:
-       pushq   %rax
-       pushq   %rdi
+       /*
+        * We are running with user GSBASE.  All GPRs contain their user
+        * values.  We have a percpu ESPFIX stack that is eight slots
+        * long (see ESPFIX_STACK_SIZE).  espfix_waddr points to the bottom
+        * of the ESPFIX stack.
+        *
+        * We clobber RAX and RDI in this code.  We stash RDI on the
+        * normal stack and RAX on the ESPFIX stack.
+        *
+        * The ESPFIX stack layout we set up looks like this:
+        *
+        * --- top of ESPFIX stack ---
+        * SS
+        * RSP
+        * RFLAGS
+        * CS
+        * RIP  <-- RSP points here when we're done
+        * RAX  <-- espfix_waddr points here
+        * --- bottom of ESPFIX stack ---
+        */
+
+       pushq   %rdi                            /* Stash user RDI */
        SWAPGS
        movq    PER_CPU_VAR(espfix_waddr), %rdi
-       movq    %rax, (0*8)(%rdi)               /* RAX */
-       movq    (2*8)(%rsp), %rax               /* RIP */
+       movq    %rax, (0*8)(%rdi)               /* user RAX */
+       movq    (1*8)(%rsp), %rax               /* user RIP */
        movq    %rax, (1*8)(%rdi)
-       movq    (3*8)(%rsp), %rax               /* CS */
+       movq    (2*8)(%rsp), %rax               /* user CS */
        movq    %rax, (2*8)(%rdi)
-       movq    (4*8)(%rsp), %rax               /* RFLAGS */
+       movq    (3*8)(%rsp), %rax               /* user RFLAGS */
        movq    %rax, (3*8)(%rdi)
-       movq    (6*8)(%rsp), %rax               /* SS */
+       movq    (5*8)(%rsp), %rax               /* user SS */
        movq    %rax, (5*8)(%rdi)
-       movq    (5*8)(%rsp), %rax               /* RSP */
+       movq    (4*8)(%rsp), %rax               /* user RSP */
        movq    %rax, (4*8)(%rdi)
-       andl    $0xffff0000, %eax
-       popq    %rdi
+       /* Now RAX == RSP. */
+
+       andl    $0xffff0000, %eax               /* RAX = (RSP & 0xffff0000) */
+       popq    %rdi                            /* Restore user RDI */
+
+       /*
+        * espfix_stack[31:16] == 0.  The page tables are set up such that
+        * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of
+        * espfix_waddr for any X.  That is, there are 65536 RO aliases of
+        * the same page.  Set up RSP so that RSP[31:16] contains the
+        * respective 16 bits of the /userspace/ RSP and RSP nonetheless
+        * still points to an RO alias of the ESPFIX stack.
+        */
        orq     PER_CPU_VAR(espfix_stack), %rax
        SWAPGS
        movq    %rax, %rsp
-       popq    %rax
+
+       /*
+        * At this point, we cannot write to the stack any more, but we can
+        * still read.
+        */
+       popq    %rax                            /* Restore user RAX */
+
+       /*
+        * RSP now points to an ordinary IRET frame, except that the page
+        * is read-only and RSP[31:16] are preloaded with the userspace
+        * values.  We can now IRET back to userspace.
+        */
        jmp     native_irq_return_iret
 #endif
 END(common_interrupt)
index d0efb5c..0a8bd7f 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/timer.h>
 #include <asm/desc.h>
 #include <asm/ldt.h>
+#include <asm/unwind.h>
 
 #include "perf_event.h"
 
@@ -2247,39 +2248,26 @@ void arch_perf_update_userpage(struct perf_event *event,
        cyc2ns_read_end(data);
 }
 
-/*
- * callchain support
- */
-
-static int backtrace_stack(void *data, char *name)
-{
-       return 0;
-}
-
-static int backtrace_address(void *data, unsigned long addr, int reliable)
-{
-       struct perf_callchain_entry_ctx *entry = data;
-
-       return perf_callchain_store(entry, addr);
-}
-
-static const struct stacktrace_ops backtrace_ops = {
-       .stack                  = backtrace_stack,
-       .address                = backtrace_address,
-       .walk_stack             = print_context_stack_bp,
-};
-
 void
 perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
+       struct unwind_state state;
+       unsigned long addr;
+
        if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
                /* TODO: We don't support guest os callchain now */
                return;
        }
 
-       perf_callchain_store(entry, regs->ip);
+       if (perf_callchain_store(entry, regs->ip))
+               return;
 
-       dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
+       for (unwind_start(&state, current, regs, NULL); !unwind_done(&state);
+            unwind_next_frame(&state)) {
+               addr = unwind_get_return_address(&state);
+               if (!addr || perf_callchain_store(entry, addr))
+                       return;
+       }
 }
 
 static inline int
index e77a644..1b02038 100644 (file)
@@ -217,10 +217,14 @@ static inline int alternatives_text_reserved(void *start, void *end)
  */
 #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2,   \
                           output, input...)                                  \
+{                                                                            \
+       register void *__sp asm(_ASM_SP);                                     \
        asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
                "call %P[new2]", feature2)                                    \
-               : output : [old] "i" (oldfunc), [new1] "i" (newfunc1),        \
-               [new2] "i" (newfunc2), ## input)
+               : output, "+r" (__sp)                                         \
+               : [old] "i" (oldfunc), [new1] "i" (newfunc1),                 \
+                 [new2] "i" (newfunc2), ## input);                           \
+}
 
 /*
  * use this macro(s) if you need more than one output parameter
index 4e10d73..12080d8 100644 (file)
@@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 extern struct desc_ptr idt_descr;
 extern gate_desc idt_table[];
-extern struct desc_ptr debug_idt_descr;
+extern const struct desc_ptr debug_idt_descr;
 extern gate_desc debug_idt_table[];
 
 struct gdt_page {
index ae55a43..d4957ac 100644 (file)
@@ -45,7 +45,8 @@
 extern u64 xfeatures_mask;
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+extern void __init update_regset_xstate_info(unsigned int size,
+                                            u64 xstate_mask);
 
 void fpu__xstate_clear_all_cpu_caps(void);
 void *get_xsave_addr(struct xregs_state *xsave, int xstate);
index a4820d4..eccd0ac 100644 (file)
@@ -6,6 +6,7 @@
 # define MCOUNT_ADDR           ((unsigned long)(__fentry__))
 #else
 # define MCOUNT_ADDR           ((unsigned long)(mcount))
+# define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
 #define MCOUNT_INSN_SIZE       5 /* sizeof mcount call */
 
@@ -13,6 +14,8 @@
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #endif
 
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
 #ifndef __ASSEMBLY__
 extern void mcount(void);
 extern atomic_t modifying_ftrace_code;
index 2674ee3..1052a79 100644 (file)
@@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose);
 #ifdef CONFIG_RANDOMIZE_MEMORY
 extern unsigned long page_offset_base;
 extern unsigned long vmalloc_base;
+extern unsigned long vmemmap_base;
 
 void kernel_randomize_memory(void);
 #else
index 1ef9d58..d318811 100644 (file)
@@ -24,8 +24,6 @@ enum die_val {
 extern void printk_address(unsigned long address);
 extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
-extern void show_trace(struct task_struct *t, struct pt_regs *regs,
-                      unsigned long *sp, unsigned long bp);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void __show_regs(struct pt_regs *regs, int all);
 extern unsigned long oops_begin(void);
index 6fdef9e..3a26420 100644 (file)
@@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t;
 #define MAXMEM         _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 #define VMALLOC_SIZE_TB        _AC(32, UL)
 #define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define VMEMMAP_START  _AC(0xffffea0000000000, UL)
+#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
 #ifdef CONFIG_RANDOMIZE_MEMORY
 #define VMALLOC_START  vmalloc_base
+#define VMEMMAP_START  vmemmap_base
 #else
 #define VMALLOC_START  __VMALLOC_BASE
+#define VMEMMAP_START  __VMEMMAP_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
 #define VMALLOC_END    (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
 #define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
index 63def95..984a7bf 100644 (file)
@@ -389,9 +389,9 @@ struct thread_struct {
        unsigned short          fsindex;
        unsigned short          gsindex;
 #endif
-#ifdef CONFIG_X86_32
-       unsigned long           ip;
-#endif
+
+       u32                     status;         /* thread synchronous flags */
+
 #ifdef CONFIG_X86_64
        unsigned long           fsbase;
        unsigned long           gsbase;
@@ -437,6 +437,15 @@ struct thread_struct {
         */
 };
 
+/*
+ * Thread-synchronous status.
+ *
+ * This is different from the flags in that nobody else
+ * ever touches our thread-synchronous status, so we don't
+ * have to worry about atomic accesses.
+ */
+#define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
+
 /*
  * Set IOPL bits in EFLAGS from given mask
  */
@@ -724,8 +733,6 @@ static inline void spin_lock_prefetch(const void *x)
        .addr_limit             = KERNEL_DS,                              \
 }
 
-extern unsigned long thread_saved_pc(struct task_struct *tsk);
-
 /*
  * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
  * This is necessary to guarantee that the entire "struct pt_regs"
@@ -776,17 +783,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
        .addr_limit             = KERNEL_DS,                    \
 }
 
-/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-#define thread_saved_pc(t)     READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8))
-
 #define task_pt_regs(tsk)      ((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
 #endif /* CONFIG_X86_64 */
 
+extern unsigned long thread_saved_pc(struct task_struct *tsk);
+
 extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
                                               unsigned long new_sp);
 
index b2988c0..230e190 100644 (file)
@@ -44,9 +44,9 @@ struct trampoline_header {
 extern struct real_mode_header *real_mode_header;
 extern unsigned char real_mode_blob_end[];
 
-extern unsigned long init_rsp;
 extern unsigned long initial_code;
 extern unsigned long initial_gs;
+extern unsigned long initial_stack;
 
 extern unsigned char real_mode_blob[];
 extern unsigned char real_mode_relocs[];
index ebd0c16..19980b3 100644 (file)
@@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
 #endif
 
-/* Static state in head.S used to set up a CPU */
-extern unsigned long stack_start; /* Initial stack pointer address */
-
 struct task_struct;
 
 struct smp_ops {
index 0944218..37f2e0b 100644 (file)
@@ -8,86 +8,86 @@
 
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
+#include <asm/switch_to.h>
+
+enum stack_type {
+       STACK_TYPE_UNKNOWN,
+       STACK_TYPE_TASK,
+       STACK_TYPE_IRQ,
+       STACK_TYPE_SOFTIRQ,
+       STACK_TYPE_EXCEPTION,
+       STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
+};
 
-extern int kstack_depth_to_print;
-
-struct thread_info;
-struct stacktrace_ops;
-
-typedef unsigned long (*walk_stack_t)(struct task_struct *task,
-                                     unsigned long *stack,
-                                     unsigned long bp,
-                                     const struct stacktrace_ops *ops,
-                                     void *data,
-                                     unsigned long *end,
-                                     int *graph);
-
-extern unsigned long
-print_context_stack(struct task_struct *task,
-                   unsigned long *stack, unsigned long bp,
-                   const struct stacktrace_ops *ops, void *data,
-                   unsigned long *end, int *graph);
-
-extern unsigned long
-print_context_stack_bp(struct task_struct *task,
-                      unsigned long *stack, unsigned long bp,
-                      const struct stacktrace_ops *ops, void *data,
-                      unsigned long *end, int *graph);
-
-/* Generic stack tracer with callbacks */
-
-struct stacktrace_ops {
-       int (*address)(void *data, unsigned long address, int reliable);
-       /* On negative return stop dumping */
-       int (*stack)(void *data, char *name);
-       walk_stack_t    walk_stack;
+struct stack_info {
+       enum stack_type type;
+       unsigned long *begin, *end, *next_sp;
 };
 
-void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data);
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+                  struct stack_info *info);
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+                  struct stack_info *info, unsigned long *visit_mask);
+
+void stack_type_str(enum stack_type type, const char **begin,
+                   const char **end);
+
+static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
+{
+       void *begin = info->begin;
+       void *end   = info->end;
+
+       return (info->type != STACK_TYPE_UNKNOWN &&
+               addr >= begin && addr < end &&
+               addr + len > begin && addr + len <= end);
+}
+
+extern int kstack_depth_to_print;
 
 #ifdef CONFIG_X86_32
 #define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
 #else
 #define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
 {
-       unsigned long bp;
-
        if (regs)
-               return regs->bp;
+               return (unsigned long *)regs->bp;
 
-       if (task == current) {
-               /* Grab bp right from our regs */
-               get_bp(bp);
-               return bp;
-       }
+       if (task == current)
+               return __builtin_frame_address(0);
 
-       /* bp is the last reg pushed by switch_to */
-       return *(unsigned long *)task->thread.sp;
+       return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp;
 }
 #else
-static inline unsigned long
-stack_frame(struct task_struct *task, struct pt_regs *regs)
+static inline unsigned long *
+get_frame_pointer(struct task_struct *task, struct pt_regs *regs)
 {
-       return 0;
+       return NULL;
+}
+#endif /* CONFIG_FRAME_POINTER */
+
+static inline unsigned long *
+get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
+{
+       if (regs)
+               return (unsigned long *)kernel_stack_pointer(regs);
+
+       if (task == current)
+               return __builtin_frame_address(0);
+
+       return (unsigned long *)task->thread.sp;
 }
-#endif
 
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *stack, unsigned long bp, char *log_lvl);
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                       unsigned long *stack, char *log_lvl);
 
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl);
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                       unsigned long *sp, char *log_lvl);
 
 extern unsigned int code_bytes;
 
@@ -106,7 +106,7 @@ static inline unsigned long caller_frame_pointer(void)
 {
        struct stack_frame *frame;
 
-       get_bp(frame);
+       frame = __builtin_frame_address(0);
 
 #ifdef CONFIG_FRAME_POINTER
        frame = frame->next_frame;
index 8f321a1..5cb436a 100644 (file)
 #define _ASM_X86_SWITCH_TO_H
 
 struct task_struct; /* one of the stranger aspects of C forward declarations */
+
+struct task_struct *__switch_to_asm(struct task_struct *prev,
+                                   struct task_struct *next);
+
 __visible struct task_struct *__switch_to(struct task_struct *prev,
-                                          struct task_struct *next);
+                                         struct task_struct *next);
 struct tss_struct;
 void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
                      struct tss_struct *tss);
 
-#ifdef CONFIG_X86_32
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+                                    struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If we switch to a stack that has a top-level paging entry
+        * that is not present in the current mm, the resulting #PF will
+        * will be promoted to a double-fault and we'll panic.  Probe
+        * the new stack now so that vmalloc_fault can fix up the page
+        * tables if needed.  This can only happen if we use a stack
+        * in vmap space.
+        *
+        * We assume that the stack is aligned so that it never spans
+        * more than one top-level paging entry.
+        *
+        * To minimize cache pollution, just follow the stack pointer.
+        */
+       READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
+asmlinkage void ret_from_fork(void);
+
+/* data that is pointed to by thread.sp */
+struct inactive_task_frame {
+#ifdef CONFIG_X86_64
+       unsigned long r15;
+       unsigned long r14;
+       unsigned long r13;
+       unsigned long r12;
+#else
+       unsigned long si;
+       unsigned long di;
+#endif
+       unsigned long bx;
+       unsigned long bp;
+       unsigned long ret_addr;
+};
 
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary                                                        \
-       "movl %P[task_canary](%[next]), %%ebx\n\t"                      \
-       "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
-#define __switch_canary_oparam                                         \
-       , [stack_canary] "=m" (stack_canary.canary)
-#define __switch_canary_iparam                                         \
-       , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else  /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
+struct fork_frame {
+       struct inactive_task_frame frame;
+       struct pt_regs regs;
+};
 
-/*
- * Saving eflags is important. It switches not only IOPL between tasks,
- * it also protects other tasks from NT leaking through sysenter etc.
- */
 #define switch_to(prev, next, last)                                    \
 do {                                                                   \
-       /*                                                              \
-        * Context-switching clobbers all registers, so we clobber      \
-        * them explicitly, via unused output variables.                \
-        * (EAX and EBP is not listed because EBP is saved/restored     \
-        * explicitly for wchan access and EAX is the return value of   \
-        * __switch_to())                                               \
-        */                                                             \
-       unsigned long ebx, ecx, edx, esi, edi;                          \
-                                                                       \
-       asm volatile("pushl %%ebp\n\t"          /* save    EBP   */     \
-                    "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
-                    "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
-                    "movl $1f,%[prev_ip]\n\t"  /* save    EIP   */     \
-                    "pushl %[next_ip]\n\t"     /* restore EIP   */     \
-                    __switch_canary                                    \
-                    "jmp __switch_to\n"        /* regparm call  */     \
-                    "1:\t"                                             \
-                    "popl %%ebp\n\t"           /* restore EBP   */     \
-                                                                       \
-                    /* output parameters */                            \
-                    : [prev_sp] "=m" (prev->thread.sp),                \
-                      [prev_ip] "=m" (prev->thread.ip),                \
-                      "=a" (last),                                     \
-                                                                       \
-                      /* clobbered output registers: */                \
-                      "=b" (ebx), "=c" (ecx), "=d" (edx),              \
-                      "=S" (esi), "=D" (edi)                           \
-                                                                       \
-                      __switch_canary_oparam                           \
-                                                                       \
-                      /* input parameters: */                          \
-                    : [next_sp]  "m" (next->thread.sp),                \
-                      [next_ip]  "m" (next->thread.ip),                \
-                                                                       \
-                      /* regparm parameters for __switch_to(): */      \
-                      [prev]     "a" (prev),                           \
-                      [next]     "d" (next)                            \
+       prepare_switch_to(prev, next);                                  \
                                                                        \
-                      __switch_canary_iparam                           \
-                                                                       \
-                    : /* reloaded segment registers */                 \
-                       "memory");                                      \
+       ((last) = __switch_to_asm((prev), (next)));                     \
 } while (0)
 
-#else /* CONFIG_X86_32 */
-
-/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT    "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
-
-#define __EXTRA_CLOBBER  \
-       , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
-         "r12", "r13", "r14", "r15", "flags"
-
-#ifdef CONFIG_CC_STACKPROTECTOR
-#define __switch_canary                                                          \
-       "movq %P[task_canary](%%rsi),%%r8\n\t"                            \
-       "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
-#define __switch_canary_oparam                                           \
-       , [gs_canary] "=m" (irq_stack_union.stack_canary)
-#define __switch_canary_iparam                                           \
-       , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
-#else  /* CC_STACKPROTECTOR */
-#define __switch_canary
-#define __switch_canary_oparam
-#define __switch_canary_iparam
-#endif /* CC_STACKPROTECTOR */
-
-/*
- * There is no need to save or restore flags, because flags are always
- * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
- * has no effect.
- */
-#define switch_to(prev, next, last) \
-       asm volatile(SAVE_CONTEXT                                         \
-            "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
-            "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
-            "call __switch_to\n\t"                                       \
-            "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
-            __switch_canary                                              \
-            "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
-            "movq %%rax,%%rdi\n\t"                                       \
-            "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"                 \
-            "jnz   ret_from_fork\n\t"                                    \
-            RESTORE_CONTEXT                                              \
-            : "=a" (last)                                                \
-              __switch_canary_oparam                                     \
-            : [next] "S" (next), [prev] "D" (prev),                      \
-              [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
-              [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
-              [_tif_fork] "i" (_TIF_FORK),                               \
-              [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-              [current_task] "m" (current_task)                          \
-              __switch_canary_iparam                                     \
-            : "memory", "cc" __EXTRA_CLOBBER)
-
-#endif /* CONFIG_X86_32 */
-
 #endif /* _ASM_X86_SWITCH_TO_H */
index 4e23dd1..e3c95e8 100644 (file)
@@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task,
         * TS_COMPAT is set for 32-bit syscall entries and then
         * remains set until we return to user mode.
         */
-       if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED))
+       if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
                /*
                 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
                 * and will match correctly in comparisons.
@@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
                                         unsigned long *args)
 {
 # ifdef CONFIG_IA32_EMULATION
-       if (task_thread_info(task)->status & TS_COMPAT)
+       if (task->thread.status & TS_COMPAT)
                switch (i) {
                case 0:
                        if (!n--) break;
@@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
                                         const unsigned long *args)
 {
 # ifdef CONFIG_IA32_EMULATION
-       if (task_thread_info(task)->status & TS_COMPAT)
+       if (task->thread.status & TS_COMPAT)
                switch (i) {
                case 0:
                        if (!n--) break;
@@ -234,18 +234,8 @@ static inline void syscall_set_arguments(struct task_struct *task,
 
 static inline int syscall_get_arch(void)
 {
-#ifdef CONFIG_IA32_EMULATION
-       /*
-        * TS_COMPAT is set for 32-bit syscall entry and then
-        * remains set until we return to user mode.
-        *
-        * x32 tasks should be considered AUDIT_ARCH_X86_64.
-        */
-       if (task_thread_info(current)->status & TS_COMPAT)
-               return AUDIT_ARCH_I386;
-#endif
-       /* Both x32 and x86_64 are considered "64-bit". */
-       return AUDIT_ARCH_X86_64;
+       /* x32 tasks should be considered AUDIT_ARCH_X86_64. */
+       return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
 }
 #endif /* CONFIG_X86_32 */
 
index 8b7c8d8..2aaca53 100644 (file)
@@ -52,21 +52,6 @@ struct task_struct;
 #include <asm/cpufeature.h>
 #include <linux/atomic.h>
 
-struct thread_info {
-       struct task_struct      *task;          /* main task structure */
-       __u32                   flags;          /* low level flags */
-       __u32                   status;         /* thread synchronous flags */
-       __u32                   cpu;            /* current CPU */
-};
-
-#define INIT_THREAD_INFO(tsk)                  \
-{                                              \
-       .task           = &tsk,                 \
-       .flags          = 0,                    \
-       .cpu            = 0,                    \
-}
-
-#define init_thread_info       (init_thread_union.thread_info)
 #define init_stack             (init_thread_union.stack)
 
 #else /* !__ASSEMBLY__ */
@@ -95,7 +80,6 @@ struct thread_info {
 #define TIF_UPROBE             12      /* breakpointed or singlestepping */
 #define TIF_NOTSC              16      /* TSC is not accessible in userland */
 #define TIF_IA32               17      /* IA32 compatibility process */
-#define TIF_FORK               18      /* ret_from_fork */
 #define TIF_NOHZ               19      /* in adaptive nohz mode */
 #define TIF_MEMDIE             20      /* is terminating due to OOM killer */
 #define TIF_POLLING_NRFLAG     21      /* idle is polling for TIF_NEED_RESCHED */
@@ -119,7 +103,6 @@ struct thread_info {
 #define _TIF_UPROBE            (1 << TIF_UPROBE)
 #define _TIF_NOTSC             (1 << TIF_NOTSC)
 #define _TIF_IA32              (1 << TIF_IA32)
-#define _TIF_FORK              (1 << TIF_FORK)
 #define _TIF_NOHZ              (1 << TIF_NOHZ)
 #define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP         (1 << TIF_IO_BITMAP)
@@ -160,11 +143,6 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-static inline struct thread_info *current_thread_info(void)
-{
-       return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
-}
-
 static inline unsigned long current_stack_pointer(void)
 {
        unsigned long sp;
@@ -226,60 +204,19 @@ static inline int arch_within_stack_frames(const void * const stack,
 # define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
 #endif
 
-/*
- * ASM operand which evaluates to a 'thread_info' address of
- * the current task, if it is known that "reg" is exactly "off"
- * bytes below the top of the stack currently.
- *
- * ( The kernel stack's size is known at build time, it is usually
- *   2 or 4 pages, and the bottom  of the kernel stack contains
- *   the thread_info structure. So to access the thread_info very
- *   quickly from assembly code we can calculate down from the
- *   top of the kernel stack to the bottom, using constant,
- *   build-time calculations only. )
- *
- * For example, to fetch the current thread_info->flags value into %eax
- * on x86-64 defconfig kernels, in syscall entry code where RSP is
- * currently at exactly SIZEOF_PTREGS bytes away from the top of the
- * stack:
- *
- *      mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
- *
- * will translate to:
- *
- *      8b 84 24 b8 c0 ff ff      mov    -0x3f48(%rsp), %eax
- *
- * which is below the current RSP by almost 16K.
- */
-#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
-
 #endif
 
-/*
- * Thread-synchronous status.
- *
- * This is different from the flags in that nobody else
- * ever touches our thread-synchronous status, so we don't
- * have to worry about atomic accesses.
- */
-#define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
 #ifdef CONFIG_COMPAT
 #define TS_I386_REGS_POKED     0x0004  /* regs poked by 32-bit ptracer */
 #endif
-
 #ifndef __ASSEMBLY__
 
-static inline bool in_ia32_syscall(void)
-{
 #ifdef CONFIG_X86_32
-       return true;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-       if (current_thread_info()->status & TS_COMPAT)
-               return true;
+#define in_ia32_syscall() true
+#else
+#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \
+                          current->thread.status & TS_COMPAT)
 #endif
-       return false;
-}
 
 /*
  * Force syscall return via IRET by making it look as if there was
index c349661..01fd0a7 100644 (file)
@@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 
+#ifdef CONFIG_VMAP_STACK
+void __noreturn handle_stack_overflow(const char *message,
+                                     struct pt_regs *regs,
+                                     unsigned long fault_address);
+#endif
+
 /* Interrupts/Exceptions */
 enum {
        X86_TRAP_DE = 0,        /*  0, Divide-by-zero */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644 (file)
index 0000000..c4b6d1c
--- /dev/null
@@ -0,0 +1,73 @@
+#ifndef _ASM_X86_UNWIND_H
+#define _ASM_X86_UNWIND_H
+
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/stacktrace.h>
+
+struct unwind_state {
+       struct stack_info stack_info;
+       unsigned long stack_mask;
+       struct task_struct *task;
+       int graph_idx;
+#ifdef CONFIG_FRAME_POINTER
+       unsigned long *bp;
+#else
+       unsigned long *sp;
+#endif
+};
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+                   struct pt_regs *regs, unsigned long *first_frame);
+
+bool unwind_next_frame(struct unwind_state *state);
+
+static inline bool unwind_done(struct unwind_state *state)
+{
+       return state->stack_info.type == STACK_TYPE_UNKNOWN;
+}
+
+static inline
+void unwind_start(struct unwind_state *state, struct task_struct *task,
+                 struct pt_regs *regs, unsigned long *first_frame)
+{
+       first_frame = first_frame ? : get_stack_pointer(task, regs);
+
+       __unwind_start(state, task, regs, first_frame);
+}
+
+#ifdef CONFIG_FRAME_POINTER
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+       if (unwind_done(state))
+               return NULL;
+
+       return state->bp + 1;
+}
+
+unsigned long unwind_get_return_address(struct unwind_state *state);
+
+#else /* !CONFIG_FRAME_POINTER */
+
+static inline
+unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
+{
+       return NULL;
+}
+
+static inline
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+       if (unwind_done(state))
+               return 0;
+
+       return ftrace_graph_ret_addr(state->task, &state->graph_idx,
+                                    *state->sp, state->sp);
+}
+
+#endif /* CONFIG_FRAME_POINTER */
+
+#endif /* _ASM_X86_UNWIND_H */
index 0503f5b..45257cf 100644 (file)
@@ -125,6 +125,12 @@ obj-$(CONFIG_EFI)                  += sysfb_efi.o
 obj-$(CONFIG_PERF_EVENTS)              += perf_regs.o
 obj-$(CONFIG_TRACING)                  += tracepoint.o
 
+ifdef CONFIG_FRAME_POINTER
+obj-y                                  += unwind_frame.o
+else
+obj-y                                  += unwind_guess.o
+endif
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
index adb3eaf..4858733 100644 (file)
@@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void)
        saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
-       stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
+       initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
        early_gdt_descr.address =
                        (unsigned long)get_cpu_gdt_table(smp_processor_id());
        initial_gs = per_cpu_offset(smp_processor_id());
index 5b2ae10..8862da7 100644 (file)
@@ -25,7 +25,7 @@
 static struct apic apic_physflat;
 static struct apic apic_flat;
 
-struct apic __read_mostly *apic = &apic_flat;
+struct apic *apic __ro_after_init = &apic_flat;
 EXPORT_SYMBOL_GPL(apic);
 
 static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -154,7 +154,7 @@ static int flat_probe(void)
        return 1;
 }
 
-static struct apic apic_flat  {
+static struct apic apic_flat __ro_after_init = {
        .name                           = "flat",
        .probe                          = flat_probe,
        .acpi_madt_oem_check            = flat_acpi_madt_oem_check,
@@ -248,7 +248,7 @@ static int physflat_probe(void)
        return 0;
 }
 
-static struct apic apic_physflat  {
+static struct apic apic_physflat __ro_after_init = {
 
        .name                           = "physical flat",
        .probe                          = physflat_probe,
index c05688b..b109e43 100644 (file)
@@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v)
        WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic);
 }
 
-struct apic apic_noop = {
+struct apic apic_noop __ro_after_init = {
        .name                           = "noop",
        .probe                          = noop_probe,
        .acpi_madt_oem_check            = NULL,
index 06dbaa4..5601201 100644 (file)
@@ -142,7 +142,7 @@ static int probe_bigsmp(void)
        return dmi_bigsmp;
 }
 
-static struct apic apic_bigsmp = {
+static struct apic apic_bigsmp __ro_after_init = {
 
        .name                           = "bigsmp",
        .probe                          = probe_bigsmp,
index ade2532..015bbf3 100644 (file)
@@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
        hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
 }
 
-static struct irq_chip hpet_msi_controller = {
+static struct irq_chip hpet_msi_controller __ro_after_init = {
        .name = "HPET-MSI",
        .irq_unmask = hpet_msi_unmask,
        .irq_mask = hpet_msi_mask,
index 7c43e71..e5fb2f0 100644 (file)
@@ -72,7 +72,7 @@ static int probe_default(void)
        return 1;
 }
 
-static struct apic apic_default = {
+static struct apic apic_default __ro_after_init = {
 
        .name                           = "default",
        .probe                          = probe_default,
@@ -126,7 +126,7 @@ static struct apic apic_default = {
 
 apic_driver(apic_default);
 
-struct apic *apic = &apic_default;
+struct apic *apic __ro_after_init = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
 
 static int cmdline_apic __initdata;
index 54f35d9..200af5a 100644 (file)
@@ -227,7 +227,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
                cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
 }
 
-static struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster __ro_after_init = {
 
        .name                           = "cluster x2apic",
        .probe                          = x2apic_cluster_probe,
index 4f13f54..ff111f0 100644 (file)
@@ -98,7 +98,7 @@ static int x2apic_phys_probe(void)
        return apic == &apic_x2apic_phys;
 }
 
-static struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys __ro_after_init = {
 
        .name                           = "physical x2apic",
        .probe                          = x2apic_phys_probe,
index cb0673c..b9f6157 100644 (file)
@@ -560,7 +560,7 @@ static int uv_probe(void)
        return apic == &apic_x2apic_uv_x;
 }
 
-static struct apic __refdata apic_x2apic_uv_x = {
+static struct apic apic_x2apic_uv_x __ro_after_init = {
 
        .name                           = "UV large system",
        .probe                          = uv_probe,
index 2bd5c6f..c62e015 100644 (file)
 
 void common(void) {
        BLANK();
-       OFFSET(TI_flags, thread_info, flags);
-       OFFSET(TI_status, thread_info, status);
+       OFFSET(TASK_threadsp, task_struct, thread.sp);
+#ifdef CONFIG_CC_STACKPROTECTOR
+       OFFSET(TASK_stack_canary, task_struct, stack_canary);
+#endif
 
        BLANK();
+       OFFSET(TASK_TI_flags, task_struct, thread_info.flags);
        OFFSET(TASK_addr_limit, task_struct, thread.addr_limit);
 
        BLANK();
index ecdc1d2..880aa09 100644 (file)
@@ -57,6 +57,11 @@ void foo(void)
        /* Size of SYSENTER_stack */
        DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+       BLANK();
+       OFFSET(stack_canary_offset, stack_canary, canary);
+#endif
+
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
        BLANK();
        OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
index d875f97..210927e 100644 (file)
@@ -56,6 +56,11 @@ int main(void)
        OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
        BLANK();
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+       DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary));
+       BLANK();
+#endif
+
        DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
        DEFINE(NR_syscalls, sizeof(syscalls_64));
 
index 809eda0..0691942 100644 (file)
@@ -1265,9 +1265,14 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 #ifdef CONFIG_X86_64
-struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
-struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
-                                   (unsigned long) debug_idt_table };
+struct desc_ptr idt_descr __ro_after_init = {
+       .size = NR_VECTORS * 16 - 1,
+       .address = (unsigned long) idt_table,
+};
+const struct desc_ptr debug_idt_descr = {
+       .size = NR_VECTORS * 16 - 1,
+       .address = (unsigned long) debug_idt_table,
+};
 
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
                     irq_stack_union) __aligned(PAGE_SIZE) __visible;
@@ -1281,7 +1286,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
 EXPORT_PER_CPU_SYMBOL(current_task);
 
 DEFINE_PER_CPU(char *, irq_stack_ptr) =
-       init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
+       init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
 
 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 
@@ -1305,11 +1310,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
-       /*
-        * LSTAR and STAR live in a bit strange symbiosis.
-        * They both write to the same internal register. STAR allows to
-        * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
-        */
        wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
        wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
index 28f1b54..24e87e7 100644 (file)
@@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex);
 u64 size_or_mask, size_and_mask;
 static bool mtrr_aps_delayed_init;
 
-static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
+static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init;
 
 const struct mtrr_ops *mtrr_if;
 
 static void set_mtrr(unsigned int reg, unsigned long base,
                     unsigned long size, mtrr_type type);
 
-void set_mtrr_ops(const struct mtrr_ops *ops)
+void __init set_mtrr_ops(const struct mtrr_ops *ops)
 {
        if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
                mtrr_ops[ops->vendor] = ops;
index 6c7ced0..ad8bd76 100644 (file)
@@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index,
 bool get_mtrr_state(void);
 void mtrr_bp_pat_init(void);
 
-extern void set_mtrr_ops(const struct mtrr_ops *ops);
+extern void __init set_mtrr_ops(const struct mtrr_ops *ops);
 
 extern u64 size_or_mask, size_and_mask;
 extern const struct mtrr_ops *mtrr_if;
index 92e8f0a..9b7cf5c 100644 (file)
@@ -17,7 +17,7 @@
 #include <linux/sysfs.h>
 
 #include <asm/stacktrace.h>
-
+#include <asm/unwind.h>
 
 int panic_on_unrecovered_nmi;
 int panic_on_io_nmi;
@@ -25,11 +25,29 @@ unsigned int code_bytes = 64;
 int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
 
+bool in_task_stack(unsigned long *stack, struct task_struct *task,
+                  struct stack_info *info)
+{
+       unsigned long *begin = task_stack_page(task);
+       unsigned long *end   = task_stack_page(task) + THREAD_SIZE;
+
+       if (stack < begin || stack >= end)
+               return false;
+
+       info->type      = STACK_TYPE_TASK;
+       info->begin     = begin;
+       info->end       = end;
+       info->next_sp   = NULL;
+
+       return true;
+}
+
 static void printk_stack_address(unsigned long address, int reliable,
-               void *data)
+                                char *log_lvl)
 {
+       touch_nmi_watchdog();
        printk("%s [<%p>] %s%pB\n",
-               (char *)data, (void *)address, reliable ? "" : "? ",
+               log_lvl, (void *)address, reliable ? "" : "? ",
                (void *)address);
 }
 
@@ -38,176 +56,120 @@ void printk_address(unsigned long address)
        pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
 }
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void
-print_ftrace_graph_addr(unsigned long addr, void *data,
-                       const struct stacktrace_ops *ops,
-                       struct task_struct *task, int *graph)
+void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                       unsigned long *stack, char *log_lvl)
 {
-       unsigned long ret_addr;
-       int index;
-
-       if (addr != (unsigned long)return_to_handler)
-               return;
-
-       index = task->curr_ret_stack;
-
-       if (!task->ret_stack || index < *graph)
-               return;
-
-       index -= *graph;
-       ret_addr = task->ret_stack[index].ret;
-
-       ops->address(data, ret_addr, 1);
+       struct unwind_state state;
+       struct stack_info stack_info = {0};
+       unsigned long visit_mask = 0;
+       int graph_idx = 0;
 
-       (*graph)++;
-}
-#else
-static inline void
-print_ftrace_graph_addr(unsigned long addr, void *data,
-                       const struct stacktrace_ops *ops,
-                       struct task_struct *task, int *graph)
-{ }
-#endif
-
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-static inline int valid_stack_ptr(struct task_struct *task,
-                       void *p, unsigned int size, void *end)
-{
-       void *t = task_stack_page(task);
-       if (end) {
-               if (p < end && p >= (end-THREAD_SIZE))
-                       return 1;
-               else
-                       return 0;
-       }
-       return p >= t && p < t + THREAD_SIZE - size;
-}
+       printk("%sCall Trace:\n", log_lvl);
 
-unsigned long
-print_context_stack(struct task_struct *task,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data,
-               unsigned long *end, int *graph)
-{
-       struct stack_frame *frame = (struct stack_frame *)bp;
+       unwind_start(&state, task, regs, stack);
 
        /*
-        * If we overflowed the stack into a guard page, jump back to the
-        * bottom of the usable stack.
+        * Iterate through the stacks, starting with the current stack pointer.
+        * Each stack has a pointer to the next one.
+        *
+        * x86-64 can have several stacks:
+        * - task stack
+        * - interrupt stack
+        * - HW exception stacks (double fault, nmi, debug, mce)
+        *
+        * x86-32 can have up to three stacks:
+        * - task stack
+        * - softirq stack
+        * - hardirq stack
         */
-       if ((unsigned long)task_stack_page(task) - (unsigned long)stack <
-           PAGE_SIZE)
-               stack = (unsigned long *)task_stack_page(task);
-
-       while (valid_stack_ptr(task, stack, sizeof(*stack), end)) {
-               unsigned long addr;
-
-               addr = *stack;
-               if (__kernel_text_address(addr)) {
-                       if ((unsigned long) stack == bp + sizeof(long)) {
-                               ops->address(data, addr, 1);
-                               frame = frame->next_frame;
-                               bp = (unsigned long) frame;
-                       } else {
-                               ops->address(data, addr, 0);
-                       }
-                       print_ftrace_graph_addr(addr, data, ops, task, graph);
-               }
-               stack++;
-       }
-       return bp;
-}
-EXPORT_SYMBOL_GPL(print_context_stack);
-
-unsigned long
-print_context_stack_bp(struct task_struct *task,
-                      unsigned long *stack, unsigned long bp,
-                      const struct stacktrace_ops *ops, void *data,
-                      unsigned long *end, int *graph)
-{
-       struct stack_frame *frame = (struct stack_frame *)bp;
-       unsigned long *ret_addr = &frame->return_address;
+       for (; stack; stack = stack_info.next_sp) {
+               const char *str_begin, *str_end;
 
-       while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) {
-               unsigned long addr = *ret_addr;
+               /*
+                * If we overflowed the task stack into a guard page, jump back
+                * to the bottom of the usable stack.
+                */
+               if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
+                       stack = task_stack_page(task);
 
-               if (!__kernel_text_address(addr))
+               if (get_stack_info(stack, task, &stack_info, &visit_mask))
                        break;
 
-               if (ops->address(data, addr, 1))
-                       break;
-               frame = frame->next_frame;
-               ret_addr = &frame->return_address;
-               print_ftrace_graph_addr(addr, data, ops, task, graph);
-       }
-
-       return (unsigned long)frame;
-}
-EXPORT_SYMBOL_GPL(print_context_stack_bp);
-
-static int print_trace_stack(void *data, char *name)
-{
-       printk("%s <%s> ", (char *)data, name);
-       return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static int print_trace_address(void *data, unsigned long addr, int reliable)
-{
-       touch_nmi_watchdog();
-       printk_stack_address(addr, reliable, data);
-       return 0;
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-       .stack                  = print_trace_stack,
-       .address                = print_trace_address,
-       .walk_stack             = print_context_stack,
-};
-
-void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-       printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
+               stack_type_str(stack_info.type, &str_begin, &str_end);
+               if (str_begin)
+                       printk("%s <%s> ", log_lvl, str_begin);
+
+               /*
+                * Scan the stack, printing any text addresses we find.  At the
+                * same time, follow proper stack frames with the unwinder.
+                *
+                * Addresses found during the scan which are not reported by
+                * the unwinder are considered to be additional clues which are
+                * sometimes useful for debugging and are prefixed with '?'.
+                * This also serves as a failsafe option in case the unwinder
+                * goes off in the weeds.
+                */
+               for (; stack < stack_info.end; stack++) {
+                       unsigned long real_addr;
+                       int reliable = 0;
+                       unsigned long addr = *stack;
+                       unsigned long *ret_addr_p =
+                               unwind_get_return_address_ptr(&state);
+
+                       if (!__kernel_text_address(addr))
+                               continue;
+
+                       if (stack == ret_addr_p)
+                               reliable = 1;
+
+                       /*
+                        * When function graph tracing is enabled for a
+                        * function, its return address on the stack is
+                        * replaced with the address of an ftrace handler
+                        * (return_to_handler).  In that case, before printing
+                        * the "real" address, we want to print the handler
+                        * address as an "unreliable" hint that function graph
+                        * tracing was involved.
+                        */
+                       real_addr = ftrace_graph_ret_addr(task, &graph_idx,
+                                                         addr, stack);
+                       if (real_addr != addr)
+                               printk_stack_address(addr, 0, log_lvl);
+                       printk_stack_address(real_addr, reliable, log_lvl);
+
+                       if (!reliable)
+                               continue;
+
+                       /*
+                        * Get the next frame from the unwinder.  No need to
+                        * check for an error: if anything goes wrong, the rest
+                        * of the addresses will just be printed as unreliable.
+                        */
+                       unwind_next_frame(&state);
+               }
 
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
-{
-       show_trace_log_lvl(task, regs, stack, bp, "");
+               if (str_end)
+                       printk("%s <%s> ", log_lvl, str_end);
+       }
 }
 
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
-       unsigned long bp = 0;
-       unsigned long stack;
+       task = task ? : current;
 
        /*
         * Stack frames below this one aren't interesting.  Don't show them
         * if we're printing for %current.
         */
-       if (!sp && (!task || task == current)) {
-               sp = &stack;
-               bp = stack_frame(current, NULL);
-       }
+       if (!sp && task == current)
+               sp = get_stack_pointer(current, NULL);
 
-       show_stack_log_lvl(task, NULL, sp, bp, "");
+       show_stack_log_lvl(task, NULL, sp, "");
 }
 
 void show_stack_regs(struct pt_regs *regs)
 {
-       show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, "");
+       show_stack_log_lvl(current, regs, NULL, "");
 }
 
 static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
index 0967571..06eb322 100644 (file)
 
 #include <asm/stacktrace.h>
 
-static void *is_irq_stack(void *p, void *irq)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
 {
-       if (p < irq || p >= (irq + THREAD_SIZE))
-               return NULL;
-       return irq + THREAD_SIZE;
+       switch (type) {
+       case STACK_TYPE_IRQ:
+       case STACK_TYPE_SOFTIRQ:
+               *begin = "IRQ";
+               *end   = "EOI";
+               break;
+       default:
+               *begin = NULL;
+               *end   = NULL;
+       }
 }
 
-
-static void *is_hardirq_stack(unsigned long *stack, int cpu)
+static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
 {
-       void *irq = per_cpu(hardirq_stack, cpu);
+       unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack);
+       unsigned long *end   = begin + (THREAD_SIZE / sizeof(long));
 
-       return is_irq_stack(stack, irq);
-}
+       /*
+        * This is a software stack, so 'end' can be a valid stack pointer.
+        * It just means the stack is empty.
+        */
+       if (stack < begin || stack > end)
+               return false;
 
-static void *is_softirq_stack(unsigned long *stack, int cpu)
-{
-       void *irq = per_cpu(softirq_stack, cpu);
+       info->type      = STACK_TYPE_IRQ;
+       info->begin     = begin;
+       info->end       = end;
 
-       return is_irq_stack(stack, irq);
+       /*
+        * See irq_32.c -- the next stack pointer is stored at the beginning of
+        * the stack.
+        */
+       info->next_sp   = (unsigned long *)*begin;
+
+       return true;
 }
 
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data)
+static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
 {
-       const unsigned cpu = get_cpu();
-       int graph = 0;
-       u32 *prev_esp;
+       unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack);
+       unsigned long *end   = begin + (THREAD_SIZE / sizeof(long));
 
-       if (!task)
-               task = current;
+       /*
+        * This is a software stack, so 'end' can be a valid stack pointer.
+        * It just means the stack is empty.
+        */
+       if (stack < begin || stack > end)
+               return false;
 
-       if (!stack) {
-               unsigned long dummy;
+       info->type      = STACK_TYPE_SOFTIRQ;
+       info->begin     = begin;
+       info->end       = end;
 
-               stack = &dummy;
-               if (task != current)
-                       stack = (unsigned long *)task->thread.sp;
-       }
+       /*
+        * The next stack pointer is stored at the beginning of the stack.
+        * See irq_32.c.
+        */
+       info->next_sp   = (unsigned long *)*begin;
 
-       if (!bp)
-               bp = stack_frame(task, regs);
+       return true;
+}
 
-       for (;;) {
-               void *end_stack;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+                  struct stack_info *info, unsigned long *visit_mask)
+{
+       if (!stack)
+               goto unknown;
 
-               end_stack = is_hardirq_stack(stack, cpu);
-               if (!end_stack)
-                       end_stack = is_softirq_stack(stack, cpu);
+       task = task ? : current;
 
-               bp = ops->walk_stack(task, stack, bp, ops, data,
-                                    end_stack, &graph);
+       if (in_task_stack(stack, task, info))
+               goto recursion_check;
 
-               /* Stop if not on irq stack */
-               if (!end_stack)
-                       break;
+       if (task != current)
+               goto unknown;
 
-               /* The previous esp is saved on the bottom of the stack */
-               prev_esp = (u32 *)(end_stack - THREAD_SIZE);
-               stack = (unsigned long *)*prev_esp;
-               if (!stack)
-                       break;
+       if (in_hardirq_stack(stack, info))
+               goto recursion_check;
 
-               if (ops->stack(data, "IRQ") < 0)
-                       break;
-               touch_nmi_watchdog();
+       if (in_softirq_stack(stack, info))
+               goto recursion_check;
+
+       goto unknown;
+
+recursion_check:
+       /*
+        * Make sure we don't iterate through any given stack more than once.
+        * If it comes up a second time then there's something wrong going on:
+        * just break out and report an unknown stack type.
+        */
+       if (visit_mask) {
+               if (*visit_mask & (1UL << info->type))
+                       goto unknown;
+               *visit_mask |= 1UL << info->type;
        }
-       put_cpu();
+
+       return 0;
+
+unknown:
+       info->type = STACK_TYPE_UNKNOWN;
+       return -EINVAL;
 }
-EXPORT_SYMBOL(dump_trace);
 
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                       unsigned long *sp, char *log_lvl)
 {
        unsigned long *stack;
        int i;
 
-       if (sp == NULL) {
-               if (regs)
-                       sp = (unsigned long *)regs->sp;
-               else if (task)
-                       sp = (unsigned long *)task->thread.sp;
-               else
-                       sp = (unsigned long *)&sp;
-       }
+       if (!try_get_task_stack(task))
+               return;
+
+       sp = sp ? : get_stack_pointer(task, regs);
 
        stack = sp;
        for (i = 0; i < kstack_depth_to_print; i++) {
@@ -117,7 +145,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                touch_nmi_watchdog();
        }
        pr_cont("\n");
-       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, log_lvl);
+
+       put_task_stack(task);
 }
 
 
@@ -139,7 +169,7 @@ void show_regs(struct pt_regs *regs)
                u8 *ip;
 
                pr_emerg("Stack:\n");
-               show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
+               show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
 
                pr_emerg("Code:");
 
index 9ee4520..36cf1a4 100644 (file)
 
 #include <asm/stacktrace.h>
 
+static char *exception_stack_names[N_EXCEPTION_STACKS] = {
+               [ DOUBLEFAULT_STACK-1   ]       = "#DF",
+               [ NMI_STACK-1           ]       = "NMI",
+               [ DEBUG_STACK-1         ]       = "#DB",
+               [ MCE_STACK-1           ]       = "#MC",
+};
 
-#define N_EXCEPTION_STACKS_END \
-               (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
-
-static char x86_stack_ids[][8] = {
-               [ DEBUG_STACK-1                 ]       = "#DB",
-               [ NMI_STACK-1                   ]       = "NMI",
-               [ DOUBLEFAULT_STACK-1           ]       = "#DF",
-               [ MCE_STACK-1                   ]       = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-               [ N_EXCEPTION_STACKS ...
-                 N_EXCEPTION_STACKS_END        ]       = "#DB[?]"
-#endif
+static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
+       [0 ... N_EXCEPTION_STACKS - 1]          = EXCEPTION_STKSZ,
+       [DEBUG_STACK - 1]                       = DEBUG_STKSZ
 };
 
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
-                                        unsigned *usedp, char **idp)
+void stack_type_str(enum stack_type type, const char **begin, const char **end)
 {
-       unsigned k;
-
-       /*
-        * Iterate over all exception stacks, and figure out whether
-        * 'stack' is in one of them:
-        */
-       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
-               unsigned long end = per_cpu(orig_ist, cpu).ist[k];
-               /*
-                * Is 'stack' above this exception frame's end?
-                * If yes then skip to the next frame.
-                */
-               if (stack >= end)
-                       continue;
-               /*
-                * Is 'stack' above this exception frame's start address?
-                * If yes then we found the right frame.
-                */
-               if (stack >= end - EXCEPTION_STKSZ) {
-                       /*
-                        * Make sure we only iterate through an exception
-                        * stack once. If it comes up for the second time
-                        * then there's something wrong going on - just
-                        * break out and return NULL:
-                        */
-                       if (*usedp & (1U << k))
-                               break;
-                       *usedp |= 1U << k;
-                       *idp = x86_stack_ids[k];
-                       return (unsigned long *)end;
-               }
-               /*
-                * If this is a debug stack, and if it has a larger size than
-                * the usual exception stacks, then 'stack' might still
-                * be within the lower portion of the debug stack:
-                */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-               if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
-                       unsigned j = N_EXCEPTION_STACKS - 1;
-
-                       /*
-                        * Black magic. A large debug stack is composed of
-                        * multiple exception stack entries, which we
-                        * iterate through now. Dont look:
-                        */
-                       do {
-                               ++j;
-                               end -= EXCEPTION_STKSZ;
-                               x86_stack_ids[j][4] = '1' +
-                                               (j - N_EXCEPTION_STACKS);
-                       } while (stack < end - EXCEPTION_STKSZ);
-                       if (*usedp & (1U << j))
-                               break;
-                       *usedp |= 1U << j;
-                       *idp = x86_stack_ids[j];
-                       return (unsigned long *)end;
-               }
-#endif
+       BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+
+       switch (type) {
+       case STACK_TYPE_IRQ:
+               *begin = "IRQ";
+               *end   = "EOI";
+               break;
+       case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
+               *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
+               *end   = "EOE";
+               break;
+       default:
+               *begin = NULL;
+               *end   = NULL;
        }
-       return NULL;
 }
 
-static inline int
-in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
-            unsigned long *irq_stack_end)
+static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
 {
-       return (stack >= irq_stack && stack < irq_stack_end);
-}
-
-static const unsigned long irq_stack_size =
-       (IRQ_STACK_SIZE - 64) / sizeof(unsigned long);
-
-enum stack_type {
-       STACK_IS_UNKNOWN,
-       STACK_IS_NORMAL,
-       STACK_IS_EXCEPTION,
-       STACK_IS_IRQ,
-};
-
-static enum stack_type
-analyze_stack(int cpu, struct task_struct *task, unsigned long *stack,
-             unsigned long **stack_end, unsigned long *irq_stack,
-             unsigned *used, char **id)
-{
-       unsigned long addr;
+       unsigned long *begin, *end;
+       struct pt_regs *regs;
+       unsigned k;
 
-       addr = ((unsigned long)stack & (~(THREAD_SIZE - 1)));
-       if ((unsigned long)task_stack_page(task) == addr)
-               return STACK_IS_NORMAL;
+       BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
 
-       *stack_end = in_exception_stack(cpu, (unsigned long)stack,
-                                       used, id);
-       if (*stack_end)
-               return STACK_IS_EXCEPTION;
+       for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+               end   = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k];
+               begin = end - (exception_stack_sizes[k] / sizeof(long));
+               regs  = (struct pt_regs *)end - 1;
 
-       if (!irq_stack)
-               return STACK_IS_NORMAL;
+               if (stack < begin || stack >= end)
+                       continue;
 
-       *stack_end = irq_stack;
-       irq_stack = irq_stack - irq_stack_size;
+               info->type      = STACK_TYPE_EXCEPTION + k;
+               info->begin     = begin;
+               info->end       = end;
+               info->next_sp   = (unsigned long *)regs->sp;
 
-       if (in_irq_stack(stack, irq_stack, *stack_end))
-               return STACK_IS_IRQ;
+               return true;
+       }
 
-       return STACK_IS_UNKNOWN;
+       return false;
 }
 
-/*
- * x86-64 can have up to three kernel stacks:
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data)
+static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
 {
-       const unsigned cpu = get_cpu();
-       unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu);
-       unsigned long dummy;
-       unsigned used = 0;
-       int graph = 0;
-       int done = 0;
-
-       if (!task)
-               task = current;
-
-       if (!stack) {
-               if (regs)
-                       stack = (unsigned long *)regs->sp;
-               else if (task != current)
-                       stack = (unsigned long *)task->thread.sp;
-               else
-                       stack = &dummy;
-       }
+       unsigned long *end   = (unsigned long *)this_cpu_read(irq_stack_ptr);
+       unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
 
-       if (!bp)
-               bp = stack_frame(task, regs);
        /*
-        * Print function call entries in all stacks, starting at the
-        * current stack address. If the stacks consist of nested
-        * exceptions
+        * This is a software stack, so 'end' can be a valid stack pointer.
+        * It just means the stack is empty.
         */
-       while (!done) {
-               unsigned long *stack_end;
-               enum stack_type stype;
-               char *id;
+       if (stack < begin || stack > end)
+               return false;
 
-               stype = analyze_stack(cpu, task, stack, &stack_end,
-                                     irq_stack, &used, &id);
+       info->type      = STACK_TYPE_IRQ;
+       info->begin     = begin;
+       info->end       = end;
 
-               /* Default finish unless specified to continue */
-               done = 1;
+       /*
+        * The next stack pointer is the first thing pushed by the entry code
+        * after switching to the irq stack.
+        */
+       info->next_sp = (unsigned long *)*(end - 1);
 
-               switch (stype) {
+       return true;
+}
 
-               /* Break out early if we are on the thread stack */
-               case STACK_IS_NORMAL:
-                       break;
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+                  struct stack_info *info, unsigned long *visit_mask)
+{
+       if (!stack)
+               goto unknown;
 
-               case STACK_IS_EXCEPTION:
+       task = task ? : current;
 
-                       if (ops->stack(data, id) < 0)
-                               break;
+       if (in_task_stack(stack, task, info))
+               goto recursion_check;
 
-                       bp = ops->walk_stack(task, stack, bp, ops,
-                                            data, stack_end, &graph);
-                       ops->stack(data, "<EOE>");
-                       /*
-                        * We link to the next stack via the
-                        * second-to-last pointer (index -2 to end) in the
-                        * exception stack:
-                        */
-                       stack = (unsigned long *) stack_end[-2];
-                       done = 0;
-                       break;
+       if (task != current)
+               goto unknown;
 
-               case STACK_IS_IRQ:
+       if (in_exception_stack(stack, info))
+               goto recursion_check;
 
-                       if (ops->stack(data, "IRQ") < 0)
-                               break;
-                       bp = ops->walk_stack(task, stack, bp,
-                                    ops, data, stack_end, &graph);
-                       /*
-                        * We link to the next stack (which would be
-                        * the process stack normally) the last
-                        * pointer (index -1 to end) in the IRQ stack:
-                        */
-                       stack = (unsigned long *) (stack_end[-1]);
-                       irq_stack = NULL;
-                       ops->stack(data, "EOI");
-                       done = 0;
-                       break;
+       if (in_irq_stack(stack, info))
+               goto recursion_check;
 
-               case STACK_IS_UNKNOWN:
-                       ops->stack(data, "UNK");
-                       break;
-               }
-       }
+       goto unknown;
 
+recursion_check:
        /*
-        * This handles the process stack:
+        * Make sure we don't iterate through any given stack more than once.
+        * If it comes up a second time then there's something wrong going on:
+        * just break out and report an unknown stack type.
         */
-       bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph);
-       put_cpu();
+       if (visit_mask) {
+               if (*visit_mask & (1UL << info->type))
+                       goto unknown;
+               *visit_mask |= 1UL << info->type;
+       }
+
+       return 0;
+
+unknown:
+       info->type = STACK_TYPE_UNKNOWN;
+       return -EINVAL;
 }
-EXPORT_SYMBOL(dump_trace);
 
-void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-                  unsigned long *sp, unsigned long bp, char *log_lvl)
+void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+                       unsigned long *sp, char *log_lvl)
 {
        unsigned long *irq_stack_end;
        unsigned long *irq_stack;
        unsigned long *stack;
-       int cpu;
        int i;
 
-       preempt_disable();
-       cpu = smp_processor_id();
+       if (!try_get_task_stack(task))
+               return;
 
-       irq_stack_end   = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
-       irq_stack       = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
+       irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
+       irq_stack     = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
 
-       /*
-        * Debugging aid: "show_stack(NULL, NULL);" prints the
-        * back trace for this cpu:
-        */
-       if (sp == NULL) {
-               if (regs)
-                       sp = (unsigned long *)regs->sp;
-               else if (task)
-                       sp = (unsigned long *)task->thread.sp;
-               else
-                       sp = (unsigned long *)&sp;
-       }
+       sp = sp ? : get_stack_pointer(task, regs);
 
        stack = sp;
        for (i = 0; i < kstack_depth_to_print; i++) {
@@ -299,18 +183,17 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                stack++;
                touch_nmi_watchdog();
        }
-       preempt_enable();
 
        pr_cont("\n");
-       show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+       show_trace_log_lvl(task, regs, sp, log_lvl);
+
+       put_task_stack(task);
 }
 
 void show_regs(struct pt_regs *regs)
 {
        int i;
-       unsigned long sp;
 
-       sp = regs->sp;
        show_regs_print_info(KERN_DEFAULT);
        __show_regs(regs, 1);
 
@@ -325,8 +208,7 @@ void show_regs(struct pt_regs *regs)
                u8 *ip;
 
                printk(KERN_DEFAULT "Stack:\n");
-               show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
-                                  0, KERN_DEFAULT);
+               show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
 
                printk(KERN_DEFAULT "Code: ");
 
index 93982ae..2f2b8c7 100644 (file)
@@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void)
        on_boot_cpu = 0;
 
        WARN_ON_FPU(current->thread.fpu.fpstate_active);
-       current_thread_info()->status = 0;
 
        if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
                eagerfpu = ENABLE;
index d036cfb..8639bb2 100644 (file)
@@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
        }
 
        if (ftrace_push_return_trace(old, self_addr, &trace.depth,
-                   frame_pointer) == -EBUSY) {
+                                    frame_pointer, parent) == -EBUSY) {
                *parent = old;
                return;
        }
index 6f8902b..5f40126 100644 (file)
@@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
  */
 __HEAD
 ENTRY(startup_32)
-       movl pa(stack_start),%ecx
+       movl pa(initial_stack),%ecx
        
        /* test KEEP_SEGMENTS flag to see if the bootloader is asking
                us to not reload segments */
@@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4
  * start_secondary().
  */
 ENTRY(start_cpu0)
-       movl stack_start, %ecx
+       movl initial_stack, %ecx
        movl %ecx, %esp
        jmp  *(initial_code)
 ENDPROC(start_cpu0)
@@ -307,7 +307,7 @@ ENTRY(startup_32_smp)
        movl %eax,%es
        movl %eax,%fs
        movl %eax,%gs
-       movl pa(stack_start),%ecx
+       movl pa(initial_stack),%ecx
        movl %eax,%ss
        leal -__PAGE_OFFSET(%ecx),%esp
 
@@ -703,7 +703,7 @@ ENTRY(initial_page_table)
 
 .data
 .balign 4
-ENTRY(stack_start)
+ENTRY(initial_stack)
        .long init_thread_union+THREAD_SIZE
 
 __INITRODATA
index 9f8efc9..c98a559 100644 (file)
@@ -66,7 +66,7 @@ startup_64:
         */
 
        /*
-        * Setup stack for verify_cpu(). "-8" because stack_start is defined
+        * Setup stack for verify_cpu(). "-8" because initial_stack is defined
         * this way, see below. Our best guess is a NULL ptr for stack
         * termination heuristics and we don't want to break anything which
         * might depend on it (kgdb, ...).
@@ -226,7 +226,7 @@ ENTRY(secondary_startup_64)
        movq    %rax, %cr0
 
        /* Setup a boot time stack */
-       movq stack_start(%rip), %rsp
+       movq initial_stack(%rip), %rsp
 
        /* zero EFLAGS after setting rsp */
        pushq $0
@@ -310,7 +310,7 @@ ENDPROC(secondary_startup_64)
  * start_secondary().
  */
 ENTRY(start_cpu0)
-       movq stack_start(%rip),%rsp
+       movq initial_stack(%rip),%rsp
        movq    initial_code(%rip),%rax
        pushq   $0              # fake return address to stop unwinder
        pushq   $__KERNEL_CS    # set correct cs
@@ -319,17 +319,15 @@ ENTRY(start_cpu0)
 ENDPROC(start_cpu0)
 #endif
 
-       /* SMP bootup changes these two */
+       /* Both SMP bootup and ACPI suspend change these variables */
        __REFDATA
        .balign 8
        GLOBAL(initial_code)
        .quad   x86_64_start_kernel
        GLOBAL(initial_gs)
        .quad   INIT_PER_CPU_VAR(irq_stack_union)
-
-       GLOBAL(stack_start)
+       GLOBAL(initial_stack)
        .quad  init_thread_union+THREAD_SIZE-8
-       .word  0
        __FINITDATA
 
 bad_address:
index 4a79037..9ebd0b0 100644 (file)
@@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
        if (user_mode(regs))
                return;
 
-       if (regs->sp >= curbase + sizeof(struct thread_info) +
-                                 sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+       if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
            regs->sp <= curbase + THREAD_SIZE)
                return;
 
index 04cde52..8e36f24 100644 (file)
@@ -50,6 +50,7 @@
 #include <asm/apicdef.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
+#include <asm/switch_to.h>
 
 struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 {
@@ -166,21 +167,19 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
        gdb_regs[GDB_DX]        = 0;
        gdb_regs[GDB_SI]        = 0;
        gdb_regs[GDB_DI]        = 0;
-       gdb_regs[GDB_BP]        = *(unsigned long *)p->thread.sp;
+       gdb_regs[GDB_BP]        = ((struct inactive_task_frame *)p->thread.sp)->bp;
 #ifdef CONFIG_X86_32
        gdb_regs[GDB_DS]        = __KERNEL_DS;
        gdb_regs[GDB_ES]        = __KERNEL_DS;
        gdb_regs[GDB_PS]        = 0;
        gdb_regs[GDB_CS]        = __KERNEL_CS;
-       gdb_regs[GDB_PC]        = p->thread.ip;
        gdb_regs[GDB_SS]        = __KERNEL_DS;
        gdb_regs[GDB_FS]        = 0xFFFF;
        gdb_regs[GDB_GS]        = 0xFFFF;
 #else
-       gdb_regs32[GDB_PS]      = *(unsigned long *)(p->thread.sp + 8);
+       gdb_regs32[GDB_PS]      = 0;
        gdb_regs32[GDB_CS]      = __KERNEL_CS;
        gdb_regs32[GDB_SS]      = __KERNEL_DS;
-       gdb_regs[GDB_PC]        = 0;
        gdb_regs[GDB_R8]        = 0;
        gdb_regs[GDB_R9]        = 0;
        gdb_regs[GDB_R10]       = 0;
@@ -190,6 +189,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
        gdb_regs[GDB_R14]       = 0;
        gdb_regs[GDB_R15]       = 0;
 #endif
+       gdb_regs[GDB_PC]        = 0;
        gdb_regs[GDB_SP]        = p->thread.sp;
 }
 
index c2bedae..4afc67f 100644 (file)
@@ -184,7 +184,7 @@ out:
 
 static struct kobj_attribute type_attr = __ATTR_RO(type);
 
-static struct bin_attribute data_attr = {
+static struct bin_attribute data_attr __ro_after_init = {
        .attr = {
                .name = "data",
                .mode = S_IRUGO,
index 3692249..60b9949 100644 (file)
@@ -29,7 +29,7 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 
-static int kvmclock = 1;
+static int kvmclock __ro_after_init = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 static cycle_t kvm_sched_clock_offset;
index 1acfd76..bef3400 100644 (file)
@@ -389,7 +389,7 @@ NOKPROBE_SYMBOL(native_load_idt);
 #define PTE_IDENT      __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
 #endif
 
-struct pv_mmu_ops pv_mmu_ops = {
+struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
        .read_cr2 = native_read_cr2,
        .write_cr2 = native_write_cr2,
index 62c0b0e..4002b47 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/tlbflush.h>
 #include <asm/mce.h>
 #include <asm/vm86.h>
+#include <asm/switch_to.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -512,6 +513,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
        return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
 
+/*
+ * Return saved PC of a blocked thread.
+ * What is this good for? it will be always the scheduler or ret_from_fork.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+       struct inactive_task_frame *frame =
+               (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
+       return READ_ONCE_NOCHECK(frame->ret_addr);
+}
+
 /*
  * Called from fs/proc with a reference on @p to find the function
  * which called into schedule(). This needs to be done carefully
@@ -520,15 +532,18 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
  */
 unsigned long get_wchan(struct task_struct *p)
 {
-       unsigned long start, bottom, top, sp, fp, ip;
+       unsigned long start, bottom, top, sp, fp, ip, ret = 0;
        int count = 0;
 
        if (!p || p == current || p->state == TASK_RUNNING)
                return 0;
 
+       if (!try_get_task_stack(p))
+               return 0;
+
        start = (unsigned long)task_stack_page(p);
        if (!start)
-               return 0;
+               goto out;
 
        /*
         * Layout of the stack page:
@@ -537,9 +552,7 @@ unsigned long get_wchan(struct task_struct *p)
         * PADDING
         * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
         * stack
-        * ----------- bottom = start + sizeof(thread_info)
-        * thread_info
-        * ----------- start
+        * ----------- bottom = start
         *
         * The tasks stack pointer points at the location where the
         * framepointer is stored. The data on the stack is:
@@ -550,20 +563,25 @@ unsigned long get_wchan(struct task_struct *p)
         */
        top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
        top -= 2 * sizeof(unsigned long);
-       bottom = start + sizeof(struct thread_info);
+       bottom = start;
 
        sp = READ_ONCE(p->thread.sp);
        if (sp < bottom || sp > top)
-               return 0;
+               goto out;
 
-       fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+       fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
        do {
                if (fp < bottom || fp > top)
-                       return 0;
+                       goto out;
                ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
-               if (!in_sched_functions(ip))
-                       return ip;
+               if (!in_sched_functions(ip)) {
+                       ret = ip;
+                       goto out;
+               }
                fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
        } while (count++ < 16 && p->state != TASK_RUNNING);
-       return 0;
+
+out:
+       put_task_stack(p);
+       return ret;
 }
index d86be29..404efdf 100644 (file)
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-/*
- * Return saved PC of a blocked thread.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
-       return ((unsigned long *)tsk->thread.sp)[3];
-}
-
 void __show_regs(struct pt_regs *regs, int all)
 {
        unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -133,35 +122,31 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        unsigned long arg, struct task_struct *p, unsigned long tls)
 {
        struct pt_regs *childregs = task_pt_regs(p);
+       struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs);
+       struct inactive_task_frame *frame = &fork_frame->frame;
        struct task_struct *tsk;
        int err;
 
-       p->thread.sp = (unsigned long) childregs;
+       frame->bp = 0;
+       frame->ret_addr = (unsigned long) ret_from_fork;
+       p->thread.sp = (unsigned long) fork_frame;
        p->thread.sp0 = (unsigned long) (childregs+1);
        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
-               p->thread.ip = (unsigned long) ret_from_kernel_thread;
-               task_user_gs(p) = __KERNEL_STACK_CANARY;
-               childregs->ds = __USER_DS;
-               childregs->es = __USER_DS;
-               childregs->fs = __KERNEL_PERCPU;
-               childregs->bx = sp;     /* function */
-               childregs->bp = arg;
-               childregs->orig_ax = -1;
-               childregs->cs = __KERNEL_CS | get_kernel_rpl();
-               childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+               frame->bx = sp;         /* function */
+               frame->di = arg;
                p->thread.io_bitmap_ptr = NULL;
                return 0;
        }
+       frame->bx = 0;
        *childregs = *current_pt_regs();
        childregs->ax = 0;
        if (sp)
                childregs->sp = sp;
 
-       p->thread.ip = (unsigned long) ret_from_fork;
        task_user_gs(p) = get_user_gs(current_pt_regs());
 
        p->thread.io_bitmap_ptr = NULL;
index 63236d8..de9acaf 100644 (file)
@@ -50,8 +50,6 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 
-asmlinkage extern void ret_from_fork(void);
-
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
@@ -141,12 +139,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 {
        int err;
        struct pt_regs *childregs;
+       struct fork_frame *fork_frame;
+       struct inactive_task_frame *frame;
        struct task_struct *me = current;
 
        p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
        childregs = task_pt_regs(p);
-       p->thread.sp = (unsigned long) childregs;
-       set_tsk_thread_flag(p, TIF_FORK);
+       fork_frame = container_of(childregs, struct fork_frame, regs);
+       frame = &fork_frame->frame;
+       frame->bp = 0;
+       frame->ret_addr = (unsigned long) ret_from_fork;
+       p->thread.sp = (unsigned long) fork_frame;
        p->thread.io_bitmap_ptr = NULL;
 
        savesegment(gs, p->thread.gsindex);
@@ -160,15 +163,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        if (unlikely(p->flags & PF_KTHREAD)) {
                /* kernel thread */
                memset(childregs, 0, sizeof(struct pt_regs));
-               childregs->sp = (unsigned long)childregs;
-               childregs->ss = __KERNEL_DS;
-               childregs->bx = sp; /* function */
-               childregs->bp = arg;
-               childregs->orig_ax = -1;
-               childregs->cs = __KERNEL_CS | get_kernel_rpl();
-               childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+               frame->bx = sp;         /* function */
+               frame->r12 = arg;
                return 0;
        }
+       frame->bx = 0;
        *childregs = *current_pt_regs();
 
        childregs->ax = 0;
@@ -511,7 +510,7 @@ void set_personality_ia32(bool x32)
                current->personality &= ~READ_IMPLIES_EXEC;
                /* in_compat_syscall() uses the presence of the x32
                   syscall bit flag to determine compat status */
-               current_thread_info()->status &= ~TS_COMPAT;
+               current->thread.status &= ~TS_COMPAT;
        } else {
                set_thread_flag(TIF_IA32);
                clear_thread_flag(TIF_X32);
@@ -519,7 +518,7 @@ void set_personality_ia32(bool x32)
                        current->mm->context.ia32_compat = TIF_IA32;
                current->personality |= force_personality32;
                /* Prepare the first "return" to user space */
-               current_thread_info()->status |= TS_COMPAT;
+               current->thread.status |= TS_COMPAT;
        }
 }
 EXPORT_SYMBOL_GPL(set_personality_ia32);
index f79576a..ce94c38 100644 (file)
@@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs)
                return sp;
 
        prev_esp = (u32 *)(context);
-       if (prev_esp)
-               return (unsigned long)prev_esp;
+       if (*prev_esp)
+               return (unsigned long)*prev_esp;
 
        return (unsigned long)regs;
 }
@@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
                 */
                regs->orig_ax = value;
                if (syscall_get_nr(child, regs) >= 0)
-                       task_thread_info(child)->status |= TS_I386_REGS_POKED;
+                       child->thread.status |= TS_I386_REGS_POKED;
                break;
 
        case offsetof(struct user32, regs.eflags):
@@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 #ifdef CONFIG_X86_64
 
-static struct user_regset x86_64_regsets[] __read_mostly = {
+static struct user_regset x86_64_regsets[] __ro_after_init = {
        [REGSET_GENERAL] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = {
 #endif /* CONFIG_X86_64 */
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-static struct user_regset x86_32_regsets[] __read_mostly = {
+static struct user_regset x86_32_regsets[] __ro_after_init = {
        [REGSET_GENERAL] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = {
  */
 u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
-void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
+void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
 {
 #ifdef CONFIG_X86_64
        x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
index 63bf27d..e244c19 100644 (file)
@@ -705,7 +705,7 @@ static void native_machine_power_off(void)
        tboot_shutdown(TB_SHUTDOWN_HALT);
 }
 
-struct machine_ops machine_ops = {
+struct machine_ops machine_ops __ro_after_init = {
        .power_off = native_machine_power_off,
        .shutdown = native_machine_shutdown,
        .emergency_restart = native_machine_emergency_restart,
index 98c9cd6..87f2330 100644 (file)
@@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data);
 
 
 #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
-__visible unsigned long mmu_cr4_features;
+__visible unsigned long mmu_cr4_features __ro_after_init;
 #else
-__visible unsigned long mmu_cr4_features = X86_CR4_PAE;
+__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
 #endif
 
 /* Boot loader ID and version as integers, for the benefit of proc_dointvec */
index 7a40e06..2bbd27f 100644 (file)
@@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number);
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
 EXPORT_PER_CPU_SYMBOL(this_cpu_off);
 
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
        [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
 };
 EXPORT_SYMBOL(__per_cpu_offset);
@@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void)
 #ifdef CONFIG_X86_64
                per_cpu(irq_stack_ptr, cpu) =
                        per_cpu(irq_stack_union.irq_stack, cpu) +
-                       IRQ_STACK_SIZE - 64;
+                       IRQ_STACK_SIZE;
 #endif
 #ifdef CONFIG_NUMA
                per_cpu(x86_cpu_to_node_map, cpu) =
index 04cb321..da20ecb 100644 (file)
@@ -783,7 +783,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
         * than the tracee.
         */
 #ifdef CONFIG_IA32_EMULATION
-       if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED))
+       if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED))
                return __NR_ia32_restart_syscall;
 #endif
 #ifdef CONFIG_X86_X32_ABI
index 4296beb..7e52f83 100644 (file)
@@ -942,7 +942,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
        per_cpu(cpu_current_top_of_stack, cpu) =
                (unsigned long)task_stack_page(idle) + THREAD_SIZE;
 #else
-       clear_tsk_thread_flag(idle, TIF_FORK);
        initial_gs = per_cpu_offset(cpu);
 #endif
 }
@@ -969,7 +968,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 
        early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
        initial_code = (unsigned long)start_secondary;
-       stack_start  = idle->thread.sp;
+       initial_stack  = idle->thread.sp;
 
        /*
         * Enable the espfix hack for this CPU
index 4738f5e..0653788 100644 (file)
@@ -8,80 +8,69 @@
 #include <linux/export.h>
 #include <linux/uaccess.h>
 #include <asm/stacktrace.h>
+#include <asm/unwind.h>
 
-static int save_stack_stack(void *data, char *name)
+static int save_stack_address(struct stack_trace *trace, unsigned long addr,
+                             bool nosched)
 {
-       return 0;
-}
-
-static int
-__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
-{
-       struct stack_trace *trace = data;
-#ifdef CONFIG_FRAME_POINTER
-       if (!reliable)
-               return 0;
-#endif
        if (nosched && in_sched_functions(addr))
                return 0;
+
        if (trace->skip > 0) {
                trace->skip--;
                return 0;
        }
-       if (trace->nr_entries < trace->max_entries) {
-               trace->entries[trace->nr_entries++] = addr;
-               return 0;
-       } else {
-               return -1; /* no more room, stop walking the stack */
-       }
-}
 
-static int save_stack_address(void *data, unsigned long addr, int reliable)
-{
-       return __save_stack_address(data, addr, reliable, false);
+       if (trace->nr_entries >= trace->max_entries)
+               return -1;
+
+       trace->entries[trace->nr_entries++] = addr;
+       return 0;
 }
 
-static int
-save_stack_address_nosched(void *data, unsigned long addr, int reliable)
+static void __save_stack_trace(struct stack_trace *trace,
+                              struct task_struct *task, struct pt_regs *regs,
+                              bool nosched)
 {
-       return __save_stack_address(data, addr, reliable, true);
-}
+       struct unwind_state state;
+       unsigned long addr;
 
-static const struct stacktrace_ops save_stack_ops = {
-       .stack          = save_stack_stack,
-       .address        = save_stack_address,
-       .walk_stack     = print_context_stack,
-};
+       if (regs)
+               save_stack_address(trace, regs->ip, nosched);
 
-static const struct stacktrace_ops save_stack_ops_nosched = {
-       .stack          = save_stack_stack,
-       .address        = save_stack_address_nosched,
-       .walk_stack     = print_context_stack,
-};
+       for (unwind_start(&state, task, regs, NULL); !unwind_done(&state);
+            unwind_next_frame(&state)) {
+               addr = unwind_get_return_address(&state);
+               if (!addr || save_stack_address(trace, addr, nosched))
+                       break;
+       }
+
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
 
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
 void save_stack_trace(struct stack_trace *trace)
 {
-       dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
-       if (trace->nr_entries < trace->max_entries)
-               trace->entries[trace->nr_entries++] = ULONG_MAX;
+       __save_stack_trace(trace, current, NULL, false);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
-       dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
-       if (trace->nr_entries < trace->max_entries)
-               trace->entries[trace->nr_entries++] = ULONG_MAX;
+       __save_stack_trace(trace, current, regs, false);
 }
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
-       dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
-       if (trace->nr_entries < trace->max_entries)
-               trace->entries[trace->nr_entries++] = ULONG_MAX;
+       if (!try_get_task_stack(tsk))
+               return;
+
+       __save_stack_trace(trace, tsk, NULL, true);
+
+       put_task_stack(tsk);
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
index b70ca12..bd4e3d4 100644 (file)
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",            stack_segment)
 DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",          alignment_check)
 
+#ifdef CONFIG_VMAP_STACK
+__visible void __noreturn handle_stack_overflow(const char *message,
+                                               struct pt_regs *regs,
+                                               unsigned long fault_address)
+{
+       printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+                (void *)fault_address, current->stack,
+                (char *)current->stack + THREAD_SIZE - 1);
+       die(message, regs, 0);
+
+       /* Be absolutely certain we don't return. */
+       panic(message);
+}
+#endif
+
 #ifdef CONFIG_X86_64
 /* Runs on IST stack */
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 {
        static const char str[] = "double fault";
        struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+       unsigned long cr2;
+#endif
 
 #ifdef CONFIG_X86_ESPFIX64
        extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
        tsk->thread.error_code = error_code;
        tsk->thread.trap_nr = X86_TRAP_DF;
 
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If we overflow the stack into a guard page, the CPU will fail
+        * to deliver #PF and will send #DF instead.  Similarly, if we
+        * take any non-IST exception while too close to the bottom of
+        * the stack, the processor will get a page fault while
+        * delivering the exception and will generate a double fault.
+        *
+        * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+        * Page-Fault Exception (#PF):
+        *
+        *   Processors update CR2 whenever a page fault is detected. If a
+        *   second page fault occurs while an earlier page fault is being
+        *   deliv- ered, the faulting linear address of the second fault will
+        *   overwrite the contents of CR2 (replacing the previous
+        *   address). These updates to CR2 occur even if the page fault
+        *   results in a double fault or occurs during the delivery of a
+        *   double fault.
+        *
+        * The logic below has a small possibility of incorrectly diagnosing
+        * some errors as stack overflows.  For example, if the IDT or GDT
+        * gets corrupted such that #GP delivery fails due to a bad descriptor
+        * causing #GP and we hit this condition while CR2 coincidentally
+        * points to the stack guard page, we'll think we overflowed the
+        * stack.  Given that we're going to panic one way or another
+        * if this happens, this isn't necessarily worth fixing.
+        *
+        * If necessary, we could improve the test by only diagnosing
+        * a stack overflow if the saved RSP points within 47 bytes of
+        * the bottom of the stack: if RSP == tsk_stack + 48 and we
+        * take an exception, the stack is already aligned and there
+        * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+        * possible error code, so a stack overflow would *not* double
+        * fault.  With any less space left, exception delivery could
+        * fail, and, as a practical matter, we've overflowed the
+        * stack even if the actual trigger for the double fault was
+        * something else.
+        */
+       cr2 = read_cr2();
+       if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+               handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
 #ifdef CONFIG_DOUBLEFAULT
        df_debug(regs, error_code);
 #endif
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
new file mode 100644 (file)
index 0000000..a2456d4
--- /dev/null
@@ -0,0 +1,93 @@
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+
+unsigned long unwind_get_return_address(struct unwind_state *state)
+{
+       unsigned long addr;
+       unsigned long *addr_p = unwind_get_return_address_ptr(state);
+
+       if (unwind_done(state))
+               return 0;
+
+       addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
+                                    addr_p);
+
+       return __kernel_text_address(addr) ? addr : 0;
+}
+EXPORT_SYMBOL_GPL(unwind_get_return_address);
+
+static bool update_stack_state(struct unwind_state *state, void *addr,
+                              size_t len)
+{
+       struct stack_info *info = &state->stack_info;
+
+       /*
+        * If addr isn't on the current stack, switch to the next one.
+        *
+        * We may have to traverse multiple stacks to deal with the possibility
+        * that 'info->next_sp' could point to an empty stack and 'addr' could
+        * be on a subsequent stack.
+        */
+       while (!on_stack(info, addr, len))
+               if (get_stack_info(info->next_sp, state->task, info,
+                                  &state->stack_mask))
+                       return false;
+
+       return true;
+}
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+       unsigned long *next_bp;
+
+       if (unwind_done(state))
+               return false;
+
+       next_bp = (unsigned long *)*state->bp;
+
+       /* make sure the next frame's data is accessible */
+       if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
+               return false;
+
+       /* move to the next frame */
+       state->bp = next_bp;
+       return true;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+                   struct pt_regs *regs, unsigned long *first_frame)
+{
+       memset(state, 0, sizeof(*state));
+       state->task = task;
+
+       /* don't even attempt to start from user mode regs */
+       if (regs && user_mode(regs)) {
+               state->stack_info.type = STACK_TYPE_UNKNOWN;
+               return;
+       }
+
+       /* set up the starting stack frame */
+       state->bp = get_frame_pointer(task, regs);
+
+       /* initialize stack info and make sure the frame data is accessible */
+       get_stack_info(state->bp, state->task, &state->stack_info,
+                      &state->stack_mask);
+       update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+
+       /*
+        * The caller can provide the address of the first frame directly
+        * (first_frame) or indirectly (regs->sp) to indicate which stack frame
+        * to start unwinding at.  Skip ahead until we reach it.
+        */
+       while (!unwind_done(state) &&
+              (!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
+                       state->bp < first_frame))
+               unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c
new file mode 100644 (file)
index 0000000..b5a834c
--- /dev/null
@@ -0,0 +1,43 @@
+#include <linux/sched.h>
+#include <linux/ftrace.h>
+#include <asm/ptrace.h>
+#include <asm/bitops.h>
+#include <asm/stacktrace.h>
+#include <asm/unwind.h>
+
+bool unwind_next_frame(struct unwind_state *state)
+{
+       struct stack_info *info = &state->stack_info;
+
+       if (unwind_done(state))
+               return false;
+
+       do {
+               for (state->sp++; state->sp < info->end; state->sp++)
+                       if (__kernel_text_address(*state->sp))
+                               return true;
+
+               state->sp = info->next_sp;
+
+       } while (!get_stack_info(state->sp, state->task, info,
+                                &state->stack_mask));
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(unwind_next_frame);
+
+void __unwind_start(struct unwind_state *state, struct task_struct *task,
+                   struct pt_regs *regs, unsigned long *first_frame)
+{
+       memset(state, 0, sizeof(*state));
+
+       state->task = task;
+       state->sp   = first_frame;
+
+       get_stack_info(first_frame, state->task, &state->stack_info,
+                      &state->stack_mask);
+
+       if (!__kernel_text_address(*first_frame))
+               unwind_next_frame(state);
+}
+EXPORT_SYMBOL_GPL(__unwind_start);
index 76c5e52..0bd9f12 100644 (file)
@@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = {
 static void default_nmi_init(void) { };
 static int default_i8042_detect(void) { return 1; };
 
-struct x86_platform_ops x86_platform = {
+struct x86_platform_ops x86_platform __ro_after_init = {
        .calibrate_cpu                  = native_calibrate_cpu,
        .calibrate_tsc                  = native_calibrate_tsc,
        .get_wallclock                  = mach_get_cmos_time,
@@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = {
 EXPORT_SYMBOL_GPL(x86_platform);
 
 #if defined(CONFIG_PCI_MSI)
-struct x86_msi_ops x86_msi = {
+struct x86_msi_ops x86_msi __ro_after_init = {
        .setup_msi_irqs         = native_setup_msi_irqs,
        .teardown_msi_irq       = native_teardown_msi_irq,
        .teardown_msi_irqs      = default_teardown_msi_irqs,
@@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 }
 #endif
 
-struct x86_io_apic_ops x86_io_apic_ops = {
+struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = {
        .read                   = native_io_apic_read,
        .disable                = native_disable_io_apic,
 };
index af523d8..1e6b84b 100644 (file)
@@ -4961,7 +4961,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
        avic_handle_ldr_update(vcpu);
 }
 
-static struct kvm_x86_ops svm_x86_ops = {
+static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
        .hardware_setup = svm_hardware_setup,
index 5cede40..121fdf6 100644 (file)
@@ -11177,7 +11177,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
                        ~FEATURE_CONTROL_LMCE;
 }
 
-static struct kvm_x86_ops vmx_x86_ops = {
+static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
        .hardware_setup = hardware_setup,
index dc80230..0b92fce 100644 (file)
@@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code,
                return;
        }
 
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * Stack overflow?  During boot, we can fault near the initial
+        * stack in the direct map, but that's not an overflow -- check
+        * that we're in vmalloc space to avoid this.
+        */
+       if (is_vmalloc_addr((void *)address) &&
+           (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+            address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+               register void *__sp asm("rsp");
+               unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
+               /*
+                * We're likely to be running with very little stack space
+                * left.  It's plausible that we'd hit this condition but
+                * double-fault even before we get this far, in which case
+                * we're fine: the double-fault handler will deal with it.
+                *
+                * We don't want to make it all the way into the oops code
+                * and then double-fault, though, because we're likely to
+                * break the console driver and lose most of the stack dump.
+                */
+               asm volatile ("movq %[stack], %%rsp\n\t"
+                             "call handle_stack_overflow\n\t"
+                             "1: jmp 1b"
+                             : "+r" (__sp)
+                             : "D" ("kernel stack overflow (page fault)"),
+                               "S" (regs), "d" (address),
+                               [stack] "rm" (stack));
+               unreachable();
+       }
+#endif
+
        /*
         * 32-bit:
         *
index bda8d5e..ddd2661 100644 (file)
  * You need to add an if/def entry if you introduce a new memory region
  * compatible with KASLR. Your entry must be in logical order with memory
  * layout. For example, ESPFIX is before EFI because its virtual address is
- * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to
+ * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to
  * ensure that this order is correct and won't be changed.
  */
 static const unsigned long vaddr_start = __PAGE_OFFSET_BASE;
-static const unsigned long vaddr_end = VMEMMAP_START;
+
+#if defined(CONFIG_X86_ESPFIX64)
+static const unsigned long vaddr_end = ESPFIX_BASE_ADDR;
+#elif defined(CONFIG_EFI)
+static const unsigned long vaddr_end = EFI_VA_START;
+#else
+static const unsigned long vaddr_end = __START_KERNEL_map;
+#endif
 
 /* Default values */
 unsigned long page_offset_base = __PAGE_OFFSET_BASE;
 EXPORT_SYMBOL(page_offset_base);
 unsigned long vmalloc_base = __VMALLOC_BASE;
 EXPORT_SYMBOL(vmalloc_base);
+unsigned long vmemmap_base = __VMEMMAP_BASE;
+EXPORT_SYMBOL(vmemmap_base);
 
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
@@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region {
 } kaslr_regions[] = {
        { &page_offset_base, 64/* Maximum */ },
        { &vmalloc_base, VMALLOC_SIZE_TB },
+       { &vmemmap_base, 1 },
 };
 
 /* Get size in bytes used by the memory region */
@@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void)
        struct rnd_state rand_state;
        unsigned long remain_entropy;
 
+       /*
+        * All these BUILD_BUG_ON checks ensures the memory layout is
+        * consistent with the vaddr_start/vaddr_end variables.
+        */
+       BUILD_BUG_ON(vaddr_start >= vaddr_end);
+       BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) &&
+                    vaddr_end >= EFI_VA_START);
+       BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) ||
+                     config_enabled(CONFIG_EFI)) &&
+                    vaddr_end >= __START_KERNEL_map);
+       BUILD_BUG_ON(vaddr_end > __START_KERNEL_map);
+
        if (!kaslr_memory_enabled())
                return;
 
index 4dbe656..a7655f6 100644 (file)
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
        unsigned cpu = smp_processor_id();
 
        if (likely(prev != next)) {
+               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+                       /*
+                        * If our current stack is in vmalloc space and isn't
+                        * mapped in the new pgd, we'll double-fault.  Forcibly
+                        * map it.
+                        */
+                       unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+
+                       pgd_t *pgd = next->pgd + stack_pgd_index;
+
+                       if (unlikely(pgd_none(*pgd)))
+                               set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+               }
+
 #ifdef CONFIG_SMP
                this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
                this_cpu_write(cpu_tlbstate.active_mm, next);
 #endif
+
                cpumask_set_cpu(cpu, mm_cpumask(next));
 
                /*
index cb31a44..a2488b6 100644 (file)
 
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
-
-static int backtrace_stack(void *data, char *name)
-{
-       /* Yes, we want all stacks */
-       return 0;
-}
-
-static int backtrace_address(void *data, unsigned long addr, int reliable)
-{
-       unsigned int *depth = data;
-
-       if ((*depth)--)
-               oprofile_add_trace(addr);
-       return 0;
-}
-
-static struct stacktrace_ops backtrace_ops = {
-       .stack          = backtrace_stack,
-       .address        = backtrace_address,
-       .walk_stack     = print_context_stack,
-};
+#include <asm/unwind.h>
 
 #ifdef CONFIG_COMPAT
 static struct stack_frame_ia32 *
@@ -113,10 +93,29 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
        struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
        if (!user_mode(regs)) {
-               unsigned long stack = kernel_stack_pointer(regs);
-               if (depth)
-                       dump_trace(NULL, regs, (unsigned long *)stack, 0,
-                                  &backtrace_ops, &depth);
+               struct unwind_state state;
+               unsigned long addr;
+
+               if (!depth)
+                       return;
+
+               oprofile_add_trace(regs->ip);
+
+               if (!--depth)
+                       return;
+
+               for (unwind_start(&state, current, regs, NULL);
+                    !unwind_done(&state); unwind_next_frame(&state)) {
+                       addr = unwind_get_return_address(&state);
+                       if (!addr)
+                               break;
+
+                       oprofile_add_trace(addr);
+
+                       if (!--depth)
+                               break;
+               }
+
                return;
        }
 
index 9770e55..1d97cea 100644 (file)
@@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service)
 static struct {
        unsigned long address;
        unsigned short segment;
-} pci_indirect = { 0, __KERNEL_CS };
+} pci_indirect __ro_after_init = {
+       .address = 0,
+       .segment = __KERNEL_CS,
+};
 
-static int pci_bios_present;
+static int pci_bios_present __ro_after_init;
 
 static int __init check_pcibios(void)
 {
index a7ef7b1..5766ead 100644 (file)
@@ -194,7 +194,7 @@ int peek_user(struct task_struct *child, long addr, long data)
 
 static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
 {
-       int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+       int err, n, cpu = task_cpu(child);
        struct user_i387_struct fpregs;
 
        err = save_i387_registers(userspace_pid[cpu],
@@ -211,7 +211,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
 
 static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
 {
-       int n, cpu = ((struct thread_info *) child->stack)->cpu;
+       int n, cpu = task_cpu(child);
        struct user_i387_struct fpregs;
 
        n = copy_from_user(&fpregs, buf, sizeof(fpregs));
@@ -224,7 +224,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c
 
 static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
 {
-       int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+       int err, n, cpu = task_cpu(child);
        struct user_fxsr_struct fpregs;
 
        err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs);
@@ -240,7 +240,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *
 
 static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
 {
-       int n, cpu = ((struct thread_info *) child->stack)->cpu;
+       int n, cpu = task_cpu(child);
        struct user_fxsr_struct fpregs;
 
        n = copy_from_user(&fpregs, buf, sizeof(fpregs));
index 96de97a..4025291 100644 (file)
@@ -940,15 +940,13 @@ static void build_inv_irt(struct iommu_cmd *cmd, u16 devid)
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command.
  */
-static int iommu_queue_command_sync(struct amd_iommu *iommu,
-                                   struct iommu_cmd *cmd,
-                                   bool sync)
+static int __iommu_queue_command_sync(struct amd_iommu *iommu,
+                                     struct iommu_cmd *cmd,
+                                     bool sync)
 {
        u32 left, tail, head, next_tail;
-       unsigned long flags;
 
 again:
-       spin_lock_irqsave(&iommu->lock, flags);
 
        head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
        tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
@@ -957,15 +955,14 @@ again:
 
        if (left <= 2) {
                struct iommu_cmd sync_cmd;
-               volatile u64 sem = 0;
                int ret;
 
-               build_completion_wait(&sync_cmd, (u64)&sem);
-               copy_cmd_to_buffer(iommu, &sync_cmd, tail);
+               iommu->cmd_sem = 0;
 
-               spin_unlock_irqrestore(&iommu->lock, flags);
+               build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem);
+               copy_cmd_to_buffer(iommu, &sync_cmd, tail);
 
-               if ((ret = wait_on_sem(&sem)) != 0)
+               if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0)
                        return ret;
 
                goto again;
@@ -976,9 +973,21 @@ again:
        /* We need to sync now to make sure all commands are processed */
        iommu->need_sync = sync;
 
+       return 0;
+}
+
+static int iommu_queue_command_sync(struct amd_iommu *iommu,
+                                   struct iommu_cmd *cmd,
+                                   bool sync)
+{
+       unsigned long flags;
+       int ret;
+
+       spin_lock_irqsave(&iommu->lock, flags);
+       ret = __iommu_queue_command_sync(iommu, cmd, sync);
        spin_unlock_irqrestore(&iommu->lock, flags);
 
-       return 0;
+       return ret;
 }
 
 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
@@ -993,19 +1002,29 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
        struct iommu_cmd cmd;
-       volatile u64 sem = 0;
+       unsigned long flags;
        int ret;
 
        if (!iommu->need_sync)
                return 0;
 
-       build_completion_wait(&cmd, (u64)&sem);
 
-       ret = iommu_queue_command_sync(iommu, &cmd, false);
+       build_completion_wait(&cmd, (u64)&iommu->cmd_sem);
+
+       spin_lock_irqsave(&iommu->lock, flags);
+
+       iommu->cmd_sem = 0;
+
+       ret = __iommu_queue_command_sync(iommu, &cmd, false);
        if (ret)
-               return ret;
+               goto out_unlock;
+
+       ret = wait_on_sem(&iommu->cmd_sem);
 
-       return wait_on_sem(&sem);
+out_unlock:
+       spin_unlock_irqrestore(&iommu->lock, flags);
+
+       return ret;
 }
 
 static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
index caf5e38..9652848 100644 (file)
@@ -524,6 +524,8 @@ struct amd_iommu {
        struct irq_domain *ir_domain;
        struct irq_domain *msi_domain;
 #endif
+
+       volatile u64 __aligned(8) cmd_sem;
 };
 
 #define ACPIHID_UID_LEN 256
index ac0df4d..3b792ab 100644 (file)
@@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
                save_stack_trace_tsk(task, &trace);
 
                for (i = 0; i < trace.nr_entries; i++) {
-                       seq_printf(m, "[<%pK>] %pS\n",
+                       seq_printf(m, "[<%pK>] %pB\n",
                                   (void *)entries[i], (void *)entries[i]);
                }
                unlock_trace(task);
index 7d565af..6f93ac4 100644 (file)
@@ -795,7 +795,12 @@ struct ftrace_ret_stack {
        unsigned long func;
        unsigned long long calltime;
        unsigned long long subtime;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        unsigned long fp;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+       unsigned long *retp;
+#endif
 };
 
 /*
@@ -807,7 +812,10 @@ extern void return_to_handler(void);
 
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
-                        unsigned long frame_pointer);
+                        unsigned long frame_pointer, unsigned long *retp);
+
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                   unsigned long ret, unsigned long *retp);
 
 /*
  * Sometimes we don't want to trace a function with the function
@@ -870,6 +878,13 @@ static inline int task_curr_ret_stack(struct task_struct *tsk)
        return -1;
 }
 
+static inline unsigned long
+ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret,
+                     unsigned long *retp)
+{
+       return ret;
+}
+
 static inline void pause_graph_tracing(void) { }
 static inline void unpause_graph_tracing(void) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
index f8834f8..325f649 100644 (file)
@@ -15,6 +15,8 @@
 #include <net/net_namespace.h>
 #include <linux/sched/rt.h>
 
+#include <asm/thread_info.h>
+
 #ifdef CONFIG_SMP
 # define INIT_PUSHABLE_TASKS(tsk)                                      \
        .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
@@ -183,12 +185,21 @@ extern struct task_group root_task_group;
 # define INIT_KASAN(tsk)
 #endif
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+# define INIT_TASK_TI(tsk)                     \
+       .thread_info = INIT_THREAD_INFO(tsk),   \
+       .stack_refcount = ATOMIC_INIT(1),
+#else
+# define INIT_TASK_TI(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
 #define INIT_TASK(tsk) \
 {                                                                      \
+       INIT_TASK_TI(tsk)                                               \
        .state          = 0,                                            \
        .stack          = init_stack,                                   \
        .usage          = ATOMIC_INIT(2),                               \
index 62c68e5..abb795a 100644 (file)
@@ -1458,6 +1458,13 @@ struct tlbflush_unmap_batch {
 };
 
 struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       /*
+        * For reasons of header soup (see current_thread_info()), this
+        * must be the first element of task_struct.
+        */
+       struct thread_info thread_info;
+#endif
        volatile long state;    /* -1 unrunnable, 0 runnable, >0 stopped */
        void *stack;
        atomic_t usage;
@@ -1467,6 +1474,9 @@ struct task_struct {
 #ifdef CONFIG_SMP
        struct llist_node wake_entry;
        int on_cpu;
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       unsigned int cpu;       /* current CPU */
+#endif
        unsigned int wakee_flips;
        unsigned long wakee_flip_decay_ts;
        struct task_struct *last_wakee;
@@ -1923,6 +1933,13 @@ struct task_struct {
 #ifdef CONFIG_MMU
        struct task_struct *oom_reaper_list;
 #endif
+#ifdef CONFIG_VMAP_STACK
+       struct vm_struct *stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       /* A live task holds one reference. */
+       atomic_t stack_refcount;
+#endif
 /* CPU-specific state of this task */
        struct thread_struct thread;
 /*
@@ -1939,6 +1956,18 @@ extern int arch_task_struct_size __read_mostly;
 # define arch_task_struct_size (sizeof(struct task_struct))
 #endif
 
+#ifdef CONFIG_VMAP_STACK
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+       return t->stack_vm_area;
+}
+#else
+static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
+{
+       return NULL;
+}
+#endif
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -2573,7 +2602,9 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 void yield(void);
 
 union thread_union {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
+#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
 };
 
@@ -3061,10 +3092,34 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
        cgroup_threadgroup_change_end(tsk);
 }
 
-#ifndef __HAVE_THREAD_FUNCTIONS
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+static inline struct thread_info *task_thread_info(struct task_struct *task)
+{
+       return &task->thread_info;
+}
+
+/*
+ * When accessing the stack of a non-current task that might exit, use
+ * try_get_task_stack() instead.  task_stack_page will return a pointer
+ * that could get freed out from under you.
+ */
+static inline void *task_stack_page(const struct task_struct *task)
+{
+       return task->stack;
+}
+
+#define setup_thread_stack(new,old)    do { } while(0)
+
+static inline unsigned long *end_of_stack(const struct task_struct *task)
+{
+       return task->stack;
+}
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)
 
 #define task_thread_info(task) ((struct thread_info *)(task)->stack)
-#define task_stack_page(task)  ((task)->stack)
+#define task_stack_page(task)  ((void *)(task)->stack)
 
 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
 {
@@ -3091,6 +3146,24 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 }
 
 #endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+       return atomic_inc_not_zero(&tsk->stack_refcount) ?
+               task_stack_page(tsk) : NULL;
+}
+
+extern void put_task_stack(struct task_struct *tsk);
+#else
+static inline void *try_get_task_stack(struct task_struct *tsk)
+{
+       return task_stack_page(tsk);
+}
+
+static inline void put_task_stack(struct task_struct *tsk) {}
+#endif
+
 #define task_stack_end_corrupted(task) \
                (*(end_of_stack(task)) != STACK_END_MAGIC)
 
@@ -3364,7 +3437,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
 
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       return p->cpu;
+#else
        return task_thread_info(p)->cpu;
+#endif
 }
 
 static inline int task_node(const struct task_struct *p)
index 2b5b10e..45f004e 100644 (file)
 struct timespec;
 struct compat_timespec;
 
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+struct thread_info {
+       unsigned long           flags;          /* low level flags */
+};
+
+#define INIT_THREAD_INFO(tsk)                  \
+{                                              \
+       .flags          = 0,                    \
+}
+#endif
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+#define current_thread_info() ((struct thread_info *)current)
+#endif
+
 /*
  * System call restart block.
  */
index cac3f09..3b9a47f 100644 (file)
@@ -26,6 +26,16 @@ config IRQ_WORK
 config BUILDTIME_EXTABLE_SORT
        bool
 
+config THREAD_INFO_IN_TASK
+       bool
+       help
+         Select this to move thread_info off the stack into task_struct.  To
+         make this work, an arch will need to remove all thread_info fields
+         except flags and fix any runtime bugs.
+
+         One subtle change that will be needed is to use try_get_task_stack()
+         and put_task_stack() in save_thread_stack_tsk() and get_wchan().
+
 menu "General setup"
 
 config BROKEN
index ba0a7f3..11f83be 100644 (file)
@@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task);
  * Initial thread structure. Alignment of this is handled by a special
  * linker map entry.
  */
-union thread_union init_thread_union __init_task_data =
-       { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data = {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+       INIT_THREAD_INFO(init_task)
+#endif
+};
index beb3172..c060c7e 100644 (file)
@@ -158,19 +158,83 @@ void __weak arch_release_thread_stack(unsigned long *stack)
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
-# if THREAD_SIZE >= PAGE_SIZE
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-                                                 int node)
+# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
+
+#ifdef CONFIG_VMAP_STACK
+/*
+ * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
+ * flush.  Try to minimize the number of calls by caching stacks.
+ */
+#define NR_CACHED_STACKS 2
+static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+#endif
+
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
+#ifdef CONFIG_VMAP_STACK
+       void *stack;
+       int i;
+
+       local_irq_disable();
+       for (i = 0; i < NR_CACHED_STACKS; i++) {
+               struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+
+               if (!s)
+                       continue;
+               this_cpu_write(cached_stacks[i], NULL);
+
+               tsk->stack_vm_area = s;
+               local_irq_enable();
+               return s->addr;
+       }
+       local_irq_enable();
+
+       stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
+                                    VMALLOC_START, VMALLOC_END,
+                                    THREADINFO_GFP | __GFP_HIGHMEM,
+                                    PAGE_KERNEL,
+                                    0, node, __builtin_return_address(0));
+
+       /*
+        * We can't call find_vm_area() in interrupt context, and
+        * free_thread_stack() can be called in interrupt context,
+        * so cache the vm_struct.
+        */
+       if (stack)
+               tsk->stack_vm_area = find_vm_area(stack);
+       return stack;
+#else
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
        return page ? page_address(page) : NULL;
+#endif
 }
 
-static inline void free_thread_stack(unsigned long *stack)
+static inline void free_thread_stack(struct task_struct *tsk)
 {
-       __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER);
+#ifdef CONFIG_VMAP_STACK
+       if (task_stack_vm_area(tsk)) {
+               unsigned long flags;
+               int i;
+
+               local_irq_save(flags);
+               for (i = 0; i < NR_CACHED_STACKS; i++) {
+                       if (this_cpu_read(cached_stacks[i]))
+                               continue;
+
+                       this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
+                       local_irq_restore(flags);
+                       return;
+               }
+               local_irq_restore(flags);
+
+               vfree(tsk->stack);
+               return;
+       }
+#endif
+
+       __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_stack_cache;
@@ -181,9 +245,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
        return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_stack(unsigned long *stack)
+static void free_thread_stack(struct task_struct *tsk)
 {
-       kmem_cache_free(thread_stack_cache, stack);
+       kmem_cache_free(thread_stack_cache, tsk->stack);
 }
 
 void thread_stack_cache_init(void)
@@ -213,24 +277,76 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(unsigned long *stack, int account)
+static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-       /* All stack pages are in the same zone and belong to the same memcg. */
-       struct page *first_page = virt_to_page(stack);
+       void *stack = task_stack_page(tsk);
+       struct vm_struct *vm = task_stack_vm_area(tsk);
+
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
+       if (vm) {
+               int i;
 
-       mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                           THREAD_SIZE / 1024 * account);
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
 
-       memcg_kmem_update_page_stat(
-               first_page, MEMCG_KERNEL_STACK_KB,
-               account * (THREAD_SIZE / 1024));
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+                       mod_zone_page_state(page_zone(vm->pages[i]),
+                                           NR_KERNEL_STACK_KB,
+                                           PAGE_SIZE / 1024 * account);
+               }
+
+               /* All stack pages belong to the same memcg. */
+               memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       } else {
+               /*
+                * All stack pages are in the same zone and belong to the
+                * same memcg.
+                */
+               struct page *first_page = virt_to_page(stack);
+
+               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
+                                   THREAD_SIZE / 1024 * account);
+
+               memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
+                                           account * (THREAD_SIZE / 1024));
+       }
 }
 
-void free_task(struct task_struct *tsk)
+static void release_task_stack(struct task_struct *tsk)
 {
-       account_kernel_stack(tsk->stack, -1);
+       account_kernel_stack(tsk, -1);
        arch_release_thread_stack(tsk->stack);
-       free_thread_stack(tsk->stack);
+       free_thread_stack(tsk);
+       tsk->stack = NULL;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = NULL;
+#endif
+}
+
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+void put_task_stack(struct task_struct *tsk)
+{
+       if (atomic_dec_and_test(&tsk->stack_refcount))
+               release_task_stack(tsk);
+}
+#endif
+
+void free_task(struct task_struct *tsk)
+{
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+       /*
+        * The task is finally done with both the stack and thread_info,
+        * so free both.
+        */
+       release_task_stack(tsk);
+#else
+       /*
+        * If the task had a separate stack allocation, it should be gone
+        * by now.
+        */
+       WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0);
+#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        put_seccomp_filter(tsk);
@@ -342,6 +458,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
        unsigned long *stack;
+       struct vm_struct *stack_vm_area;
        int err;
 
        if (node == NUMA_NO_NODE)
@@ -354,11 +471,26 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!stack)
                goto free_tsk;
 
+       stack_vm_area = task_stack_vm_area(tsk);
+
        err = arch_dup_task_struct(tsk, orig);
+
+       /*
+        * arch_dup_task_struct() clobbers the stack-related fields.  Make
+        * sure they're properly initialized before using any stack-related
+        * functions again.
+        */
+       tsk->stack = stack;
+#ifdef CONFIG_VMAP_STACK
+       tsk->stack_vm_area = stack_vm_area;
+#endif
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       atomic_set(&tsk->stack_refcount, 1);
+#endif
+
        if (err)
                goto free_stack;
 
-       tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -390,14 +522,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
 
-       account_kernel_stack(stack, 1);
+       account_kernel_stack(tsk, 1);
 
        kcov_task_init(tsk);
 
        return tsk;
 
 free_stack:
-       free_thread_stack(stack);
+       free_thread_stack(tsk);
 free_tsk:
        free_task_struct(tsk);
        return NULL;
@@ -1715,6 +1847,7 @@ bad_fork_cleanup_count:
        atomic_dec(&p->cred->user->processes);
        exit_creds(p);
 bad_fork_free:
+       put_task_stack(p);
        free_task(p);
 fork_out:
        return ERR_PTR(retval);
index 9ff173d..4ab4c37 100644 (file)
@@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 static struct kthread *to_live_kthread(struct task_struct *k)
 {
        struct completion *vfork = ACCESS_ONCE(k->vfork_done);
-       if (likely(vfork))
+       if (likely(vfork) && try_get_task_stack(k))
                return __to_kthread(vfork);
        return NULL;
 }
@@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k)
 {
        struct kthread *kthread = to_live_kthread(k);
 
-       if (kthread)
+       if (kthread) {
                __kthread_unpark(k, kthread);
+               put_task_stack(k);
+       }
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
 
@@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k)
                                wait_for_completion(&kthread->parked);
                        }
                }
+               put_task_stack(k);
                ret = 0;
        }
        return ret;
@@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k)
                __kthread_unpark(k, kthread);
                wake_up_process(k);
                wait_for_completion(&kthread->exited);
+               put_task_stack(k);
        }
        ret = k->exit_code;
        put_task_struct(k);
index 44817c6..23c6037 100644 (file)
@@ -2772,6 +2772,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                 * task and put them back on the free list.
                 */
                kprobe_flush_task(prev);
+
+               /* Task is done with its stack. */
+               put_task_stack(prev);
+
                put_task_struct(prev);
        }
 
@@ -3403,7 +3407,6 @@ static void __sched notrace __schedule(bool preempt)
 
        balance_callback(rq);
 }
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
 
 static inline void sched_submit_work(struct task_struct *tsk)
 {
index c64fc51..3655c96 100644 (file)
@@ -1000,7 +1000,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         * per-task data have been completed by this moment.
         */
        smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+       p->cpu = cpu;
+#else
        task_thread_info(p)->cpu = cpu;
+#endif
        p->wake_cpu = cpu;
 #endif
 }
index f4b86e8..ba33267 100644 (file)
@@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER
        help
          See Documentation/trace/ftrace-design.txt
 
-config HAVE_FUNCTION_GRAPH_FP_TEST
-       bool
-       help
-         See Documentation/trace/ftrace-design.txt
-
 config HAVE_DYNAMIC_FTRACE
        bool
        help
index 7363ccf..0cbe38a 100644 (file)
@@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,
 /* Add a function return address to the trace stack on thread info.*/
 int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
-                        unsigned long frame_pointer)
+                        unsigned long frame_pointer, unsigned long *retp)
 {
        unsigned long long calltime;
        int index;
@@ -171,7 +171,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
        current->ret_stack[index].subtime = 0;
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        current->ret_stack[index].fp = frame_pointer;
+#endif
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+       current->ret_stack[index].retp = retp;
+#endif
        *depth = current->curr_ret_stack;
 
        return 0;
@@ -204,7 +209,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
                return;
        }
 
-#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
        /*
         * The arch may choose to record the frame pointer used
         * and check it here to make sure that it is what we expect it
@@ -279,6 +284,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
        return ret;
 }
 
+/**
+ * ftrace_graph_ret_addr - convert a potentially modified stack return address
+ *                        to its original value
+ *
+ * This function can be called by stack unwinding code to convert a found stack
+ * return address ('ret') to its original value, in case the function graph
+ * tracer has modified it to be 'return_to_handler'.  If the address hasn't
+ * been modified, the unchanged value of 'ret' is returned.
+ *
+ * 'idx' is a state variable which should be initialized by the caller to zero
+ * before the first call.
+ *
+ * 'retp' is a pointer to the return address on the stack.  It's ignored if
+ * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined.
+ */
+#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                   unsigned long ret, unsigned long *retp)
+{
+       int index = task->curr_ret_stack;
+       int i;
+
+       if (ret != (unsigned long)return_to_handler)
+               return ret;
+
+       if (index < -1)
+               index += FTRACE_NOTRACE_DEPTH;
+
+       if (index < 0)
+               return ret;
+
+       for (i = 0; i <= index; i++)
+               if (task->ret_stack[i].retp == retp)
+                       return task->ret_stack[i].ret;
+
+       return ret;
+}
+#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
+                                   unsigned long ret, unsigned long *retp)
+{
+       int task_idx;
+
+       if (ret != (unsigned long)return_to_handler)
+               return ret;
+
+       task_idx = task->curr_ret_stack;
+
+       if (!task->ret_stack || task_idx < *idx)
+               return ret;
+
+       task_idx -= *idx;
+       (*idx)++;
+
+       return task->ret_stack[task_idx].ret;
+}
+#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */
+
 int __trace_graph_entry(struct trace_array *tr,
                                struct ftrace_graph_ent *trace,
                                unsigned long flags,
index fcfa193..06f02f6 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/stacktrace.h>
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
+#include <linux/vmalloc.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/export.h>
@@ -1164,11 +1165,32 @@ static void check_unmap(struct dma_debug_entry *ref)
        put_hash_bucket(bucket, &flags);
 }
 
-static void check_for_stack(struct device *dev, void *addr)
+static void check_for_stack(struct device *dev,
+                           struct page *page, size_t offset)
 {
-       if (object_is_on_stack(addr))
-               err_printk(dev, NULL, "DMA-API: device driver maps memory from "
-                               "stack [addr=%p]\n", addr);
+       void *addr;
+       struct vm_struct *stack_vm_area = task_stack_vm_area(current);
+
+       if (!stack_vm_area) {
+               /* Stack is direct-mapped. */
+               if (PageHighMem(page))
+                       return;
+               addr = page_address(page) + offset;
+               if (object_is_on_stack(addr))
+                       err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr);
+       } else {
+               /* Stack is vmalloced. */
+               int i;
+
+               for (i = 0; i < stack_vm_area->nr_pages; i++) {
+                       if (page != stack_vm_area->pages[i])
+                               continue;
+
+                       addr = (u8 *)current->stack + i * PAGE_SIZE + offset;
+                       err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr);
+                       break;
+               }
+       }
 }
 
 static inline bool overlap(void *addr, unsigned long len, void *start, void *end)
@@ -1291,10 +1313,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
        if (map_single)
                entry->type = dma_debug_single;
 
+       check_for_stack(dev, page, offset);
+
        if (!PageHighMem(page)) {
                void *addr = page_address(page) + offset;
 
-               check_for_stack(dev, addr);
                check_for_illegal_area(dev, addr, size);
        }
 
@@ -1386,8 +1409,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
                entry->sg_call_ents   = nents;
                entry->sg_mapped_ents = mapped_ents;
 
+               check_for_stack(dev, sg_page(s), s->offset);
+
                if (!PageHighMem(sg_page(s))) {
-                       check_for_stack(dev, sg_virt(s));
                        check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
                }
 
index e30e039..63239e0 100644 (file)
@@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno,
                           unsigned long args[6], unsigned int maxargs,
                           unsigned long *sp, unsigned long *pc)
 {
-       struct pt_regs *regs = task_pt_regs(target);
-       if (unlikely(!regs))
+       struct pt_regs *regs;
+
+       if (!try_get_task_stack(target)) {
+               /* Task has no stack, so the task isn't in a syscall. */
+               *callno = -1;
+               return 0;
+       }
+
+       regs = task_pt_regs(target);
+       if (unlikely(!regs)) {
+               put_task_stack(target);
                return -EAGAIN;
+       }
 
        *sp = user_stack_pointer(regs);
        *pc = instruction_pointer(regs);
@@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno,
        if (*callno != -1L && maxargs > 0)
                syscall_get_arguments(target, regs, 0, maxargs, args);
 
+       put_task_stack(target);
        return 0;
 }
 
index 4214567..b037ce9 100644 (file)
@@ -147,7 +147,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
        if (args.nr != getpid() ||
            args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 ||
            args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
-               printf("[FAIL]\tgetpid() failed to preseve regs\n");
+               printf("[FAIL]\tgetpid() failed to preserve regs\n");
                nerrs++;
        } else {
                printf("[OK]\tgetpid() preserves regs\n");
@@ -162,7 +162,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
        if (args.nr != 0 ||
            args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 ||
            args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
-               printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preseve regs\n");
+               printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preserve regs\n");
                nerrs++;
        } else {
                printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n");
index 8a577e7..246145b 100644 (file)
@@ -106,7 +106,7 @@ asm (".pushsection .text\n\t"
      ".type int3, @function\n\t"
      ".align 4096\n\t"
      "int3:\n\t"
-     "mov %ss,%eax\n\t"
+     "mov %ss,%ecx\n\t"
      "int3\n\t"
      ".size int3, . - int3\n\t"
      ".align 4096, 0xcc\n\t"
@@ -306,7 +306,7 @@ static volatile sig_atomic_t sig_corrupt_final_ss;
 #ifdef __x86_64__
 # define REG_IP REG_RIP
 # define REG_SP REG_RSP
-# define REG_AX REG_RAX
+# define REG_CX REG_RCX
 
 struct selectors {
        unsigned short cs, gs, fs, ss;
@@ -326,7 +326,7 @@ static unsigned short *csptr(ucontext_t *ctx)
 #else
 # define REG_IP REG_EIP
 # define REG_SP REG_ESP
-# define REG_AX REG_EAX
+# define REG_CX REG_ECX
 
 static greg_t *ssptr(ucontext_t *ctx)
 {
@@ -457,10 +457,10 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
        ctx->uc_mcontext.gregs[REG_IP] =
                sig_cs == code16_sel ? 0 : (unsigned long)&int3;
        ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
-       ctx->uc_mcontext.gregs[REG_AX] = 0;
+       ctx->uc_mcontext.gregs[REG_CX] = 0;
 
        memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
-       requested_regs[REG_AX] = *ssptr(ctx);   /* The asm code does this. */
+       requested_regs[REG_CX] = *ssptr(ctx);   /* The asm code does this. */
 
        return;
 }
@@ -482,7 +482,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
        unsigned short ss;
        asm ("mov %%ss,%0" : "=r" (ss));
 
-       greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX];
+       greg_t asm_ss = ctx->uc_mcontext.gregs[REG_CX];
        if (asm_ss != sig_ss && sig == SIGTRAP) {
                /* Sanity check failure. */
                printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n",
@@ -654,8 +654,8 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss)
 #endif
 
                /* Sanity check on the kernel */
-               if (i == REG_AX && requested_regs[i] != resulting_regs[i]) {
-                       printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n",
+               if (i == REG_CX && requested_regs[i] != resulting_regs[i]) {
+                       printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n",
                               (unsigned long long)requested_regs[i],
                               (unsigned long long)resulting_regs[i]);
                        nerrs++;