Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 10 Feb 2015 01:16:44 +0000 (17:16 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 10 Feb 2015 01:16:44 +0000 (17:16 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 10 Feb 2015 01:16:44 +0000 (17:16 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 10 Feb 2015 01:16:44 +0000 (17:16 -0800)
diff --git a/Documentation/x86/entry_64.txt b/Documentation/x86/entry_64.txt

index 4a1c5c2..9132b86 100644 (file)
--- a/Documentation/x86/entry_64.txt
+++ b/Documentation/x86/entry_64.txt
@@ -78,9 +78,6 @@ The expensive (paranoid) way is to read back the MSR_GS_BASE value
         xorl %ebx,%ebx
  1:     ret
  
-and the whole paranoid non-paranoid macro complexity is about whether
-to suffer that RDMSR cost.
-
  If we are at an interrupt or user-trap/gate-alike boundary then we can
  use the faster check: the stack will be a reliable indicator of
  whether SWAPGS was already done: if we see that we are a secondary
@@ -93,6 +90,15 @@ which might have triggered right after a normal entry wrote CS to the
  stack but before we executed SWAPGS, then the only safe way to check
  for GS is the slower method: the RDMSR.
  
-So we try only to mark those entry methods 'paranoid' that absolutely
-need the more expensive check for the GS base - and we generate all
-'normal' entry points with the regular (faster) entry macros.
+Therefore, super-atomic entries (except NMI, which is handled separately)
+must use idtentry with paranoid=1 to handle gsbase correctly.  This
+triggers three main behavior changes:
+
+ - Interrupt entry will use the slower gsbase check.
+ - Interrupt entry from user mode will switch off the IST stack.
+ - Interrupt exit to kernel mode will not attempt to reschedule.
+
+We try to only use IST entries and the paranoid entry code for vectors
+that absolutely need the more expensive check for the GS base - and we
+generate all 'normal' entry points with the regular (faster) paranoid=0
+variant.
diff --git a/Documentation/x86/x86_64/kernel-stacks b/Documentation/x86/x86_64/kernel-stacks

index a01eec5..e3c8a49 100644 (file)
--- a/Documentation/x86/x86_64/kernel-stacks
+++ b/Documentation/x86/x86_64/kernel-stacks
@@ -40,9 +40,11 @@ An IST is selected by a non-zero value in the IST field of an
  interrupt-gate descriptor.  When an interrupt occurs and the hardware
  loads such a descriptor, the hardware automatically sets the new stack
  pointer based on the IST value, then invokes the interrupt handler.  If
-software wants to allow nested IST interrupts then the handler must
-adjust the IST values on entry to and exit from the interrupt handler.
-(This is occasionally done, e.g. for debug exceptions.)
+the interrupt came from user mode, then the interrupt handler prologue
+will switch back to the per-thread stack.  If software wants to allow
+nested IST interrupts then the handler must adjust the IST values on
+entry to and exit from the interrupt handler.  (This is occasionally
+done, e.g. for debug exceptions.)
  
  Events with different IST codes (i.e. with different stacks) can be
  nested.  For example, a debug interrupt can safely be interrupted by an
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S

index 82e8a1d..156ebca 100644 (file)
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -179,8 +179,8 @@ sysenter_dispatch:
  sysexit_from_sys_call:
         andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
         /* clear IF, that popfq doesn't enable interrupts early */
-       andl  $~0x200,EFLAGS-R11(%rsp) 
-       movl    RIP-R11(%rsp),%edx              /* User %eip */
+       andl    $~0x200,EFLAGS-ARGOFFSET(%rsp)
+       movl    RIP-ARGOFFSET(%rsp),%edx                /* User %eip */
         CFI_REGISTER rip,rdx
         RESTORE_ARGS 0,24,0,0,0,0
         xorq    %r8,%r8
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h

index 76659b6..1f1297b 100644 (file)
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
  #define SS             160
  
  #define ARGOFFSET      R11
-#define SWFRAME                ORIG_RAX
  
         .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
         subq  $9*8+\addskip, %rsp
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h

index 51b26e8..9b3de99 100644 (file)
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -190,7 +190,6 @@ enum mcp_flags {
  void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
  
  int mce_notify_irq(void);
-void mce_notify_process(void);
  
  DECLARE_PER_CPU(struct mce, injectm);
  
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h

index 547e344..e82e95a 100644 (file)
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -75,7 +75,6 @@ struct thread_info {
  #define TIF_SYSCALL_EMU                6       /* syscall emulation active */
  #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
  #define TIF_SECCOMP            8       /* secure computing */
-#define TIF_MCE_NOTIFY         10      /* notify userspace of an MCE */
  #define TIF_USER_RETURN_NOTIFY 11      /* notify kernel of userspace return */
  #define TIF_UPROBE             12      /* breakpointed or singlestepping */
  #define TIF_NOTSC              16      /* TSC is not accessible in userland */
@@ -100,7 +99,6 @@ struct thread_info {
  #define _TIF_SYSCALL_EMU       (1 << TIF_SYSCALL_EMU)
  #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
  #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
-#define _TIF_MCE_NOTIFY                (1 << TIF_MCE_NOTIFY)
  #define _TIF_USER_RETURN_NOTIFY        (1 << TIF_USER_RETURN_NOTIFY)
  #define _TIF_UPROBE            (1 << TIF_UPROBE)
  #define _TIF_NOTSC             (1 << TIF_NOTSC)
@@ -140,7 +138,7 @@ struct thread_info {
  
  /* Only used for 64 bit */
  #define _TIF_DO_NOTIFY_MASK                                            \
-       (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME |       \
+       (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |                         \
          _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
  
  /* flags to check in __switch_to() */
@@ -170,6 +168,17 @@ static inline struct thread_info *current_thread_info(void)
         return ti;
  }
  
+static inline unsigned long current_stack_pointer(void)
+{
+       unsigned long sp;
+#ifdef CONFIG_X86_64
+       asm("mov %%rsp,%0" : "=g" (sp));
+#else
+       asm("mov %%esp,%0" : "=g" (sp));
+#endif
+       return sp;
+}
+
  #else /* !__ASSEMBLY__ */
  
  /* how to get the thread information struct from ASM */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h

index 707adc6..4e49d7d 100644 (file)
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,7 @@
  #ifndef _ASM_X86_TRAPS_H
  #define _ASM_X86_TRAPS_H
  
+#include <linux/context_tracking_state.h>
  #include <linux/kprobes.h>
  
  #include <asm/debugreg.h>
@@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void);
  asmlinkage void mce_threshold_interrupt(void);
  #endif
  
+extern enum ctx_state ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_begin_non_atomic(struct pt_regs *regs);
+extern void ist_end_non_atomic(void);
+
  /* Interrupts/Exceptions */
  enum {
         X86_TRAP_DE = 0,        /*  0, Divide-by-zero */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c

index d2c6116..d231799 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
  #include <linux/export.h>
  
  #include <asm/processor.h>
+#include <asm/traps.h>
  #include <asm/mce.h>
  #include <asm/msr.h>
  
@@ -1002,51 +1003,6 @@ static void mce_clear_state(unsigned long *toclear)
         }
  }
  
-/*
- * Need to save faulting physical address associated with a process
- * in the machine check handler some place where we can grab it back
- * later in mce_notify_process()
- */
-#define        MCE_INFO_MAX    16
-
-struct mce_info {
-       atomic_t                inuse;
-       struct task_struct      *t;
-       __u64                   paddr;
-       int                     restartable;
-} mce_info[MCE_INFO_MAX];
-
-static void mce_save_info(__u64 addr, int c)
-{
-       struct mce_info *mi;
-
-       for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
-               if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
-                       mi->t = current;
-                       mi->paddr = addr;
-                       mi->restartable = c;
-                       return;
-               }
-       }
-
-       mce_panic("Too many concurrent recoverable errors", NULL, NULL);
-}
-
-static struct mce_info *mce_find_info(void)
-{
-       struct mce_info *mi;
-
-       for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
-               if (atomic_read(&mi->inuse) && mi->t == current)
-                       return mi;
-       return NULL;
-}
-
-static void mce_clear_info(struct mce_info *mi)
-{
-       atomic_set(&mi->inuse, 0);
-}
-
  /*
   * The actual machine check handler. This only handles real
   * exceptions when something got corrupted coming in through int 18.
@@ -1063,6 +1019,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
  {
         struct mca_config *cfg = &mca_cfg;
         struct mce m, *final;
+       enum ctx_state prev_state;
         int i;
         int worst = 0;
         int severity;
@@ -1084,6 +1041,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         DECLARE_BITMAP(toclear, MAX_NR_BANKS);
         DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
         char *msg = "Unknown";
+       u64 recover_paddr = ~0ull;
+       int flags = MF_ACTION_REQUIRED;
+
+       prev_state = ist_enter(regs);
  
         this_cpu_inc(mce_exception_count);
  
@@ -1203,9 +1164,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
                 if (no_way_out)
                         mce_panic("Fatal machine check on current CPU", &m, msg);
                 if (worst == MCE_AR_SEVERITY) {
-                       /* schedule action before return to userland */
-                       mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
-                       set_thread_flag(TIF_MCE_NOTIFY);
+                       recover_paddr = m.addr;
+                       if (!(m.mcgstatus & MCG_STATUS_RIPV))
+                               flags |= MF_MUST_KILL;
                 } else if (kill_it) {
                         force_sig(SIGBUS, current);
                 }
@@ -1216,6 +1177,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
         mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out:
         sync_core();
+
+       if (recover_paddr == ~0ull)
+               goto done;
+
+       pr_err("Uncorrected hardware memory error in user-access at %llx",
+                recover_paddr);
+       /*
+        * We must call memory_failure() here even if the current process is
+        * doomed. We still need to mark the page as poisoned and alert any
+        * other users of the page.
+        */
+       ist_begin_non_atomic(regs);
+       local_irq_enable();
+       if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
+               pr_err("Memory error not recovered");
+               force_sig(SIGBUS, current);
+       }
+       local_irq_disable();
+       ist_end_non_atomic();
+done:
+       ist_exit(regs, prev_state);
  }
  EXPORT_SYMBOL_GPL(do_machine_check);
  
@@ -1232,42 +1214,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
  }
  #endif
  
-/*
- * Called in process context that interrupted by MCE and marked with
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * This code is allowed to sleep.
- * Attempt possible recovery such as calling the high level VM handler to
- * process any corrupted pages, and kill/signal current process if required.
- * Action required errors are handled here.
- */
-void mce_notify_process(void)
-{
-       unsigned long pfn;
-       struct mce_info *mi = mce_find_info();
-       int flags = MF_ACTION_REQUIRED;
-
-       if (!mi)
-               mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
-       pfn = mi->paddr >> PAGE_SHIFT;
-
-       clear_thread_flag(TIF_MCE_NOTIFY);
-
-       pr_err("Uncorrected hardware memory error in user-access at %llx",
-                mi->paddr);
-       /*
-        * We must call memory_failure() here even if the current process is
-        * doomed. We still need to mark the page as poisoned and alert any
-        * other users of the page.
-        */
-       if (!mi->restartable)
-               flags |= MF_MUST_KILL;
-       if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
-               pr_err("Memory error not recovered");
-               force_sig(SIGBUS, current);
-       }
-       mce_clear_info(mi);
-}
-
  /*
   * Action optional processing happens here (picking up
   * from the list of faulting pages that do_machine_check()
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c

index a304298..ec2663a 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,7 @@
  #include <linux/smp.h>
  
  #include <asm/processor.h>
+#include <asm/traps.h>
  #include <asm/mce.h>
  #include <asm/msr.h>
  
@@ -17,8 +18,11 @@ int mce_p5_enabled __read_mostly;
  /* Machine check handler for Pentium class Intel CPUs: */
  static void pentium_machine_check(struct pt_regs *regs, long error_code)
  {
+       enum ctx_state prev_state;
         u32 loaddr, hi, lotype;
  
+       prev_state = ist_enter(regs);
+
         rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
         rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
  
@@ -33,6 +37,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
         }
  
         add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+       ist_exit(regs, prev_state);
  }
  
  /* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c

index 7dc5564..bd5d46a 100644 (file)
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,14 +7,19 @@
  #include <linux/types.h>
  
  #include <asm/processor.h>
+#include <asm/traps.h>
  #include <asm/mce.h>
  #include <asm/msr.h>
  
  /* Machine check handler for WinChip C6: */
  static void winchip_machine_check(struct pt_regs *regs, long error_code)
  {
+       enum ctx_state prev_state = ist_enter(regs);
+
         printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
         add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+       ist_exit(regs, prev_state);
  }
  
  /* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S

index 9ebaf63..db13655 100644 (file)
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
         movq \tmp,RSP+\offset(%rsp)
         movq $__USER_DS,SS+\offset(%rsp)
         movq $__USER_CS,CS+\offset(%rsp)
-       movq $-1,RCX+\offset(%rsp)
+       movq RIP+\offset(%rsp),\tmp  /* get rip */
+       movq \tmp,RCX+\offset(%rsp)  /* copy it to rcx as sysret would do */
         movq R11+\offset(%rsp),\tmp  /* get eflags */
         movq \tmp,EFLAGS+\offset(%rsp)
         .endm
@@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
         movq \tmp,R11+\offset(%rsp)
         .endm
  
-       .macro FAKE_STACK_FRAME child_rip
-       /* push in order ss, rsp, eflags, cs, rip */
-       xorl %eax, %eax
-       pushq_cfi $__KERNEL_DS /* ss */
-       /*CFI_REL_OFFSET        ss,0*/
-       pushq_cfi %rax /* rsp */
-       CFI_REL_OFFSET  rsp,0
-       pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
-       /*CFI_REL_OFFSET        rflags,0*/
-       pushq_cfi $__KERNEL_CS /* cs */
-       /*CFI_REL_OFFSET        cs,0*/
-       pushq_cfi \child_rip /* rip */
-       CFI_REL_OFFSET  rip,0
-       pushq_cfi %rax /* orig rax */
-       .endm
-
-       .macro UNFAKE_STACK_FRAME
-       addq $8*6, %rsp
-       CFI_ADJUST_CFA_OFFSET   -(6*8)
-       .endm
-
  /*
   * initial frame state for interrupts (and exceptions without error code)
   */
@@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
         CFI_REL_OFFSET r15, R15+\offset
         .endm
  
-/* save partial stack frame */
-       .macro SAVE_ARGS_IRQ
-       cld
-       /* start from rbp in pt_regs and jump over */
-       movq_cfi rdi, (RDI-RBP)
-       movq_cfi rsi, (RSI-RBP)
-       movq_cfi rdx, (RDX-RBP)
-       movq_cfi rcx, (RCX-RBP)
-       movq_cfi rax, (RAX-RBP)
-       movq_cfi  r8,  (R8-RBP)
-       movq_cfi  r9,  (R9-RBP)
-       movq_cfi r10, (R10-RBP)
-       movq_cfi r11, (R11-RBP)
-
-       /* Save rbp so that we can unwind from get_irq_regs() */
-       movq_cfi rbp, 0
-
-       /* Save previous stack value */
-       movq %rsp, %rsi
-
-       leaq -RBP(%rsp),%rdi    /* arg1 for handler */
-       testl $3, CS-RBP(%rsi)
-       je 1f
-       SWAPGS
-       /*
-        * irq_count is used to check if a CPU is already on an interrupt stack
-        * or not. While this is essentially redundant with preempt_count it is
-        * a little cheaper to use a separate counter in the PDA (short of
-        * moving irq_enter into assembly, which would be too much work)
-        */
-1:     incl PER_CPU_VAR(irq_count)
-       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-       CFI_DEF_CFA_REGISTER    rsi
-
-       /* Store previous stack value */
-       pushq %rsi
-       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
-                       0x77 /* DW_OP_breg7 */, 0, \
-                       0x06 /* DW_OP_deref */, \
-                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
-                       0x22 /* DW_OP_plus */
-       /* We entered an interrupt context - irqs are off: */
-       TRACE_IRQS_OFF
-       .endm
-
  ENTRY(save_paranoid)
         XCPT_FRAME 1 RDI+8
         cld
@@ -426,15 +361,12 @@ system_call_fastpath:
   * Has incomplete stack frame and undefined top of stack.
   */
  ret_from_sys_call:
-       movl $_TIF_ALLWORK_MASK,%edi
-       /* edi: flagmask */
-sysret_check:
+       testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+       jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+
         LOCKDEP_SYS_EXIT
         DISABLE_INTERRUPTS(CLBR_NONE)
         TRACE_IRQS_OFF
-       movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
-       andl %edi,%edx
-       jnz  sysret_careful
         CFI_REMEMBER_STATE
         /*
          * sysretq will re-enable interrupts:
@@ -448,49 +380,10 @@ sysret_check:
         USERGS_SYSRET64
  
         CFI_RESTORE_STATE
-       /* Handle reschedules */
-       /* edx: work, edi: workmask */
-sysret_careful:
-       bt $TIF_NEED_RESCHED,%edx
-       jnc sysret_signal
-       TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_NONE)
-       pushq_cfi %rdi
-       SCHEDULE_USER
-       popq_cfi %rdi
-       jmp sysret_check
  
-       /* Handle a signal */
-sysret_signal:
-       TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
-       bt $TIF_SYSCALL_AUDIT,%edx
-       jc sysret_audit
-#endif
-       /*
-        * We have a signal, or exit tracing or single-step.
-        * These all wind up with the iret return path anyway,
-        * so just join that path right now.
-        */
+int_ret_from_sys_call_fixup:
         FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-       jmp int_check_syscall_exit_work
-
-#ifdef CONFIG_AUDITSYSCALL
-       /*
-        * Return fast path for syscall audit.  Call __audit_syscall_exit()
-        * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
-        * masked off.
-        */
-sysret_audit:
-       movq RAX-ARGOFFSET(%rsp),%rsi   /* second arg, syscall return value */
-       cmpq $-MAX_ERRNO,%rsi   /* is it < -MAX_ERRNO? */
-       setbe %al               /* 1 if so, 0 if not */
-       movzbl %al,%edi         /* zero-extend that into %edi */
-       call __audit_syscall_exit
-       movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
-       jmp sysret_check
-#endif /* CONFIG_AUDITSYSCALL */
+       jmp int_ret_from_sys_call
  
         /* Do syscall tracing */
  tracesys:
@@ -626,19 +519,6 @@ END(\label)
         FORK_LIKE  vfork
         FIXED_FRAME stub_iopl, sys_iopl
  
-ENTRY(ptregscall_common)
-       DEFAULT_FRAME 1 8       /* offset 8: return address */
-       RESTORE_TOP_OF_STACK %r11, 8
-       movq_cfi_restore R15+8, r15
-       movq_cfi_restore R14+8, r14
-       movq_cfi_restore R13+8, r13
-       movq_cfi_restore R12+8, r12
-       movq_cfi_restore RBP+8, rbp
-       movq_cfi_restore RBX+8, rbx
-       ret $REST_SKIP          /* pop extended registers */
-       CFI_ENDPROC
-END(ptregscall_common)
-
  ENTRY(stub_execve)
         CFI_STARTPROC
         addq $8, %rsp
@@ -779,7 +659,48 @@ END(interrupt)
         /* reserve pt_regs for scratch regs and rbp */
         subq $ORIG_RAX-RBP, %rsp
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
-       SAVE_ARGS_IRQ
+       cld
+       /* start from rbp in pt_regs and jump over */
+       movq_cfi rdi, (RDI-RBP)
+       movq_cfi rsi, (RSI-RBP)
+       movq_cfi rdx, (RDX-RBP)
+       movq_cfi rcx, (RCX-RBP)
+       movq_cfi rax, (RAX-RBP)
+       movq_cfi  r8,  (R8-RBP)
+       movq_cfi  r9,  (R9-RBP)
+       movq_cfi r10, (R10-RBP)
+       movq_cfi r11, (R11-RBP)
+
+       /* Save rbp so that we can unwind from get_irq_regs() */
+       movq_cfi rbp, 0
+
+       /* Save previous stack value */
+       movq %rsp, %rsi
+
+       leaq -RBP(%rsp),%rdi    /* arg1 for handler */
+       testl $3, CS-RBP(%rsi)
+       je 1f
+       SWAPGS
+       /*
+        * irq_count is used to check if a CPU is already on an interrupt stack
+        * or not. While this is essentially redundant with preempt_count it is
+        * a little cheaper to use a separate counter in the PDA (short of
+        * moving irq_enter into assembly, which would be too much work)
+        */
+1:     incl PER_CPU_VAR(irq_count)
+       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+       CFI_DEF_CFA_REGISTER    rsi
+
+       /* Store previous stack value */
+       pushq %rsi
+       CFI_ESCAPE      0x0f /* DW_CFA_def_cfa_expression */, 6, \
+                       0x77 /* DW_OP_breg7 */, 0, \
+                       0x06 /* DW_OP_deref */, \
+                       0x08 /* DW_OP_const1u */, SS+8-RBP, \
+                       0x22 /* DW_OP_plus */
+       /* We entered an interrupt context - irqs are off: */
+       TRACE_IRQS_OFF
+
         call \func
         .endm
  
@@ -831,6 +752,60 @@ retint_swapgs:             /* return to user-space */
          */
         DISABLE_INTERRUPTS(CLBR_ANY)
         TRACE_IRQS_IRETQ
+
+       /*
+        * Try to use SYSRET instead of IRET if we're returning to
+        * a completely clean 64-bit userspace context.
+        */
+       movq (RCX-R11)(%rsp), %rcx
+       cmpq %rcx,(RIP-R11)(%rsp)               /* RCX == RIP */
+       jne opportunistic_sysret_failed
+
+       /*
+        * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
+        * in kernel space.  This essentially lets the user take over
+        * the kernel, since userspace controls RSP.  It's not worth
+        * testing for canonicalness exactly -- this check detects any
+        * of the 17 high bits set, which is true for non-canonical
+        * or kernel addresses.  (This will pessimize vsyscall=native.
+        * Big deal.)
+        *
+        * If virtual addresses ever become wider, this will need
+        * to be updated to remain correct on both old and new CPUs.
+        */
+       .ifne __VIRTUAL_MASK_SHIFT - 47
+       .error "virtual address width changed -- sysret checks need update"
+       .endif
+       shr $__VIRTUAL_MASK_SHIFT, %rcx
+       jnz opportunistic_sysret_failed
+
+       cmpq $__USER_CS,(CS-R11)(%rsp)          /* CS must match SYSRET */
+       jne opportunistic_sysret_failed
+
+       movq (R11-ARGOFFSET)(%rsp), %r11
+       cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp)      /* R11 == RFLAGS */
+       jne opportunistic_sysret_failed
+
+       testq $X86_EFLAGS_RF,%r11               /* sysret can't restore RF */
+       jnz opportunistic_sysret_failed
+
+       /* nothing to check for RSP */
+
+       cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp)    /* SS must match SYSRET */
+       jne opportunistic_sysret_failed
+
+       /*
+        * We win!  This label is here just for ease of understanding
+        * perf profiles.  Nothing jumps here.
+        */
+irq_return_via_sysret:
+       CFI_REMEMBER_STATE
+       RESTORE_ARGS 1,8,1
+       movq (RSP-RIP)(%rsp),%rsp
+       USERGS_SYSRET64
+       CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
         SWAPGS
         jmp restore_args
  
@@ -1048,6 +1023,11 @@ ENTRY(\sym)
         CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
  
         .if \paranoid
+       .if \paranoid == 1
+       CFI_REMEMBER_STATE
+       testl $3, CS(%rsp)              /* If coming from userspace, switch */
+       jnz 1f                          /* stacks. */
+       .endif
         call save_paranoid
         .else
         call error_entry
@@ -1088,6 +1068,36 @@ ENTRY(\sym)
         jmp error_exit                  /* %ebx: no swapgs flag */
         .endif
  
+       .if \paranoid == 1
+       CFI_RESTORE_STATE
+       /*
+        * Paranoid entry from userspace.  Switch stacks and treat it
+        * as a normal entry.  This means that paranoid handlers
+        * run in real process context if user_mode(regs).
+        */
+1:
+       call error_entry
+
+       DEFAULT_FRAME 0
+
+       movq %rsp,%rdi                  /* pt_regs pointer */
+       call sync_regs
+       movq %rax,%rsp                  /* switch stack */
+
+       movq %rsp,%rdi                  /* pt_regs pointer */
+
+       .if \has_error_code
+       movq ORIG_RAX(%rsp),%rsi        /* get error code */
+       movq $-1,ORIG_RAX(%rsp)         /* no syscall to restart */
+       .else
+       xorl %esi,%esi                  /* no error code */
+       .endif
+
+       call \do_sym
+
+       jmp error_exit                  /* %ebx: no swapgs flag */
+       .endif
+
         CFI_ENDPROC
  END(\sym)
  .endm
@@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
  idtentry bounds do_bounds has_error_code=0
  idtentry invalid_op do_invalid_op has_error_code=0
  idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
  idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
  idtentry invalid_TSS do_invalid_TSS has_error_code=1
  idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
  #endif
  
         /*
-        * "Paranoid" exit path from exception stack.
-        * Paranoid because this is used by NMIs and cannot take
-        * any kernel state for granted.
-        * We don't do kernel preemption checks here, because only
-        * NMI should be common and it does not enable IRQs and
-        * cannot get reschedule ticks.
+        * "Paranoid" exit path from exception stack.  This is invoked
+        * only on return from non-NMI IST interrupts that came
+        * from kernel space.
          *
-        * "trace" is 0 for the NMI handler only, because irq-tracing
-        * is fundamentally NMI-unsafe. (we cannot change the soft and
-        * hard flags at once, atomically)
+        * We may be returning to very strange contexts (e.g. very early
+        * in syscall entry), so checking for preemption here would
+        * be complicated.  Fortunately, we there's no good reason
+        * to try to handle preemption here.
          */
  
         /* ebx: no swapgs flag */
@@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit)
         TRACE_IRQS_OFF_DEBUG
         testl %ebx,%ebx                         /* swapgs needed? */
         jnz paranoid_restore
-       testl $3,CS(%rsp)
-       jnz   paranoid_userspace
-paranoid_swapgs:
         TRACE_IRQS_IRETQ 0
         SWAPGS_UNSAFE_STACK
         RESTORE_ALL 8
-       jmp irq_return
+       INTERRUPT_RETURN
  paranoid_restore:
         TRACE_IRQS_IRETQ_DEBUG 0
         RESTORE_ALL 8
-       jmp irq_return
-paranoid_userspace:
-       GET_THREAD_INFO(%rcx)
-       movl TI_flags(%rcx),%ebx
-       andl $_TIF_WORK_MASK,%ebx
-       jz paranoid_swapgs
-       movq %rsp,%rdi                  /* &pt_regs */
-       call sync_regs
-       movq %rax,%rsp                  /* switch stack for scheduling */
-       testl $_TIF_NEED_RESCHED,%ebx
-       jnz paranoid_schedule
-       movl %ebx,%edx                  /* arg3: thread flags */
-       TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_NONE)
-       xorl %esi,%esi                  /* arg2: oldset */
-       movq %rsp,%rdi                  /* arg1: &pt_regs */
-       call do_notify_resume
-       DISABLE_INTERRUPTS(CLBR_NONE)
-       TRACE_IRQS_OFF
-       jmp paranoid_userspace
-paranoid_schedule:
-       TRACE_IRQS_ON
-       ENABLE_INTERRUPTS(CLBR_ANY)
-       SCHEDULE_USER
-       DISABLE_INTERRUPTS(CLBR_ANY)
-       TRACE_IRQS_OFF
-       jmp paranoid_userspace
+       INTERRUPT_RETURN
         CFI_ENDPROC
  END(paranoid_exit)
  
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c

index 63ce838..28d28f5 100644 (file)
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
                      : "memory", "cc", "edx", "ecx", "eax");
  }
  
-/* how to get the current stack pointer from C */
-#define current_stack_pointer ({               \
-       unsigned long sp;                       \
-       asm("mov %%esp,%0" : "=g" (sp));        \
-       sp;                                     \
-})
-
  static inline void *current_stack(void)
  {
-       return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+       return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
  }
  
  static inline int
@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
  
         /* Save the next esp at the bottom of the stack */
         prev_esp = (u32 *)irqstk;
-       *prev_esp = current_stack_pointer;
+       *prev_esp = current_stack_pointer();
  
         if (unlikely(overflow))
                 call_on_stack(print_stack_overflow, isp);
@@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
  
         /* Push the previous esp onto the stack */
         prev_esp = (u32 *)irqstk;
-       *prev_esp = current_stack_pointer;
+       *prev_esp = current_stack_pointer();
  
         call_on_stack(__do_softirq, isp);
  }
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c

index ed37a76..2a33c8f 100644 (file)
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
  {
         user_exit();
  
-#ifdef CONFIG_X86_MCE
-       /* notify userspace of pending MCEs */
-       if (thread_info_flags & _TIF_MCE_NOTIFY)
-               mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
         if (thread_info_flags & _TIF_UPROBE)
                 uprobe_notify_resume(regs);
  
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index 88900e2..c74f2f5 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
         preempt_count_dec();
  }
  
+enum ctx_state ist_enter(struct pt_regs *regs)
+{
+       enum ctx_state prev_state;
+
+       if (user_mode_vm(regs)) {
+               /* Other than that, we're just an exception. */
+               prev_state = exception_enter();
+       } else {
+               /*
+                * We might have interrupted pretty much anything.  In
+                * fact, if we're a machine check, we can even interrupt
+                * NMI processing.  We don't want in_nmi() to return true,
+                * but we need to notify RCU.
+                */
+               rcu_nmi_enter();
+               prev_state = IN_KERNEL;  /* the value is irrelevant. */
+       }
+
+       /*
+        * We are atomic because we're on the IST stack (or we're on x86_32,
+        * in which case we still shouldn't schedule).
+        *
+        * This must be after exception_enter(), because exception_enter()
+        * won't do anything if in_interrupt() returns true.
+        */
+       preempt_count_add(HARDIRQ_OFFSET);
+
+       /* This code is a bit fragile.  Test it. */
+       rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
+
+       return prev_state;
+}
+
+void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+{
+       /* Must be before exception_exit. */
+       preempt_count_sub(HARDIRQ_OFFSET);
+
+       if (user_mode_vm(regs))
+               return exception_exit(prev_state);
+       else
+               rcu_nmi_exit();
+}
+
+/**
+ * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
+ * @regs:      regs passed to the IST exception handler
+ *
+ * IST exception handlers normally cannot schedule.  As a special
+ * exception, if the exception interrupted userspace code (i.e.
+ * user_mode_vm(regs) would return true) and the exception was not
+ * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
+ * begins a non-atomic section within an ist_enter()/ist_exit() region.
+ * Callers are responsible for enabling interrupts themselves inside
+ * the non-atomic section, and callers must call is_end_non_atomic()
+ * before ist_exit().
+ */
+void ist_begin_non_atomic(struct pt_regs *regs)
+{
+       BUG_ON(!user_mode_vm(regs));
+
+       /*
+        * Sanity check: we need to be on the normal thread stack.  This
+        * will catch asm bugs and any attempt to use ist_preempt_enable
+        * from double_fault.
+        */
+       BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
+               & ~(THREAD_SIZE - 1)) != 0);
+
+       preempt_count_sub(HARDIRQ_OFFSET);
+}
+
+/**
+ * ist_end_non_atomic() - begin a non-atomic section in an IST exception
+ *
+ * Ends a non-atomic section started with ist_begin_non_atomic().
+ */
+void ist_end_non_atomic(void)
+{
+       preempt_count_add(HARDIRQ_OFFSET);
+}
+
  static nokprobe_inline int
  do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
                   struct pt_regs *regs, long error_code)
@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
          * end up promoting it to a doublefault.  In that case, modify
          * the stack to make it look like we just entered the #GP
          * handler from user space, similar to bad_iret.
+        *
+        * No need for ist_enter here because we don't use RCU.
          */
         if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
                 regs->cs == __KERNEL_CS &&
@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
                 normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
                 regs->ip = (unsigned long)general_protection;
                 regs->sp = (unsigned long)&normal_regs->orig_ax;
+
                 return;
         }
  #endif
  
-       exception_enter();
-       /* Return not checked because double check cannot be ignored */
+       ist_enter(regs);  /* Discard prev_state because we won't return. */
         notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
  
         tsk->thread.error_code = error_code;
@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
         if (poke_int3_handler(regs))
                 return;
  
-       prev_state = exception_enter();
+       prev_state = ist_enter(regs);
  #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
         if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
                                 SIGTRAP) == NOTIFY_STOP)
@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
         preempt_conditional_cli(regs);
         debug_stack_usage_dec();
  exit:
-       exception_exit(prev_state);
+       ist_exit(regs, prev_state);
  }
  NOKPROBE_SYMBOL(do_int3);
  
  #ifdef CONFIG_X86_64
  /*
- * Help handler running on IST stack to switch back to user stack
- * for scheduling or signal handling. The actual stack switch is done in
- * entry.S
+ * Help handler running on IST stack to switch off the IST stack if the
+ * interrupted code was in user mode. The actual stack switch is done in
+ * entry_64.S
   */
  asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
  {
-       struct pt_regs *regs = eregs;
-       /* Did already sync */
-       if (eregs == (struct pt_regs *)eregs->sp)
-               ;
-       /* Exception from user space */
-       else if (user_mode(eregs))
-               regs = task_pt_regs(current);
-       /*
-        * Exception from kernel and interrupts are enabled. Move to
-        * kernel process stack.
-        */
-       else if (eregs->flags & X86_EFLAGS_IF)
-               regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
-       if (eregs != regs)
-               *regs = *eregs;
+       struct pt_regs *regs = task_pt_regs(current);
+       *regs = *eregs;
         return regs;
  }
  NOKPROBE_SYMBOL(sync_regs);
@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         unsigned long dr6;
         int si_code;
  
-       prev_state = exception_enter();
+       prev_state = ist_enter(regs);
  
         get_debugreg(dr6, 6);
  
@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
         debug_stack_usage_dec();
  
  exit:
-       exception_exit(prev_state);
+       ist_exit(regs, prev_state);
  }
  NOKPROBE_SYMBOL(do_debug);
  
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile

index 5a4affe..09297c8 100644 (file)
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -205,4 +205,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
  PHONY += vdso_install $(vdso_img_insttargets)
  vdso_install: $(vdso_img_insttargets) FORCE
  
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
+clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 10 Feb 2015 01:16:44 +0000 (17:16 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 10 Feb 2015 01:16:44 +0000 (17:16 -0800)
Documentation/x86/entry_64.txt		patch \| blob \| history
Documentation/x86/x86_64/kernel-stacks		patch \| blob \| history
arch/x86/ia32/ia32entry.S		patch \| blob \| history
arch/x86/include/asm/calling.h		patch \| blob \| history
arch/x86/include/asm/mce.h		patch \| blob \| history
arch/x86/include/asm/thread_info.h		patch \| blob \| history
arch/x86/include/asm/traps.h		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/mce.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/p5.c		patch \| blob \| history
arch/x86/kernel/cpu/mcheck/winchip.c		patch \| blob \| history
arch/x86/kernel/entry_64.S		patch \| blob \| history
arch/x86/kernel/irq_32.c		patch \| blob \| history
arch/x86/kernel/signal.c		patch \| blob \| history
arch/x86/kernel/traps.c		patch \| blob \| history
arch/x86/vdso/Makefile		patch \| blob \| history