x86/vdso: Only define prctl_map_vdso() if CONFIG_CHECKPOINT_RESTORE
[cascardo/linux.git] / arch / x86 / kernel / process_64.c
index 6cbab31..b26a009 100644 (file)
@@ -26,7 +26,7 @@
 #include <linux/user.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/ptrace.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
@@ -49,6 +49,7 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
+#include <asm/vdso.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -136,25 +137,6 @@ void release_thread(struct task_struct *dead_task)
        }
 }
 
-static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
-{
-       struct user_desc ud = {
-               .base_addr = addr,
-               .limit = 0xfffff,
-               .seg_32bit = 1,
-               .limit_in_pages = 1,
-               .useable = 1,
-       };
-       struct desc_struct *desc = t->thread.tls_array;
-       desc += tls;
-       fill_ldt(desc, &ud);
-}
-
-static inline u32 read_32bit_tls(struct task_struct *t, int tls)
-{
-       return get_desc_base(&t->thread.tls_array[tls]);
-}
-
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
                unsigned long arg, struct task_struct *p, unsigned long tls)
 {
@@ -169,9 +151,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
        p->thread.io_bitmap_ptr = NULL;
 
        savesegment(gs, p->thread.gsindex);
-       p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
+       p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
        savesegment(fs, p->thread.fsindex);
-       p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
+       p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
        savesegment(es, p->thread.es);
        savesegment(ds, p->thread.ds);
        memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
@@ -210,7 +192,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
         */
        if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
-               if (is_ia32_task())
+               if (in_ia32_syscall())
                        err = do_set_thread_area(p, -1,
                                (struct user_desc __user *)tls, 0);
                else
@@ -282,7 +264,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-       unsigned fsindex, gsindex;
+       unsigned prev_fsindex, prev_gsindex;
        fpu_switch_t fpu_switch;
 
        fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
@@ -292,8 +274,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         *
         * (e.g. xen_load_tls())
         */
-       savesegment(fs, fsindex);
-       savesegment(gs, gsindex);
+       savesegment(fs, prev_fsindex);
+       savesegment(gs, prev_gsindex);
 
        /*
         * Load TLS before restoring any segments so that segment loads
@@ -336,66 +318,104 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         * Switch FS and GS.
         *
         * These are even more complicated than DS and ES: they have
-        * 64-bit bases are that controlled by arch_prctl.  Those bases
-        * only differ from the values in the GDT or LDT if the selector
-        * is 0.
+        * 64-bit bases are that controlled by arch_prctl.  The bases
+        * don't necessarily match the selectors, as user code can do
+        * any number of things to cause them to be inconsistent.
         *
-        * Loading the segment register resets the hidden base part of
-        * the register to 0 or the value from the GDT / LDT.  If the
-        * next base address zero, writing 0 to the segment register is
-        * much faster than using wrmsr to explicitly zero the base.
+        * We don't promise to preserve the bases if the selectors are
+        * nonzero.  We also don't promise to preserve the base if the
+        * selector is zero and the base doesn't match whatever was
+        * most recently passed to ARCH_SET_FS/GS.  (If/when the
+        * FSGSBASE instructions are enabled, we'll need to offer
+        * stronger guarantees.)
         *
-        * The thread_struct.fs and thread_struct.gs values are 0
-        * if the fs and gs bases respectively are not overridden
-        * from the values implied by fsindex and gsindex.  They
-        * are nonzero, and store the nonzero base addresses, if
-        * the bases are overridden.
-        *
-        * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
-        * be impossible.
-        *
-        * Therefore we need to reload the segment registers if either
-        * the old or new selector is nonzero, and we need to override
-        * the base address if next thread expects it to be overridden.
-        *
-        * This code is unnecessarily slow in the case where the old and
-        * new indexes are zero and the new base is nonzero -- it will
-        * unnecessarily write 0 to the selector before writing the new
-        * base address.
-        *
-        * Note: This all depends on arch_prctl being the only way that
-        * user code can override the segment base.  Once wrfsbase and
-        * wrgsbase are enabled, most of this code will need to change.
+        * As an invariant,
+        * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
+        * impossible.
         */
-       if (unlikely(fsindex | next->fsindex | prev->fs)) {
+       if (next->fsindex) {
+               /* Loading a nonzero value into FS sets the index and base. */
                loadsegment(fs, next->fsindex);
-
-               /*
-                * If user code wrote a nonzero value to FS, then it also
-                * cleared the overridden base address.
-                *
-                * XXX: if user code wrote 0 to FS and cleared the base
-                * address itself, we won't notice and we'll incorrectly
-                * restore the prior base address next time we reschdule
-                * the process.
-                */
-               if (fsindex)
-                       prev->fs = 0;
+       } else {
+               if (next->fsbase) {
+                       /* Next index is zero but next base is nonzero. */
+                       if (prev_fsindex)
+                               loadsegment(fs, 0);
+                       wrmsrl(MSR_FS_BASE, next->fsbase);
+               } else {
+                       /* Next base and index are both zero. */
+                       if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+                               /*
+                                * We don't know the previous base and can't
+                                * find out without RDMSR.  Forcibly clear it.
+                                */
+                               loadsegment(fs, __USER_DS);
+                               loadsegment(fs, 0);
+                       } else {
+                               /*
+                                * If the previous index is zero and ARCH_SET_FS
+                                * didn't change the base, then the base is
+                                * also zero and we don't need to do anything.
+                                */
+                               if (prev->fsbase || prev_fsindex)
+                                       loadsegment(fs, 0);
+                       }
+               }
        }
-       if (next->fs)
-               wrmsrl(MSR_FS_BASE, next->fs);
-       prev->fsindex = fsindex;
+       /*
+        * Save the old state and preserve the invariant.
+        * NB: if prev_fsindex == 0, then we can't reliably learn the base
+        * without RDMSR because Intel user code can zero it without telling
+        * us and AMD user code can program any 32-bit value without telling
+        * us.
+        */
+       if (prev_fsindex)
+               prev->fsbase = 0;
+       prev->fsindex = prev_fsindex;
 
-       if (unlikely(gsindex | next->gsindex | prev->gs)) {
+       if (next->gsindex) {
+               /* Loading a nonzero value into GS sets the index and base. */
                load_gs_index(next->gsindex);
-
-               /* This works (and fails) the same way as fsindex above. */
-               if (gsindex)
-                       prev->gs = 0;
+       } else {
+               if (next->gsbase) {
+                       /* Next index is zero but next base is nonzero. */
+                       if (prev_gsindex)
+                               load_gs_index(0);
+                       wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
+               } else {
+                       /* Next base and index are both zero. */
+                       if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+                               /*
+                                * We don't know the previous base and can't
+                                * find out without RDMSR.  Forcibly clear it.
+                                *
+                                * This contains a pointless SWAPGS pair.
+                                * Fixing it would involve an explicit check
+                                * for Xen or a new pvop.
+                                */
+                               load_gs_index(__USER_DS);
+                               load_gs_index(0);
+                       } else {
+                               /*
+                                * If the previous index is zero and ARCH_SET_GS
+                                * didn't change the base, then the base is
+                                * also zero and we don't need to do anything.
+                                */
+                               if (prev->gsbase || prev_gsindex)
+                                       load_gs_index(0);
+                       }
+               }
        }
-       if (next->gs)
-               wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
-       prev->gsindex = gsindex;
+       /*
+        * Save the old state and preserve the invariant.
+        * NB: if prev_gsindex == 0, then we can't reliably learn the base
+        * without RDMSR because Intel user code can zero it without telling
+        * us and AMD user code can program any 32-bit value without telling
+        * us.
+        */
+       if (prev_gsindex)
+               prev->gsbase = 0;
+       prev->gsindex = prev_gsindex;
 
        switch_fpu_finish(next_fpu, fpu_switch);
 
@@ -505,6 +525,19 @@ void set_personality_ia32(bool x32)
 }
 EXPORT_SYMBOL_GPL(set_personality_ia32);
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+{
+       int ret;
+
+       ret = map_vdso_once(image, addr);
+       if (ret)
+               return ret;
+
+       return (long)image->size;
+}
+#endif
+
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 {
        int ret = 0;
@@ -513,85 +546,64 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 
        switch (code) {
        case ARCH_SET_GS:
-               if (addr >= TASK_SIZE_OF(task))
+               if (addr >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
-               /* handle small bases via the GDT because that's faster to
-                  switch. */
-               if (addr <= 0xffffffff) {
-                       set_32bit_tls(task, GS_TLS, addr);
-                       if (doit) {
-                               load_TLS(&task->thread, cpu);
-                               load_gs_index(GS_TLS_SEL);
-                       }
-                       task->thread.gsindex = GS_TLS_SEL;
-                       task->thread.gs = 0;
-               } else {
-                       task->thread.gsindex = 0;
-                       task->thread.gs = addr;
-                       if (doit) {
-                               load_gs_index(0);
-                               ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
-                       }
+               task->thread.gsindex = 0;
+               task->thread.gsbase = addr;
+               if (doit) {
+                       load_gs_index(0);
+                       ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
                }
                put_cpu();
                break;
        case ARCH_SET_FS:
                /* Not strictly needed for fs, but do it for symmetry
                   with gs */
-               if (addr >= TASK_SIZE_OF(task))
+               if (addr >= TASK_SIZE_MAX)
                        return -EPERM;
                cpu = get_cpu();
-               /* handle small bases via the GDT because that's faster to
-                  switch. */
-               if (addr <= 0xffffffff) {
-                       set_32bit_tls(task, FS_TLS, addr);
-                       if (doit) {
-                               load_TLS(&task->thread, cpu);
-                               loadsegment(fs, FS_TLS_SEL);
-                       }
-                       task->thread.fsindex = FS_TLS_SEL;
-                       task->thread.fs = 0;
-               } else {
-                       task->thread.fsindex = 0;
-                       task->thread.fs = addr;
-                       if (doit) {
-                               /* set the selector to 0 to not confuse
-                                  __switch_to */
-                               loadsegment(fs, 0);
-                               ret = wrmsrl_safe(MSR_FS_BASE, addr);
-                       }
+               task->thread.fsindex = 0;
+               task->thread.fsbase = addr;
+               if (doit) {
+                       /* set the selector to 0 to not confuse __switch_to */
+                       loadsegment(fs, 0);
+                       ret = wrmsrl_safe(MSR_FS_BASE, addr);
                }
                put_cpu();
                break;
        case ARCH_GET_FS: {
                unsigned long base;
-               if (task->thread.fsindex == FS_TLS_SEL)
-                       base = read_32bit_tls(task, FS_TLS);
-               else if (doit)
+               if (doit)
                        rdmsrl(MSR_FS_BASE, base);
                else
-                       base = task->thread.fs;
+                       base = task->thread.fsbase;
                ret = put_user(base, (unsigned long __user *)addr);
                break;
        }
        case ARCH_GET_GS: {
                unsigned long base;
-               unsigned gsindex;
-               if (task->thread.gsindex == GS_TLS_SEL)
-                       base = read_32bit_tls(task, GS_TLS);
-               else if (doit) {
-                       savesegment(gs, gsindex);
-                       if (gsindex)
-                               rdmsrl(MSR_KERNEL_GS_BASE, base);
-                       else
-                               base = task->thread.gs;
-               } else
-                       base = task->thread.gs;
+               if (doit)
+                       rdmsrl(MSR_KERNEL_GS_BASE, base);
+               else
+                       base = task->thread.gsbase;
                ret = put_user(base, (unsigned long __user *)addr);
                break;
        }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# ifdef CONFIG_X86_X32
+       case ARCH_MAP_VDSO_X32:
+               return prctl_map_vdso(&vdso_image_x32, addr);
+# endif
+# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+       case ARCH_MAP_VDSO_32:
+               return prctl_map_vdso(&vdso_image_32, addr);
+# endif
+       case ARCH_MAP_VDSO_64:
+               return prctl_map_vdso(&vdso_image_64, addr);
+#endif
+
        default:
                ret = -EINVAL;
                break;