Merge branch 'x86/asm' into x86/mm, to resolve conflicts
authorIngo Molnar <mingo@kernel.org>
Fri, 15 Jul 2016 08:26:04 +0000 (10:26 +0200)
committerIngo Molnar <mingo@kernel.org>
Fri, 15 Jul 2016 08:26:04 +0000 (10:26 +0200)
 Conflicts:
tools/testing/selftests/x86/Makefile

Signed-off-by: Ingo Molnar <mingo@kernel.org>
19 files changed:
arch/x86/boot/boot.h
arch/x86/boot/cpu.c
arch/x86/boot/cpucheck.c
arch/x86/boot/cpuflags.c
arch/x86/boot/cpuflags.h
arch/x86/entry/vdso/vma.c
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/uaccess.h
arch/x86/mm/init_64.c
arch/x86/mm/pageattr.c
arch/x86/mm/pat.c
arch/x86/mm/pgtable_32.c
drivers/char/mem.c
include/linux/mm_types.h
mm/mmap.c
tools/testing/selftests/x86/Makefile
tools/testing/selftests/x86/test_mremap_vdso.c [new file with mode: 0644]

index 7c1495f..e5612f3 100644 (file)
@@ -295,6 +295,7 @@ static inline int cmdline_find_option_bool(const char *option)
 
 /* cpu.c, cpucheck.c */
 int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
+int check_knl_erratum(void);
 int validate_cpu(void);
 
 /* early_serial_console.c */
index 29207f6..26240dd 100644 (file)
@@ -93,6 +93,8 @@ int validate_cpu(void)
                show_cap_strs(err_flags);
                putchar('\n');
                return -1;
+       } else if (check_knl_erratum()) {
+               return -1;
        } else {
                return 0;
        }
index 1fd7d57..4ad7d70 100644 (file)
@@ -24,6 +24,7 @@
 # include "boot.h"
 #endif
 #include <linux/types.h>
+#include <asm/intel-family.h>
 #include <asm/processor-flags.h>
 #include <asm/required-features.h>
 #include <asm/msr-index.h>
@@ -175,6 +176,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
                        puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n");
                }
        }
+       if (!err)
+               err = check_knl_erratum();
 
        if (err_flags_ptr)
                *err_flags_ptr = err ? err_flags : NULL;
@@ -185,3 +188,33 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr)
 
        return (cpu.level < req_level || err) ? -1 : 0;
 }
+
+int check_knl_erratum(void)
+{
+       /*
+        * First check for the affected model/family:
+        */
+       if (!is_intel() ||
+           cpu.family != 6 ||
+           cpu.model != INTEL_FAM6_XEON_PHI_KNL)
+               return 0;
+
+       /*
+        * This erratum affects the Accessed/Dirty bits, and can
+        * cause stray bits to be set in !Present PTEs.  We have
+        * enough bits in our 64-bit PTEs (which we have on real
+        * 64-bit mode or PAE) to avoid using these troublesome
+        * bits.  But, we do not have enough space in our 32-bit
+        * PTEs.  So, refuse to run on 32-bit non-PAE kernels.
+        */
+       if (IS_ENABLED(CONFIG_X86_64) || IS_ENABLED(CONFIG_X86_PAE))
+               return 0;
+
+       puts("This 32-bit kernel can not run on this Xeon Phi x200\n"
+            "processor due to a processor erratum.  Use a 64-bit\n"
+            "kernel, or enable PAE in this 32-bit kernel.\n\n");
+
+       return -1;
+}
+
+
index 431fa5f..6687ab9 100644 (file)
@@ -102,6 +102,7 @@ void get_cpuflags(void)
                        cpuid(0x1, &tfms, &ignored, &cpu.flags[4],
                              &cpu.flags[0]);
                        cpu.level = (tfms >> 8) & 15;
+                       cpu.family = cpu.level;
                        cpu.model = (tfms >> 4) & 15;
                        if (cpu.level >= 6)
                                cpu.model += ((tfms >> 16) & 0xf) << 4;
index 4cb404f..15ad56a 100644 (file)
@@ -6,6 +6,7 @@
 
 struct cpu_features {
        int level;              /* Family, or 64 for x86-64 */
+       int family;             /* Family, always */
        int model;
        u32 flags[NCAPINTS];
 };
index ab220ac..3329844 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/cpu.h>
+#include <linux/ptrace.h>
 #include <asm/pvclock.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
@@ -97,10 +98,40 @@ static int vdso_fault(const struct vm_special_mapping *sm,
        return 0;
 }
 
-static const struct vm_special_mapping text_mapping = {
-       .name = "[vdso]",
-       .fault = vdso_fault,
-};
+static void vdso_fix_landing(const struct vdso_image *image,
+               struct vm_area_struct *new_vma)
+{
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+       if (in_ia32_syscall() && image == &vdso_image_32) {
+               struct pt_regs *regs = current_pt_regs();
+               unsigned long vdso_land = image->sym_int80_landing_pad;
+               unsigned long old_land_addr = vdso_land +
+                       (unsigned long)current->mm->context.vdso;
+
+               /* Fixing userspace landing - look at do_fast_syscall_32 */
+               if (regs->ip == old_land_addr)
+                       regs->ip = new_vma->vm_start + vdso_land;
+       }
+#endif
+}
+
+static int vdso_mremap(const struct vm_special_mapping *sm,
+               struct vm_area_struct *new_vma)
+{
+       unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+       const struct vdso_image *image = current->mm->context.vdso_image;
+
+       if (image->size != new_size)
+               return -EINVAL;
+
+       if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
+               return -EFAULT;
+
+       vdso_fix_landing(image, new_vma);
+       current->mm->context.vdso = (void __user *)new_vma->vm_start;
+
+       return 0;
+}
 
 static int vvar_fault(const struct vm_special_mapping *sm,
                      struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -151,6 +182,12 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
        struct vm_area_struct *vma;
        unsigned long addr, text_start;
        int ret = 0;
+
+       static const struct vm_special_mapping vdso_mapping = {
+               .name = "[vdso]",
+               .fault = vdso_fault,
+               .mremap = vdso_mremap,
+       };
        static const struct vm_special_mapping vvar_mapping = {
                .name = "[vvar]",
                .fault = vvar_fault,
@@ -185,7 +222,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
                                       image->size,
                                       VM_READ|VM_EXEC|
                                       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-                                      &text_mapping);
+                                      &vdso_mapping);
 
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
index 1a27396..2815d26 100644 (file)
@@ -480,7 +480,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
 
 static inline int pte_none(pte_t pte)
 {
-       return !pte.pte;
+       return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
 }
 
 #define __HAVE_ARCH_PTE_SAME
@@ -552,7 +552,8 @@ static inline int pmd_none(pmd_t pmd)
 {
        /* Only check low word on 32-bit platforms, since it might be
           out of sync with upper half. */
-       return (unsigned long)native_pmd_val(pmd) == 0;
+       unsigned long val = native_pmd_val(pmd);
+       return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
 }
 
 static inline unsigned long pmd_page_vaddr(pmd_t pmd)
@@ -616,7 +617,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
 #if CONFIG_PGTABLE_LEVELS > 2
 static inline int pud_none(pud_t pud)
 {
-       return native_pud_val(pud) == 0;
+       return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
 }
 
 static inline int pud_present(pud_t pud)
@@ -694,6 +695,12 @@ static inline int pgd_bad(pgd_t pgd)
 
 static inline int pgd_none(pgd_t pgd)
 {
+       /*
+        * There is no need to do a workaround for the KNL stray
+        * A/D bit erratum here.  PGDs only point to page tables
+        * except on 32-bit non-PAE which is not supported on
+        * KNL.
+        */
        return !native_pgd_val(pgd);
 }
 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
index 2ee7811..7e8ec7a 100644 (file)
@@ -140,18 +140,32 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
-/* Encode and de-code a swap entry */
+/*
+ * Encode and de-code a swap entry
+ *
+ * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2|1|0| <- bit number
+ * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
+ * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ *
+ * G (8) is aliased and used as a PROT_NONE indicator for
+ * !present ptes.  We need to start storing swap entries above
+ * there.  We also need to avoid using A and D because of an
+ * erratum where they can be incorrectly set by hardware on
+ * non-present PTEs.
+ */
+#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
 #define SWP_TYPE_BITS 5
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
+/* Place the offset above the type: */
+#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
-#define __swp_type(x)                  (((x).val >> (_PAGE_BIT_PRESENT + 1)) \
+#define __swp_type(x)                  (((x).val >> (SWP_TYPE_FIRST_BIT)) \
                                         & ((1U << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x)                        ((x).val >> SWP_OFFSET_SHIFT)
+#define __swp_offset(x)                        ((x).val >> SWP_OFFSET_FIRST_BIT)
 #define __swp_entry(type, offset)      ((swp_entry_t) { \
-                                        ((type) << (_PAGE_BIT_PRESENT + 1)) \
-                                        | ((offset) << SWP_OFFSET_SHIFT) })
+                                        ((type) << (SWP_TYPE_FIRST_BIT)) \
+                                        | ((offset) << SWP_OFFSET_FIRST_BIT) })
 #define __pte_to_swp_entry(pte)                ((swp_entry_t) { pte_val((pte)) })
 #define __swp_entry_to_pte(x)          ((pte_t) { .pte = (x).val })
 
index 7b5efe2..d14d0a5 100644 (file)
                         _PAGE_PKEY_BIT2 | \
                         _PAGE_PKEY_BIT3)
 
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED)
+#else
+#define _PAGE_KNL_ERRATUM_MASK 0
+#endif
+
 #ifdef CONFIG_KMEMCHECK
 #define _PAGE_HIDDEN   (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
 #else
index 2982387..d40ec72 100644 (file)
@@ -342,7 +342,26 @@ do {                                                                       \
 } while (0)
 
 #ifdef CONFIG_X86_32
-#define __get_user_asm_u64(x, ptr, retval, errret)     (x) = __get_user_bad()
+#define __get_user_asm_u64(x, ptr, retval, errret)                     \
+({                                                                     \
+       __typeof__(ptr) __ptr = (ptr);                                  \
+       asm volatile(ASM_STAC "\n"                                      \
+                    "1:        movl %2,%%eax\n"                        \
+                    "2:        movl %3,%%edx\n"                        \
+                    "3: " ASM_CLAC "\n"                                \
+                    ".section .fixup,\"ax\"\n"                         \
+                    "4:        mov %4,%0\n"                            \
+                    "  xorl %%eax,%%eax\n"                             \
+                    "  xorl %%edx,%%edx\n"                             \
+                    "  jmp 3b\n"                                       \
+                    ".previous\n"                                      \
+                    _ASM_EXTABLE(1b, 4b)                               \
+                    _ASM_EXTABLE(2b, 4b)                               \
+                    : "=r" (retval), "=A"(x)                           \
+                    : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \
+                      "i" (errret), "0" (retval));                     \
+})
+
 #define __get_user_asm_ex_u64(x, ptr)                  (x) = __get_user_bad()
 #else
 #define __get_user_asm_u64(x, ptr, retval, errret) \
@@ -429,7 +448,7 @@ do {                                                                        \
 #define __get_user_nocheck(x, ptr, size)                               \
 ({                                                                     \
        int __gu_err;                                                   \
-       unsigned long __gu_val;                                         \
+       __inttype(*(ptr)) __gu_val;                                     \
        __uaccess_begin();                                              \
        __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT);    \
        __uaccess_end();                                                \
index bce2e5d..bb88fbc 100644 (file)
@@ -354,7 +354,7 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
                 * pagetable pages as RO. So assume someone who pre-setup
                 * these mappings are more intelligent.
                 */
-               if (pte_val(*pte)) {
+               if (!pte_none(*pte)) {
                        if (!after_bootmem)
                                pages++;
                        continue;
@@ -396,7 +396,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
                        continue;
                }
 
-               if (pmd_val(*pmd)) {
+               if (!pmd_none(*pmd)) {
                        if (!pmd_large(*pmd)) {
                                spin_lock(&init_mm.page_table_lock);
                                pte = (pte_t *)pmd_page_vaddr(*pmd);
@@ -470,7 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
                        continue;
                }
 
-               if (pud_val(*pud)) {
+               if (!pud_none(*pud)) {
                        if (!pud_large(*pud)) {
                                pmd = pmd_offset(pud, 0);
                                last_map_addr = phys_pmd_init(pmd, addr, end,
@@ -673,7 +673,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
 
        for (i = 0; i < PTRS_PER_PTE; i++) {
                pte = pte_start + i;
-               if (pte_val(*pte))
+               if (!pte_none(*pte))
                        return;
        }
 
@@ -691,7 +691,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
 
        for (i = 0; i < PTRS_PER_PMD; i++) {
                pmd = pmd_start + i;
-               if (pmd_val(*pmd))
+               if (!pmd_none(*pmd))
                        return;
        }
 
@@ -710,7 +710,7 @@ static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
 
        for (i = 0; i < PTRS_PER_PUD; i++) {
                pud = pud_start + i;
-               if (pud_val(*pud))
+               if (!pud_none(*pud))
                        return false;
        }
 
index 7a1f7bb..7514215 100644 (file)
@@ -1185,7 +1185,7 @@ repeat:
                return __cpa_process_fault(cpa, address, primary);
 
        old_pte = *kpte;
-       if (!pte_val(old_pte))
+       if (pte_none(old_pte))
                return __cpa_process_fault(cpa, address, primary);
 
        if (level == PG_LEVEL_4K) {
index fb0604f..db00e3e 100644 (file)
@@ -755,11 +755,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
                return 1;
 
        while (cursor < to) {
-               if (!devmem_is_allowed(pfn)) {
-                       pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-                               current->comm, from, to - 1);
+               if (!devmem_is_allowed(pfn))
                        return 0;
-               }
                cursor += PAGE_SIZE;
                pfn++;
        }
index 75cc097..e67ae0e 100644 (file)
@@ -47,7 +47,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
                return;
        }
        pte = pte_offset_kernel(pmd, vaddr);
-       if (pte_val(pteval))
+       if (!pte_none(pteval))
                set_pte_at(&init_mm, vaddr, pte, pteval);
        else
                pte_clear(&init_mm, vaddr, pte);
index 71025c2..d633974 100644 (file)
@@ -66,12 +66,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
        u64 cursor = from;
 
        while (cursor < to) {
-               if (!devmem_is_allowed(pfn)) {
-                       printk(KERN_INFO
-               "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
-                               current->comm, from, to);
+               if (!devmem_is_allowed(pfn))
                        return 0;
-               }
                cursor += PAGE_SIZE;
                pfn++;
        }
index ca3e517..917f2b6 100644 (file)
@@ -594,6 +594,9 @@ struct vm_special_mapping {
        int (*fault)(const struct vm_special_mapping *sm,
                     struct vm_area_struct *vma,
                     struct vm_fault *vmf);
+
+       int (*mremap)(const struct vm_special_mapping *sm,
+                    struct vm_area_struct *new_vma);
 };
 
 enum tlb_flush_reason {
index de2c176..234edff 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2943,9 +2943,19 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
 }
 
+static int special_mapping_mremap(struct vm_area_struct *new_vma)
+{
+       struct vm_special_mapping *sm = new_vma->vm_private_data;
+
+       if (sm->mremap)
+               return sm->mremap(sm, new_vma);
+       return 0;
+}
+
 static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
+       .mremap = special_mapping_mremap,
        .name = special_mapping_name,
 };
 
index abe9c35..4f747ee 100644 (file)
@@ -4,7 +4,7 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
                        check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
                        test_FCMOV test_FCOMI test_FISTTP \
diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c
new file mode 100644 (file)
index 0000000..bf0d687
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * 32-bit test to check vDSO mremap.
+ *
+ * Copyright (c) 2016 Dmitry Safonov
+ * Suggested-by: Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+/*
+ * Can be built statically:
+ * gcc -Os -Wall -static -m32 test_mremap_vdso.c
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <sys/mman.h>
+#include <sys/auxv.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#define PAGE_SIZE      4096
+
+static int try_to_remap(void *vdso_addr, unsigned long size)
+{
+       void *dest_addr, *new_addr;
+
+       /* Searching for memory location where to remap */
+       dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+       if (dest_addr == MAP_FAILED) {
+               printf("[WARN]\tmmap failed (%d): %m\n", errno);
+               return 0;
+       }
+
+       printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n",
+               vdso_addr, (unsigned long)vdso_addr + size,
+               dest_addr, (unsigned long)dest_addr + size);
+       fflush(stdout);
+
+       new_addr = mremap(vdso_addr, size, size,
+                       MREMAP_FIXED|MREMAP_MAYMOVE, dest_addr);
+       if ((unsigned long)new_addr == (unsigned long)-1) {
+               munmap(dest_addr, size);
+               if (errno == EINVAL) {
+                       printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n");
+                       return -1; /* Retry with larger */
+               }
+               printf("[FAIL]\tmremap failed (%d): %m\n", errno);
+               return 1;
+       }
+
+       return 0;
+
+}
+
+int main(int argc, char **argv, char **envp)
+{
+       pid_t child;
+
+       child = fork();
+       if (child == -1) {
+               printf("[WARN]\tfailed to fork (%d): %m\n", errno);
+               return 1;
+       }
+
+       if (child == 0) {
+               unsigned long vdso_size = PAGE_SIZE;
+               unsigned long auxval;
+               int ret = -1;
+
+               auxval = getauxval(AT_SYSINFO_EHDR);
+               printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval);
+               if (!auxval || auxval == -ENOENT) {
+                       printf("[WARN]\tgetauxval failed\n");
+                       return 0;
+               }
+
+               /* Simpler than parsing ELF header */
+               while (ret < 0) {
+                       ret = try_to_remap((void *)auxval, vdso_size);
+                       vdso_size += PAGE_SIZE;
+               }
+
+               /* Glibc is likely to explode now - exit with raw syscall */
+               asm volatile ("int $0x80" : : "a" (__NR_exit), "b" (!!ret));
+       } else {
+               int status;
+
+               if (waitpid(child, &status, 0) != child ||
+                       !WIFEXITED(status)) {
+                       printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n");
+                       return 1;
+               } else if (WEXITSTATUS(status) != 0) {
+                       printf("[FAIL]\tChild failed with %d\n",
+                                       WEXITSTATUS(status));
+                       return 1;
+               }
+               printf("[OK]\n");
+       }
+
+       return 0;
+}