Merge branch 'mm-pkeys-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 10 Oct 2016 18:01:51 +0000 (11:01 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 10 Oct 2016 18:01:51 +0000 (11:01 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2016 18:01:51 +0000 (11:01 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2016 18:01:51 +0000 (11:01 -0700)
diff --combined Documentation/kernel-parameters.txt

index ec8d814,3725976..705fb91
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -460,15 -460,6 +460,15 @@@ bytes respectively. Such letter suffixe
                         driver will print ACPI tables for AMD IOMMU during
                         IOMMU initialization.
   
+ +      amd_iommu_intr= [HW,X86-64]
+ +                      Specifies one of the following AMD IOMMU interrupt
+ +                      remapping modes:
+ +                      legacy     - Use legacy interrupt remapping mode.
+ +                      vapic      - Use virtual APIC mode, which allows IOMMU
+ +                                   to inject interrupts directly into guest.
+ +                                   This mode requires kvm-amd.avic=1.
+ +                                   (Default when IOMMU HW support is present.)
+ +
         amijoy.map=     [HW,JOY] Amiga joystick support
                         Map of devices attached to JOY0DAT and JOY1DAT
                         Format: <a>,<b>
@@@ -707,15 -698,6 +707,15 @@@
                         loops can be debugged more effectively on production
                         systems.
   
+ +      clocksource.arm_arch_timer.fsl-a008585=
+ +                      [ARM64]
+ +                      Format: <bool>
+ +                      Enable/disable the workaround of Freescale/NXP
+ +                      erratum A-008585.  This can be useful for KVM
+ +                      guests, if the guest device tree doesn't show the
+ +                      erratum.  If unspecified, the workaround is
+ +                      enabled based on the device tree.
+ +
         clearcpuid=BITNUM [X86]
                         Disable CPUID feature X for the kernel. See
                         arch/x86/include/asm/cpufeatures.h for the valid bit
@@@ -1063,12 -1045,11 +1063,12 @@@
                         determined by the stdout-path property in device
                         tree's chosen node.
   
- -              cdns,<addr>
- -                      Start an early, polled-mode console on a cadence serial
- -                      port at the specified address. The cadence serial port
- -                      must already be setup and configured. Options are not
- -                      yet supported.
+ +              cdns,<addr>[,options]
+ +                      Start an early, polled-mode console on a Cadence
+ +                      (xuartps) serial port at the specified address. Only
+ +                      supported option is baud rate. If baud rate is not
+ +                      specified, the serial port must already be setup and
+ +                      configured.
   
                 uart[8250],io,<addr>[,options]
                 uart[8250],mmio,<addr>[,options]
@@@ -1383,10 -1364,6 +1383,10 @@@
                         Format: <unsigned int> such that (rxsize & ~0x1fffc0) == 0.
                         Default: 1024
   
+ +      gpio-mockup.gpio_mockup_ranges
+ +                      [HW] Sets the ranges of gpiochip of for this device.
+ +                      Format: <start1>,<end1>,<start2>,<end2>...
+ +
         hardlockup_all_cpu_backtrace=
                         [KNL] Should the hard-lockup detector generate
                         backtraces on all cpus.
@@@ -1666,6 -1643,11 +1666,11 @@@
   
         initrd=         [BOOT] Specify the location of the initial ramdisk
   
+       init_pkru=      [x86] Specify the default memory protection keys rights
+                       register contents for all processes.  0x55555554 by
+                       default (disallow access to all but pkey 0).  Can
+                       override in debugfs after boot.
+ 
         inport.irq=     [HW] Inport (ATI XL and Microsoft) busmouse driver
                         Format: <irq>
   
@@@ -1711,7 -1693,7 +1716,7 @@@
   
         intel_idle.max_cstate=  [KNL,HW,ACPI,X86]
                         0       disables intel_idle and fall back on acpi_idle.
- -                      1 to 6  specify maximum depth of C-state.
+ +                      1 to 9  specify maximum depth of C-state.
   
         intel_pstate=  [X86]
                        disable
@@@ -2184,13 -2166,10 +2189,13 @@@
                         than or equal to this physical address is ignored.
   
         maxcpus=        [SMP] Maximum number of processors that an SMP kernel
- -                      should make use of.  maxcpus=n : n >= 0 limits the
- -                      kernel to using 'n' processors.  n=0 is a special case,
- -                      it is equivalent to "nosmp", which also disables
- -                      the IO APIC.
+ +                      will bring up during bootup.  maxcpus=n : n >= 0 limits
+ +                      the kernel to bring up 'n' processors. Surely after
+ +                      bootup you can bring up the other plugged cpu by executing
+ +                      "echo 1 > /sys/devices/system/cpu/cpuX/online". So maxcpus
+ +                      only takes effect during system bootup.
+ +                      While n=0 is a special case, it is equivalent to "nosmp",
+ +                      which also disables the IO APIC.
   
         max_loop=       [LOOP] The number of loop block devices that get
         (loop.max_loop) unconditionally pre-created at init time. The default
@@@ -2597,6 -2576,8 +2602,6 @@@
   
         nodelayacct     [KNL] Disable per-task delay accounting
   
- -      nodisconnect    [HW,SCSI,M68K] Disables SCSI disconnects.
- -
         nodsp           [SH] Disable hardware DSP at boot time.
   
         noefi           Disable EFI runtime services support.
@@@ -2797,12 -2778,9 +2802,12 @@@
   
         nr_cpus=        [SMP] Maximum number of processors that an SMP kernel
                         could support.  nr_cpus=n : n >= 1 limits the kernel to
- -                      supporting 'n' processors. Later in runtime you can not
- -                      use hotplug cpu feature to put more cpu back to online.
- -                      just like you compile the kernel NR_CPUS=n
+ +                      support 'n' processors. It could be larger than the
+ +                      number of already plugged CPU during bootup, later in
+ +                      runtime you can physically add extra cpu until it reaches
+ +                      n. So during boot up some boot time memory for per-cpu
+ +                      variables need be pre-allocated for later physical cpu
+ +                      hot plugging.
   
         nr_uarts=       [SERIAL] maximum number of UARTs to be registered.
   
@@@ -4265,8 -4243,6 +4270,8 @@@
                                 u = IGNORE_UAS (don't bind to the uas driver);
                                 w = NO_WP_DETECT (don't test whether the
                                         medium is write-protected).
+ +                              y = ALWAYS_SYNC (issue a SYNCHRONIZE_CACHE
+ +                                      even if the device claims no cache)
                         Example: quirks=0419:aaf5:rl,0421:0433:rc
   
         user_debug=     [KNL,ARM]
diff --combined arch/x86/kernel/process_64.c

index ee944bd,a21068e..b3760b3
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -49,7 -49,8 +49,7 @@@
   #include <asm/debugreg.h>
   #include <asm/switch_to.h>
   #include <asm/xen/hypervisor.h>
- -
- -asmlinkage extern void ret_from_fork(void);
+ +#include <asm/vdso.h>
   
   __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
   
@@@ -109,12 -110,13 +109,13 @@@ void __show_regs(struct pt_regs *regs, 
         get_debugreg(d7, 7);
   
         /* Only print out debug registers if they are in their non-default state. */
-       if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
-           (d6 == DR6_RESERVED) && (d7 == 0x400))
-               return;
- 
-       printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
-       printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+       if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
+           (d6 == DR6_RESERVED) && (d7 == 0x400))) {
+               printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
+                      d0, d1, d2);
+               printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
+                      d3, d6, d7);
+       }
   
         if (boot_cpu_has(X86_FEATURE_OSPKE))
                 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
@@@ -140,17 -142,12 +141,17 @@@ int copy_thread_tls(unsigned long clone
   {
         int err;
         struct pt_regs *childregs;
+ +      struct fork_frame *fork_frame;
+ +      struct inactive_task_frame *frame;
         struct task_struct *me = current;
   
         p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
         childregs = task_pt_regs(p);
- -      p->thread.sp = (unsigned long) childregs;
- -      set_tsk_thread_flag(p, TIF_FORK);
+ +      fork_frame = container_of(childregs, struct fork_frame, regs);
+ +      frame = &fork_frame->frame;
+ +      frame->bp = 0;
+ +      frame->ret_addr = (unsigned long) ret_from_fork;
+ +      p->thread.sp = (unsigned long) fork_frame;
         p->thread.io_bitmap_ptr = NULL;
   
         savesegment(gs, p->thread.gsindex);
@@@ -164,11 -161,15 +165,11 @@@
         if (unlikely(p->flags & PF_KTHREAD)) {
                 /* kernel thread */
                 memset(childregs, 0, sizeof(struct pt_regs));
- -              childregs->sp = (unsigned long)childregs;
- -              childregs->ss = __KERNEL_DS;
- -              childregs->bx = sp; /* function */
- -              childregs->bp = arg;
- -              childregs->orig_ax = -1;
- -              childregs->cs = __KERNEL_CS | get_kernel_rpl();
- -              childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
+ +              frame->bx = sp;         /* function */
+ +              frame->r12 = arg;
                 return 0;
         }
+ +      frame->bx = 0;
         *childregs = *current_pt_regs();
   
         childregs->ax = 0;
@@@ -511,7 -512,7 +512,7 @@@ void set_personality_ia32(bool x32
                 current->personality &= ~READ_IMPLIES_EXEC;
                 /* in_compat_syscall() uses the presence of the x32
                    syscall bit flag to determine compat status */
- -              current_thread_info()->status &= ~TS_COMPAT;
+ +              current->thread.status &= ~TS_COMPAT;
         } else {
                 set_thread_flag(TIF_IA32);
                 clear_thread_flag(TIF_X32);
@@@ -519,24 -520,11 +520,24 @@@
                         current->mm->context.ia32_compat = TIF_IA32;
                 current->personality |= force_personality32;
                 /* Prepare the first "return" to user space */
- -              current_thread_info()->status |= TS_COMPAT;
+ +              current->thread.status |= TS_COMPAT;
         }
   }
   EXPORT_SYMBOL_GPL(set_personality_ia32);
   
+ +#ifdef CONFIG_CHECKPOINT_RESTORE
+ +static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
+ +{
+ +      int ret;
+ +
+ +      ret = map_vdso_once(image, addr);
+ +      if (ret)
+ +              return ret;
+ +
+ +      return (long)image->size;
+ +}
+ +#endif
+ +
   long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
   {
         int ret = 0;
@@@ -590,19 -578,6 +591,19 @@@
                 break;
         }
   
+ +#ifdef CONFIG_CHECKPOINT_RESTORE
+ +# ifdef CONFIG_X86_X32_ABI
+ +      case ARCH_MAP_VDSO_X32:
+ +              return prctl_map_vdso(&vdso_image_x32, addr);
+ +# endif
+ +# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+ +      case ARCH_MAP_VDSO_32:
+ +              return prctl_map_vdso(&vdso_image_32, addr);
+ +# endif
+ +      case ARCH_MAP_VDSO_64:
+ +              return prctl_map_vdso(&vdso_image_64, addr);
+ +#endif
+ +
         default:
                 ret = -EINVAL;
                 break;
diff --combined arch/x86/mm/fault.c

index 1e52512,b88d8ac..4dc1334
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -5,7 -5,7 +5,7 @@@
    */
   #include <linux/sched.h>              /* test_thread_flag(), ...      */
   #include <linux/kdebug.h>             /* oops_begin/end, ...          */
- -#include <linux/module.h>             /* search_exception_table       */
+ +#include <linux/extable.h>            /* search_exception_table       */
   #include <linux/bootmem.h>            /* max_low_pfn                  */
   #include <linux/kprobes.h>            /* NOKPROBE_SYMBOL, ...         */
   #include <linux/mmiotrace.h>          /* kmmio_handler, ...           */
@@@ -753,38 -753,6 +753,38 @@@ no_context(struct pt_regs *regs, unsign
                 return;
         }
   
+ +#ifdef CONFIG_VMAP_STACK
+ +      /*
+ +       * Stack overflow?  During boot, we can fault near the initial
+ +       * stack in the direct map, but that's not an overflow -- check
+ +       * that we're in vmalloc space to avoid this.
+ +       */
+ +      if (is_vmalloc_addr((void *)address) &&
+ +          (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
+ +           address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
+ +              register void *__sp asm("rsp");
+ +              unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
+ +              /*
+ +               * We're likely to be running with very little stack space
+ +               * left.  It's plausible that we'd hit this condition but
+ +               * double-fault even before we get this far, in which case
+ +               * we're fine: the double-fault handler will deal with it.
+ +               *
+ +               * We don't want to make it all the way into the oops code
+ +               * and then double-fault, though, because we're likely to
+ +               * break the console driver and lose most of the stack dump.
+ +               */
+ +              asm volatile ("movq %[stack], %%rsp\n\t"
+ +                            "call handle_stack_overflow\n\t"
+ +                            "1: jmp 1b"
+ +                            : "+r" (__sp)
+ +                            : "D" ("kernel stack overflow (page fault)"),
+ +                              "S" (regs), "d" (address),
+ +                              [stack] "rm" (stack));
+ +              unreachable();
+ +      }
+ +#endif
+ +
         /*
          * 32-bit:
          *
@@@ -1144,6 -1112,15 +1144,15 @@@ access_error(unsigned long error_code, 
   {
         /* This is only called for the current mm, so: */
         bool foreign = false;
+ 
+       /*
+        * Read or write was blocked by protection keys.  This is
+        * always an unconditional error and can never result in
+        * a follow-up action to resolve the fault, like a COW.
+        */
+       if (error_code & PF_PK)
+               return 1;
+ 
         /*
          * Make sure to check the VMA so that we do not perform
          * faults just to hit a PF_PK as soon as we fill in a
diff --combined mm/mprotect.c

index ec91dfd,7b35ee3..bcdbe62
--- 1/mm/mprotect.c
--- 2/mm/mprotect.c
+++ b/mm/mprotect.c
@@@ -23,11 -23,13 +23,13 @@@
   #include <linux/mmu_notifier.h>
   #include <linux/migrate.h>
   #include <linux/perf_event.h>
+ #include <linux/pkeys.h>
   #include <linux/ksm.h>
   #include <linux/pkeys.h>
   #include <asm/uaccess.h>
   #include <asm/pgtable.h>
   #include <asm/cacheflush.h>
+ #include <asm/mmu_context.h>
   #include <asm/tlbflush.h>
   
   #include "internal.h"
@@@ -304,7 -306,6 +306,7 @@@ mprotect_fixup(struct vm_area_struct *v
                            vma->vm_userfaultfd_ctx);
         if (*pprev) {
                 vma = *pprev;
+ +              VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
                 goto success;
         }
   
@@@ -328,7 -329,7 +330,7 @@@ success
          * held in write mode.
          */
         vma->vm_flags = newflags;
- -      dirty_accountable = vma_wants_writenotify(vma);
+ +      dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
         vma_set_page_prot(vma);
   
         change_protection(vma, start, end, vma->vm_page_prot,
@@@ -353,8 -354,11 +355,11 @@@ fail
         return error;
   }
   
- SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-               unsigned long, prot)
+ /*
+  * pkey==-1 when doing a legacy mprotect()
+  */
+ static int do_mprotect_pkey(unsigned long start, size_t len,
+               unsigned long prot, int pkey)
   {
         unsigned long nstart, end, tmp, reqprot;
         struct vm_area_struct *vma, *prev;
@@@ -383,6 -387,14 +388,14 @@@
         if (down_write_killable(&current->mm->mmap_sem))
                 return -EINTR;
   
+       /*
+        * If userspace did not allocate the pkey, do not let
+        * them use it here.
+        */
+       error = -EINVAL;
+       if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
+               goto out;
+ 
         vma = find_vma(current->mm, start);
         error = -ENOMEM;
         if (!vma)
@@@ -409,8 -421,9 +422,9 @@@
                 prev = vma;
   
         for (nstart = start ; ; ) {
+               unsigned long mask_off_old_flags;
                 unsigned long newflags;
-               int pkey = arch_override_mprotect_pkey(vma, prot, -1);
+               int new_vma_pkey;
   
                 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
   
@@@ -418,8 -431,17 +432,17 @@@
                 if (rier && (vma->vm_flags & VM_MAYEXEC))
                         prot |= PROT_EXEC;
   
-               newflags = calc_vm_prot_bits(prot, pkey);
-               newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+               /*
+                * Each mprotect() call explicitly passes r/w/x permissions.
+                * If a permission is not passed to mprotect(), it must be
+                * cleared from the VMA.
+                */
+               mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC |
+                                       ARCH_VM_PKEY_FLAGS;
+ 
+               new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey);
+               newflags = calc_vm_prot_bits(prot, new_vma_pkey);
+               newflags |= (vma->vm_flags & ~mask_off_old_flags);
   
                 /* newflags >> 4 shift VM_MAY% in place of VM_% */
                 if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
@@@ -455,3 -477,60 +478,60 @@@ out
         up_write(&current->mm->mmap_sem);
         return error;
   }
+ 
+ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+               unsigned long, prot)
+ {
+       return do_mprotect_pkey(start, len, prot, -1);
+ }
+ 
+ SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
+               unsigned long, prot, int, pkey)
+ {
+       return do_mprotect_pkey(start, len, prot, pkey);
+ }
+ 
+ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
+ {
+       int pkey;
+       int ret;
+ 
+       /* No flags supported yet. */
+       if (flags)
+               return -EINVAL;
+       /* check for unsupported init values */
+       if (init_val & ~PKEY_ACCESS_MASK)
+               return -EINVAL;
+ 
+       down_write(&current->mm->mmap_sem);
+       pkey = mm_pkey_alloc(current->mm);
+ 
+       ret = -ENOSPC;
+       if (pkey == -1)
+               goto out;
+ 
+       ret = arch_set_user_pkey_access(current, pkey, init_val);
+       if (ret) {
+               mm_pkey_free(current->mm, pkey);
+               goto out;
+       }
+       ret = pkey;
+ out:
+       up_write(&current->mm->mmap_sem);
+       return ret;
+ }
+ 
+ SYSCALL_DEFINE1(pkey_free, int, pkey)
+ {
+       int ret;
+ 
+       down_write(&current->mm->mmap_sem);
+       ret = mm_pkey_free(current->mm, pkey);
+       up_write(&current->mm->mmap_sem);
+ 
+       /*
+        * We could provie warnings or errors if any VMA still
+        * has the pkey set here.
+        */
+       return ret;
+ }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 10 Oct 2016 18:01:51 +0000 (11:01 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 10 Oct 2016 18:01:51 +0000 (11:01 -0700)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/mprotect.c	patch \|	diff1 \|	diff2 \|	blob \| history