console: don't prefer first registered if DT specifies stdout-path
[cascardo/linux.git] / mm / mmap.c
index 234edff..1af87c1 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -25,6 +25,7 @@
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
 #include <linux/profile.h>
 #include <linux/export.h>
 #include <linux/mount.h>
@@ -87,6 +88,11 @@ static void unmap_region(struct mm_struct *mm,
  *             w: (no) no      w: (no) no      w: (copy) copy  w: (no) no
  *             x: (no) no      x: (no) yes     x: (no) yes     x: (yes) yes
  *
+ * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
+ * MAP_PRIVATE:
+ *                                                             r: (no) no
+ *                                                             w: (no) no
+ *                                                             x: (yes) yes
  */
 pgprot_t protection_map[16] = {
        __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -110,13 +116,15 @@ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
 void vma_set_page_prot(struct vm_area_struct *vma)
 {
        unsigned long vm_flags = vma->vm_flags;
+       pgprot_t vm_page_prot;
 
-       vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
-       if (vma_wants_writenotify(vma)) {
+       vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
+       if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
-               vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
-                                                    vm_flags);
+               vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
+       /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
+       WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
 }
 
 /*
@@ -394,14 +402,8 @@ static inline void vma_rb_insert(struct vm_area_struct *vma,
        rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 }
 
-static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
+static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
 {
-       /*
-        * All rb_subtree_gap values must be consistent prior to erase,
-        * with the possible exception of the vma being erased.
-        */
-       validate_mm_rb(root, vma);
-
        /*
         * Note rb_erase_augmented is a fairly large inline function,
         * so make sure we instantiate it only once with our desired
@@ -410,6 +412,32 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
        rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
 }
 
+static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
+                                               struct rb_root *root,
+                                               struct vm_area_struct *ignore)
+{
+       /*
+        * All rb_subtree_gap values must be consistent prior to erase,
+        * with the possible exception of the "next" vma being erased if
+        * next->vm_start was reduced.
+        */
+       validate_mm_rb(root, ignore);
+
+       __vma_rb_erase(vma, root);
+}
+
+static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
+                                        struct rb_root *root)
+{
+       /*
+        * All rb_subtree_gap values must be consistent prior to erase,
+        * with the possible exception of the vma being erased.
+        */
+       validate_mm_rb(root, vma);
+
+       __vma_rb_erase(vma, root);
+}
+
 /*
  * vma has some anon_vma assigned, and is already inserted on that
  * anon_vma's interval trees.
@@ -593,14 +621,25 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
        mm->map_count++;
 }
 
-static inline void
-__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
-               struct vm_area_struct *prev)
+static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+                                               struct vm_area_struct *vma,
+                                               struct vm_area_struct *prev,
+                                               bool has_prev,
+                                               struct vm_area_struct *ignore)
 {
        struct vm_area_struct *next;
 
-       vma_rb_erase(vma, &mm->mm_rb);
-       prev->vm_next = next = vma->vm_next;
+       vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
+       next = vma->vm_next;
+       if (has_prev)
+               prev->vm_next = next;
+       else {
+               prev = vma->vm_prev;
+               if (prev)
+                       prev->vm_next = next;
+               else
+                       mm->mmap = next;
+       }
        if (next)
                next->vm_prev = prev;
 
@@ -608,6 +647,13 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
        vmacache_invalidate(mm);
 }
 
+static inline void __vma_unlink_prev(struct mm_struct *mm,
+                                    struct vm_area_struct *vma,
+                                    struct vm_area_struct *prev)
+{
+       __vma_unlink_common(mm, vma, prev, true, vma);
+}
+
 /*
  * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
  * is already present in an i_mmap tree without adjusting the tree.
@@ -615,12 +661,12 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
  * are necessary.  The "insert" vma (if any) is to be inserted
  * before we drop the necessary locks.
  */
-int vma_adjust(struct vm_area_struct *vma, unsigned long start,
-       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+       unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+       struct vm_area_struct *expand)
 {
        struct mm_struct *mm = vma->vm_mm;
-       struct vm_area_struct *next = vma->vm_next;
-       struct vm_area_struct *importer = NULL;
+       struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
        struct address_space *mapping = NULL;
        struct rb_root *root = NULL;
        struct anon_vma *anon_vma = NULL;
@@ -630,17 +676,54 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        int remove_next = 0;
 
        if (next && !insert) {
-               struct vm_area_struct *exporter = NULL;
+               struct vm_area_struct *exporter = NULL, *importer = NULL;
 
                if (end >= next->vm_end) {
                        /*
                         * vma expands, overlapping all the next, and
                         * perhaps the one after too (mprotect case 6).
+                        * The only other cases that gets here are
+                        * case 1, case 7 and case 8.
                         */
-again:                 remove_next = 1 + (end > next->vm_end);
-                       end = next->vm_end;
+                       if (next == expand) {
+                               /*
+                                * The only case where we don't expand "vma"
+                                * and we expand "next" instead is case 8.
+                                */
+                               VM_WARN_ON(end != next->vm_end);
+                               /*
+                                * remove_next == 3 means we're
+                                * removing "vma" and that to do so we
+                                * swapped "vma" and "next".
+                                */
+                               remove_next = 3;
+                               VM_WARN_ON(file != next->vm_file);
+                               swap(vma, next);
+                       } else {
+                               VM_WARN_ON(expand != vma);
+                               /*
+                                * case 1, 6, 7, remove_next == 2 is case 6,
+                                * remove_next == 1 is case 1 or 7.
+                                */
+                               remove_next = 1 + (end > next->vm_end);
+                               VM_WARN_ON(remove_next == 2 &&
+                                          end != next->vm_next->vm_end);
+                               VM_WARN_ON(remove_next == 1 &&
+                                          end != next->vm_end);
+                               /* trim end to next, for case 6 first pass */
+                               end = next->vm_end;
+                       }
+
                        exporter = next;
                        importer = vma;
+
+                       /*
+                        * If next doesn't have anon_vma, import from vma after
+                        * next, if the vma overlaps with it.
+                        */
+                       if (remove_next == 2 && !next->anon_vma)
+                               exporter = next->vm_next;
+
                } else if (end > next->vm_start) {
                        /*
                         * vma expands, overlapping part of the next:
@@ -649,6 +732,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
                        adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
                        exporter = next;
                        importer = vma;
+                       VM_WARN_ON(expand != importer);
                } else if (end < vma->vm_end) {
                        /*
                         * vma shrinks, and !insert tells it's not
@@ -658,6 +742,7 @@ again:                      remove_next = 1 + (end > next->vm_end);
                        adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
                        exporter = vma;
                        importer = next;
+                       VM_WARN_ON(expand != importer);
                }
 
                /*
@@ -674,6 +759,8 @@ again:                      remove_next = 1 + (end > next->vm_end);
                                return error;
                }
        }
+again:
+       vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
 
        if (file) {
                mapping = file->f_mapping;
@@ -695,14 +782,12 @@ again:                    remove_next = 1 + (end > next->vm_end);
                }
        }
 
-       vma_adjust_trans_huge(vma, start, end, adjust_next);
-
        anon_vma = vma->anon_vma;
        if (!anon_vma && adjust_next)
                anon_vma = next->anon_vma;
        if (anon_vma) {
-               VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
-                         anon_vma != next->anon_vma, next);
+               VM_WARN_ON(adjust_next && next->anon_vma &&
+                          anon_vma != next->anon_vma);
                anon_vma_lock_write(anon_vma);
                anon_vma_interval_tree_pre_update_vma(vma);
                if (adjust_next)
@@ -742,7 +827,19 @@ again:                     remove_next = 1 + (end > next->vm_end);
                 * vma_merge has merged next into vma, and needs
                 * us to remove next before dropping the locks.
                 */
-               __vma_unlink(mm, next, vma);
+               if (remove_next != 3)
+                       __vma_unlink_prev(mm, next, vma);
+               else
+                       /*
+                        * vma is not before next if they've been
+                        * swapped.
+                        *
+                        * pre-swap() next->vm_start was reduced so
+                        * tell validate_mm_rb to ignore pre-swap()
+                        * "next" (which is stored in post-swap()
+                        * "vma").
+                        */
+                       __vma_unlink_common(mm, next, NULL, false, vma);
                if (file)
                        __remove_shared_vm_struct(next, file, mapping);
        } else if (insert) {
@@ -794,13 +891,56 @@ again:                    remove_next = 1 + (end > next->vm_end);
                 * we must remove another next too. It would clutter
                 * up the code too much to do both in one go.
                 */
-               next = vma->vm_next;
-               if (remove_next == 2)
+               if (remove_next != 3) {
+                       /*
+                        * If "next" was removed and vma->vm_end was
+                        * expanded (up) over it, in turn
+                        * "next->vm_prev->vm_end" changed and the
+                        * "vma->vm_next" gap must be updated.
+                        */
+                       next = vma->vm_next;
+               } else {
+                       /*
+                        * For the scope of the comment "next" and
+                        * "vma" considered pre-swap(): if "vma" was
+                        * removed, next->vm_start was expanded (down)
+                        * over it and the "next" gap must be updated.
+                        * Because of the swap() the post-swap() "vma"
+                        * actually points to pre-swap() "next"
+                        * (post-swap() "next" as opposed is now a
+                        * dangling pointer).
+                        */
+                       next = vma;
+               }
+               if (remove_next == 2) {
+                       remove_next = 1;
+                       end = next->vm_end;
                        goto again;
+               }
                else if (next)
                        vma_gap_update(next);
-               else
-                       mm->highest_vm_end = end;
+               else {
+                       /*
+                        * If remove_next == 2 we obviously can't
+                        * reach this path.
+                        *
+                        * If remove_next == 3 we can't reach this
+                        * path because pre-swap() next is always not
+                        * NULL. pre-swap() "next" is not being
+                        * removed and its next->vm_end is not altered
+                        * (and furthermore "end" already matches
+                        * next->vm_end in remove_next == 3).
+                        *
+                        * We reach this only in the remove_next == 1
+                        * case if the "next" vma that was removed was
+                        * the highest vma of the mm. However in such
+                        * case next->vm_end == "end" and the extended
+                        * "vma" has vma->vm_end == next->vm_end so
+                        * mm->highest_vm_end doesn't need any update
+                        * in remove_next == 1 case.
+                        */
+                       VM_WARN_ON(mm->highest_vm_end != end);
+               }
        }
        if (insert && file)
                uprobe_mmap(insert);
@@ -920,13 +1060,24 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  *    cannot merge    might become    might become    might become
  *                    PPNNNNNNNNNN    PPPPPPPPPPNN    PPPPPPPPPPPP 6 or
  *    mmap, brk or    case 4 below    case 5 below    PPPPPPPPXXXX 7 or
- *    mremap move:                                    PPPPNNNNNNNN 8
+ *    mremap move:                                    PPPPXXXXXXXX 8
  *        AAAA
  *    PPPP    NNNN    PPPPPPPPPPPP    PPPPPPPPNNNN    PPPPNNNNNNNN
  *    might become    case 1 below    case 2 below    case 3 below
  *
- * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
- * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
+ * It is important for case 8 that the the vma NNNN overlapping the
+ * region AAAA is never going to extended over XXXX. Instead XXXX must
+ * be extended in region AAAA and NNNN must be removed. This way in
+ * all cases where vma_merge succeeds, the moment vma_adjust drops the
+ * rmap_locks, the properties of the merged vma will be already
+ * correct for the whole merged range. Some of those properties like
+ * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
+ * be correct for the whole merged range immediately after the
+ * rmap_locks are released. Otherwise if XXXX would be removed and
+ * NNNN would be extended over the XXXX range, remove_migration_ptes
+ * or other rmap walkers (if working on addresses beyond the "end"
+ * parameter) may establish ptes with the wrong permissions of NNNN
+ * instead of the right permissions of XXXX.
  */
 struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        struct vm_area_struct *prev, unsigned long addr,
@@ -951,9 +1102,14 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
        else
                next = mm->mmap;
        area = next;
-       if (next && next->vm_end == end)                /* cases 6, 7, 8 */
+       if (area && area->vm_end == end)                /* cases 6, 7, 8 */
                next = next->vm_next;
 
+       /* verify some invariant that must be enforced by the caller */
+       VM_WARN_ON(prev && addr <= prev->vm_start);
+       VM_WARN_ON(area && end > area->vm_end);
+       VM_WARN_ON(addr >= end);
+
        /*
         * Can it merge with the predecessor?
         */
@@ -974,11 +1130,12 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               next->vm_end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                                        next->vm_end, prev->vm_pgoff, NULL,
+                                        prev);
                } else                                  /* cases 2, 5, 7 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               end, prev->vm_pgoff, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                                        end, prev->vm_pgoff, NULL, prev);
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(prev, vm_flags);
@@ -994,11 +1151,18 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                             anon_vma, file, pgoff+pglen,
                                             vm_userfaultfd_ctx)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
-                       err = vma_adjust(prev, prev->vm_start,
-                               addr, prev->vm_pgoff, NULL);
-               else                                    /* cases 3, 8 */
-                       err = vma_adjust(area, addr, next->vm_end,
-                               next->vm_pgoff - pglen, NULL);
+                       err = __vma_adjust(prev, prev->vm_start,
+                                        addr, prev->vm_pgoff, NULL, next);
+               else {                                  /* cases 3, 8 */
+                       err = __vma_adjust(area, addr, next->vm_end,
+                                        next->vm_pgoff - pglen, NULL, next);
+                       /*
+                        * In case 3 area is already equal to next and
+                        * this is a noop, but in case 8 "area" has
+                        * been removed and next was expanded over it.
+                        */
+                       area = next;
+               }
                if (err)
                        return NULL;
                khugepaged_enter_vma_merge(area, vm_flags);
@@ -1370,7 +1534,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
  * to the private version (using protection_map[] without the
  * VM_SHARED bit).
  */
-int vma_wants_writenotify(struct vm_area_struct *vma)
+int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
 {
        vm_flags_t vm_flags = vma->vm_flags;
        const struct vm_operations_struct *vm_ops = vma->vm_ops;
@@ -1385,8 +1549,8 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 
        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
-       if (pgprot_val(vma->vm_page_prot) !=
-           pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
+       if (pgprot_val(vm_page_prot) !=
+           pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
                return 0;
 
        /* Do we need to track softdirty? */
@@ -1897,8 +2061,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                return -ENOMEM;
 
        get_area = current->mm->get_unmapped_area;
-       if (file && file->f_op->get_unmapped_area)
-               get_area = file->f_op->get_unmapped_area;
+       if (file) {
+               if (file->f_op->get_unmapped_area)
+                       get_area = file->f_op->get_unmapped_area;
+       } else if (flags & MAP_SHARED) {
+               /*
+                * mmap_region() will call shmem_zero_setup() to create a file,
+                * so use shmem's get_unmapped_area in case it can be huge.
+                * do_mmap_pgoff() will clear pgoff, so match alignment.
+                */
+               pgoff = 0;
+               get_area = shmem_get_unmapped_area;
+       }
+
        addr = get_area(file, addr, len, pgoff, flags);
        if (IS_ERR_VALUE(addr))
                return addr;
@@ -2591,6 +2766,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                /* drop PG_Mlocked flag for over-mapped range */
                for (tmp = vma; tmp->vm_start >= start + size;
                                tmp = tmp->vm_next) {
+                       /*
+                        * Split pmd and munlock page on the border
+                        * of the range.
+                        */
+                       vma_adjust_trans_huge(tmp, start, start + size, 0);
+
                        munlock_vma_pages_range(tmp,
                                        max(tmp->vm_start, start),
                                        min(tmp->vm_end, start + size));
@@ -2625,16 +2806,18 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static int do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long request)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
-       unsigned long flags;
+       unsigned long flags, len;
        struct rb_node **rb_link, *rb_parent;
        pgoff_t pgoff = addr >> PAGE_SHIFT;
        int error;
 
-       len = PAGE_ALIGN(len);
+       len = PAGE_ALIGN(request);
+       if (len < request)
+               return -ENOMEM;
        if (!len)
                return 0;
 
@@ -3033,6 +3216,14 @@ out:
        return ERR_PTR(ret);
 }
 
+bool vma_is_special_mapping(const struct vm_area_struct *vma,
+       const struct vm_special_mapping *sm)
+{
+       return vma->vm_private_data == sm &&
+               (vma->vm_ops == &special_mapping_vmops ||
+                vma->vm_ops == &legacy_special_mapping_vmops);
+}
+
 /*
  * Called with mm->mmap_sem held for writing.
  * Insert a new vma covering the given region, with the given flags.