mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma()

author Andrea Arcangeli <aarcange@redhat.com>

Tue, 10 Jan 2012 23:08:05 +0000 (15:08 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 11 Jan 2012 00:30:44 +0000 (16:30 -0800)
author Andrea Arcangeli <aarcange@redhat.com>
Tue, 10 Jan 2012 23:08:05 +0000 (15:08 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 11 Jan 2012 00:30:44 +0000 (16:30 -0800)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index 2148b12..1afb995 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -120,6 +120,7 @@ void anon_vma_init(void);   /* create anon_vma_cachep */
  int  anon_vma_prepare(struct vm_area_struct *);
  void unlink_anon_vmas(struct vm_area_struct *);
  int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
+void anon_vma_moveto_tail(struct vm_area_struct *);
  int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
  void __anon_vma_link(struct vm_area_struct *);
  
diff --git a/mm/mmap.c b/mm/mmap.c

index eae90af..adea3b8 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2322,13 +2322,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
         struct vm_area_struct *new_vma, *prev;
         struct rb_node **rb_link, *rb_parent;
         struct mempolicy *pol;
+       bool faulted_in_anon_vma = true;
  
         /*
          * If anonymous vma has not yet been faulted, update new pgoff
          * to match new location, to increase its chance of merging.
          */
-       if (!vma->vm_file && !vma->anon_vma)
+       if (unlikely(!vma->vm_file && !vma->anon_vma)) {
                 pgoff = addr >> PAGE_SHIFT;
+               faulted_in_anon_vma = false;
+       }
  
         find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
         new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2337,9 +2340,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                 /*
                  * Source vma may have been merged into new_vma
                  */
-               if (vma_start >= new_vma->vm_start &&
-                   vma_start < new_vma->vm_end)
+               if (unlikely(vma_start >= new_vma->vm_start &&
+                            vma_start < new_vma->vm_end)) {
+                       /*
+                        * The only way we can get a vma_merge with
+                        * self during an mremap is if the vma hasn't
+                        * been faulted in yet and we were allowed to
+                        * reset the dst vma->vm_pgoff to the
+                        * destination address of the mremap to allow
+                        * the merge to happen. mremap must change the
+                        * vm_pgoff linearity between src and dst vmas
+                        * (in turn preventing a vma_merge) to be
+                        * safe. It is only safe to keep the vm_pgoff
+                        * linear if there are no pages mapped yet.
+                        */
+                       VM_BUG_ON(faulted_in_anon_vma);
                         *vmap = new_vma;
+               } else
+                       anon_vma_moveto_tail(new_vma);
         } else {
                 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                 if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c

index d6959cb..87bb839 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -220,6 +220,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
  
         moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
         if (moved_len < old_len) {
+               /*
+                * Before moving the page tables from the new vma to
+                * the old vma, we need to be sure the old vma is
+                * queued after new vma in the same_anon_vma list to
+                * prevent SMP races with rmap_walk (that could lead
+                * rmap_walk to miss some page table).
+                */
+               anon_vma_moveto_tail(vma);
+
                 /*
                  * On error, move entries back from new area to old,
                  * which will succeed since page tables still there,
diff --git a/mm/rmap.c b/mm/rmap.c

index a4fd368..a2e5ce1 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -271,6 +271,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
         return -ENOMEM;
  }
  
+/*
+ * Some rmap walk that needs to find all ptes/hugepmds without false
+ * negatives (like migrate and split_huge_page) running concurrent
+ * with operations that copy or move pagetables (like mremap() and
+ * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
+ * list to be in a certain order: the dst_vma must be placed after the
+ * src_vma in the list. This is always guaranteed by fork() but
+ * mremap() needs to call this function to enforce it in case the
+ * dst_vma isn't newly allocated and chained with the anon_vma_clone()
+ * function but just an extension of a pre-existing vma through
+ * vma_merge.
+ *
+ * NOTE: the same_anon_vma list can still be changed by other
+ * processes while mremap runs because mremap doesn't hold the
+ * anon_vma mutex to prevent modifications to the list while it
+ * runs. All we need to enforce is that the relative order of this
+ * process vmas isn't changing (we don't care about other vmas
+ * order). Each vma corresponds to an anon_vma_chain structure so
+ * there's no risk that other processes calling anon_vma_moveto_tail()
+ * and changing the same_anon_vma list under mremap() will screw with
+ * the relative order of this process vmas in the list, because we
+ * they can't alter the order of any vma that belongs to this
+ * process. And there can't be another anon_vma_moveto_tail() running
+ * concurrently with mremap() coming from this process because we hold
+ * the mmap_sem for the whole mremap(). fork() ordering dependency
+ * also shouldn't be affected because fork() only cares that the
+ * parent vmas are placed in the list before the child vmas and
+ * anon_vma_moveto_tail() won't reorder vmas from either the fork()
+ * parent or child.
+ */
+void anon_vma_moveto_tail(struct vm_area_struct *dst)
+{
+       struct anon_vma_chain *pavc;
+       struct anon_vma *root = NULL;
+
+       list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
+               struct anon_vma *anon_vma = pavc->anon_vma;
+               VM_BUG_ON(pavc->vma != dst);
+               root = lock_anon_vma_root(root, anon_vma);
+               list_del(&pavc->same_anon_vma);
+               list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
+       }
+       unlock_anon_vma_root(root);
+}
+
  /*
   * Attach vma to its own anon_vma, as well as to the anon_vmas that
   * the corresponding VMA in the parent process is attached to.
author	Andrea Arcangeli <aarcange@redhat.com>
	Tue, 10 Jan 2012 23:08:05 +0000 (15:08 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 11 Jan 2012 00:30:44 +0000 (16:30 -0800)
include/linux/rmap.h		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history