rmap: drop support of non-linear mappings

author Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Tue, 10 Feb 2015 22:09:59 +0000 (14:09 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 10 Feb 2015 22:30:31 +0000 (14:30 -0800)
author Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tue, 10 Feb 2015 22:09:59 +0000 (14:09 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 10 Feb 2015 22:30:31 +0000 (14:30 -0800)
diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt

index d79b008..3f9f808 100644 (file)
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -317,10 +317,10 @@ maps this page at its virtual address.
         about doing this.
  
         The idea is, first at flush_dcache_page() time, if
-       page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
-       an empty list, just mark the architecture private page flag bit.
-       Later, in update_mmu_cache(), a check is made of this flag bit,
-       and if set the flush is done and the flag bit is cleared.
+       page->mapping->i_mmap is an empty tree, just mark the architecture
+       private page flag bit.  Later, in update_mmu_cache(), a check is
+       made of this flag bit, and if set the flush is done and the flag
+       bit is cleared.
  
         IMPORTANT NOTE: It is often important, if you defer the flush,
                         that the actual flush occurs on the same CPU
diff --git a/fs/inode.c b/fs/inode.c

index aa149e7..c760fac 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping)
         INIT_LIST_HEAD(&mapping->private_list);
         spin_lock_init(&mapping->private_lock);
         mapping->i_mmap = RB_ROOT;
-       INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
  }
  EXPORT_SYMBOL(address_space_init_once);
  
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 47f557c..60acab2 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -401,7 +401,6 @@ struct address_space {
         spinlock_t              tree_lock;      /* and lock protecting it */
         atomic_t                i_mmap_writable;/* count VM_SHARED mappings */
         struct rb_root          i_mmap;         /* tree of private and shared mappings */
-       struct list_head        i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
         struct rw_semaphore     i_mmap_rwsem;   /* protect tree, count, list */
         /* Protected by tree_lock together with the radix tree */
         unsigned long           nrpages;        /* number of total pages */
@@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping)
   */
  static inline int mapping_mapped(struct address_space *mapping)
  {
-       return  !RB_EMPTY_ROOT(&mapping->i_mmap) ||
-               !list_empty(&mapping->i_mmap_nonlinear);
+       return  !RB_EMPTY_ROOT(&mapping->i_mmap);
  }
  
  /*
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 2ddd9d1..18391ee 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
         for (vma = vma_interval_tree_iter_first(root, start, last);     \
              vma; vma = vma_interval_tree_iter_next(vma, start, last))
  
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
-                                       struct list_head *list)
-{
-       list_add_tail(&vma->shared.nonlinear, list);
-}
-
  void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                    struct rb_root *root);
  void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 6d34aa2..3b1d20f 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -273,15 +273,13 @@ struct vm_area_struct {
  
         /*
          * For areas with an address space and backing store,
-        * linkage into the address_space->i_mmap interval tree, or
-        * linkage of vma in the address_space->i_mmap_nonlinear list.
+        * linkage into the address_space->i_mmap interval tree.
          */
         union {
                 struct {
                         struct rb_node rb;
                         unsigned long rb_subtree_last;
                 } linear;
-               struct list_head nonlinear;
         } shared;
  
         /*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index d9d7e7e..b38f559 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
   * arg: passed to rmap_one() and invalid_vma()
   * rmap_one: executed on each vma where page is mapped
   * done: for checking traversing termination condition
- * file_nonlinear: for handling file nonlinear mapping
   * anon_lock: for getting anon_lock by optimized way rather than default
   * invalid_vma: for skipping uninterested vma
   */
@@ -255,7 +254,6 @@ struct rmap_walk_control {
         int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
                                         unsigned long addr, void *arg);
         int (*done)(struct page *page);
-       int (*file_nonlinear)(struct page *, struct address_space *, void *arg);
         struct anon_vma *(*anon_lock)(struct page *page);
         bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
  };
diff --git a/kernel/fork.c b/kernel/fork.c

index 4dc2dda..b379d9a 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                                 atomic_inc(&mapping->i_mmap_writable);
                         flush_dcache_mmap_lock(mapping);
                         /* insert tmp into the share list, just after mpnt */
-                       if (unlikely(tmp->vm_flags & VM_NONLINEAR))
-                               vma_nonlinear_insert(tmp,
-                                               &mapping->i_mmap_nonlinear);
-                       else
-                               vma_interval_tree_insert_after(tmp, mpnt,
-                                                       &mapping->i_mmap);
+                       vma_interval_tree_insert_after(tmp, mpnt,
+                                       &mapping->i_mmap);
                         flush_dcache_mmap_unlock(mapping);
                         i_mmap_unlock_write(mapping);
                 }
diff --git a/mm/migrate.c b/mm/migrate.c

index 344cdf6..6e284bc 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -178,37 +178,6 @@ out:
         return SWAP_AGAIN;
  }
  
-/*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it.  Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
-               struct address_space *mapping, void *arg)
-{
-       struct vm_area_struct *vma;
-       /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
-       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-       unsigned long addr;
-
-       list_for_each_entry(vma,
-               &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-               addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-               if (addr >= vma->vm_start && addr < vma->vm_end)
-                       remove_migration_pte(page, vma, addr, arg);
-       }
-       return SWAP_AGAIN;
-}
-
  /*
   * Get rid of all migration entries and replace them by
   * references to the indicated page.
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
         struct rmap_walk_control rwc = {
                 .rmap_one = remove_migration_pte,
                 .arg = old,
-               .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
         };
  
         rmap_walk(new, &rwc);
diff --git a/mm/mmap.c b/mm/mmap.c

index e023dc5..14d8466 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                 mapping_unmap_writable(mapping);
  
         flush_dcache_mmap_lock(mapping);
-       if (unlikely(vma->vm_flags & VM_NONLINEAR))
-               list_del_init(&vma->shared.nonlinear);
-       else
-               vma_interval_tree_remove(vma, &mapping->i_mmap);
+       vma_interval_tree_remove(vma, &mapping->i_mmap);
         flush_dcache_mmap_unlock(mapping);
  }
  
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
                         atomic_inc(&mapping->i_mmap_writable);
  
                 flush_dcache_mmap_lock(mapping);
-               if (unlikely(vma->vm_flags & VM_NONLINEAR))
-                       vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-               else
-                       vma_interval_tree_insert(vma, &mapping->i_mmap);
+               vma_interval_tree_insert(vma, &mapping->i_mmap);
                 flush_dcache_mmap_unlock(mapping);
         }
  }
@@ -789,14 +783,11 @@ again:                    remove_next = 1 + (end > next->vm_end);
  
         if (file) {
                 mapping = file->f_mapping;
-               if (!(vma->vm_flags & VM_NONLINEAR)) {
-                       root = &mapping->i_mmap;
-                       uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+               root = &mapping->i_mmap;
+               uprobe_munmap(vma, vma->vm_start, vma->vm_end);
  
-                       if (adjust_next)
-                               uprobe_munmap(next, next->vm_start,
-                                                       next->vm_end);
-               }
+               if (adjust_next)
+                       uprobe_munmap(next, next->vm_start, next->vm_end);
  
                 i_mmap_lock_write(mapping);
                 if (insert) {
@@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
   *
   * mmap_sem in write mode is required in order to block all operations
   * that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
   * anon_vmas to be associated with existing vmas.
   *
   * A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/rmap.c b/mm/rmap.c

index 71cd5bd..70b3249 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                 if (!vma->anon_vma || !page__anon_vma ||
                     vma->anon_vma->root != page__anon_vma->root)
                         return -EFAULT;
-       } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-               if (!vma->vm_file ||
-                   vma->vm_file->f_mapping != page->mapping)
+       } else if (page->mapping) {
+               if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
                         return -EFAULT;
         } else
                 return -EFAULT;
@@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                 if (pte_soft_dirty(pteval))
                         swp_pte = pte_swp_mksoft_dirty(swp_pte);
                 set_pte_at(mm, address, pte, swp_pte);
-               BUG_ON(pte_file(*pte));
         } else if (IS_ENABLED(CONFIG_MIGRATION) &&
                    (flags & TTU_MIGRATION)) {
                 /* Establish migration entry for a file page */
@@ -1316,211 +1314,6 @@ out_mlock:
         return ret;
  }
  
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs.  The ->vm_private_data field
- * holds the current cursor into that scan.  Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well.   Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster.  In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
- * rather than unmapping them.  If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE   min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK   (~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-               struct vm_area_struct *vma, struct page *check_page)
-{
-       struct mm_struct *mm = vma->vm_mm;
-       pmd_t *pmd;
-       pte_t *pte;
-       pte_t pteval;
-       spinlock_t *ptl;
-       struct page *page;
-       unsigned long address;
-       unsigned long mmun_start;       /* For mmu_notifiers */
-       unsigned long mmun_end;         /* For mmu_notifiers */
-       unsigned long end;
-       int ret = SWAP_AGAIN;
-       int locked_vma = 0;
-
-       address = (vma->vm_start + cursor) & CLUSTER_MASK;
-       end = address + CLUSTER_SIZE;
-       if (address < vma->vm_start)
-               address = vma->vm_start;
-       if (end > vma->vm_end)
-               end = vma->vm_end;
-
-       pmd = mm_find_pmd(mm, address);
-       if (!pmd)
-               return ret;
-
-       mmun_start = address;
-       mmun_end   = end;
-       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
-       /*
-        * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-        * keep the sem while scanning the cluster for mlocking pages.
-        */
-       if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-               locked_vma = (vma->vm_flags & VM_LOCKED);
-               if (!locked_vma)
-                       up_read(&vma->vm_mm->mmap_sem); /* don't need it */
-       }
-
-       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-       /* Update high watermark before we lower rss */
-       update_hiwater_rss(mm);
-
-       for (; address < end; pte++, address += PAGE_SIZE) {
-               if (!pte_present(*pte))
-                       continue;
-               page = vm_normal_page(vma, address, *pte);
-               BUG_ON(!page || PageAnon(page));
-
-               if (locked_vma) {
-                       if (page == check_page) {
-                               /* we know we have check_page locked */
-                               mlock_vma_page(page);
-                               ret = SWAP_MLOCK;
-                       } else if (trylock_page(page)) {
-                               /*
-                                * If we can lock the page, perform mlock.
-                                * Otherwise leave the page alone, it will be
-                                * eventually encountered again later.
-                                */
-                               mlock_vma_page(page);
-                               unlock_page(page);
-                       }
-                       continue;       /* don't unmap */
-               }
-
-               /*
-                * No need for _notify because we're within an
-                * mmu_notifier_invalidate_range_ {start|end} scope.
-                */
-               if (ptep_clear_flush_young(vma, address, pte))
-                       continue;
-
-               /* Nuke the page table entry. */
-               flush_cache_page(vma, address, pte_pfn(*pte));
-               pteval = ptep_clear_flush_notify(vma, address, pte);
-
-               /* If nonlinear, store the file page offset in the pte. */
-               if (page->index != linear_page_index(vma, address)) {
-                       pte_t ptfile = pgoff_to_pte(page->index);
-                       if (pte_soft_dirty(pteval))
-                               ptfile = pte_file_mksoft_dirty(ptfile);
-                       set_pte_at(mm, address, pte, ptfile);
-               }
-
-               /* Move the dirty bit to the physical page now the pte is gone. */
-               if (pte_dirty(pteval))
-                       set_page_dirty(page);
-
-               page_remove_rmap(page);
-               page_cache_release(page);
-               dec_mm_counter(mm, MM_FILEPAGES);
-               (*mapcount)--;
-       }
-       pte_unmap_unlock(pte - 1, ptl);
-       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-       if (locked_vma)
-               up_read(&vma->vm_mm->mmap_sem);
-       return ret;
-}
-
-static int try_to_unmap_nonlinear(struct page *page,
-               struct address_space *mapping, void *arg)
-{
-       struct vm_area_struct *vma;
-       int ret = SWAP_AGAIN;
-       unsigned long cursor;
-       unsigned long max_nl_cursor = 0;
-       unsigned long max_nl_size = 0;
-       unsigned int mapcount;
-
-       list_for_each_entry(vma,
-               &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-               cursor = (unsigned long) vma->vm_private_data;
-               if (cursor > max_nl_cursor)
-                       max_nl_cursor = cursor;
-               cursor = vma->vm_end - vma->vm_start;
-               if (cursor > max_nl_size)
-                       max_nl_size = cursor;
-       }
-
-       if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
-               return SWAP_FAIL;
-       }
-
-       /*
-        * We don't try to search for this page in the nonlinear vmas,
-        * and page_referenced wouldn't have found it anyway.  Instead
-        * just walk the nonlinear vmas trying to age and unmap some.
-        * The mapcount of the page we came in with is irrelevant,
-        * but even so use it as a guide to how hard we should try?
-        */
-       mapcount = page_mapcount(page);
-       if (!mapcount)
-               return ret;
-
-       cond_resched();
-
-       max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
-       if (max_nl_cursor == 0)
-               max_nl_cursor = CLUSTER_SIZE;
-
-       do {
-               list_for_each_entry(vma,
-                       &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-                       cursor = (unsigned long) vma->vm_private_data;
-                       while (cursor < max_nl_cursor &&
-                               cursor < vma->vm_end - vma->vm_start) {
-                               if (try_to_unmap_cluster(cursor, &mapcount,
-                                               vma, page) == SWAP_MLOCK)
-                                       ret = SWAP_MLOCK;
-                               cursor += CLUSTER_SIZE;
-                               vma->vm_private_data = (void *) cursor;
-                               if ((int)mapcount <= 0)
-                                       return ret;
-                       }
-                       vma->vm_private_data = (void *) max_nl_cursor;
-               }
-               cond_resched();
-               max_nl_cursor += CLUSTER_SIZE;
-       } while (max_nl_cursor <= max_nl_size);
-
-       /*
-        * Don't loop forever (perhaps all the remaining pages are
-        * in locked vmas).  Reset cursor on all unreserved nonlinear
-        * vmas, now forgetting on which ones it had fallen behind.
-        */
-       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
-               vma->vm_private_data = NULL;
-
-       return ret;
-}
-
  bool is_vma_temporary_stack(struct vm_area_struct *vma)
  {
         int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
                 .rmap_one = try_to_unmap_one,
                 .arg = (void *)flags,
                 .done = page_not_mapped,
-               .file_nonlinear = try_to_unmap_nonlinear,
                 .anon_lock = page_lock_anon_vma_read,
         };
  
@@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page)
                 .rmap_one = try_to_unmap_one,
                 .arg = (void *)TTU_MUNLOCK,
                 .done = page_not_mapped,
-               /*
-                * We don't bother to try to find the munlocked page in
-                * nonlinears. It's costly. Instead, later, page reclaim logic
-                * may call try_to_unmap() and recover PG_mlocked lazily.
-                */
-               .file_nonlinear = NULL,
                 .anon_lock = page_lock_anon_vma_read,
  
         };
@@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
                         goto done;
         }
  
-       if (!rwc->file_nonlinear)
-               goto done;
-
-       if (list_empty(&mapping->i_mmap_nonlinear))
-               goto done;
-
-       ret = rwc->file_nonlinear(page, mapping, rwc->arg);
  done:
         i_mmap_unlock_read(mapping);
         return ret;
diff --git a/mm/swap.c b/mm/swap.c

index 8a12b33..5b30872 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1140,10 +1140,8 @@ void __init swap_setup(void)
  
         if (bdi_init(swapper_spaces[0].backing_dev_info))
                 panic("Failed to init swap bdi");
-       for (i = 0; i < MAX_SWAPFILES; i++) {
+       for (i = 0; i < MAX_SWAPFILES; i++)
                 spin_lock_init(&swapper_spaces[i].tree_lock);
-               INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-       }
  #endif
  
         /* Use a smaller cluster for small-memory machines */
author	Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
	Tue, 10 Feb 2015 22:09:59 +0000 (14:09 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 10 Feb 2015 22:30:31 +0000 (14:30 -0800)
Documentation/cachetlb.txt		patch \| blob \| history
fs/inode.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/rmap.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history