thp: allow mlocked THP again

author Kirill A. Shutemov <kirill.shutemov@linux.intel.com>

Sat, 16 Jan 2016 00:54:33 +0000 (16:54 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 16 Jan 2016 01:56:32 +0000 (17:56 -0800)
author Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Sat, 16 Jan 2016 00:54:33 +0000 (16:54 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 16 Jan 2016 01:56:32 +0000 (17:56 -0800)
diff --git a/mm/gup.c b/mm/gup.c

index 70d65e4..e95b0cb 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -143,6 +143,10 @@ retry:
                 mark_page_accessed(page);
         }
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /* Do not mlock pte-mapped THP */
+               if (PageTransCompound(page))
+                       goto out;
+
                 /*
                  * The preliminary mapping check is mainly to avoid the
                  * pointless overhead of lock_page on the ZERO_PAGE
@@ -920,8 +924,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
         gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
         if (vma->vm_flags & VM_LOCKONFAULT)
                 gup_flags &= ~FOLL_POPULATE;
-       if (vma->vm_flags & VM_LOCKED)
-               gup_flags |= FOLL_SPLIT;
         /*
          * We want to touch writable mappings with a write fault in order
          * to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 4acf55b..f283cb7 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -874,8 +874,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
                 return VM_FAULT_FALLBACK;
-       if (vma->vm_flags & VM_LOCKED)
-               return VM_FAULT_FALLBACK;
         if (unlikely(anon_vma_prepare(vma)))
                 return VM_FAULT_OOM;
         if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
@@ -1344,7 +1342,20 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                         update_mmu_cache_pmd(vma, addr, pmd);
         }
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
-               if (page->mapping && trylock_page(page)) {
+               /*
+                * We don't mlock() pte-mapped THPs. This way we can avoid
+                * leaking mlocked pages into non-VM_LOCKED VMAs.
+                *
+                * In most cases the pmd is the only mapping of the page as we
+                * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+                * writable private mappings in populate_vma_page_range().
+                *
+                * The only scenario when we have the page shared here is if we
+                * mlocking read-only mapping shared over fork(). We skip
+                * mlocking such pages.
+                */
+               if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+                               page->mapping && trylock_page(page)) {
                         lru_add_drain();
                         if (page->mapping)
                                 mlock_vma_page(page);
@@ -2209,8 +2220,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
         if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
             (vma->vm_flags & VM_NOHUGEPAGE))
                 return false;
-       if (vma->vm_flags & VM_LOCKED)
-               return false;
         if (!vma->anon_vma || vma->vm_ops)
                 return false;
         if (is_vma_temporary_stack(vma))
@@ -2851,14 +2860,28 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
  {
         spinlock_t *ptl;
         struct mm_struct *mm = vma->vm_mm;
+       struct page *page = NULL;
         unsigned long haddr = address & HPAGE_PMD_MASK;
  
         mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
         ptl = pmd_lock(mm, pmd);
-       if (likely(pmd_trans_huge(*pmd)))
-               __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (unlikely(!pmd_trans_huge(*pmd)))
+               goto out;
+       page = pmd_page(*pmd);
+       __split_huge_pmd_locked(vma, pmd, haddr, false);
+       if (PageMlocked(page))
+               get_page(page);
+       else
+               page = NULL;
+out:
         spin_unlock(ptl);
         mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+       if (page) {
+               lock_page(page);
+               munlock_vma_page(page);
+               unlock_page(page);
+               put_page(page);
+       }
  }
  
  static void split_huge_pmd_address(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c

index 9d5b408..5a73c6e 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2160,15 +2160,15 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
  
         pte_unmap_unlock(page_table, ptl);
         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-       /* THP pages are never mlocked */
-       if (old_page && !PageTransCompound(old_page)) {
+       if (old_page) {
                 /*
                  * Don't let another task, with possibly unlocked vma,
                  * keep the mlocked page.
                  */
                 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                         lock_page(old_page);    /* LRU manipulation */
-                       munlock_vma_page(old_page);
+                       if (PageMlocked(old_page))
+                               munlock_vma_page(old_page);
                         unlock_page(old_page);
                 }
                 page_cache_release(old_page);
diff --git a/mm/mlock.c b/mm/mlock.c

index c6b139a..9197b67 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
         /* Serialize with page migration */
         BUG_ON(!PageLocked(page));
  
+       VM_BUG_ON_PAGE(PageTail(page), page);
+       VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
         if (!TestSetPageMlocked(page)) {
                 mod_zone_page_state(page_zone(page), NR_MLOCK,
                                     hpage_nr_pages(page));
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page)
         /* For try_to_munlock() and to serialize with page migration */
         BUG_ON(!PageLocked(page));
  
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
         /*
          * Serialize with any parallel __split_huge_page_refcount() which
          * might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
                 if (!page || page_zone_id(page) != zoneid)
                         break;
  
+               /*
+                * Do not use pagevec for PTE-mapped THP,
+                * munlock_vma_pages_range() will handle them.
+                */
+               if (PageTransCompound(page))
+                       break;
+
                 get_page(page);
                 /*
                  * Increase the address that will be returned *before* the
@@ -443,29 +455,43 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
                 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
                                 &page_mask);
  
-               if (page && !IS_ERR(page) && !PageTransCompound(page)) {
-                       /*
-                        * Non-huge pages are handled in batches via
-                        * pagevec. The pin from follow_page_mask()
-                        * prevents them from collapsing by THP.
-                        */
-                       pagevec_add(&pvec, page);
-                       zone = page_zone(page);
-                       zoneid = page_zone_id(page);
+               if (page && !IS_ERR(page)) {
+                       if (PageTransTail(page)) {
+                               VM_BUG_ON_PAGE(PageMlocked(page), page);
+                               put_page(page); /* follow_page_mask() */
+                       } else if (PageTransHuge(page)) {
+                               lock_page(page);
+                               /*
+                                * Any THP page found by follow_page_mask() may
+                                * have gotten split before reaching
+                                * munlock_vma_page(), so we need to recompute
+                                * the page_mask here.
+                                */
+                               page_mask = munlock_vma_page(page);
+                               unlock_page(page);
+                               put_page(page); /* follow_page_mask() */
+                       } else {
+                               /*
+                                * Non-huge pages are handled in batches via
+                                * pagevec. The pin from follow_page_mask()
+                                * prevents them from collapsing by THP.
+                                */
+                               pagevec_add(&pvec, page);
+                               zone = page_zone(page);
+                               zoneid = page_zone_id(page);
  
-                       /*
-                        * Try to fill the rest of pagevec using fast
-                        * pte walk. This will also update start to
-                        * the next page to process. Then munlock the
-                        * pagevec.
-                        */
-                       start = __munlock_pagevec_fill(&pvec, vma,
-                                       zoneid, start, end);
-                       __munlock_pagevec(&pvec, zone);
-                       goto next;
+                               /*
+                                * Try to fill the rest of pagevec using fast
+                                * pte walk. This will also update start to
+                                * the next page to process. Then munlock the
+                                * pagevec.
+                                */
+                               start = __munlock_pagevec_fill(&pvec, vma,
+                                               zoneid, start, end);
+                               __munlock_pagevec(&pvec, zone);
+                               goto next;
+                       }
                 }
-               /* It's a bug to munlock in the middle of a THP page */
-               VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
                 page_increm = 1 + page_mask;
                 start += page_increm * PAGE_SIZE;
  next:
diff --git a/mm/rmap.c b/mm/rmap.c

index 84271cc..31d8866 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1282,6 +1282,9 @@ static void page_remove_anon_compound_rmap(struct page *page)
                 nr = HPAGE_PMD_NR;
         }
  
+       if (unlikely(PageMlocked(page)))
+               clear_page_mlock(page);
+
         if (nr) {
                 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
                 deferred_split_huge_page(page);
diff --git a/mm/swap.c b/mm/swap.c

index 3d65480..abffc33 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -358,6 +358,7 @@ static void __lru_cache_activate_page(struct page *page)
   */
  void mark_page_accessed(struct page *page)
  {
+       page = compound_head(page);
         if (!PageActive(page) && !PageUnevictable(page) &&
                         PageReferenced(page)) {
author	Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
	Sat, 16 Jan 2016 00:54:33 +0000 (16:54 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 16 Jan 2016 01:56:32 +0000 (17:56 -0800)
mm/gup.c		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mlock.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history