net: drop tcp_memcontrol.c

[cascardo/linux.git] / mm / gup.c
diff --git a/mm/gup.c b/mm/gup.c

index 7017abe..b64a361 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -4,6 +4,7 @@
  #include <linux/spinlock.h>
  
  #include <linux/mm.h>
+#include <linux/memremap.h>
  #include <linux/pagemap.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
                 unsigned long address, pmd_t *pmd, unsigned int flags)
  {
         struct mm_struct *mm = vma->vm_mm;
+       struct dev_pagemap *pgmap = NULL;
         struct page *page;
         spinlock_t *ptl;
         pte_t *ptep, pte;
@@ -98,7 +100,17 @@ retry:
         }
  
         page = vm_normal_page(vma, address, pte);
-       if (unlikely(!page)) {
+       if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+               /*
+                * Only return device mapping pages in the FOLL_GET case since
+                * they are only valid while holding the pgmap reference.
+                */
+               pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+               if (pgmap)
+                       page = pte_page(pte);
+               else
+                       goto no_page;
+       } else if (unlikely(!page)) {
                 if (flags & FOLL_DUMP) {
                         /* Avoid special (like zero) pages in core dumps */
                         page = ERR_PTR(-EFAULT);
@@ -129,8 +141,15 @@ retry:
                 goto retry;
         }
  
-       if (flags & FOLL_GET)
+       if (flags & FOLL_GET) {
                 get_page(page);
+
+               /* drop the pgmap reference now that we hold the page */
+               if (pgmap) {
+                       put_dev_pagemap(pgmap);
+                       pgmap = NULL;
+               }
+       }
         if (flags & FOLL_TOUCH) {
                 if ((flags & FOLL_WRITE) &&
                     !pte_dirty(pte) && !PageDirty(page))
@@ -143,6 +162,10 @@ retry:
                 mark_page_accessed(page);
         }
         if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               /* Do not mlock pte-mapped THP */
+               if (PageTransCompound(page))
+                       goto out;
+
                 /*
                  * The preliminary mapping check is mainly to avoid the
                  * pointless overhead of lock_page on the ZERO_PAGE
@@ -233,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
         }
         if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                 return no_page_table(vma, flags);
+       if (pmd_devmap(*pmd)) {
+               ptl = pmd_lock(mm, pmd);
+               page = follow_devmap_pmd(vma, address, pmd, flags);
+               spin_unlock(ptl);
+               if (page)
+                       return page;
+       }
         if (likely(!pmd_trans_huge(*pmd)))
                 return follow_page_pte(vma, address, pmd, flags);
  
@@ -241,13 +271,6 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
                 spin_unlock(ptl);
                 return follow_page_pte(vma, address, pmd, flags);
         }
-
-       if (unlikely(pmd_trans_splitting(*pmd))) {
-               spin_unlock(ptl);
-               wait_split_huge_page(vma->anon_vma, pmd);
-               return follow_page_pte(vma, address, pmd, flags);
-       }
-
         if (flags & FOLL_SPLIT) {
                 int ret;
                 page = pmd_page(*pmd);
@@ -595,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages);
   * @mm:                mm_struct of target mm
   * @address:   user address
   * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ *             does not allow retry
   *
   * This is meant to be called in the specific scenario where for locking reasons
   * we try to access user memory in atomic context (within a pagefault_disable()
@@ -606,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages);
   * The main difference with get_user_pages() is that this function will
   * unconditionally call handle_mm_fault() which will in turn perform all the
   * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
   *
   * This is important for some architectures where those bits also gate the
   * access permission to the page because they are maintained in software.  On
   * such architectures, gup() will not be enough to make a subsequent access
   * succeed.
   *
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
   */
  int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-                    unsigned long address, unsigned int fault_flags)
+                    unsigned long address, unsigned int fault_flags,
+                    bool *unlocked)
  {
         struct vm_area_struct *vma;
         vm_flags_t vm_flags;
-       int ret;
+       int ret, major = 0;
+
+       if (unlocked)
+               fault_flags |= FAULT_FLAG_ALLOW_RETRY;
  
+retry:
         vma = find_extend_vma(mm, address);
         if (!vma || address < vma->vm_start)
                 return -EFAULT;
@@ -631,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                 return -EFAULT;
  
         ret = handle_mm_fault(mm, vma, address, fault_flags);
+       major |= ret & VM_FAULT_MAJOR;
         if (ret & VM_FAULT_ERROR) {
                 if (ret & VM_FAULT_OOM)
                         return -ENOMEM;
@@ -640,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
                         return -EFAULT;
                 BUG();
         }
+
+       if (ret & VM_FAULT_RETRY) {
+               down_read(&mm->mmap_sem);
+               if (!(fault_flags & FAULT_FLAG_TRIED)) {
+                       *unlocked = true;
+                       fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+                       fault_flags |= FAULT_FLAG_TRIED;
+                       goto retry;
+               }
+       }
+
         if (tsk) {
-               if (ret & VM_FAULT_MAJOR)
+               if (major)
                         tsk->maj_flt++;
                 else
                         tsk->min_flt++;
@@ -927,8 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
         gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
         if (vma->vm_flags & VM_LOCKONFAULT)
                 gup_flags &= ~FOLL_POPULATE;
-       if (vma->vm_flags & VM_LOCKED)
-               gup_flags |= FOLL_SPLIT;
         /*
          * We want to touch writable mappings with a write fault in order
          * to break COW, except for shared mappings because these don't COW
@@ -1068,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr)
   *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
   *      pages containing page tables.
   *
- *  *) THP splits will broadcast an IPI, this can be achieved by overriding
- *      pmdp_splitting_flush.
- *
   *  *) ptes can be read atomically by the architecture.
   *
   *  *) access_ok is sufficient to validate userspace address ranges.
@@ -1267,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                 pmd_t pmd = READ_ONCE(*pmdp);
  
                 next = pmd_addr_end(addr, end);
-               if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+               if (pmd_none(pmd))
                         return 0;
  
                 if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {