Merge tag 'v3.15-rc1' into perf/urgent
[cascardo/linux.git] / mm / memory.c
index 22dfa61..d0f0bef 100644 (file)
@@ -60,6 +60,7 @@
 #include <linux/migrate.h>
 #include <linux/string.h>
 #include <linux/dma-debug.h>
+#include <linux/debugfs.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
-                        * cleanup path of do_mmap_pgoff. When
+                        * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
-                        * do_mmap_pgoff() nullifies vma->vm_file
+                        * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
@@ -1705,15 +1706,6 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
        VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 
-       /* 
-        * Require read or write permissions.
-        * If FOLL_FORCE is set, we only require the "MAY" flags.
-        */
-       vm_flags  = (gup_flags & FOLL_WRITE) ?
-                       (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-       vm_flags &= (gup_flags & FOLL_FORCE) ?
-                       (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
-
        /*
         * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
         * would be called on PROT_NONE ranges. We must never invoke
@@ -1741,7 +1733,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
                        /* user gate pages are read-only */
                        if (gup_flags & FOLL_WRITE)
-                               return i ? : -EFAULT;
+                               goto efault;
                        if (pg > TASK_SIZE)
                                pgd = pgd_offset_k(pg);
                        else
@@ -1751,12 +1743,12 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        BUG_ON(pud_none(*pud));
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
-                               return i ? : -EFAULT;
+                               goto efault;
                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
-                               return i ? : -EFAULT;
+                               goto efault;
                        }
                        vma = get_gate_vma(mm);
                        if (pages) {
@@ -1769,7 +1761,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                                page = pte_page(*pte);
                                        else {
                                                pte_unmap(pte);
-                                               return i ? : -EFAULT;
+                                               goto efault;
                                        }
                                }
                                pages[i] = page;
@@ -1780,10 +1772,42 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        goto next_page;
                }
 
-               if (!vma ||
-                   (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
-                   !(vm_flags & vma->vm_flags))
-                       return i ? : -EFAULT;
+               if (!vma)
+                       goto efault;
+               vm_flags = vma->vm_flags;
+               if (vm_flags & (VM_IO | VM_PFNMAP))
+                       goto efault;
+
+               if (gup_flags & FOLL_WRITE) {
+                       if (!(vm_flags & VM_WRITE)) {
+                               if (!(gup_flags & FOLL_FORCE))
+                                       goto efault;
+                               /*
+                                * We used to let the write,force case do COW
+                                * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so
+                                * ptrace could set a breakpoint in a read-only
+                                * mapping of an executable, without corrupting
+                                * the file (yet only when that file had been
+                                * opened for writing!).  Anon pages in shared
+                                * mappings are surprising: now just reject it.
+                                */
+                               if (!is_cow_mapping(vm_flags)) {
+                                       WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+                                       goto efault;
+                               }
+                       }
+               } else {
+                       if (!(vm_flags & VM_READ)) {
+                               if (!(gup_flags & FOLL_FORCE))
+                                       goto efault;
+                               /*
+                                * Is there actually any vma we can reach here
+                                * which does not have VM_MAYREAD set?
+                                */
+                               if (!(vm_flags & VM_MAYREAD))
+                                       goto efault;
+                       }
+               }
 
                if (is_vm_hugetlb_page(vma)) {
                        i = follow_hugetlb_page(mm, vma, pages, vmas,
@@ -1837,7 +1861,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                                        return -EFAULT;
                                        }
                                        if (ret & VM_FAULT_SIGBUS)
-                                               return i ? i : -EFAULT;
+                                               goto efault;
                                        BUG();
                                }
 
@@ -1895,6 +1919,8 @@ next_page:
                } while (nr_pages && start < vma->vm_end);
        } while (nr_pages);
        return i;
+efault:
+       return i ? : -EFAULT;
 }
 EXPORT_SYMBOL(__get_user_pages);
 
@@ -1962,9 +1988,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
  * @start:     starting user address
  * @nr_pages:  number of pages from start to pin
  * @write:     whether pages will be written to by the caller
- * @force:     whether to force write access even if user mapping is
- *             readonly. This will result in the page being COWed even
- *             in MAP_SHARED mappings. You do not want this.
+ * @force:     whether to force access even when user mapping is currently
+ *             protected (but never forces write access to shared mapping).
  * @pages:     array that receives pointers to the pages pinned.
  *             Should be at least nr_pages long. Or NULL, if caller
  *             only intends to ensure the pages are faulted in.
@@ -2586,6 +2611,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                copy_user_highpage(dst, src, va, vma);
 }
 
+/*
+ * Notify the address space that the page is about to become writable so that
+ * it can prohibit this or wait for the page to get into an appropriate state.
+ *
+ * We do this without the lock held, so that it can sleep if it needs to.
+ */
+static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+              unsigned long address)
+{
+       struct vm_fault vmf;
+       int ret;
+
+       vmf.virtual_address = (void __user *)(address & PAGE_MASK);
+       vmf.pgoff = page->index;
+       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+       vmf.page = page;
+
+       ret = vma->vm_ops->page_mkwrite(vma, &vmf);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+               return ret;
+       if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+               lock_page(page);
+               if (!page->mapping) {
+                       unlock_page(page);
+                       return 0; /* retry */
+               }
+               ret |= VM_FAULT_LOCKED;
+       } else
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+       return ret;
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -2668,42 +2725,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * get_user_pages(.write=1, .force=1).
                 */
                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                       struct vm_fault vmf;
                        int tmp;
-
-                       vmf.virtual_address = (void __user *)(address &
-                                                               PAGE_MASK);
-                       vmf.pgoff = old_page->index;
-                       vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-                       vmf.page = old_page;
-
-                       /*
-                        * Notify the address space that the page is about to
-                        * become writable so that it can prohibit this or wait
-                        * for the page to get into an appropriate state.
-                        *
-                        * We do this without the lock held, so that it can
-                        * sleep if it needs to.
-                        */
                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
-
-                       tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
-                       if (unlikely(tmp &
-                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                               ret = tmp;
-                               goto unwritable_page;
+                       tmp = do_page_mkwrite(vma, old_page, address);
+                       if (unlikely(!tmp || (tmp &
+                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                               page_cache_release(old_page);
+                               return tmp;
                        }
-                       if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
-                               lock_page(old_page);
-                               if (!old_page->mapping) {
-                                       ret = 0; /* retry the fault */
-                                       unlock_page(old_page);
-                                       goto unwritable_page;
-                               }
-                       } else
-                               VM_BUG_ON_PAGE(!PageLocked(old_page), old_page);
-
                        /*
                         * Since we dropped the lock we need to revalidate
                         * the PTE as someone else may have changed it.  If
@@ -2748,11 +2778,11 @@ reuse:
                 * bit after it clear all dirty ptes, but before a racing
                 * do_wp_page installs a dirty pte.
                 *
-                * __do_fault is protected similarly.
+                * do_shared_fault is protected similarly.
                 */
                if (!page_mkwrite) {
                        wait_on_page_locked(dirty_page);
-                       set_page_dirty_balance(dirty_page, page_mkwrite);
+                       set_page_dirty_balance(dirty_page);
                        /* file_update_time outside page_lock */
                        if (vma->vm_file)
                                file_update_time(vma->vm_file);
@@ -2798,7 +2828,7 @@ gotten:
        }
        __SetPageUptodate(new_page);
 
-       if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+       if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
 
        mmun_start  = address & PAGE_MASK;
@@ -2892,10 +2922,6 @@ oom:
        if (old_page)
                page_cache_release(old_page);
        return VM_FAULT_OOM;
-
-unwritable_page:
-       page_cache_release(old_page);
-       return ret;
 }
 
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -3255,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        __SetPageUptodate(page);
 
-       if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+       if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
                goto oom_free_page;
 
        entry = mk_pte(page, vma->vm_page_prot);
@@ -3286,53 +3312,11 @@ oom:
        return VM_FAULT_OOM;
 }
 
-/*
- * __do_fault() tries to create a new page mapping. It aggressively
- * tries to share with existing pages, but makes a separate copy if
- * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
- * the next page fault.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte neither mapped nor locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- */
-static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pmd_t *pmd,
-               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+static int __do_fault(struct vm_area_struct *vma, unsigned long address,
+               pgoff_t pgoff, unsigned int flags, struct page **page)
 {
-       pte_t *page_table;
-       spinlock_t *ptl;
-       struct page *page;
-       struct page *cow_page;
-       pte_t entry;
-       int anon = 0;
-       struct page *dirty_page = NULL;
        struct vm_fault vmf;
        int ret;
-       int page_mkwrite = 0;
-
-       /*
-        * If we do COW later, allocate page befor taking lock_page()
-        * on the file cache page. This will reduce lock holding time.
-        */
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
-
-               if (unlikely(anon_vma_prepare(vma)))
-                       return VM_FAULT_OOM;
-
-               cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-               if (!cow_page)
-                       return VM_FAULT_OOM;
-
-               if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
-                       page_cache_release(cow_page);
-                       return VM_FAULT_OOM;
-               }
-       } else
-               cow_page = NULL;
 
        vmf.virtual_address = (void __user *)(address & PAGE_MASK);
        vmf.pgoff = pgoff;
@@ -3340,151 +3324,304 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        vmf.page = NULL;
 
        ret = vma->vm_ops->fault(vma, &vmf);
-       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-                           VM_FAULT_RETRY)))
-               goto uncharge_out;
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               return ret;
 
        if (unlikely(PageHWPoison(vmf.page))) {
                if (ret & VM_FAULT_LOCKED)
                        unlock_page(vmf.page);
-               ret = VM_FAULT_HWPOISON;
                page_cache_release(vmf.page);
-               goto uncharge_out;
+               return VM_FAULT_HWPOISON;
        }
 
-       /*
-        * For consistency in subsequent calls, make the faulted page always
-        * locked.
-        */
        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                lock_page(vmf.page);
        else
                VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
+       *page = vmf.page;
+       return ret;
+}
+
+/**
+ * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
+ *
+ * @vma: virtual memory area
+ * @address: user virtual address
+ * @page: page to map
+ * @pte: pointer to target page table entry
+ * @write: true, if new entry is writable
+ * @anon: true, if it's anonymous page
+ *
+ * Caller must hold page table lock relevant for @pte.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
+ */
+void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+               struct page *page, pte_t *pte, bool write, bool anon)
+{
+       pte_t entry;
+
+       flush_icache_page(vma, page);
+       entry = mk_pte(page, vma->vm_page_prot);
+       if (write)
+               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
+               pte_mksoft_dirty(entry);
+       if (anon) {
+               inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+               page_add_new_anon_rmap(page, vma, address);
+       } else {
+               inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+               page_add_file_rmap(page);
+       }
+       set_pte_at(vma->vm_mm, address, pte, entry);
+
+       /* no need to invalidate: a not-present page won't be cached */
+       update_mmu_cache(vma, address, pte);
+}
+
+#define FAULT_AROUND_ORDER 4
+
+#ifdef CONFIG_DEBUG_FS
+static unsigned int fault_around_order = FAULT_AROUND_ORDER;
+
+static int fault_around_order_get(void *data, u64 *val)
+{
+       *val = fault_around_order;
+       return 0;
+}
+
+static int fault_around_order_set(void *data, u64 val)
+{
+       BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
+       if (1UL << val > PTRS_PER_PTE)
+               return -EINVAL;
+       fault_around_order = val;
+       return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
+               fault_around_order_get, fault_around_order_set, "%llu\n");
+
+static int __init fault_around_debugfs(void)
+{
+       void *ret;
+
+       ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL,
+                       &fault_around_order_fops);
+       if (!ret)
+               pr_warn("Failed to create fault_around_order in debugfs");
+       return 0;
+}
+late_initcall(fault_around_debugfs);
+
+static inline unsigned long fault_around_pages(void)
+{
+       return 1UL << fault_around_order;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+       return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
+}
+#else
+static inline unsigned long fault_around_pages(void)
+{
+       unsigned long nr_pages;
+
+       nr_pages = 1UL << FAULT_AROUND_ORDER;
+       BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
+       return nr_pages;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+       return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
+}
+#endif
+
+static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
+               pte_t *pte, pgoff_t pgoff, unsigned int flags)
+{
+       unsigned long start_addr;
+       pgoff_t max_pgoff;
+       struct vm_fault vmf;
+       int off;
+
+       start_addr = max(address & fault_around_mask(), vma->vm_start);
+       off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+       pte -= off;
+       pgoff -= off;
+
        /*
-        * Should we do an early C-O-W break?
+        *  max_pgoff is either end of page table or end of vma
+        *  or fault_around_pages() from pgoff, depending what is neast.
         */
-       page = vmf.page;
-       if (flags & FAULT_FLAG_WRITE) {
-               if (!(vma->vm_flags & VM_SHARED)) {
-                       page = cow_page;
-                       anon = 1;
-                       copy_user_highpage(page, vmf.page, address, vma);
-                       __SetPageUptodate(page);
-               } else {
-                       /*
-                        * If the page will be shareable, see if the backing
-                        * address space wants to know that the page is about
-                        * to become writable
-                        */
-                       if (vma->vm_ops->page_mkwrite) {
-                               int tmp;
-
-                               unlock_page(page);
-                               vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
-                               tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
-                               if (unlikely(tmp &
-                                         (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
-                                       ret = tmp;
-                                       goto unwritable_page;
-                               }
-                               if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
-                                       lock_page(page);
-                                       if (!page->mapping) {
-                                               ret = 0; /* retry the fault */
-                                               unlock_page(page);
-                                               goto unwritable_page;
-                                       }
-                               } else
-                                       VM_BUG_ON_PAGE(!PageLocked(page), page);
-                               page_mkwrite = 1;
-                       }
-               }
-
+       max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+               PTRS_PER_PTE - 1;
+       max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
+                       pgoff + fault_around_pages() - 1);
+
+       /* Check if it makes any sense to call ->map_pages */
+       while (!pte_none(*pte)) {
+               if (++pgoff > max_pgoff)
+                       return;
+               start_addr += PAGE_SIZE;
+               if (start_addr >= vma->vm_end)
+                       return;
+               pte++;
        }
 
-       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+       vmf.virtual_address = (void __user *) start_addr;
+       vmf.pte = pte;
+       vmf.pgoff = pgoff;
+       vmf.max_pgoff = max_pgoff;
+       vmf.flags = flags;
+       vma->vm_ops->map_pages(vma, &vmf);
+}
+
+static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+       struct page *fault_page;
+       spinlock_t *ptl;
+       pte_t *pte;
+       int ret = 0;
 
        /*
-        * This silly early PAGE_DIRTY setting removes a race
-        * due to the bad i386 page protection. But it's valid
-        * for other architectures too.
-        *
-        * Note that if FAULT_FLAG_WRITE is set, we either now have
-        * an exclusive copy of the page, or this is a shared mapping,
-        * so we can make it writable and dirty to avoid having to
-        * handle that later.
+        * Let's call ->map_pages() first and use ->fault() as fallback
+        * if page by the offset is not ready to be mapped (cold cache or
+        * something).
         */
-       /* Only go through if we didn't race with anybody else... */
-       if (likely(pte_same(*page_table, orig_pte))) {
-               flush_icache_page(vma, page);
-               entry = mk_pte(page, vma->vm_page_prot);
-               if (flags & FAULT_FLAG_WRITE)
-                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
-                       pte_mksoft_dirty(entry);
-               if (anon) {
-                       inc_mm_counter_fast(mm, MM_ANONPAGES);
-                       page_add_new_anon_rmap(page, vma, address);
-               } else {
-                       inc_mm_counter_fast(mm, MM_FILEPAGES);
-                       page_add_file_rmap(page);
-                       if (flags & FAULT_FLAG_WRITE) {
-                               dirty_page = page;
-                               get_page(dirty_page);
-                       }
-               }
-               set_pte_at(mm, address, page_table, entry);
+       if (vma->vm_ops->map_pages) {
+               pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+               do_fault_around(vma, address, pte, pgoff, flags);
+               if (!pte_same(*pte, orig_pte))
+                       goto unlock_out;
+               pte_unmap_unlock(pte, ptl);
+       }
 
-               /* no need to invalidate: a not-present page won't be cached */
-               update_mmu_cache(vma, address, page_table);
-       } else {
-               if (cow_page)
-                       mem_cgroup_uncharge_page(cow_page);
-               if (anon)
-                       page_cache_release(page);
-               else
-                       anon = 1; /* no anon but release faulted_page */
+       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               return ret;
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*pte, orig_pte))) {
+               pte_unmap_unlock(pte, ptl);
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+               return ret;
        }
+       do_set_pte(vma, address, fault_page, pte, false, false);
+       unlock_page(fault_page);
+unlock_out:
+       pte_unmap_unlock(pte, ptl);
+       return ret;
+}
 
-       pte_unmap_unlock(page_table, ptl);
+static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+       struct page *fault_page, *new_page;
+       spinlock_t *ptl;
+       pte_t *pte;
+       int ret;
 
-       if (dirty_page) {
-               struct address_space *mapping = page->mapping;
-               int dirtied = 0;
+       if (unlikely(anon_vma_prepare(vma)))
+               return VM_FAULT_OOM;
 
-               if (set_page_dirty(dirty_page))
-                       dirtied = 1;
-               unlock_page(dirty_page);
-               put_page(dirty_page);
-               if ((dirtied || page_mkwrite) && mapping) {
-                       /*
-                        * Some device drivers do not set page.mapping but still
-                        * dirty their pages
-                        */
-                       balance_dirty_pages_ratelimited(mapping);
-               }
+       new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+       if (!new_page)
+               return VM_FAULT_OOM;
 
-               /* file_update_time outside page_lock */
-               if (vma->vm_file && !page_mkwrite)
-                       file_update_time(vma->vm_file);
-       } else {
-               unlock_page(vmf.page);
-               if (anon)
-                       page_cache_release(vmf.page);
+       if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
+               page_cache_release(new_page);
+               return VM_FAULT_OOM;
        }
 
-       return ret;
+       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               goto uncharge_out;
 
-unwritable_page:
-       page_cache_release(page);
+       copy_user_highpage(new_page, fault_page, address, vma);
+       __SetPageUptodate(new_page);
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*pte, orig_pte))) {
+               pte_unmap_unlock(pte, ptl);
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+               goto uncharge_out;
+       }
+       do_set_pte(vma, address, new_page, pte, true, true);
+       pte_unmap_unlock(pte, ptl);
+       unlock_page(fault_page);
+       page_cache_release(fault_page);
        return ret;
 uncharge_out:
-       /* fs's fault handler get error */
-       if (cow_page) {
-               mem_cgroup_uncharge_page(cow_page);
-               page_cache_release(cow_page);
+       mem_cgroup_uncharge_page(new_page);
+       page_cache_release(new_page);
+       return ret;
+}
+
+static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pmd_t *pmd,
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
+{
+       struct page *fault_page;
+       struct address_space *mapping;
+       spinlock_t *ptl;
+       pte_t *pte;
+       int dirtied = 0;
+       int ret, tmp;
+
+       ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+               return ret;
+
+       /*
+        * Check if the backing address space wants to know that the page is
+        * about to become writable
+        */
+       if (vma->vm_ops->page_mkwrite) {
+               unlock_page(fault_page);
+               tmp = do_page_mkwrite(vma, fault_page, address);
+               if (unlikely(!tmp ||
+                               (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                       page_cache_release(fault_page);
+                       return tmp;
+               }
        }
+
+       pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (unlikely(!pte_same(*pte, orig_pte))) {
+               pte_unmap_unlock(pte, ptl);
+               unlock_page(fault_page);
+               page_cache_release(fault_page);
+               return ret;
+       }
+       do_set_pte(vma, address, fault_page, pte, true, false);
+       pte_unmap_unlock(pte, ptl);
+
+       if (set_page_dirty(fault_page))
+               dirtied = 1;
+       mapping = fault_page->mapping;
+       unlock_page(fault_page);
+       if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
+               /*
+                * Some device drivers do not set page.mapping but still
+                * dirty their pages
+                */
+               balance_dirty_pages_ratelimited(mapping);
+       }
+
+       /* file_update_time outside page_lock */
+       if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+               file_update_time(vma->vm_file);
+
        return ret;
 }
 
@@ -3496,7 +3633,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 
        pte_unmap(page_table);
-       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+       if (!(flags & FAULT_FLAG_WRITE))
+               return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       if (!(vma->vm_flags & VM_SHARED))
+               return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
 /*
@@ -3528,10 +3671,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        pgoff = pte_to_pgoff(orig_pte);
-       return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
+       if (!(flags & FAULT_FLAG_WRITE))
+               return do_read_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       if (!(vma->vm_flags & VM_SHARED))
+               return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
+                               orig_pte);
+       return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
                                unsigned long addr, int page_nid,
                                int *flags)
 {
@@ -3546,7 +3695,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
        return mpol_misplaced(page, vma, addr);
 }
 
-int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                   unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
 {
        struct page *page = NULL;