selftest: Timers: Avoid signal deadlock in leap-a-day
[cascardo/linux.git] / mm / memory.c
index 411144f..22e037e 100644 (file)
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        /*
         * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
         */
-       if (vma->vm_ops)
-               printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
-                      vma->vm_ops->fault);
-       if (vma->vm_file)
-               printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
-                      vma->vm_file->f_op->mmap);
+       pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+                vma->vm_file,
+                vma->vm_ops ? vma->vm_ops->fault : NULL,
+                vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+                mapping ? mapping->a_ops->readpage : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 }
@@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
 }
 
 /*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
- *
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
- *
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
+ * Handle write page faults for pages that can be reused in the current vma
  *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
  */
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, pte_t *page_table, pmd_t *pmd,
-               spinlock_t *ptl, pte_t orig_pte)
+static inline int wp_page_reuse(struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+                       struct page *page, int page_mkwrite,
+                       int dirty_shared)
        __releases(ptl)
 {
-       struct page *old_page, *new_page = NULL;
        pte_t entry;
-       int ret = 0;
-       int page_mkwrite = 0;
-       bool dirty_shared = false;
-       unsigned long mmun_start = 0;   /* For mmu_notifiers */
-       unsigned long mmun_end = 0;     /* For mmu_notifiers */
-       struct mem_cgroup *memcg;
-
-       old_page = vm_normal_page(vma, address, orig_pte);
-       if (!old_page) {
-               /*
-                * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
-                * VM_PFNMAP VMA.
-                *
-                * We should not cow pages in a shared writeable mapping.
-                * Just mark the pages writable as we can't do any dirty
-                * accounting on raw pfn maps.
-                */
-               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                    (VM_WRITE|VM_SHARED))
-                       goto reuse;
-               goto gotten;
-       }
-
        /*
-        * Take out anonymous pages first, anonymous shared vmas are
-        * not dirty accountable.
+        * Clear the pages cpupid information as the existing
+        * information potentially belongs to a now completely
+        * unrelated process.
         */
-       if (PageAnon(old_page) && !PageKsm(old_page)) {
-               if (!trylock_page(old_page)) {
-                       page_cache_get(old_page);
-                       pte_unmap_unlock(page_table, ptl);
-                       lock_page(old_page);
-                       page_table = pte_offset_map_lock(mm, pmd, address,
-                                                        &ptl);
-                       if (!pte_same(*page_table, orig_pte)) {
-                               unlock_page(old_page);
-                               goto unlock;
-                       }
-                       page_cache_release(old_page);
-               }
-               if (reuse_swap_page(old_page)) {
-                       /*
-                        * The page is all ours.  Move it to our anon_vma so
-                        * the rmap code will not search our parent or siblings.
-                        * Protected against the rmap code by the page lock.
-                        */
-                       page_move_anon_rmap(old_page, vma, address);
-                       unlock_page(old_page);
-                       goto reuse;
-               }
-               unlock_page(old_page);
-       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
-                                       (VM_WRITE|VM_SHARED))) {
-               page_cache_get(old_page);
-               /*
-                * Only catch write-faults on shared writable pages,
-                * read-only shared pages can get COWed by
-                * get_user_pages(.write=1, .force=1).
-                */
-               if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                       int tmp;
-
-                       pte_unmap_unlock(page_table, ptl);
-                       tmp = do_page_mkwrite(vma, old_page, address);
-                       if (unlikely(!tmp || (tmp &
-                                       (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-                               page_cache_release(old_page);
-                               return tmp;
-                       }
-                       /*
-                        * Since we dropped the lock we need to revalidate
-                        * the PTE as someone else may have changed it.  If
-                        * they did, we just return, as we can count on the
-                        * MMU to tell us if they didn't also make it writable.
-                        */
-                       page_table = pte_offset_map_lock(mm, pmd, address,
-                                                        &ptl);
-                       if (!pte_same(*page_table, orig_pte)) {
-                               unlock_page(old_page);
-                               goto unlock;
-                       }
-                       page_mkwrite = 1;
-               }
-
-               dirty_shared = true;
-
-reuse:
-               /*
-                * Clear the pages cpupid information as the existing
-                * information potentially belongs to a now completely
-                * unrelated process.
-                */
-               if (old_page)
-                       page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
-               flush_cache_page(vma, address, pte_pfn(orig_pte));
-               entry = pte_mkyoung(orig_pte);
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               if (ptep_set_access_flags(vma, address, page_table, entry,1))
-                       update_mmu_cache(vma, address, page_table);
-               pte_unmap_unlock(page_table, ptl);
-               ret |= VM_FAULT_WRITE;
+       if (page)
+               page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
 
-               if (dirty_shared) {
-                       struct address_space *mapping;
-                       int dirtied;
+       flush_cache_page(vma, address, pte_pfn(orig_pte));
+       entry = pte_mkyoung(orig_pte);
+       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+       if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+               update_mmu_cache(vma, address, page_table);
+       pte_unmap_unlock(page_table, ptl);
 
-                       if (!page_mkwrite)
-                               lock_page(old_page);
+       if (dirty_shared) {
+               struct address_space *mapping;
+               int dirtied;
 
-                       dirtied = set_page_dirty(old_page);
-                       VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
-                       mapping = old_page->mapping;
-                       unlock_page(old_page);
-                       page_cache_release(old_page);
+               if (!page_mkwrite)
+                       lock_page(page);
 
-                       if ((dirtied || page_mkwrite) && mapping) {
-                               /*
-                                * Some device drivers do not set page.mapping
-                                * but still dirty their pages
-                                */
-                               balance_dirty_pages_ratelimited(mapping);
-                       }
+               dirtied = set_page_dirty(page);
+               VM_BUG_ON_PAGE(PageAnon(page), page);
+               mapping = page->mapping;
+               unlock_page(page);
+               page_cache_release(page);
 
-                       if (!page_mkwrite)
-                               file_update_time(vma->vm_file);
+               if ((dirtied || page_mkwrite) && mapping) {
+                       /*
+                        * Some device drivers do not set page.mapping
+                        * but still dirty their pages
+                        */
+                       balance_dirty_pages_ratelimited(mapping);
                }
 
-               return ret;
+               if (!page_mkwrite)
+                       file_update_time(vma->vm_file);
        }
 
-       /*
-        * Ok, we need to copy. Oh, well..
-        */
-       page_cache_get(old_page);
-gotten:
-       pte_unmap_unlock(page_table, ptl);
+       return VM_FAULT_WRITE;
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ *   relevant references. This includes dropping the reference the page-table
+ *   held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pte_t *page_table, pmd_t *pmd,
+                       pte_t orig_pte, struct page *old_page)
+{
+       struct page *new_page = NULL;
+       spinlock_t *ptl = NULL;
+       pte_t entry;
+       int page_copied = 0;
+       const unsigned long mmun_start = address & PAGE_MASK;   /* For mmu_notifiers */
+       const unsigned long mmun_end = mmun_start + PAGE_SIZE;  /* For mmu_notifiers */
+       struct mem_cgroup *memcg;
 
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
@@ -2163,8 +2086,6 @@ gotten:
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
 
-       mmun_start  = address & PAGE_MASK;
-       mmun_end    = mmun_start + PAGE_SIZE;
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
        /*
@@ -2177,8 +2098,9 @@ gotten:
                                dec_mm_counter_fast(mm, MM_FILEPAGES);
                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
-               } else
+               } else {
                        inc_mm_counter_fast(mm, MM_ANONPAGES);
+               }
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2149,29 @@ gotten:
 
                /* Free the old page.. */
                new_page = old_page;
-               ret |= VM_FAULT_WRITE;
-       } else
+               page_copied = 1;
+       } else {
                mem_cgroup_cancel_charge(new_page, memcg);
+       }
 
        if (new_page)
                page_cache_release(new_page);
-unlock:
+
        pte_unmap_unlock(page_table, ptl);
-       if (mmun_end > mmun_start)
-               mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        if (old_page) {
                /*
                 * Don't let another task, with possibly unlocked vma,
                 * keep the mlocked page.
                 */
-               if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+               if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
                        munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
        }
-       return ret;
+       return page_copied ? VM_FAULT_WRITE : 0;
 oom_free_new:
        page_cache_release(new_page);
 oom:
@@ -2258,6 +2180,179 @@ oom:
        return VM_FAULT_OOM;
 }
 
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+                       struct vm_area_struct *vma, unsigned long address,
+                       pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+                       pmd_t *pmd)
+{
+       if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+               struct vm_fault vmf = {
+                       .page = NULL,
+                       .pgoff = linear_page_index(vma, address),
+                       .virtual_address = (void __user *)(address & PAGE_MASK),
+                       .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+               };
+               int ret;
+
+               pte_unmap_unlock(page_table, ptl);
+               ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+               if (ret & VM_FAULT_ERROR)
+                       return ret;
+               page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+               /*
+                * We might have raced with another page fault while we
+                * released the pte_offset_map_lock.
+                */
+               if (!pte_same(*page_table, orig_pte)) {
+                       pte_unmap_unlock(page_table, ptl);
+                       return 0;
+               }
+       }
+       return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+                            NULL, 0, 0);
+}
+
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+                         unsigned long address, pte_t *page_table,
+                         pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+                         struct page *old_page)
+       __releases(ptl)
+{
+       int page_mkwrite = 0;
+
+       page_cache_get(old_page);
+
+       /*
+        * Only catch write-faults on shared writable pages,
+        * read-only shared pages can get COWed by
+        * get_user_pages(.write=1, .force=1).
+        */
+       if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+               int tmp;
+
+               pte_unmap_unlock(page_table, ptl);
+               tmp = do_page_mkwrite(vma, old_page, address);
+               if (unlikely(!tmp || (tmp &
+                                     (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+                       page_cache_release(old_page);
+                       return tmp;
+               }
+               /*
+                * Since we dropped the lock we need to revalidate
+                * the PTE as someone else may have changed it.  If
+                * they did, we just return, as we can count on the
+                * MMU to tell us if they didn't also make it writable.
+                */
+               page_table = pte_offset_map_lock(mm, pmd, address,
+                                                &ptl);
+               if (!pte_same(*page_table, orig_pte)) {
+                       unlock_page(old_page);
+                       pte_unmap_unlock(page_table, ptl);
+                       page_cache_release(old_page);
+                       return 0;
+               }
+               page_mkwrite = 1;
+       }
+
+       return wp_page_reuse(mm, vma, address, page_table, ptl,
+                            orig_pte, old_page, page_mkwrite, 1);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               spinlock_t *ptl, pte_t orig_pte)
+       __releases(ptl)
+{
+       struct page *old_page;
+
+       old_page = vm_normal_page(vma, address, orig_pte);
+       if (!old_page) {
+               /*
+                * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+                * VM_PFNMAP VMA.
+                *
+                * We should not cow pages in a shared writeable mapping.
+                * Just mark the pages writable and/or call ops->pfn_mkwrite.
+                */
+               if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                    (VM_WRITE|VM_SHARED))
+                       return wp_pfn_shared(mm, vma, address, page_table, ptl,
+                                            orig_pte, pmd);
+
+               pte_unmap_unlock(page_table, ptl);
+               return wp_page_copy(mm, vma, address, page_table, pmd,
+                                   orig_pte, old_page);
+       }
+
+       /*
+        * Take out anonymous pages first, anonymous shared vmas are
+        * not dirty accountable.
+        */
+       if (PageAnon(old_page) && !PageKsm(old_page)) {
+               if (!trylock_page(old_page)) {
+                       page_cache_get(old_page);
+                       pte_unmap_unlock(page_table, ptl);
+                       lock_page(old_page);
+                       page_table = pte_offset_map_lock(mm, pmd, address,
+                                                        &ptl);
+                       if (!pte_same(*page_table, orig_pte)) {
+                               unlock_page(old_page);
+                               pte_unmap_unlock(page_table, ptl);
+                               page_cache_release(old_page);
+                               return 0;
+                       }
+                       page_cache_release(old_page);
+               }
+               if (reuse_swap_page(old_page)) {
+                       /*
+                        * The page is all ours.  Move it to our anon_vma so
+                        * the rmap code will not search our parent or siblings.
+                        * Protected against the rmap code by the page lock.
+                        */
+                       page_move_anon_rmap(old_page, vma, address);
+                       unlock_page(old_page);
+                       return wp_page_reuse(mm, vma, address, page_table, ptl,
+                                            orig_pte, old_page, 0, 0);
+               }
+               unlock_page(old_page);
+       } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+                                       (VM_WRITE|VM_SHARED))) {
+               return wp_page_shared(mm, vma, address, page_table, pmd,
+                                     ptl, orig_pte, old_page);
+       }
+
+       /*
+        * Ok, we need to copy. Oh, well..
+        */
+       page_cache_get(old_page);
+
+       pte_unmap_unlock(page_table, ptl);
+       return wp_page_copy(mm, vma, address, page_table, pmd,
+                           orig_pte, old_page);
+}
+
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
@@ -2784,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        struct vm_fault vmf;
        int off;
 
-       nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+       nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
        mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
 
        start_addr = max(address & mask, vma->vm_start);
@@ -3035,6 +3130,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int last_cpupid;
        int target_nid;
        bool migrated = false;
+       bool was_writable = pte_write(pte);
        int flags = 0;
 
        /* A PROT_NONE fault should not end up here */
@@ -3059,6 +3155,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Make it present again */
        pte = pte_modify(pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
+       if (was_writable)
+               pte = pte_mkwrite(pte);
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
 
@@ -3069,16 +3167,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        /*
-        * Avoid grouping on DSO/COW pages in specific and RO pages
-        * in general, RO pages shouldn't hurt as much anyway since
-        * they can be in shared cache state.
-        *
-        * FIXME! This checks "pmd_dirty()" as an approximation of
-        * "is this a read-only page", since checking "pmd_write()"
-        * is even more broken. We haven't actually turned this into
-        * a writable page, so pmd_write() will always be false.
+        * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+        * much anyway since they can be in shared cache state. This misses
+        * the case where a mapping is writable but the process never writes
+        * to it but pte_write gets cleared during protection updates and
+        * pte_dirty has unpredictable behaviour between PTE scan updates,
+        * background writeback, dirty balancing and application behaviour.
         */
-       if (!pte_dirty(pte))
+       if (!(vma->vm_flags & VM_WRITE))
                flags |= TNF_NO_GROUP;
 
        /*
@@ -3102,7 +3198,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (migrated) {
                page_nid = target_nid;
                flags |= TNF_MIGRATED;
-       }
+       } else
+               flags |= TNF_MIGRATE_FAIL;
 
 out:
        if (page_nid != -1)