mm: create a separate slab for page->ptl allocation
[cascardo/linux.git] / mm / memory.c
index d176154..0409e8f 100644 (file)
@@ -69,8 +69,8 @@
 
 #include "internal.h"
 
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
-       tlb->mm->nr_ptes--;
+       atomic_long_dec(&tlb->mm->nr_ptes);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 
 /*
  * This function frees user-level page tables of a process.
- *
- * Must be called with pagetable lock held.
  */
 void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
@@ -552,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long address)
 {
+       spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm, address);
        int wait_split_huge_page;
        if (!new)
@@ -572,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
-       spin_lock(&mm->page_table_lock);
+       ptl = pmd_lock(mm, pmd);
        wait_split_huge_page = 0;
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
-               mm->nr_ptes++;
+               atomic_long_inc(&mm->nr_ptes);
                pmd_populate(mm, pmd, new);
                new = NULL;
        } else if (unlikely(pmd_trans_splitting(*pmd)))
                wait_split_huge_page = 1;
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
        if (wait_split_huge_page)
@@ -681,7 +680,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        if (vma->vm_ops)
                printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
                       vma->vm_ops->fault);
-       if (vma->vm_file && vma->vm_file->f_op)
+       if (vma->vm_file)
                printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
                       vma->vm_file->f_op->mmap);
        dump_stack();
@@ -1518,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
                        split_huge_page_pmd(vma, address, pmd);
                        goto split_fallthrough;
                }
-               spin_lock(&mm->page_table_lock);
+               ptl = pmd_lock(mm, pmd);
                if (likely(pmd_trans_huge(*pmd))) {
                        if (unlikely(pmd_trans_splitting(*pmd))) {
-                               spin_unlock(&mm->page_table_lock);
+                               spin_unlock(ptl);
                                wait_split_huge_page(vma->anon_vma, pmd);
                        } else {
                                page = follow_trans_huge_pmd(vma, address,
                                                             pmd, flags);
-                               spin_unlock(&mm->page_table_lock);
+                               spin_unlock(ptl);
                                *page_mask = HPAGE_PMD_NR - 1;
                                goto out;
                        }
                } else
-                       spin_unlock(&mm->page_table_lock);
+                       spin_unlock(ptl);
                /* fall through */
        }
 split_fallthrough:
@@ -2721,6 +2720,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                get_page(dirty_page);
 
 reuse:
+               /*
+                * Clear the pages cpupid information as the existing
+                * information potentially belongs to a now completely
+                * unrelated process.
+                */
+               if (old_page)
+                       page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3521,13 +3528,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                               unsigned long addr, int page_nid)
+                               unsigned long addr, int page_nid,
+                               int *flags)
 {
        get_page(page);
 
        count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (page_nid == numa_node_id())
+       if (page_nid == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+               *flags |= TNF_FAULT_LOCAL;
+       }
 
        return mpol_misplaced(page, vma, addr);
 }
@@ -3538,8 +3548,10 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page = NULL;
        spinlock_t *ptl;
        int page_nid = -1;
+       int last_cpupid;
        int target_nid;
        bool migrated = false;
+       int flags = 0;
 
        /*
        * The "pte" at this point cannot be used safely without
@@ -3566,9 +3578,26 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(ptep, ptl);
                return 0;
        }
+       BUG_ON(is_zero_pfn(page_to_pfn(page)));
+
+       /*
+        * Avoid grouping on DSO/COW pages in specific and RO pages
+        * in general, RO pages shouldn't hurt as much anyway since
+        * they can be in shared cache state.
+        */
+       if (!pte_write(pte))
+               flags |= TNF_NO_GROUP;
+
+       /*
+        * Flag if the page is shared between multiple address spaces. This
+        * is later used when determining whether to group tasks together
+        */
+       if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+               flags |= TNF_SHARED;
 
+       last_cpupid = page_cpupid_last(page);
        page_nid = page_to_nid(page);
-       target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+       target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
        pte_unmap_unlock(ptep, ptl);
        if (target_nid == -1) {
                put_page(page);
@@ -3576,102 +3605,17 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        /* Migrate to the requested node */
-       migrated = migrate_misplaced_page(page, target_nid);
-       if (migrated)
+       migrated = migrate_misplaced_page(page, vma, target_nid);
+       if (migrated) {
                page_nid = target_nid;
+               flags |= TNF_MIGRATED;
+       }
 
 out:
        if (page_nid != -1)
-               task_numa_fault(page_nid, 1, migrated);
-       return 0;
-}
-
-/* NUMA hinting page fault entry point for regular pmds */
-#ifdef CONFIG_NUMA_BALANCING
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                    unsigned long addr, pmd_t *pmdp)
-{
-       pmd_t pmd;
-       pte_t *pte, *orig_pte;
-       unsigned long _addr = addr & PMD_MASK;
-       unsigned long offset;
-       spinlock_t *ptl;
-       bool numa = false;
-
-       spin_lock(&mm->page_table_lock);
-       pmd = *pmdp;
-       if (pmd_numa(pmd)) {
-               set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
-               numa = true;
-       }
-       spin_unlock(&mm->page_table_lock);
-
-       if (!numa)
-               return 0;
-
-       /* we're in a page fault so some vma must be in the range */
-       BUG_ON(!vma);
-       BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
-       offset = max(_addr, vma->vm_start) & ~PMD_MASK;
-       VM_BUG_ON(offset >= PMD_SIZE);
-       orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
-       pte += offset >> PAGE_SHIFT;
-       for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
-               pte_t pteval = *pte;
-               struct page *page;
-               int page_nid = -1;
-               int target_nid;
-               bool migrated = false;
-
-               if (!pte_present(pteval))
-                       continue;
-               if (!pte_numa(pteval))
-                       continue;
-               if (addr >= vma->vm_end) {
-                       vma = find_vma(mm, addr);
-                       /* there's a pte present so there must be a vma */
-                       BUG_ON(!vma);
-                       BUG_ON(addr < vma->vm_start);
-               }
-               if (pte_numa(pteval)) {
-                       pteval = pte_mknonnuma(pteval);
-                       set_pte_at(mm, addr, pte, pteval);
-               }
-               page = vm_normal_page(vma, addr, pteval);
-               if (unlikely(!page))
-                       continue;
-               /* only check non-shared pages */
-               if (unlikely(page_mapcount(page) != 1))
-                       continue;
-
-               page_nid = page_to_nid(page);
-               target_nid = numa_migrate_prep(page, vma, addr, page_nid);
-               pte_unmap_unlock(pte, ptl);
-               if (target_nid != -1) {
-                       migrated = migrate_misplaced_page(page, target_nid);
-                       if (migrated)
-                               page_nid = target_nid;
-               } else {
-                       put_page(page);
-               }
-
-               if (page_nid != -1)
-                       task_numa_fault(page_nid, 1, migrated);
-
-               pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-       }
-       pte_unmap_unlock(orig_pte, ptl);
-
-       return 0;
-}
-#else
-static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                    unsigned long addr, pmd_t *pmdp)
-{
-       BUG();
+               task_numa_fault(last_cpupid, page_nid, 1, flags);
        return 0;
 }
-#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * These routines also need to handle stuff like marking pages dirty
@@ -3811,8 +3755,8 @@ retry:
                }
        }
 
-       if (pmd_numa(*pmd))
-               return do_pmd_numa_page(mm, vma, address, pmd);
+       /* THP should already have been handled */
+       BUG_ON(pmd_numa(*pmd));
 
        /*
         * Use __pte_alloc instead of pte_alloc_map, because we can't
@@ -4326,3 +4270,28 @@ void copy_user_huge_page(struct page *dst, struct page *src,
        }
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
+
+#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
+static struct kmem_cache *page_ptl_cachep;
+void __init ptlock_cache_init(void)
+{
+       page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
+                       SLAB_PANIC, NULL);
+}
+
+bool ptlock_alloc(struct page *page)
+{
+       spinlock_t *ptl;
+
+       ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
+       if (!ptl)
+               return false;
+       page->ptl = ptl;
+       return true;
+}
+
+void ptlock_free(struct page *page)
+{
+       kfree(page->ptl);
+}
+#endif