sctp: fix ASCONF list handling
[cascardo/linux.git] / mm / huge_memory.c
index 6817b03..078832c 100644 (file)
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
 
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
+static void khugepaged_slab_exit(void);
 
 #define MM_SLOTS_HASH_BITS 10
 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
        int nr_zones = 0;
        unsigned long recommended_min;
 
-       if (!khugepaged_enabled())
-               return 0;
-
        for_each_populated_zone(zone)
                nr_zones++;
 
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
        setup_per_zone_wmarks();
        return 0;
 }
-late_initcall(set_recommended_min_free_kbytes);
 
-static int start_khugepaged(void)
+static int start_stop_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
+                       goto fail;
                }
 
                if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
-
+fail:
        return err;
 }
 
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
        struct page *zero_page;
 retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
-               return ACCESS_ONCE(huge_zero_page);
+               return READ_ONCE(huge_zero_page);
 
        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
-       return ACCESS_ONCE(huge_zero_page);
+       return READ_ONCE(huge_zero_page);
 }
 
 static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
                int err;
 
                mutex_lock(&khugepaged_mutex);
-               err = start_khugepaged();
+               err = start_stop_khugepaged();
                mutex_unlock(&khugepaged_mutex);
 
                if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
 
        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
-               return err;
+               goto err_sysfs;
 
        err = khugepaged_slab_init();
        if (err)
-               goto out;
+               goto err_slab;
 
-       register_shrinker(&huge_zero_page_shrinker);
+       err = register_shrinker(&huge_zero_page_shrinker);
+       if (err)
+               goto err_hzp_shrinker;
 
        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
-       if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+       if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
+               return 0;
+       }
 
-       start_khugepaged();
+       err = start_stop_khugepaged();
+       if (err)
+               goto err_khugepaged;
 
        return 0;
-out:
+err_khugepaged:
+       unregister_shrinker(&huge_zero_page_shrinker);
+err_hzp_shrinker:
+       khugepaged_slab_exit();
+err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
+err_sysfs:
        return err;
 }
 subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long haddr, pmd_t *pmd,
-                                       struct page *page)
+                                       struct page *page, gfp_t gfp)
 {
        struct mem_cgroup *memcg;
        pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+       if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
                return VM_FAULT_OOM;
 
        pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
        }
-       if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+       if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                return VM_FAULT_FALLBACK;
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long haddr;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
+       gfp_t huge_gfp;                 /* for allocation and charge */
 
        ptl = pmd_lockptr(mm, pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 alloc:
        if (transparent_hugepage_enabled(vma) &&
            !transparent_hugepage_debug_cow()) {
-               gfp_t gfp;
-
-               gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
-               new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+               huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
        } else
                new_page = NULL;
 
@@ -1130,8 +1138,7 @@ alloc:
                goto out;
        }
 
-       if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                          GFP_TRANSHUGE, &memcg))) {
+       if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
                put_page(new_page);
                if (page) {
                        split_huge_page(page);
@@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                          pmd, _pmd,  1))
                        update_mmu_cache_pmd(vma, addr, pmd);
        }
-       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+       if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
                if (page->mapping && trylock_page(page)) {
                        lru_add_drain();
                        if (page->mapping)
@@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void)
        return 0;
 }
 
+static void __init khugepaged_slab_exit(void)
+{
+       kmem_cache_destroy(mm_slot_cache);
+}
+
 static inline struct mm_slot *alloc_mm_slot(void)
 {
        if (!mm_slot_cache)     /* initialization failed */
@@ -2109,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
 {
        while (--_pte >= pte) {
                pte_t pteval = *_pte;
-               if (!pte_none(pteval))
+               if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
                        release_pte_page(pte_page(pteval));
        }
 }
@@ -2120,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 {
        struct page *page;
        pte_t *_pte;
-       int none = 0;
+       int none_or_zero = 0;
        bool referenced = false, writable = false;
        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = *_pte;
-               if (pte_none(pteval)) {
-                       if (++none <= khugepaged_max_ptes_none)
+               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+                       if (++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out;
@@ -2207,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
                pte_t pteval = *_pte;
                struct page *src_page;
 
-               if (pte_none(pteval)) {
+               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        clear_user_highpage(page, address);
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+                       if (is_zero_pfn(pte_pfn(pteval))) {
+                               /*
+                                * ptl mostly unnecessary.
+                                */
+                               spin_lock(ptl);
+                               /*
+                                * paravirt calls inside pte_clear here are
+                                * superfluous.
+                                */
+                               pte_clear(vma->vm_mm, address, _pte);
+                               spin_unlock(ptl);
+                       }
                } else {
                        src_page = pte_page(pteval);
                        copy_user_highpage(page, src_page, address, vma);
@@ -2311,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
 
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
@@ -2326,8 +2350,7 @@ static struct page
         */
        up_read(&mm->mmap_sem);
 
-       *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
-               khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+       *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
        if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
@@ -2380,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
        return true;
 }
 
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
                       struct vm_area_struct *vma, unsigned long address,
                       int node)
 {
        up_read(&mm->mmap_sem);
        VM_BUG_ON(!*hpage);
+
        return  *hpage;
 }
 #endif
@@ -2421,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm,
        struct mem_cgroup *memcg;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
+       gfp_t gfp;
 
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
+       /* Only allocate from the target node */
+       gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+               __GFP_THISNODE;
+
        /* release the mmap_sem read lock. */
-       new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+       new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
        if (!new_page)
                return;
 
        if (unlikely(mem_cgroup_try_charge(new_page, mm,
-                                          GFP_TRANSHUGE, &memcg)))
+                                          gfp, &memcg)))
                return;
 
        /*
@@ -2543,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 {
        pmd_t *pmd;
        pte_t *pte, *_pte;
-       int ret = 0, none = 0;
+       int ret = 0, none_or_zero = 0;
        struct page *page;
        unsigned long _address;
        spinlock_t *ptl;
@@ -2561,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = *_pte;
-               if (pte_none(pteval)) {
-                       if (++none <= khugepaged_max_ptes_none)
+               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+                       if (++none_or_zero <= khugepaged_max_ptes_none)
                                continue;
                        else
                                goto out_unmap;