mm/kmemleak.c: remove obsolete simple_strtoul
[cascardo/linux.git] / mm / mempolicy.c
index 05b2836..d1b315e 100644 (file)
@@ -90,6 +90,7 @@
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
        .flags = MPOL_F_LOCAL,
 };
 
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+       struct mempolicy *pol = p->mempolicy;
+       int node;
+
+       if (!pol) {
+               node = numa_node_id();
+               if (node != -1)
+                       pol = &preferred_node_policy[node];
+
+               /* preferred_node_policy is not initialised early in boot */
+               if (!pol->mode)
+                       pol = NULL;
+       }
+
+       return pol;
+}
+
 static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        /*
@@ -212,9 +233,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
-       /* Check N_HIGH_MEMORY */
+       /* Check N_MEMORY */
        nodes_and(nsc->mask1,
-                 cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
+                 cpuset_current_mems_allowed, node_states[N_MEMORY]);
 
        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
-               return NULL;    /* simply delete any existing policy */
+               return NULL;
        }
        VM_BUG_ON(!nodes);
 
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
+       } else if (mode == MPOL_LOCAL) {
+               if (!nodes_empty(*nodes))
+                       return ERR_PTR(-EINVAL);
+               mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -511,7 +536,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               split_huge_page_pmd(vma->vm_mm, pmd);
+               split_huge_page_pmd(vma, addr, pmd);
                if (pmd_none_or_trans_huge_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
        return 0;
 }
 
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long addr, unsigned long end)
+{
+       int nr_updated;
+       BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+
+       nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+       if (nr_updated)
+               count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+       return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+                       unsigned long addr, unsigned long end)
+{
+       return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
 /*
  * Check if all pages in a range are on a set of nodes.
  * If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                return ERR_PTR(-EFAULT);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+               unsigned long endvma = vma->vm_end;
+
+               if (endvma > end)
+                       endvma = end;
+               if (vma->vm_start > start)
+                       start = vma->vm_start;
+
                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
                        if (!vma->vm_next && vma->vm_end < end)
                                return ERR_PTR(-EFAULT);
                        if (prev && prev->vm_end < vma->vm_start)
                                return ERR_PTR(-EFAULT);
                }
-               if (!is_vm_hugetlb_page(vma) &&
-                   ((flags & MPOL_MF_STRICT) ||
+
+               if (is_vm_hugetlb_page(vma))
+                       goto next;
+
+               if (flags & MPOL_MF_LAZY) {
+                       change_prot_numa(vma, start, endvma);
+                       goto next;
+               }
+
+               if ((flags & MPOL_MF_STRICT) ||
                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
-                               vma_migratable(vma)))) {
-                       unsigned long endvma = vma->vm_end;
+                     vma_migratable(vma))) {
 
-                       if (endvma > end)
-                               endvma = end;
-                       if (vma->vm_start > start)
-                               start = vma->vm_start;
                        err = check_pgd_range(vma, start, endvma, nodes,
                                                flags, private);
                        if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                break;
                        }
                }
+next:
                prev = vma;
        }
        return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 
        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest,
-                                                       false, MIGRATE_SYNC);
+                                                       false, MIGRATE_SYNC,
+                                                       MR_SYSCALL);
                if (err)
                        putback_lru_pages(&pagelist);
        }
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        int err;
        LIST_HEAD(pagelist);
 
-       if (flags & ~(unsigned long)(MPOL_MF_STRICT |
-                                    MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+       if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (IS_ERR(new))
                return PTR_ERR(new);
 
+       if (flags & MPOL_MF_LAZY)
+               new->flags |= MPOL_F_MOF;
+
        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
 
-       err = PTR_ERR(vma);
-       if (!IS_ERR(vma)) {
-               int nr_failed = 0;
-
+       err = PTR_ERR(vma);     /* maybe ... */
+       if (!IS_ERR(vma))
                err = mbind_range(mm, start, end, new);
 
+       if (!err) {
+               int nr_failed = 0;
+
                if (!list_empty(&pagelist)) {
+                       WARN_ON_ONCE(flags & MPOL_MF_LAZY);
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                               false, MIGRATE_SYNC);
+                                               false, MIGRATE_SYNC,
+                                               MR_MEMPOLICY_MBIND);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
 
-               if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+               if (nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        } else
                putback_lru_pages(&pagelist);
@@ -1388,7 +1460,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                goto out_put;
        }
 
-       if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
+       if (!nodes_subset(*new, node_states[N_MEMORY])) {
                err = -EINVAL;
                goto out_put;
        }
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
-       struct mempolicy *pol = task->mempolicy;
+       struct mempolicy *pol = get_task_policy(task);
 
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1956,7 +2028,7 @@ retry_cpuset:
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
-       struct mempolicy *pol = current->mempolicy;
+       struct mempolicy *pol = get_task_policy(current);
        struct page *page;
        unsigned int cpuset_mems_cookie;
 
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
 
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page   - page to be checked
+ * @vma    - vm area where page mapped
+ * @addr   - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ *     -1      - not misplaced, page is in the right node
+ *     node    - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+       struct mempolicy *pol;
+       struct zone *zone;
+       int curnid = page_to_nid(page);
+       unsigned long pgoff;
+       int polnid = -1;
+       int ret = -1;
+
+       BUG_ON(!vma);
+
+       pol = get_vma_policy(current, vma, addr);
+       if (!(pol->flags & MPOL_F_MOF))
+               goto out;
+
+       switch (pol->mode) {
+       case MPOL_INTERLEAVE:
+               BUG_ON(addr >= vma->vm_end);
+               BUG_ON(addr < vma->vm_start);
+
+               pgoff = vma->vm_pgoff;
+               pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+               polnid = offset_il_node(pol, vma, pgoff);
+               break;
+
+       case MPOL_PREFERRED:
+               if (pol->flags & MPOL_F_LOCAL)
+                       polnid = numa_node_id();
+               else
+                       polnid = pol->v.preferred_node;
+               break;
+
+       case MPOL_BIND:
+               /*
+                * allows binding to multiple nodes.
+                * use current page if in policy nodemask,
+                * else select nearest allowed node, if any.
+                * If no allowed nodes, use current [!misplaced].
+                */
+               if (node_isset(curnid, pol->v.nodes))
+                       goto out;
+               (void)first_zones_zonelist(
+                               node_zonelist(numa_node_id(), GFP_HIGHUSER),
+                               gfp_zone(GFP_HIGHUSER),
+                               &pol->v.nodes, &zone);
+               polnid = zone->node;
+               break;
+
+       default:
+               BUG();
+       }
+
+       /* Migrate the page towards the node whose CPU is referencing it */
+       if (pol->flags & MPOL_F_MORON) {
+               int last_nid;
+
+               polnid = numa_node_id();
+
+               /*
+                * Multi-stage node selection is used in conjunction
+                * with a periodic migration fault to build a temporal
+                * task<->page relation. By using a two-stage filter we
+                * remove short/unlikely relations.
+                *
+                * Using P(p) ~ n_p / n_t as per frequentist
+                * probability, we can equate a task's usage of a
+                * particular page (n_p) per total usage of this
+                * page (n_t) (in a given time-span) to a probability.
+                *
+                * Our periodic faults will sample this probability and
+                * getting the same result twice in a row, given these
+                * samples are fully independent, is then given by
+                * P(n)^2, provided our sample period is sufficiently
+                * short compared to the usage pattern.
+                *
+                * This quadric squishes small probabilities, making
+                * it less likely we act on an unlikely task<->page
+                * relation.
+                */
+               last_nid = page_xchg_last_nid(page, polnid);
+               if (last_nid != polnid)
+                       goto out;
+       }
+
+       if (curnid != polnid)
+               ret = polnid;
+out:
+       mpol_cond_put(pol);
+
+       return ret;
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
        mutex_unlock(&p->mutex);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
+{
+       bool numabalancing_default = false;
+
+       if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+               numabalancing_default = true;
+
+       if (nr_node_ids > 1 && !numabalancing_override) {
+               printk(KERN_INFO "Enabling automatic NUMA balancing. "
+                       "Configure with numa_balancing= or sysctl");
+               set_numabalancing_state(numabalancing_default);
+       }
+}
+
+static int __init setup_numabalancing(char *str)
+{
+       int ret = 0;
+       if (!str)
+               goto out;
+       numabalancing_override = true;
+
+       if (!strcmp(str, "enable")) {
+               set_numabalancing_state(true);
+               ret = 1;
+       } else if (!strcmp(str, "disable")) {
+               set_numabalancing_state(false);
+               ret = 1;
+       }
+out:
+       if (!ret)
+               printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+
+       return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
@@ -2320,13 +2545,22 @@ void __init numa_policy_init(void)
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);
 
+       for_each_node(nid) {
+               preferred_node_policy[nid] = (struct mempolicy) {
+                       .refcnt = ATOMIC_INIT(1),
+                       .mode = MPOL_PREFERRED,
+                       .flags = MPOL_F_MOF | MPOL_F_MORON,
+                       .v = { .preferred_node = nid, },
+               };
+       }
+
        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
-       for_each_node_state(nid, N_HIGH_MEMORY) {
+       for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);
 
                /* Preserve the largest node */
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
 
        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                printk("numa_policy_init: interleaving failed\n");
+
+       check_numabalancing_enable();
 }
 
 /* Reset policy of current process to default */
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
  * Used only for mpol_parse_str() and mpol_to_str()
  */
-#define MPOL_LOCAL MPOL_MAX
 static const char * const policy_modes[] =
 {
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
-       [MPOL_LOCAL]      = "local"
+       [MPOL_LOCAL]      = "local",
 };
 
 
@@ -2407,7 +2642,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
-               if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+               if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
        if (flags)
                *flags++ = '\0';        /* terminate mode string */
 
-       for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+       for (mode = 0; mode < MPOL_MAX; mode++) {
                if (!strcmp(str, policy_modes[mode])) {
                        break;
                }
        }
-       if (mode > MPOL_LOCAL)
+       if (mode >= MPOL_MAX)
                goto out;
 
        switch (mode) {
@@ -2441,7 +2676,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
-                       nodes = node_states[N_HIGH_MEMORY];
+                       nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*