mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case NUMA_NO_NODE here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  69
  70 #include <linux/mempolicy.h>
  71 #include <linux/mm.h>
  72 #include <linux/highmem.h>
  73 #include <linux/hugetlb.h>
  74 #include <linux/kernel.h>
  75 #include <linux/sched.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/export.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/ksm.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92 #include <linux/syscalls.h>
  93 #include <linux/ctype.h>
  94 #include <linux/mm_inline.h>
  95 #include <linux/mmu_notifier.h>
  96 #include <linux/printk.h>
  97
  98 #include <asm/tlbflush.h>
  99 #include <asm/uaccess.h>
 100 #include <linux/random.h>
 101
 102 #include "internal.h"
 103
 104 /* Internal flags */
 105 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 106 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 107
 108 static struct kmem_cache *policy_cache;
 109 static struct kmem_cache *sn_cache;
 110
 111 /* Highest zone. An specific allocation for a zone below that is not
 112    policied. */
 113 enum zone_type policy_zone = 0;
 114
 115 /*
 116  * run-time system-wide default policy => local allocation
 117  */
 118 static struct mempolicy default_policy = {
 119         .refcnt = ATOMIC_INIT(1), /* never free it */
 120         .mode = MPOL_PREFERRED,
 121         .flags = MPOL_F_LOCAL,
 122 };
 123
 124 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 125
 126 static struct mempolicy *get_task_policy(struct task_struct *p)
 127 {
 128         struct mempolicy *pol = p->mempolicy;
 129
 130         if (!pol) {
 131                 int node = numa_node_id();
 132
 133                 if (node != NUMA_NO_NODE) {
 134                         pol = &preferred_node_policy[node];
 135                         /*
 136                          * preferred_node_policy is not initialised early in
 137                          * boot
 138                          */
 139                         if (!pol->mode)
 140                                 pol = NULL;
 141                 }
 142         }
 143
 144         return pol;
 145 }
 146
 147 static const struct mempolicy_operations {
 148         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 149         /*
 150          * If read-side task has no lock to protect task->mempolicy, write-side
 151          * task will rebind the task->mempolicy by two step. The first step is
 152          * setting all the newly nodes, and the second step is cleaning all the
 153          * disallowed nodes. In this way, we can avoid finding no node to alloc
 154          * page.
 155          * If we have a lock to protect task->mempolicy in read-side, we do
 156          * rebind directly.
 157          *
 158          * step:
 159          *      MPOL_REBIND_ONCE - do rebind work at once
 160          *      MPOL_REBIND_STEP1 - set all the newly nodes
 161          *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 162          */
 163         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 164                         enum mpol_rebind_step step);
 165 } mpol_ops[MPOL_MAX];
 166
 167 /* Check that the nodemask contains at least one populated zone */
 168 static int is_valid_nodemask(const nodemask_t *nodemask)
 169 {
 170         return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 171 }
 172
 173 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 174 {
 175         return pol->flags & MPOL_MODE_FLAGS;
 176 }
 177
 178 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 179                                    const nodemask_t *rel)
 180 {
 181         nodemask_t tmp;
 182         nodes_fold(tmp, *orig, nodes_weight(*rel));
 183         nodes_onto(*ret, tmp, *rel);
 184 }
 185
 186 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 187 {
 188         if (nodes_empty(*nodes))
 189                 return -EINVAL;
 190         pol->v.nodes = *nodes;
 191         return 0;
 192 }
 193
 194 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 195 {
 196         if (!nodes)
 197                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 198         else if (nodes_empty(*nodes))
 199                 return -EINVAL;                 /*  no allowed nodes */
 200         else
 201                 pol->v.preferred_node = first_node(*nodes);
 202         return 0;
 203 }
 204
 205 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 206 {
 207         if (!is_valid_nodemask(nodes))
 208                 return -EINVAL;
 209         pol->v.nodes = *nodes;
 210         return 0;
 211 }
 212
 213 /*
 214  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 215  * any, for the new policy.  mpol_new() has already validated the nodes
 216  * parameter with respect to the policy mode and flags.  But, we need to
 217  * handle an empty nodemask with MPOL_PREFERRED here.
 218  *
 219  * Must be called holding task's alloc_lock to protect task's mems_allowed
 220  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 221  */
 222 static int mpol_set_nodemask(struct mempolicy *pol,
 223                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 224 {
 225         int ret;
 226
 227         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 228         if (pol == NULL)
 229                 return 0;
 230         /* Check N_MEMORY */
 231         nodes_and(nsc->mask1,
 232                   cpuset_current_mems_allowed, node_states[N_MEMORY]);
 233
 234         VM_BUG_ON(!nodes);
 235         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 236                 nodes = NULL;   /* explicit local allocation */
 237         else {
 238                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 239                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 240                 else
 241                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 242
 243                 if (mpol_store_user_nodemask(pol))
 244                         pol->w.user_nodemask = *nodes;
 245                 else
 246                         pol->w.cpuset_mems_allowed =
 247                                                 cpuset_current_mems_allowed;
 248         }
 249
 250         if (nodes)
 251                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 252         else
 253                 ret = mpol_ops[pol->mode].create(pol, NULL);
 254         return ret;
 255 }
 256
 257 /*
 258  * This function just creates a new policy, does some check and simple
 259  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 260  */
 261 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 262                                   nodemask_t *nodes)
 263 {
 264         struct mempolicy *policy;
 265
 266         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 267                  mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 268
 269         if (mode == MPOL_DEFAULT) {
 270                 if (nodes && !nodes_empty(*nodes))
 271                         return ERR_PTR(-EINVAL);
 272                 return NULL;
 273         }
 274         VM_BUG_ON(!nodes);
 275
 276         /*
 277          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 278          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 279          * All other modes require a valid pointer to a non-empty nodemask.
 280          */
 281         if (mode == MPOL_PREFERRED) {
 282                 if (nodes_empty(*nodes)) {
 283                         if (((flags & MPOL_F_STATIC_NODES) ||
 284                              (flags & MPOL_F_RELATIVE_NODES)))
 285                                 return ERR_PTR(-EINVAL);
 286                 }
 287         } else if (mode == MPOL_LOCAL) {
 288                 if (!nodes_empty(*nodes))
 289                         return ERR_PTR(-EINVAL);
 290                 mode = MPOL_PREFERRED;
 291         } else if (nodes_empty(*nodes))
 292                 return ERR_PTR(-EINVAL);
 293         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 294         if (!policy)
 295                 return ERR_PTR(-ENOMEM);
 296         atomic_set(&policy->refcnt, 1);
 297         policy->mode = mode;
 298         policy->flags = flags;
 299
 300         return policy;
 301 }
 302
 303 /* Slow path of a mpol destructor. */
 304 void __mpol_put(struct mempolicy *p)
 305 {
 306         if (!atomic_dec_and_test(&p->refcnt))
 307                 return;
 308         kmem_cache_free(policy_cache, p);
 309 }
 310
 311 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 312                                 enum mpol_rebind_step step)
 313 {
 314 }
 315
 316 /*
 317  * step:
 318  *      MPOL_REBIND_ONCE  - do rebind work at once
 319  *      MPOL_REBIND_STEP1 - set all the newly nodes
 320  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 321  */
 322 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 323                                  enum mpol_rebind_step step)
 324 {
 325         nodemask_t tmp;
 326
 327         if (pol->flags & MPOL_F_STATIC_NODES)
 328                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 329         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 330                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 331         else {
 332                 /*
 333                  * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 334                  * result
 335                  */
 336                 if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 337                         nodes_remap(tmp, pol->v.nodes,
 338                                         pol->w.cpuset_mems_allowed, *nodes);
 339                         pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 340                 } else if (step == MPOL_REBIND_STEP2) {
 341                         tmp = pol->w.cpuset_mems_allowed;
 342                         pol->w.cpuset_mems_allowed = *nodes;
 343                 } else
 344                         BUG();
 345         }
 346
 347         if (nodes_empty(tmp))
 348                 tmp = *nodes;
 349
 350         if (step == MPOL_REBIND_STEP1)
 351                 nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 352         else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 353                 pol->v.nodes = tmp;
 354         else
 355                 BUG();
 356
 357         if (!node_isset(current->il_next, tmp)) {
 358                 current->il_next = next_node(current->il_next, tmp);
 359                 if (current->il_next >= MAX_NUMNODES)
 360                         current->il_next = first_node(tmp);
 361                 if (current->il_next >= MAX_NUMNODES)
 362                         current->il_next = numa_node_id();
 363         }
 364 }
 365
 366 static void mpol_rebind_preferred(struct mempolicy *pol,
 367                                   const nodemask_t *nodes,
 368                                   enum mpol_rebind_step step)
 369 {
 370         nodemask_t tmp;
 371
 372         if (pol->flags & MPOL_F_STATIC_NODES) {
 373                 int node = first_node(pol->w.user_nodemask);
 374
 375                 if (node_isset(node, *nodes)) {
 376                         pol->v.preferred_node = node;
 377                         pol->flags &= ~MPOL_F_LOCAL;
 378                 } else
 379                         pol->flags |= MPOL_F_LOCAL;
 380         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 381                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 382                 pol->v.preferred_node = first_node(tmp);
 383         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 384                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 385                                                    pol->w.cpuset_mems_allowed,
 386                                                    *nodes);
 387                 pol->w.cpuset_mems_allowed = *nodes;
 388         }
 389 }
 390
 391 /*
 392  * mpol_rebind_policy - Migrate a policy to a different set of nodes
 393  *
 394  * If read-side task has no lock to protect task->mempolicy, write-side
 395  * task will rebind the task->mempolicy by two step. The first step is
 396  * setting all the newly nodes, and the second step is cleaning all the
 397  * disallowed nodes. In this way, we can avoid finding no node to alloc
 398  * page.
 399  * If we have a lock to protect task->mempolicy in read-side, we do
 400  * rebind directly.
 401  *
 402  * step:
 403  *      MPOL_REBIND_ONCE  - do rebind work at once
 404  *      MPOL_REBIND_STEP1 - set all the newly nodes
 405  *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
 406  */
 407 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 408                                 enum mpol_rebind_step step)
 409 {
 410         if (!pol)
 411                 return;
 412         if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 413             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 414                 return;
 415
 416         if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 417                 return;
 418
 419         if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 420                 BUG();
 421
 422         if (step == MPOL_REBIND_STEP1)
 423                 pol->flags |= MPOL_F_REBINDING;
 424         else if (step == MPOL_REBIND_STEP2)
 425                 pol->flags &= ~MPOL_F_REBINDING;
 426         else if (step >= MPOL_REBIND_NSTEP)
 427                 BUG();
 428
 429         mpol_ops[pol->mode].rebind(pol, newmask, step);
 430 }
 431
 432 /*
 433  * Wrapper for mpol_rebind_policy() that just requires task
 434  * pointer, and updates task mempolicy.
 435  *
 436  * Called with task's alloc_lock held.
 437  */
 438
 439 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 440                         enum mpol_rebind_step step)
 441 {
 442         mpol_rebind_policy(tsk->mempolicy, new, step);
 443 }
 444
 445 /*
 446  * Rebind each vma in mm to new nodemask.
 447  *
 448  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 449  */
 450
 451 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 452 {
 453         struct vm_area_struct *vma;
 454
 455         down_write(&mm->mmap_sem);
 456         for (vma = mm->mmap; vma; vma = vma->vm_next)
 457                 mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 458         up_write(&mm->mmap_sem);
 459 }
 460
 461 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 462         [MPOL_DEFAULT] = {
 463                 .rebind = mpol_rebind_default,
 464         },
 465         [MPOL_INTERLEAVE] = {
 466                 .create = mpol_new_interleave,
 467                 .rebind = mpol_rebind_nodemask,
 468         },
 469         [MPOL_PREFERRED] = {
 470                 .create = mpol_new_preferred,
 471                 .rebind = mpol_rebind_preferred,
 472         },
 473         [MPOL_BIND] = {
 474                 .create = mpol_new_bind,
 475                 .rebind = mpol_rebind_nodemask,
 476         },
 477 };
 478
 479 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 480                                 unsigned long flags);
 481
 482 /*
 483  * Scan through pages checking if pages follow certain conditions,
 484  * and move them to the pagelist if they do.
 485  */
 486 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 487                 unsigned long addr, unsigned long end,
 488                 const nodemask_t *nodes, unsigned long flags,
 489                 void *private)
 490 {
 491         pte_t *orig_pte;
 492         pte_t *pte;
 493         spinlock_t *ptl;
 494
 495         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 496         do {
 497                 struct page *page;
 498                 int nid;
 499
 500                 if (!pte_present(*pte))
 501                         continue;
 502                 page = vm_normal_page(vma, addr, *pte);
 503                 if (!page)
 504                         continue;
 505                 /*
 506                  * vm_normal_page() filters out zero pages, but there might
 507                  * still be PageReserved pages to skip, perhaps in a VDSO.
 508                  */
 509                 if (PageReserved(page))
 510                         continue;
 511                 nid = page_to_nid(page);
 512                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 513                         continue;
 514
 515                 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 516                         migrate_page_add(page, private, flags);
 517                 else
 518                         break;
 519         } while (pte++, addr += PAGE_SIZE, addr != end);
 520         pte_unmap_unlock(orig_pte, ptl);
 521         return addr != end;
 522 }
 523
 524 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 525                 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 526                                     void *private)
 527 {
 528 #ifdef CONFIG_HUGETLB_PAGE
 529         int nid;
 530         struct page *page;
 531         spinlock_t *ptl;
 532
 533         ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
 534         page = pte_page(huge_ptep_get((pte_t *)pmd));
 535         nid = page_to_nid(page);
 536         if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 537                 goto unlock;
 538         /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 539         if (flags & (MPOL_MF_MOVE_ALL) ||
 540             (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 541                 isolate_huge_page(page, private);
 542 unlock:
 543         spin_unlock(ptl);
 544 #else
 545         BUG();
 546 #endif
 547 }
 548
 549 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 550                 unsigned long addr, unsigned long end,
 551                 const nodemask_t *nodes, unsigned long flags,
 552                 void *private)
 553 {
 554         pmd_t *pmd;
 555         unsigned long next;
 556
 557         pmd = pmd_offset(pud, addr);
 558         do {
 559                 next = pmd_addr_end(addr, end);
 560                 if (!pmd_present(*pmd))
 561                         continue;
 562                 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 563                         queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 564                                                 flags, private);
 565                         continue;
 566                 }
 567                 split_huge_page_pmd(vma, addr, pmd);
 568                 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 569                         continue;
 570                 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 571                                     flags, private))
 572                         return -EIO;
 573         } while (pmd++, addr = next, addr != end);
 574         return 0;
 575 }
 576
 577 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 578                 unsigned long addr, unsigned long end,
 579                 const nodemask_t *nodes, unsigned long flags,
 580                 void *private)
 581 {
 582         pud_t *pud;
 583         unsigned long next;
 584
 585         pud = pud_offset(pgd, addr);
 586         do {
 587                 next = pud_addr_end(addr, end);
 588                 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 589                         continue;
 590                 if (pud_none_or_clear_bad(pud))
 591                         continue;
 592                 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 593                                     flags, private))
 594                         return -EIO;
 595         } while (pud++, addr = next, addr != end);
 596         return 0;
 597 }
 598
 599 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 600                 unsigned long addr, unsigned long end,
 601                 const nodemask_t *nodes, unsigned long flags,
 602                 void *private)
 603 {
 604         pgd_t *pgd;
 605         unsigned long next;
 606
 607         pgd = pgd_offset(vma->vm_mm, addr);
 608         do {
 609                 next = pgd_addr_end(addr, end);
 610                 if (pgd_none_or_clear_bad(pgd))
 611                         continue;
 612                 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 613                                     flags, private))
 614                         return -EIO;
 615         } while (pgd++, addr = next, addr != end);
 616         return 0;
 617 }
 618
 619 #ifdef CONFIG_NUMA_BALANCING
 620 /*
 621  * This is used to mark a range of virtual addresses to be inaccessible.
 622  * These are later cleared by a NUMA hinting fault. Depending on these
 623  * faults, pages may be migrated for better NUMA placement.
 624  *
 625  * This is assuming that NUMA faults are handled using PROT_NONE. If
 626  * an architecture makes a different choice, it will need further
 627  * changes to the core.
 628  */
 629 unsigned long change_prot_numa(struct vm_area_struct *vma,
 630                         unsigned long addr, unsigned long end)
 631 {
 632         int nr_updated;
 633
 634         nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 635         if (nr_updated)
 636                 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 637
 638         return nr_updated;
 639 }
 640 #else
 641 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 642                         unsigned long addr, unsigned long end)
 643 {
 644         return 0;
 645 }
 646 #endif /* CONFIG_NUMA_BALANCING */
 647
 648 /*
 649  * Walk through page tables and collect pages to be migrated.
 650  *
 651  * If pages found in a given range are on a set of nodes (determined by
 652  * @nodes and @flags,) it's isolated and queued to the pagelist which is
 653  * passed via @private.)
 654  */
 655 static struct vm_area_struct *
 656 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 657                 const nodemask_t *nodes, unsigned long flags, void *private)
 658 {
 659         int err;
 660         struct vm_area_struct *first, *vma, *prev;
 661
 662
 663         first = find_vma(mm, start);
 664         if (!first)
 665                 return ERR_PTR(-EFAULT);
 666         prev = NULL;
 667         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 668                 unsigned long endvma = vma->vm_end;
 669
 670                 if (endvma > end)
 671                         endvma = end;
 672                 if (vma->vm_start > start)
 673                         start = vma->vm_start;
 674
 675                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 676                         if (!vma->vm_next && vma->vm_end < end)
 677                                 return ERR_PTR(-EFAULT);
 678                         if (prev && prev->vm_end < vma->vm_start)
 679                                 return ERR_PTR(-EFAULT);
 680                 }
 681
 682                 if (flags & MPOL_MF_LAZY) {
 683                         change_prot_numa(vma, start, endvma);
 684                         goto next;
 685                 }
 686
 687                 if ((flags & MPOL_MF_STRICT) ||
 688                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 689                       vma_migratable(vma))) {
 690
 691                         err = queue_pages_pgd_range(vma, start, endvma, nodes,
 692                                                 flags, private);
 693                         if (err) {
 694                                 first = ERR_PTR(err);
 695                                 break;
 696                         }
 697                 }
 698 next:
 699                 prev = vma;
 700         }
 701         return first;
 702 }
 703
 704 /*
 705  * Apply policy to a single VMA
 706  * This must be called with the mmap_sem held for writing.
 707  */
 708 static int vma_replace_policy(struct vm_area_struct *vma,
 709                                                 struct mempolicy *pol)
 710 {
 711         int err;
 712         struct mempolicy *old;
 713         struct mempolicy *new;
 714
 715         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 716                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 717                  vma->vm_ops, vma->vm_file,
 718                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 719
 720         new = mpol_dup(pol);
 721         if (IS_ERR(new))
 722                 return PTR_ERR(new);
 723
 724         if (vma->vm_ops && vma->vm_ops->set_policy) {
 725                 err = vma->vm_ops->set_policy(vma, new);
 726                 if (err)
 727                         goto err_out;
 728         }
 729
 730         old = vma->vm_policy;
 731         vma->vm_policy = new; /* protected by mmap_sem */
 732         mpol_put(old);
 733
 734         return 0;
 735  err_out:
 736         mpol_put(new);
 737         return err;
 738 }
 739
 740 /* Step 2: apply policy to a range and do splits. */
 741 static int mbind_range(struct mm_struct *mm, unsigned long start,
 742                        unsigned long end, struct mempolicy *new_pol)
 743 {
 744         struct vm_area_struct *next;
 745         struct vm_area_struct *prev;
 746         struct vm_area_struct *vma;
 747         int err = 0;
 748         pgoff_t pgoff;
 749         unsigned long vmstart;
 750         unsigned long vmend;
 751
 752         vma = find_vma(mm, start);
 753         if (!vma || vma->vm_start > start)
 754                 return -EFAULT;
 755
 756         prev = vma->vm_prev;
 757         if (start > vma->vm_start)
 758                 prev = vma;
 759
 760         for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 761                 next = vma->vm_next;
 762                 vmstart = max(start, vma->vm_start);
 763                 vmend   = min(end, vma->vm_end);
 764
 765                 if (mpol_equal(vma_policy(vma), new_pol))
 766                         continue;
 767
 768                 pgoff = vma->vm_pgoff +
 769                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
 770                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 771                                   vma->anon_vma, vma->vm_file, pgoff,
 772                                   new_pol);
 773                 if (prev) {
 774                         vma = prev;
 775                         next = vma->vm_next;
 776                         if (mpol_equal(vma_policy(vma), new_pol))
 777                                 continue;
 778                         /* vma_merge() joined vma && vma->next, case 8 */
 779                         goto replace;
 780                 }
 781                 if (vma->vm_start != vmstart) {
 782                         err = split_vma(vma->vm_mm, vma, vmstart, 1);
 783                         if (err)
 784                                 goto out;
 785                 }
 786                 if (vma->vm_end != vmend) {
 787                         err = split_vma(vma->vm_mm, vma, vmend, 0);
 788                         if (err)
 789                                 goto out;
 790                 }
 791  replace:
 792                 err = vma_replace_policy(vma, new_pol);
 793                 if (err)
 794                         goto out;
 795         }
 796
 797  out:
 798         return err;
 799 }
 800
 801 /* Set the process memory policy */
 802 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 803                              nodemask_t *nodes)
 804 {
 805         struct mempolicy *new, *old;
 806         struct mm_struct *mm = current->mm;
 807         NODEMASK_SCRATCH(scratch);
 808         int ret;
 809
 810         if (!scratch)
 811                 return -ENOMEM;
 812
 813         new = mpol_new(mode, flags, nodes);
 814         if (IS_ERR(new)) {
 815                 ret = PTR_ERR(new);
 816                 goto out;
 817         }
 818         /*
 819          * prevent changing our mempolicy while show_numa_maps()
 820          * is using it.
 821          * Note:  do_set_mempolicy() can be called at init time
 822          * with no 'mm'.
 823          */
 824         if (mm)
 825                 down_write(&mm->mmap_sem);
 826         task_lock(current);
 827         ret = mpol_set_nodemask(new, nodes, scratch);
 828         if (ret) {
 829                 task_unlock(current);
 830                 if (mm)
 831                         up_write(&mm->mmap_sem);
 832                 mpol_put(new);
 833                 goto out;
 834         }
 835         old = current->mempolicy;
 836         current->mempolicy = new;
 837         if (new && new->mode == MPOL_INTERLEAVE &&
 838             nodes_weight(new->v.nodes))
 839                 current->il_next = first_node(new->v.nodes);
 840         task_unlock(current);
 841         if (mm)
 842                 up_write(&mm->mmap_sem);
 843
 844         mpol_put(old);
 845         ret = 0;
 846 out:
 847         NODEMASK_SCRATCH_FREE(scratch);
 848         return ret;
 849 }
 850
 851 /*
 852  * Return nodemask for policy for get_mempolicy() query
 853  *
 854  * Called with task's alloc_lock held
 855  */
 856 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 857 {
 858         nodes_clear(*nodes);
 859         if (p == &default_policy)
 860                 return;
 861
 862         switch (p->mode) {
 863         case MPOL_BIND:
 864                 /* Fall through */
 865         case MPOL_INTERLEAVE:
 866                 *nodes = p->v.nodes;
 867                 break;
 868         case MPOL_PREFERRED:
 869                 if (!(p->flags & MPOL_F_LOCAL))
 870                         node_set(p->v.preferred_node, *nodes);
 871                 /* else return empty node mask for local allocation */
 872                 break;
 873         default:
 874                 BUG();
 875         }
 876 }
 877
 878 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 879 {
 880         struct page *p;
 881         int err;
 882
 883         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 884         if (err >= 0) {
 885                 err = page_to_nid(p);
 886                 put_page(p);
 887         }
 888         return err;
 889 }
 890
 891 /* Retrieve NUMA policy */
 892 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 893                              unsigned long addr, unsigned long flags)
 894 {
 895         int err;
 896         struct mm_struct *mm = current->mm;
 897         struct vm_area_struct *vma = NULL;
 898         struct mempolicy *pol = current->mempolicy;
 899
 900         if (flags &
 901                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 902                 return -EINVAL;
 903
 904         if (flags & MPOL_F_MEMS_ALLOWED) {
 905                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 906                         return -EINVAL;
 907                 *policy = 0;    /* just so it's initialized */
 908                 task_lock(current);
 909                 *nmask  = cpuset_current_mems_allowed;
 910                 task_unlock(current);
 911                 return 0;
 912         }
 913
 914         if (flags & MPOL_F_ADDR) {
 915                 /*
 916                  * Do NOT fall back to task policy if the
 917                  * vma/shared policy at addr is NULL.  We
 918                  * want to return MPOL_DEFAULT in this case.
 919                  */
 920                 down_read(&mm->mmap_sem);
 921                 vma = find_vma_intersection(mm, addr, addr+1);
 922                 if (!vma) {
 923                         up_read(&mm->mmap_sem);
 924                         return -EFAULT;
 925                 }
 926                 if (vma->vm_ops && vma->vm_ops->get_policy)
 927                         pol = vma->vm_ops->get_policy(vma, addr);
 928                 else
 929                         pol = vma->vm_policy;
 930         } else if (addr)
 931                 return -EINVAL;
 932
 933         if (!pol)
 934                 pol = &default_policy;  /* indicates default behavior */
 935
 936         if (flags & MPOL_F_NODE) {
 937                 if (flags & MPOL_F_ADDR) {
 938                         err = lookup_node(mm, addr);
 939                         if (err < 0)
 940                                 goto out;
 941                         *policy = err;
 942                 } else if (pol == current->mempolicy &&
 943                                 pol->mode == MPOL_INTERLEAVE) {
 944                         *policy = current->il_next;
 945                 } else {
 946                         err = -EINVAL;
 947                         goto out;
 948                 }
 949         } else {
 950                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 951                                                 pol->mode;
 952                 /*
 953                  * Internal mempolicy flags must be masked off before exposing
 954                  * the policy to userspace.
 955                  */
 956                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 957         }
 958
 959         if (vma) {
 960                 up_read(&current->mm->mmap_sem);
 961                 vma = NULL;
 962         }
 963
 964         err = 0;
 965         if (nmask) {
 966                 if (mpol_store_user_nodemask(pol)) {
 967                         *nmask = pol->w.user_nodemask;
 968                 } else {
 969                         task_lock(current);
 970                         get_policy_nodemask(pol, nmask);
 971                         task_unlock(current);
 972                 }
 973         }
 974
 975  out:
 976         mpol_cond_put(pol);
 977         if (vma)
 978                 up_read(&current->mm->mmap_sem);
 979         return err;
 980 }
 981
 982 #ifdef CONFIG_MIGRATION
 983 /*
 984  * page migration
 985  */
 986 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 987                                 unsigned long flags)
 988 {
 989         /*
 990          * Avoid migrating a page that is shared with others.
 991          */
 992         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 993                 if (!isolate_lru_page(page)) {
 994                         list_add_tail(&page->lru, pagelist);
 995                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 996                                             page_is_file_cache(page));
 997                 }
 998         }
 999 }
1000
1001 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
1002 {
1003         if (PageHuge(page))
1004                 return alloc_huge_page_node(page_hstate(compound_head(page)),
1005                                         node);
1006         else
1007                 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
1008 }
1009
1010 /*
1011  * Migrate pages from one node to a target node.
1012  * Returns error or the number of pages not migrated.
1013  */
1014 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1015                            int flags)
1016 {
1017         nodemask_t nmask;
1018         LIST_HEAD(pagelist);
1019         int err = 0;
1020
1021         nodes_clear(nmask);
1022         node_set(source, nmask);
1023
1024         /*
1025          * This does not "check" the range but isolates all pages that
1026          * need migration.  Between passing in the full user address
1027          * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1028          */
1029         VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1030         queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1031                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1032
1033         if (!list_empty(&pagelist)) {
1034                 err = migrate_pages(&pagelist, new_node_page, NULL, dest,
1035                                         MIGRATE_SYNC, MR_SYSCALL);
1036                 if (err)
1037                         putback_movable_pages(&pagelist);
1038         }
1039
1040         return err;
1041 }
1042
1043 /*
1044  * Move pages between the two nodesets so as to preserve the physical
1045  * layout as much as possible.
1046  *
1047  * Returns the number of page that could not be moved.
1048  */
1049 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1050                      const nodemask_t *to, int flags)
1051 {
1052         int busy = 0;
1053         int err;
1054         nodemask_t tmp;
1055
1056         err = migrate_prep();
1057         if (err)
1058                 return err;
1059
1060         down_read(&mm->mmap_sem);
1061
1062         err = migrate_vmas(mm, from, to, flags);
1063         if (err)
1064                 goto out;
1065
1066         /*
1067          * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1068          * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1069          * bit in 'tmp', and return that <source, dest> pair for migration.
1070          * The pair of nodemasks 'to' and 'from' define the map.
1071          *
1072          * If no pair of bits is found that way, fallback to picking some
1073          * pair of 'source' and 'dest' bits that are not the same.  If the
1074          * 'source' and 'dest' bits are the same, this represents a node
1075          * that will be migrating to itself, so no pages need move.
1076          *
1077          * If no bits are left in 'tmp', or if all remaining bits left
1078          * in 'tmp' correspond to the same bit in 'to', return false
1079          * (nothing left to migrate).
1080          *
1081          * This lets us pick a pair of nodes to migrate between, such that
1082          * if possible the dest node is not already occupied by some other
1083          * source node, minimizing the risk of overloading the memory on a
1084          * node that would happen if we migrated incoming memory to a node
1085          * before migrating outgoing memory source that same node.
1086          *
1087          * A single scan of tmp is sufficient.  As we go, we remember the
1088          * most recent <s, d> pair that moved (s != d).  If we find a pair
1089          * that not only moved, but what's better, moved to an empty slot
1090          * (d is not set in tmp), then we break out then, with that pair.
1091          * Otherwise when we finish scanning from_tmp, we at least have the
1092          * most recent <s, d> pair that moved.  If we get all the way through
1093          * the scan of tmp without finding any node that moved, much less
1094          * moved to an empty node, then there is nothing left worth migrating.
1095          */
1096
1097         tmp = *from;
1098         while (!nodes_empty(tmp)) {
1099                 int s,d;
1100                 int source = NUMA_NO_NODE;
1101                 int dest = 0;
1102
1103                 for_each_node_mask(s, tmp) {
1104
1105                         /*
1106                          * do_migrate_pages() tries to maintain the relative
1107                          * node relationship of the pages established between
1108                          * threads and memory areas.
1109                          *
1110                          * However if the number of source nodes is not equal to
1111                          * the number of destination nodes we can not preserve
1112                          * this node relative relationship.  In that case, skip
1113                          * copying memory from a node that is in the destination
1114                          * mask.
1115                          *
1116                          * Example: [2,3,4] -> [3,4,5] moves everything.
1117                          *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1118                          */
1119
1120                         if ((nodes_weight(*from) != nodes_weight(*to)) &&
1121                                                 (node_isset(s, *to)))
1122                                 continue;
1123
1124                         d = node_remap(s, *from, *to);
1125                         if (s == d)
1126                                 continue;
1127
1128                         source = s;     /* Node moved. Memorize */
1129                         dest = d;
1130
1131                         /* dest not in remaining from nodes? */
1132                         if (!node_isset(dest, tmp))
1133                                 break;
1134                 }
1135                 if (source == NUMA_NO_NODE)
1136                         break;
1137
1138                 node_clear(source, tmp);
1139                 err = migrate_to_node(mm, source, dest, flags);
1140                 if (err > 0)
1141                         busy += err;
1142                 if (err < 0)
1143                         break;
1144         }
1145 out:
1146         up_read(&mm->mmap_sem);
1147         if (err < 0)
1148                 return err;
1149         return busy;
1150
1151 }
1152
1153 /*
1154  * Allocate a new page for page migration based on vma policy.
1155  * Start assuming that page is mapped by vma pointed to by @private.
1156  * Search forward from there, if not.  N.B., this assumes that the
1157  * list of pages handed to migrate_pages()--which is how we get here--
1158  * is in virtual address order.
1159  */
1160 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1161 {
1162         struct vm_area_struct *vma = (struct vm_area_struct *)private;
1163         unsigned long uninitialized_var(address);
1164
1165         while (vma) {
1166                 address = page_address_in_vma(page, vma);
1167                 if (address != -EFAULT)
1168                         break;
1169                 vma = vma->vm_next;
1170         }
1171
1172         if (PageHuge(page)) {
1173                 BUG_ON(!vma);
1174                 return alloc_huge_page_noerr(vma, address, 1);
1175         }
1176         /*
1177          * if !vma, alloc_page_vma() will use task or system default policy
1178          */
1179         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1180 }
1181 #else
1182
1183 static void migrate_page_add(struct page *page, struct list_head *pagelist,
1184                                 unsigned long flags)
1185 {
1186 }
1187
1188 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1189                      const nodemask_t *to, int flags)
1190 {
1191         return -ENOSYS;
1192 }
1193
1194 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
1195 {
1196         return NULL;
1197 }
1198 #endif
1199
1200 static long do_mbind(unsigned long start, unsigned long len,
1201                      unsigned short mode, unsigned short mode_flags,
1202                      nodemask_t *nmask, unsigned long flags)
1203 {
1204         struct vm_area_struct *vma;
1205         struct mm_struct *mm = current->mm;
1206         struct mempolicy *new;
1207         unsigned long end;
1208         int err;
1209         LIST_HEAD(pagelist);
1210
1211         if (flags & ~(unsigned long)MPOL_MF_VALID)
1212                 return -EINVAL;
1213         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1214                 return -EPERM;
1215
1216         if (start & ~PAGE_MASK)
1217                 return -EINVAL;
1218
1219         if (mode == MPOL_DEFAULT)
1220                 flags &= ~MPOL_MF_STRICT;
1221
1222         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1223         end = start + len;
1224
1225         if (end < start)
1226                 return -EINVAL;
1227         if (end == start)
1228                 return 0;
1229
1230         new = mpol_new(mode, mode_flags, nmask);
1231         if (IS_ERR(new))
1232                 return PTR_ERR(new);
1233
1234         if (flags & MPOL_MF_LAZY)
1235                 new->flags |= MPOL_F_MOF;
1236
1237         /*
1238          * If we are using the default policy then operation
1239          * on discontinuous address spaces is okay after all
1240          */
1241         if (!new)
1242                 flags |= MPOL_MF_DISCONTIG_OK;
1243
1244         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1245                  start, start + len, mode, mode_flags,
1246                  nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1247
1248         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1249
1250                 err = migrate_prep();
1251                 if (err)
1252                         goto mpol_out;
1253         }
1254         {
1255                 NODEMASK_SCRATCH(scratch);
1256                 if (scratch) {
1257                         down_write(&mm->mmap_sem);
1258                         task_lock(current);
1259                         err = mpol_set_nodemask(new, nmask, scratch);
1260                         task_unlock(current);
1261                         if (err)
1262                                 up_write(&mm->mmap_sem);
1263                 } else
1264                         err = -ENOMEM;
1265                 NODEMASK_SCRATCH_FREE(scratch);
1266         }
1267         if (err)
1268                 goto mpol_out;
1269
1270         vma = queue_pages_range(mm, start, end, nmask,
1271                           flags | MPOL_MF_INVERT, &pagelist);
1272
1273         err = PTR_ERR(vma);     /* maybe ... */
1274         if (!IS_ERR(vma))
1275                 err = mbind_range(mm, start, end, new);
1276
1277         if (!err) {
1278                 int nr_failed = 0;
1279
1280                 if (!list_empty(&pagelist)) {
1281                         WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1282                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1283                                         NULL, (unsigned long)vma,
1284                                         MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1285                         if (nr_failed)
1286                                 putback_movable_pages(&pagelist);
1287                 }
1288
1289                 if (nr_failed && (flags & MPOL_MF_STRICT))
1290                         err = -EIO;
1291         } else
1292                 putback_movable_pages(&pagelist);
1293
1294         up_write(&mm->mmap_sem);
1295  mpol_out:
1296         mpol_put(new);
1297         return err;
1298 }
1299
1300 /*
1301  * User space interface with variable sized bitmaps for nodelists.
1302  */
1303
1304 /* Copy a node mask from user space. */
1305 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1306                      unsigned long maxnode)
1307 {
1308         unsigned long k;
1309         unsigned long nlongs;
1310         unsigned long endmask;
1311
1312         --maxnode;
1313         nodes_clear(*nodes);
1314         if (maxnode == 0 || !nmask)
1315                 return 0;
1316         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1317                 return -EINVAL;
1318
1319         nlongs = BITS_TO_LONGS(maxnode);
1320         if ((maxnode % BITS_PER_LONG) == 0)
1321                 endmask = ~0UL;
1322         else
1323                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1324
1325         /* When the user specified more nodes than supported just check
1326            if the non supported part is all zero. */
1327         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1328                 if (nlongs > PAGE_SIZE/sizeof(long))
1329                         return -EINVAL;
1330                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1331                         unsigned long t;
1332                         if (get_user(t, nmask + k))
1333                                 return -EFAULT;
1334                         if (k == nlongs - 1) {
1335                                 if (t & endmask)
1336                                         return -EINVAL;
1337                         } else if (t)
1338                                 return -EINVAL;
1339                 }
1340                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1341                 endmask = ~0UL;
1342         }
1343
1344         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1345                 return -EFAULT;
1346         nodes_addr(*nodes)[nlongs-1] &= endmask;
1347         return 0;
1348 }
1349
1350 /* Copy a kernel node mask to user space */
1351 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1352                               nodemask_t *nodes)
1353 {
1354         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1355         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1356
1357         if (copy > nbytes) {
1358                 if (copy > PAGE_SIZE)
1359                         return -EINVAL;
1360                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1361                         return -EFAULT;
1362                 copy = nbytes;
1363         }
1364         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1365 }
1366
1367 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1368                 unsigned long, mode, const unsigned long __user *, nmask,
1369                 unsigned long, maxnode, unsigned, flags)
1370 {
1371         nodemask_t nodes;
1372         int err;
1373         unsigned short mode_flags;
1374
1375         mode_flags = mode & MPOL_MODE_FLAGS;
1376         mode &= ~MPOL_MODE_FLAGS;
1377         if (mode >= MPOL_MAX)
1378                 return -EINVAL;
1379         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1380             (mode_flags & MPOL_F_RELATIVE_NODES))
1381                 return -EINVAL;
1382         err = get_nodes(&nodes, nmask, maxnode);
1383         if (err)
1384                 return err;
1385         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1386 }
1387
1388 /* Set the process memory policy */
1389 SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1390                 unsigned long, maxnode)
1391 {
1392         int err;
1393         nodemask_t nodes;
1394         unsigned short flags;
1395
1396         flags = mode & MPOL_MODE_FLAGS;
1397         mode &= ~MPOL_MODE_FLAGS;
1398         if ((unsigned int)mode >= MPOL_MAX)
1399                 return -EINVAL;
1400         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1401                 return -EINVAL;
1402         err = get_nodes(&nodes, nmask, maxnode);
1403         if (err)
1404                 return err;
1405         return do_set_mempolicy(mode, flags, &nodes);
1406 }
1407
1408 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1409                 const unsigned long __user *, old_nodes,
1410                 const unsigned long __user *, new_nodes)
1411 {
1412         const struct cred *cred = current_cred(), *tcred;
1413         struct mm_struct *mm = NULL;
1414         struct task_struct *task;
1415         nodemask_t task_nodes;
1416         int err;
1417         nodemask_t *old;
1418         nodemask_t *new;
1419         NODEMASK_SCRATCH(scratch);
1420
1421         if (!scratch)
1422                 return -ENOMEM;
1423
1424         old = &scratch->mask1;
1425         new = &scratch->mask2;
1426
1427         err = get_nodes(old, old_nodes, maxnode);
1428         if (err)
1429                 goto out;
1430
1431         err = get_nodes(new, new_nodes, maxnode);
1432         if (err)
1433                 goto out;
1434
1435         /* Find the mm_struct */
1436         rcu_read_lock();
1437         task = pid ? find_task_by_vpid(pid) : current;
1438         if (!task) {
1439                 rcu_read_unlock();
1440                 err = -ESRCH;
1441                 goto out;
1442         }
1443         get_task_struct(task);
1444
1445         err = -EINVAL;
1446
1447         /*
1448          * Check if this process has the right to modify the specified
1449          * process. The right exists if the process has administrative
1450          * capabilities, superuser privileges or the same
1451          * userid as the target process.
1452          */
1453         tcred = __task_cred(task);
1454         if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1455             !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1456             !capable(CAP_SYS_NICE)) {
1457                 rcu_read_unlock();
1458                 err = -EPERM;
1459                 goto out_put;
1460         }
1461         rcu_read_unlock();
1462
1463         task_nodes = cpuset_mems_allowed(task);
1464         /* Is the user allowed to access the target nodes? */
1465         if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1466                 err = -EPERM;
1467                 goto out_put;
1468         }
1469
1470         if (!nodes_subset(*new, node_states[N_MEMORY])) {
1471                 err = -EINVAL;
1472                 goto out_put;
1473         }
1474
1475         err = security_task_movememory(task);
1476         if (err)
1477                 goto out_put;
1478
1479         mm = get_task_mm(task);
1480         put_task_struct(task);
1481
1482         if (!mm) {
1483                 err = -EINVAL;
1484                 goto out;
1485         }
1486
1487         err = do_migrate_pages(mm, old, new,
1488                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1489
1490         mmput(mm);
1491 out:
1492         NODEMASK_SCRATCH_FREE(scratch);
1493
1494         return err;
1495
1496 out_put:
1497         put_task_struct(task);
1498         goto out;
1499
1500 }
1501
1502
1503 /* Retrieve NUMA policy */
1504 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1505                 unsigned long __user *, nmask, unsigned long, maxnode,
1506                 unsigned long, addr, unsigned long, flags)
1507 {
1508         int err;
1509         int uninitialized_var(pval);
1510         nodemask_t nodes;
1511
1512         if (nmask != NULL && maxnode < MAX_NUMNODES)
1513                 return -EINVAL;
1514
1515         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1516
1517         if (err)
1518                 return err;
1519
1520         if (policy && put_user(pval, policy))
1521                 return -EFAULT;
1522
1523         if (nmask)
1524                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1525
1526         return err;
1527 }
1528
1529 #ifdef CONFIG_COMPAT
1530
1531 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1532                        compat_ulong_t __user *, nmask,
1533                        compat_ulong_t, maxnode,
1534                        compat_ulong_t, addr, compat_ulong_t, flags)
1535 {
1536         long err;
1537         unsigned long __user *nm = NULL;
1538         unsigned long nr_bits, alloc_size;
1539         DECLARE_BITMAP(bm, MAX_NUMNODES);
1540
1541         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1542         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1543
1544         if (nmask)
1545                 nm = compat_alloc_user_space(alloc_size);
1546
1547         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1548
1549         if (!err && nmask) {
1550                 unsigned long copy_size;
1551                 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1552                 err = copy_from_user(bm, nm, copy_size);
1553                 /* ensure entire bitmap is zeroed */
1554                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1555                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1556         }
1557
1558         return err;
1559 }
1560
1561 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1562                        compat_ulong_t, maxnode)
1563 {
1564         long err = 0;
1565         unsigned long __user *nm = NULL;
1566         unsigned long nr_bits, alloc_size;
1567         DECLARE_BITMAP(bm, MAX_NUMNODES);
1568
1569         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1570         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1571
1572         if (nmask) {
1573                 err = compat_get_bitmap(bm, nmask, nr_bits);
1574                 nm = compat_alloc_user_space(alloc_size);
1575                 err |= copy_to_user(nm, bm, alloc_size);
1576         }
1577
1578         if (err)
1579                 return -EFAULT;
1580
1581         return sys_set_mempolicy(mode, nm, nr_bits+1);
1582 }
1583
1584 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1585                        compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1586                        compat_ulong_t, maxnode, compat_ulong_t, flags)
1587 {
1588         long err = 0;
1589         unsigned long __user *nm = NULL;
1590         unsigned long nr_bits, alloc_size;
1591         nodemask_t bm;
1592
1593         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1594         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1595
1596         if (nmask) {
1597                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1598                 nm = compat_alloc_user_space(alloc_size);
1599                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1600         }
1601
1602         if (err)
1603                 return -EFAULT;
1604
1605         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1606 }
1607
1608 #endif
1609
1610 /*
1611  * get_vma_policy(@task, @vma, @addr)
1612  * @task: task for fallback if vma policy == default
1613  * @vma: virtual memory area whose policy is sought
1614  * @addr: address in @vma for shared policy lookup
1615  *
1616  * Returns effective policy for a VMA at specified address.
1617  * Falls back to @task or system default policy, as necessary.
1618  * Current or other task's task mempolicy and non-shared vma policies must be
1619  * protected by task_lock(task) by the caller.
1620  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1621  * count--added by the get_policy() vm_op, as appropriate--to protect against
1622  * freeing by another task.  It is the caller's responsibility to free the
1623  * extra reference for shared policies.
1624  */
1625 struct mempolicy *get_vma_policy(struct task_struct *task,
1626                 struct vm_area_struct *vma, unsigned long addr)
1627 {
1628         struct mempolicy *pol = get_task_policy(task);
1629
1630         if (vma) {
1631                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1632                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1633                                                                         addr);
1634                         if (vpol)
1635                                 pol = vpol;
1636                 } else if (vma->vm_policy) {
1637                         pol = vma->vm_policy;
1638
1639                         /*
1640                          * shmem_alloc_page() passes MPOL_F_SHARED policy with
1641                          * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1642                          * count on these policies which will be dropped by
1643                          * mpol_cond_put() later
1644                          */
1645                         if (mpol_needs_cond_ref(pol))
1646                                 mpol_get(pol);
1647                 }
1648         }
1649         if (!pol)
1650                 pol = &default_policy;
1651         return pol;
1652 }
1653
1654 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1655 {
1656         struct mempolicy *pol = get_task_policy(task);
1657         if (vma) {
1658                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1659                         bool ret = false;
1660
1661                         pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1662                         if (pol && (pol->flags & MPOL_F_MOF))
1663                                 ret = true;
1664                         mpol_cond_put(pol);
1665
1666                         return ret;
1667                 } else if (vma->vm_policy) {
1668                         pol = vma->vm_policy;
1669                 }
1670         }
1671
1672         if (!pol)
1673                 return default_policy.flags & MPOL_F_MOF;
1674
1675         return pol->flags & MPOL_F_MOF;
1676 }
1677
1678 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1679 {
1680         enum zone_type dynamic_policy_zone = policy_zone;
1681
1682         BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1683
1684         /*
1685          * if policy->v.nodes has movable memory only,
1686          * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1687          *
1688          * policy->v.nodes is intersect with node_states[N_MEMORY].
1689          * so if the following test faile, it implies
1690          * policy->v.nodes has movable memory only.
1691          */
1692         if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1693                 dynamic_policy_zone = ZONE_MOVABLE;
1694
1695         return zone >= dynamic_policy_zone;
1696 }
1697
1698 /*
1699  * Return a nodemask representing a mempolicy for filtering nodes for
1700  * page allocation
1701  */
1702 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1703 {
1704         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1705         if (unlikely(policy->mode == MPOL_BIND) &&
1706                         apply_policy_zone(policy, gfp_zone(gfp)) &&
1707                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1708                 return &policy->v.nodes;
1709
1710         return NULL;
1711 }
1712
1713 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1714 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1715         int nd)
1716 {
1717         switch (policy->mode) {
1718         case MPOL_PREFERRED:
1719                 if (!(policy->flags & MPOL_F_LOCAL))
1720                         nd = policy->v.preferred_node;
1721                 break;
1722         case MPOL_BIND:
1723                 /*
1724                  * Normally, MPOL_BIND allocations are node-local within the
1725                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1726                  * current node isn't part of the mask, we use the zonelist for
1727                  * the first node in the mask instead.
1728                  */
1729                 if (unlikely(gfp & __GFP_THISNODE) &&
1730                                 unlikely(!node_isset(nd, policy->v.nodes)))
1731                         nd = first_node(policy->v.nodes);
1732                 break;
1733         default:
1734                 BUG();
1735         }
1736         return node_zonelist(nd, gfp);
1737 }
1738
1739 /* Do dynamic interleaving for a process */
1740 static unsigned interleave_nodes(struct mempolicy *policy)
1741 {
1742         unsigned nid, next;
1743         struct task_struct *me = current;
1744
1745         nid = me->il_next;
1746         next = next_node(nid, policy->v.nodes);
1747         if (next >= MAX_NUMNODES)
1748                 next = first_node(policy->v.nodes);
1749         if (next < MAX_NUMNODES)
1750                 me->il_next = next;
1751         return nid;
1752 }
1753
1754 /*
1755  * Depending on the memory policy provide a node from which to allocate the
1756  * next slab entry.
1757  */
1758 unsigned int mempolicy_slab_node(void)
1759 {
1760         struct mempolicy *policy;
1761         int node = numa_mem_id();
1762
1763         if (in_interrupt())
1764                 return node;
1765
1766         policy = current->mempolicy;
1767         if (!policy || policy->flags & MPOL_F_LOCAL)
1768                 return node;
1769
1770         switch (policy->mode) {
1771         case MPOL_PREFERRED:
1772                 /*
1773                  * handled MPOL_F_LOCAL above
1774                  */
1775                 return policy->v.preferred_node;
1776
1777         case MPOL_INTERLEAVE:
1778                 return interleave_nodes(policy);
1779
1780         case MPOL_BIND: {
1781                 /*
1782                  * Follow bind policy behavior and start allocation at the
1783                  * first node.
1784                  */
1785                 struct zonelist *zonelist;
1786                 struct zone *zone;
1787                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1788                 zonelist = &NODE_DATA(node)->node_zonelists[0];
1789                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1790                                                         &policy->v.nodes,
1791                                                         &zone);
1792                 return zone ? zone->node : node;
1793         }
1794
1795         default:
1796                 BUG();
1797         }
1798 }
1799
1800 /* Do static interleaving for a VMA with known offset. */
1801 static unsigned offset_il_node(struct mempolicy *pol,
1802                 struct vm_area_struct *vma, unsigned long off)
1803 {
1804         unsigned nnodes = nodes_weight(pol->v.nodes);
1805         unsigned target;
1806         int c;
1807         int nid = NUMA_NO_NODE;
1808
1809         if (!nnodes)
1810                 return numa_node_id();
1811         target = (unsigned int)off % nnodes;
1812         c = 0;
1813         do {
1814                 nid = next_node(nid, pol->v.nodes);
1815                 c++;
1816         } while (c <= target);
1817         return nid;
1818 }
1819
1820 /* Determine a node number for interleave */
1821 static inline unsigned interleave_nid(struct mempolicy *pol,
1822                  struct vm_area_struct *vma, unsigned long addr, int shift)
1823 {
1824         if (vma) {
1825                 unsigned long off;
1826
1827                 /*
1828                  * for small pages, there is no difference between
1829                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1830                  * for huge pages, since vm_pgoff is in units of small
1831                  * pages, we need to shift off the always 0 bits to get
1832                  * a useful offset.
1833                  */
1834                 BUG_ON(shift < PAGE_SHIFT);
1835                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1836                 off += (addr - vma->vm_start) >> shift;
1837                 return offset_il_node(pol, vma, off);
1838         } else
1839                 return interleave_nodes(pol);
1840 }
1841
1842 /*
1843  * Return the bit number of a random bit set in the nodemask.
1844  * (returns NUMA_NO_NODE if nodemask is empty)
1845  */
1846 int node_random(const nodemask_t *maskp)
1847 {
1848         int w, bit = NUMA_NO_NODE;
1849
1850         w = nodes_weight(*maskp);
1851         if (w)
1852                 bit = bitmap_ord_to_pos(maskp->bits,
1853                         get_random_int() % w, MAX_NUMNODES);
1854         return bit;
1855 }
1856
1857 #ifdef CONFIG_HUGETLBFS
1858 /*
1859  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1860  * @vma: virtual memory area whose policy is sought
1861  * @addr: address in @vma for shared policy lookup and interleave policy
1862  * @gfp_flags: for requested zone
1863  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1864  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1865  *
1866  * Returns a zonelist suitable for a huge page allocation and a pointer
1867  * to the struct mempolicy for conditional unref after allocation.
1868  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1869  * @nodemask for filtering the zonelist.
1870  *
1871  * Must be protected by read_mems_allowed_begin()
1872  */
1873 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1874                                 gfp_t gfp_flags, struct mempolicy **mpol,
1875                                 nodemask_t **nodemask)
1876 {
1877         struct zonelist *zl;
1878
1879         *mpol = get_vma_policy(current, vma, addr);
1880         *nodemask = NULL;       /* assume !MPOL_BIND */
1881
1882         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1883                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1884                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1885         } else {
1886                 zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1887                 if ((*mpol)->mode == MPOL_BIND)
1888                         *nodemask = &(*mpol)->v.nodes;
1889         }
1890         return zl;
1891 }
1892
1893 /*
1894  * init_nodemask_of_mempolicy
1895  *
1896  * If the current task's mempolicy is "default" [NULL], return 'false'
1897  * to indicate default policy.  Otherwise, extract the policy nodemask
1898  * for 'bind' or 'interleave' policy into the argument nodemask, or
1899  * initialize the argument nodemask to contain the single node for
1900  * 'preferred' or 'local' policy and return 'true' to indicate presence
1901  * of non-default mempolicy.
1902  *
1903  * We don't bother with reference counting the mempolicy [mpol_get/put]
1904  * because the current task is examining it's own mempolicy and a task's
1905  * mempolicy is only ever changed by the task itself.
1906  *
1907  * N.B., it is the caller's responsibility to free a returned nodemask.
1908  */
1909 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1910 {
1911         struct mempolicy *mempolicy;
1912         int nid;
1913
1914         if (!(mask && current->mempolicy))
1915                 return false;
1916
1917         task_lock(current);
1918         mempolicy = current->mempolicy;
1919         switch (mempolicy->mode) {
1920         case MPOL_PREFERRED:
1921                 if (mempolicy->flags & MPOL_F_LOCAL)
1922                         nid = numa_node_id();
1923                 else
1924                         nid = mempolicy->v.preferred_node;
1925                 init_nodemask_of_node(mask, nid);
1926                 break;
1927
1928         case MPOL_BIND:
1929                 /* Fall through */
1930         case MPOL_INTERLEAVE:
1931                 *mask =  mempolicy->v.nodes;
1932                 break;
1933
1934         default:
1935                 BUG();
1936         }
1937         task_unlock(current);
1938
1939         return true;
1940 }
1941 #endif
1942
1943 /*
1944  * mempolicy_nodemask_intersects
1945  *
1946  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1947  * policy.  Otherwise, check for intersection between mask and the policy
1948  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1949  * policy, always return true since it may allocate elsewhere on fallback.
1950  *
1951  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1952  */
1953 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1954                                         const nodemask_t *mask)
1955 {
1956         struct mempolicy *mempolicy;
1957         bool ret = true;
1958
1959         if (!mask)
1960                 return ret;
1961         task_lock(tsk);
1962         mempolicy = tsk->mempolicy;
1963         if (!mempolicy)
1964                 goto out;
1965
1966         switch (mempolicy->mode) {
1967         case MPOL_PREFERRED:
1968                 /*
1969                  * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1970                  * allocate from, they may fallback to other nodes when oom.
1971                  * Thus, it's possible for tsk to have allocated memory from
1972                  * nodes in mask.
1973                  */
1974                 break;
1975         case MPOL_BIND:
1976         case MPOL_INTERLEAVE:
1977                 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1978                 break;
1979         default:
1980                 BUG();
1981         }
1982 out:
1983         task_unlock(tsk);
1984         return ret;
1985 }
1986
1987 /* Allocate a page in interleaved policy.
1988    Own path because it needs to do special accounting. */
1989 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1990                                         unsigned nid)
1991 {
1992         struct zonelist *zl;
1993         struct page *page;
1994
1995         zl = node_zonelist(nid, gfp);
1996         page = __alloc_pages(gfp, order, zl);
1997         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1998                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1999         return page;
2000 }
2001
2002 /**
2003  *      alloc_pages_vma - Allocate a page for a VMA.
2004  *
2005  *      @gfp:
2006  *      %GFP_USER    user allocation.
2007  *      %GFP_KERNEL  kernel allocations,
2008  *      %GFP_HIGHMEM highmem/user allocations,
2009  *      %GFP_FS      allocation should not call back into a file system.
2010  *      %GFP_ATOMIC  don't sleep.
2011  *
2012  *      @order:Order of the GFP allocation.
2013  *      @vma:  Pointer to VMA or NULL if not available.
2014  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
2015  *
2016  *      This function allocates a page from the kernel page pool and applies
2017  *      a NUMA policy associated with the VMA or the current process.
2018  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
2019  *      mm_struct of the VMA to prevent it from going away. Should be used for
2020  *      all allocations for pages that will be mapped into
2021  *      user space. Returns NULL when no page can be allocated.
2022  *
2023  *      Should be called with the mm_sem of the vma hold.
2024  */
2025 struct page *
2026 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2027                 unsigned long addr, int node)
2028 {
2029         struct mempolicy *pol;
2030         struct page *page;
2031         unsigned int cpuset_mems_cookie;
2032
2033 retry_cpuset:
2034         pol = get_vma_policy(current, vma, addr);
2035         cpuset_mems_cookie = read_mems_allowed_begin();
2036
2037         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
2038                 unsigned nid;
2039
2040                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2041                 mpol_cond_put(pol);
2042                 page = alloc_page_interleave(gfp, order, nid);
2043                 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2044                         goto retry_cpuset;
2045
2046                 return page;
2047         }
2048         page = __alloc_pages_nodemask(gfp, order,
2049                                       policy_zonelist(gfp, pol, node),
2050                                       policy_nodemask(gfp, pol));
2051         if (unlikely(mpol_needs_cond_ref(pol)))
2052                 __mpol_put(pol);
2053         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2054                 goto retry_cpuset;
2055         return page;
2056 }
2057
2058 /**
2059  *      alloc_pages_current - Allocate pages.
2060  *
2061  *      @gfp:
2062  *              %GFP_USER   user allocation,
2063  *              %GFP_KERNEL kernel allocation,
2064  *              %GFP_HIGHMEM highmem allocation,
2065  *              %GFP_FS     don't call back into a file system.
2066  *              %GFP_ATOMIC don't sleep.
2067  *      @order: Power of two of allocation size in pages. 0 is a single page.
2068  *
2069  *      Allocate a page from the kernel page pool.  When not in
2070  *      interrupt context and apply the current process NUMA policy.
2071  *      Returns NULL when no page can be allocated.
2072  *
2073  *      Don't call cpuset_update_task_memory_state() unless
2074  *      1) it's ok to take cpuset_sem (can WAIT), and
2075  *      2) allocating for current task (not interrupt).
2076  */
2077 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2078 {
2079         struct mempolicy *pol = get_task_policy(current);
2080         struct page *page;
2081         unsigned int cpuset_mems_cookie;
2082
2083         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
2084                 pol = &default_policy;
2085
2086 retry_cpuset:
2087         cpuset_mems_cookie = read_mems_allowed_begin();
2088
2089         /*
2090          * No reference counting needed for current->mempolicy
2091          * nor system default_policy
2092          */
2093         if (pol->mode == MPOL_INTERLEAVE)
2094                 page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2095         else
2096                 page = __alloc_pages_nodemask(gfp, order,
2097                                 policy_zonelist(gfp, pol, numa_node_id()),
2098                                 policy_nodemask(gfp, pol));
2099
2100         if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2101                 goto retry_cpuset;
2102
2103         return page;
2104 }
2105 EXPORT_SYMBOL(alloc_pages_current);
2106
2107 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2108 {
2109         struct mempolicy *pol = mpol_dup(vma_policy(src));
2110
2111         if (IS_ERR(pol))
2112                 return PTR_ERR(pol);
2113         dst->vm_policy = pol;
2114         return 0;
2115 }
2116
2117 /*
2118  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2119  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2120  * with the mems_allowed returned by cpuset_mems_allowed().  This
2121  * keeps mempolicies cpuset relative after its cpuset moves.  See
2122  * further kernel/cpuset.c update_nodemask().
2123  *
2124  * current's mempolicy may be rebinded by the other task(the task that changes
2125  * cpuset's mems), so we needn't do rebind work for current task.
2126  */
2127
2128 /* Slow path of a mempolicy duplicate */
2129 struct mempolicy *__mpol_dup(struct mempolicy *old)
2130 {
2131         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2132
2133         if (!new)
2134                 return ERR_PTR(-ENOMEM);
2135
2136         /* task's mempolicy is protected by alloc_lock */
2137         if (old == current->mempolicy) {
2138                 task_lock(current);
2139                 *new = *old;
2140                 task_unlock(current);
2141         } else
2142                 *new = *old;
2143
2144         rcu_read_lock();
2145         if (current_cpuset_is_being_rebound()) {
2146                 nodemask_t mems = cpuset_mems_allowed(current);
2147                 if (new->flags & MPOL_F_REBINDING)
2148                         mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2149                 else
2150                         mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2151         }
2152         rcu_read_unlock();
2153         atomic_set(&new->refcnt, 1);
2154         return new;
2155 }
2156
2157 /* Slow path of a mempolicy comparison */
2158 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2159 {
2160         if (!a || !b)
2161                 return false;
2162         if (a->mode != b->mode)
2163                 return false;
2164         if (a->flags != b->flags)
2165                 return false;
2166         if (mpol_store_user_nodemask(a))
2167                 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2168                         return false;
2169
2170         switch (a->mode) {
2171         case MPOL_BIND:
2172                 /* Fall through */
2173         case MPOL_INTERLEAVE:
2174                 return !!nodes_equal(a->v.nodes, b->v.nodes);
2175         case MPOL_PREFERRED:
2176                 return a->v.preferred_node == b->v.preferred_node;
2177         default:
2178                 BUG();
2179                 return false;
2180         }
2181 }
2182
2183 /*
2184  * Shared memory backing store policy support.
2185  *
2186  * Remember policies even when nobody has shared memory mapped.
2187  * The policies are kept in Red-Black tree linked from the inode.
2188  * They are protected by the sp->lock spinlock, which should be held
2189  * for any accesses to the tree.
2190  */
2191
2192 /* lookup first element intersecting start-end */
2193 /* Caller holds sp->lock */
2194 static struct sp_node *
2195 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2196 {
2197         struct rb_node *n = sp->root.rb_node;
2198
2199         while (n) {
2200                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
2201
2202                 if (start >= p->end)
2203                         n = n->rb_right;
2204                 else if (end <= p->start)
2205                         n = n->rb_left;
2206                 else
2207                         break;
2208         }
2209         if (!n)
2210                 return NULL;
2211         for (;;) {
2212                 struct sp_node *w = NULL;
2213                 struct rb_node *prev = rb_prev(n);
2214                 if (!prev)
2215                         break;
2216                 w = rb_entry(prev, struct sp_node, nd);
2217                 if (w->end <= start)
2218                         break;
2219                 n = prev;
2220         }
2221         return rb_entry(n, struct sp_node, nd);
2222 }
2223
2224 /* Insert a new shared policy into the list. */
2225 /* Caller holds sp->lock */
2226 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2227 {
2228         struct rb_node **p = &sp->root.rb_node;
2229         struct rb_node *parent = NULL;
2230         struct sp_node *nd;
2231
2232         while (*p) {
2233                 parent = *p;
2234                 nd = rb_entry(parent, struct sp_node, nd);
2235                 if (new->start < nd->start)
2236                         p = &(*p)->rb_left;
2237                 else if (new->end > nd->end)
2238                         p = &(*p)->rb_right;
2239                 else
2240                         BUG();
2241         }
2242         rb_link_node(&new->nd, parent, p);
2243         rb_insert_color(&new->nd, &sp->root);
2244         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2245                  new->policy ? new->policy->mode : 0);
2246 }
2247
2248 /* Find shared policy intersecting idx */
2249 struct mempolicy *
2250 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2251 {
2252         struct mempolicy *pol = NULL;
2253         struct sp_node *sn;
2254
2255         if (!sp->root.rb_node)
2256                 return NULL;
2257         spin_lock(&sp->lock);
2258         sn = sp_lookup(sp, idx, idx+1);
2259         if (sn) {
2260                 mpol_get(sn->policy);
2261                 pol = sn->policy;
2262         }
2263         spin_unlock(&sp->lock);
2264         return pol;
2265 }
2266
2267 static void sp_free(struct sp_node *n)
2268 {
2269         mpol_put(n->policy);
2270         kmem_cache_free(sn_cache, n);
2271 }
2272
2273 /**
2274  * mpol_misplaced - check whether current page node is valid in policy
2275  *
2276  * @page: page to be checked
2277  * @vma: vm area where page mapped
2278  * @addr: virtual address where page mapped
2279  *
2280  * Lookup current policy node id for vma,addr and "compare to" page's
2281  * node id.
2282  *
2283  * Returns:
2284  *      -1      - not misplaced, page is in the right node
2285  *      node    - node id where the page should be
2286  *
2287  * Policy determination "mimics" alloc_page_vma().
2288  * Called from fault path where we know the vma and faulting address.
2289  */
2290 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2291 {
2292         struct mempolicy *pol;
2293         struct zone *zone;
2294         int curnid = page_to_nid(page);
2295         unsigned long pgoff;
2296         int thiscpu = raw_smp_processor_id();
2297         int thisnid = cpu_to_node(thiscpu);
2298         int polnid = -1;
2299         int ret = -1;
2300
2301         BUG_ON(!vma);
2302
2303         pol = get_vma_policy(current, vma, addr);
2304         if (!(pol->flags & MPOL_F_MOF))
2305                 goto out;
2306
2307         switch (pol->mode) {
2308         case MPOL_INTERLEAVE:
2309                 BUG_ON(addr >= vma->vm_end);
2310                 BUG_ON(addr < vma->vm_start);
2311
2312                 pgoff = vma->vm_pgoff;
2313                 pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2314                 polnid = offset_il_node(pol, vma, pgoff);
2315                 break;
2316
2317         case MPOL_PREFERRED:
2318                 if (pol->flags & MPOL_F_LOCAL)
2319                         polnid = numa_node_id();
2320                 else
2321                         polnid = pol->v.preferred_node;
2322                 break;
2323
2324         case MPOL_BIND:
2325                 /*
2326                  * allows binding to multiple nodes.
2327                  * use current page if in policy nodemask,
2328                  * else select nearest allowed node, if any.
2329                  * If no allowed nodes, use current [!misplaced].
2330                  */
2331                 if (node_isset(curnid, pol->v.nodes))
2332                         goto out;
2333                 (void)first_zones_zonelist(
2334                                 node_zonelist(numa_node_id(), GFP_HIGHUSER),
2335                                 gfp_zone(GFP_HIGHUSER),
2336                                 &pol->v.nodes, &zone);
2337                 polnid = zone->node;
2338                 break;
2339
2340         default:
2341                 BUG();
2342         }
2343
2344         /* Migrate the page towards the node whose CPU is referencing it */
2345         if (pol->flags & MPOL_F_MORON) {
2346                 polnid = thisnid;
2347
2348                 if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2349                         goto out;
2350         }
2351
2352         if (curnid != polnid)
2353                 ret = polnid;
2354 out:
2355         mpol_cond_put(pol);
2356
2357         return ret;
2358 }
2359
2360 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2361 {
2362         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2363         rb_erase(&n->nd, &sp->root);
2364         sp_free(n);
2365 }
2366
2367 static void sp_node_init(struct sp_node *node, unsigned long start,
2368                         unsigned long end, struct mempolicy *pol)
2369 {
2370         node->start = start;
2371         node->end = end;
2372         node->policy = pol;
2373 }
2374
2375 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2376                                 struct mempolicy *pol)
2377 {
2378         struct sp_node *n;
2379         struct mempolicy *newpol;
2380
2381         n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2382         if (!n)
2383                 return NULL;
2384
2385         newpol = mpol_dup(pol);
2386         if (IS_ERR(newpol)) {
2387                 kmem_cache_free(sn_cache, n);
2388                 return NULL;
2389         }
2390         newpol->flags |= MPOL_F_SHARED;
2391         sp_node_init(n, start, end, newpol);
2392
2393         return n;
2394 }
2395
2396 /* Replace a policy range. */
2397 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2398                                  unsigned long end, struct sp_node *new)
2399 {
2400         struct sp_node *n;
2401         struct sp_node *n_new = NULL;
2402         struct mempolicy *mpol_new = NULL;
2403         int ret = 0;
2404
2405 restart:
2406         spin_lock(&sp->lock);
2407         n = sp_lookup(sp, start, end);
2408         /* Take care of old policies in the same range. */
2409         while (n && n->start < end) {
2410                 struct rb_node *next = rb_next(&n->nd);
2411                 if (n->start >= start) {
2412                         if (n->end <= end)
2413                                 sp_delete(sp, n);
2414                         else
2415                                 n->start = end;
2416                 } else {
2417                         /* Old policy spanning whole new range. */
2418                         if (n->end > end) {
2419                                 if (!n_new)
2420                                         goto alloc_new;
2421
2422                                 *mpol_new = *n->policy;
2423                                 atomic_set(&mpol_new->refcnt, 1);
2424                                 sp_node_init(n_new, end, n->end, mpol_new);
2425                                 n->end = start;
2426                                 sp_insert(sp, n_new);
2427                                 n_new = NULL;
2428                                 mpol_new = NULL;
2429                                 break;
2430                         } else
2431                                 n->end = start;
2432                 }
2433                 if (!next)
2434                         break;
2435                 n = rb_entry(next, struct sp_node, nd);
2436         }
2437         if (new)
2438                 sp_insert(sp, new);
2439         spin_unlock(&sp->lock);
2440         ret = 0;
2441
2442 err_out:
2443         if (mpol_new)
2444                 mpol_put(mpol_new);
2445         if (n_new)
2446                 kmem_cache_free(sn_cache, n_new);
2447
2448         return ret;
2449
2450 alloc_new:
2451         spin_unlock(&sp->lock);
2452         ret = -ENOMEM;
2453         n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2454         if (!n_new)
2455                 goto err_out;
2456         mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2457         if (!mpol_new)
2458                 goto err_out;
2459         goto restart;
2460 }
2461
2462 /**
2463  * mpol_shared_policy_init - initialize shared policy for inode
2464  * @sp: pointer to inode shared policy
2465  * @mpol:  struct mempolicy to install
2466  *
2467  * Install non-NULL @mpol in inode's shared policy rb-tree.
2468  * On entry, the current task has a reference on a non-NULL @mpol.
2469  * This must be released on exit.
2470  * This is called at get_inode() calls and we can use GFP_KERNEL.
2471  */
2472 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2473 {
2474         int ret;
2475
2476         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
2477         spin_lock_init(&sp->lock);
2478
2479         if (mpol) {
2480                 struct vm_area_struct pvma;
2481                 struct mempolicy *new;
2482                 NODEMASK_SCRATCH(scratch);
2483
2484                 if (!scratch)
2485                         goto put_mpol;
2486                 /* contextualize the tmpfs mount point mempolicy */
2487                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2488                 if (IS_ERR(new))
2489                         goto free_scratch; /* no valid nodemask intersection */
2490
2491                 task_lock(current);
2492                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2493                 task_unlock(current);
2494                 if (ret)
2495                         goto put_new;
2496
2497                 /* Create pseudo-vma that contains just the policy */
2498                 memset(&pvma, 0, sizeof(struct vm_area_struct));
2499                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
2500                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2501
2502 put_new:
2503                 mpol_put(new);                  /* drop initial ref */
2504 free_scratch:
2505                 NODEMASK_SCRATCH_FREE(scratch);
2506 put_mpol:
2507                 mpol_put(mpol); /* drop our incoming ref on sb mpol */
2508         }
2509 }
2510
2511 int mpol_set_shared_policy(struct shared_policy *info,
2512                         struct vm_area_struct *vma, struct mempolicy *npol)
2513 {
2514         int err;
2515         struct sp_node *new = NULL;
2516         unsigned long sz = vma_pages(vma);
2517
2518         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2519                  vma->vm_pgoff,
2520                  sz, npol ? npol->mode : -1,
2521                  npol ? npol->flags : -1,
2522                  npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2523
2524         if (npol) {
2525                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2526                 if (!new)
2527                         return -ENOMEM;
2528         }
2529         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2530         if (err && new)
2531                 sp_free(new);
2532         return err;
2533 }
2534
2535 /* Free a backing policy store on inode delete. */
2536 void mpol_free_shared_policy(struct shared_policy *p)
2537 {
2538         struct sp_node *n;
2539         struct rb_node *next;
2540
2541         if (!p->root.rb_node)
2542                 return;
2543         spin_lock(&p->lock);
2544         next = rb_first(&p->root);
2545         while (next) {
2546                 n = rb_entry(next, struct sp_node, nd);
2547                 next = rb_next(&n->nd);
2548                 sp_delete(p, n);
2549         }
2550         spin_unlock(&p->lock);
2551 }
2552
2553 #ifdef CONFIG_NUMA_BALANCING
2554 static int __initdata numabalancing_override;
2555
2556 static void __init check_numabalancing_enable(void)
2557 {
2558         bool numabalancing_default = false;
2559
2560         if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2561                 numabalancing_default = true;
2562
2563         /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2564         if (numabalancing_override)
2565                 set_numabalancing_state(numabalancing_override == 1);
2566
2567         if (nr_node_ids > 1 && !numabalancing_override) {
2568                 pr_info("%s automatic NUMA balancing. "
2569                         "Configure with numa_balancing= or the "
2570                         "kernel.numa_balancing sysctl",
2571                         numabalancing_default ? "Enabling" : "Disabling");
2572                 set_numabalancing_state(numabalancing_default);
2573         }
2574 }
2575
2576 static int __init setup_numabalancing(char *str)
2577 {
2578         int ret = 0;
2579         if (!str)
2580                 goto out;
2581
2582         if (!strcmp(str, "enable")) {
2583                 numabalancing_override = 1;
2584                 ret = 1;
2585         } else if (!strcmp(str, "disable")) {
2586                 numabalancing_override = -1;
2587                 ret = 1;
2588         }
2589 out:
2590         if (!ret)
2591                 pr_warn("Unable to parse numa_balancing=\n");
2592
2593         return ret;
2594 }
2595 __setup("numa_balancing=", setup_numabalancing);
2596 #else
2597 static inline void __init check_numabalancing_enable(void)
2598 {
2599 }
2600 #endif /* CONFIG_NUMA_BALANCING */
2601
2602 /* assumes fs == KERNEL_DS */
2603 void __init numa_policy_init(void)
2604 {
2605         nodemask_t interleave_nodes;
2606         unsigned long largest = 0;
2607         int nid, prefer = 0;
2608
2609         policy_cache = kmem_cache_create("numa_policy",
2610                                          sizeof(struct mempolicy),
2611                                          0, SLAB_PANIC, NULL);
2612
2613         sn_cache = kmem_cache_create("shared_policy_node",
2614                                      sizeof(struct sp_node),
2615                                      0, SLAB_PANIC, NULL);
2616
2617         for_each_node(nid) {
2618                 preferred_node_policy[nid] = (struct mempolicy) {
2619                         .refcnt = ATOMIC_INIT(1),
2620                         .mode = MPOL_PREFERRED,
2621                         .flags = MPOL_F_MOF | MPOL_F_MORON,
2622                         .v = { .preferred_node = nid, },
2623                 };
2624         }
2625
2626         /*
2627          * Set interleaving policy for system init. Interleaving is only
2628          * enabled across suitably sized nodes (default is >= 16MB), or
2629          * fall back to the largest node if they're all smaller.
2630          */
2631         nodes_clear(interleave_nodes);
2632         for_each_node_state(nid, N_MEMORY) {
2633                 unsigned long total_pages = node_present_pages(nid);
2634
2635                 /* Preserve the largest node */
2636                 if (largest < total_pages) {
2637                         largest = total_pages;
2638                         prefer = nid;
2639                 }
2640
2641                 /* Interleave this node? */
2642                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2643                         node_set(nid, interleave_nodes);
2644         }
2645
2646         /* All too small, use the largest */
2647         if (unlikely(nodes_empty(interleave_nodes)))
2648                 node_set(prefer, interleave_nodes);
2649
2650         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2651                 pr_err("%s: interleaving failed\n", __func__);
2652
2653         check_numabalancing_enable();
2654 }
2655
2656 /* Reset policy of current process to default */
2657 void numa_default_policy(void)
2658 {
2659         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2660 }
2661
2662 /*
2663  * Parse and format mempolicy from/to strings
2664  */
2665
2666 /*
2667  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2668  */
2669 static const char * const policy_modes[] =
2670 {
2671         [MPOL_DEFAULT]    = "default",
2672         [MPOL_PREFERRED]  = "prefer",
2673         [MPOL_BIND]       = "bind",
2674         [MPOL_INTERLEAVE] = "interleave",
2675         [MPOL_LOCAL]      = "local",
2676 };
2677
2678
2679 #ifdef CONFIG_TMPFS
2680 /**
2681  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2682  * @str:  string containing mempolicy to parse
2683  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2684  *
2685  * Format of input:
2686  *      <mode>[=<flags>][:<nodelist>]
2687  *
2688  * On success, returns 0, else 1
2689  */
2690 int mpol_parse_str(char *str, struct mempolicy **mpol)
2691 {
2692         struct mempolicy *new = NULL;
2693         unsigned short mode;
2694         unsigned short mode_flags;
2695         nodemask_t nodes;
2696         char *nodelist = strchr(str, ':');
2697         char *flags = strchr(str, '=');
2698         int err = 1;
2699
2700         if (nodelist) {
2701                 /* NUL-terminate mode or flags string */
2702                 *nodelist++ = '\0';
2703                 if (nodelist_parse(nodelist, nodes))
2704                         goto out;
2705                 if (!nodes_subset(nodes, node_states[N_MEMORY]))
2706                         goto out;
2707         } else
2708                 nodes_clear(nodes);
2709
2710         if (flags)
2711                 *flags++ = '\0';        /* terminate mode string */
2712
2713         for (mode = 0; mode < MPOL_MAX; mode++) {
2714                 if (!strcmp(str, policy_modes[mode])) {
2715                         break;
2716                 }
2717         }
2718         if (mode >= MPOL_MAX)
2719                 goto out;
2720
2721         switch (mode) {
2722         case MPOL_PREFERRED:
2723                 /*
2724                  * Insist on a nodelist of one node only
2725                  */
2726                 if (nodelist) {
2727                         char *rest = nodelist;
2728                         while (isdigit(*rest))
2729                                 rest++;
2730                         if (*rest)
2731                                 goto out;
2732                 }
2733                 break;
2734         case MPOL_INTERLEAVE:
2735                 /*
2736                  * Default to online nodes with memory if no nodelist
2737                  */
2738                 if (!nodelist)
2739                         nodes = node_states[N_MEMORY];
2740                 break;
2741         case MPOL_LOCAL:
2742                 /*
2743                  * Don't allow a nodelist;  mpol_new() checks flags
2744                  */
2745                 if (nodelist)
2746                         goto out;
2747                 mode = MPOL_PREFERRED;
2748                 break;
2749         case MPOL_DEFAULT:
2750                 /*
2751                  * Insist on a empty nodelist
2752                  */
2753                 if (!nodelist)
2754                         err = 0;
2755                 goto out;
2756         case MPOL_BIND:
2757                 /*
2758                  * Insist on a nodelist
2759                  */
2760                 if (!nodelist)
2761                         goto out;
2762         }
2763
2764         mode_flags = 0;
2765         if (flags) {
2766                 /*
2767                  * Currently, we only support two mutually exclusive
2768                  * mode flags.
2769                  */
2770                 if (!strcmp(flags, "static"))
2771                         mode_flags |= MPOL_F_STATIC_NODES;
2772                 else if (!strcmp(flags, "relative"))
2773                         mode_flags |= MPOL_F_RELATIVE_NODES;
2774                 else
2775                         goto out;
2776         }
2777
2778         new = mpol_new(mode, mode_flags, &nodes);
2779         if (IS_ERR(new))
2780                 goto out;
2781
2782         /*
2783          * Save nodes for mpol_to_str() to show the tmpfs mount options
2784          * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2785          */
2786         if (mode != MPOL_PREFERRED)
2787                 new->v.nodes = nodes;
2788         else if (nodelist)
2789                 new->v.preferred_node = first_node(nodes);
2790         else
2791                 new->flags |= MPOL_F_LOCAL;
2792
2793         /*
2794          * Save nodes for contextualization: this will be used to "clone"
2795          * the mempolicy in a specific context [cpuset] at a later time.
2796          */
2797         new->w.user_nodemask = nodes;
2798
2799         err = 0;
2800
2801 out:
2802         /* Restore string for error message */
2803         if (nodelist)
2804                 *--nodelist = ':';
2805         if (flags)
2806                 *--flags = '=';
2807         if (!err)
2808                 *mpol = new;
2809         return err;
2810 }
2811 #endif /* CONFIG_TMPFS */
2812
2813 /**
2814  * mpol_to_str - format a mempolicy structure for printing
2815  * @buffer:  to contain formatted mempolicy string
2816  * @maxlen:  length of @buffer
2817  * @pol:  pointer to mempolicy to be formatted
2818  *
2819  * Convert @pol into a string.  If @buffer is too short, truncate the string.
2820  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2821  * longest flag, "relative", and to display at least a few node ids.
2822  */
2823 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2824 {
2825         char *p = buffer;
2826         nodemask_t nodes = NODE_MASK_NONE;
2827         unsigned short mode = MPOL_DEFAULT;
2828         unsigned short flags = 0;
2829
2830         if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2831                 mode = pol->mode;
2832                 flags = pol->flags;
2833         }
2834
2835         switch (mode) {
2836         case MPOL_DEFAULT:
2837                 break;
2838         case MPOL_PREFERRED:
2839                 if (flags & MPOL_F_LOCAL)
2840                         mode = MPOL_LOCAL;
2841                 else
2842                         node_set(pol->v.preferred_node, nodes);
2843                 break;
2844         case MPOL_BIND:
2845         case MPOL_INTERLEAVE:
2846                 nodes = pol->v.nodes;
2847                 break;
2848         default:
2849                 WARN_ON_ONCE(1);
2850                 snprintf(p, maxlen, "unknown");
2851                 return;
2852         }
2853
2854         p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2855
2856         if (flags & MPOL_MODE_FLAGS) {
2857                 p += snprintf(p, buffer + maxlen - p, "=");
2858
2859                 /*
2860                  * Currently, the only defined flags are mutually exclusive
2861                  */
2862                 if (flags & MPOL_F_STATIC_NODES)
2863                         p += snprintf(p, buffer + maxlen - p, "static");
2864                 else if (flags & MPOL_F_RELATIVE_NODES)
2865                         p += snprintf(p, buffer + maxlen - p, "relative");
2866         }
2867
2868         if (!nodes_empty(nodes)) {
2869                 p += snprintf(p, buffer + maxlen - p, ":");
2870                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2871         }
2872 }