mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/nodemask.h>
  76 #include <linux/cpuset.h>
  77 #include <linux/gfp.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/module.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/rmap.h>
  90 #include <linux/security.h>
  91 #include <linux/syscalls.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 struct mempolicy default_policy = {
 109         .refcnt = ATOMIC_INIT(1), /* never free it */
 110         .policy = MPOL_DEFAULT,
 111 };
 112
 113 /* Check that the nodemask contains at least one populated zone */
 114 static int is_valid_nodemask(nodemask_t *nodemask)
 115 {
 116         int nd, k;
 117
 118         /* Check that there is something useful in this mask */
 119         k = policy_zone;
 120
 121         for_each_node_mask(nd, *nodemask) {
 122                 struct zone *z;
 123
 124                 for (k = 0; k <= policy_zone; k++) {
 125                         z = &NODE_DATA(nd)->node_zones[k];
 126                         if (z->present_pages > 0)
 127                                 return 1;
 128                 }
 129         }
 130
 131         return 0;
 132 }
 133
 134 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 135 {
 136         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 137 }
 138
 139 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 140                                    const nodemask_t *rel)
 141 {
 142         nodemask_t tmp;
 143         nodes_fold(tmp, *orig, nodes_weight(*rel));
 144         nodes_onto(*ret, tmp, *rel);
 145 }
 146
 147 /* Create a new policy */
 148 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 149                                   nodemask_t *nodes)
 150 {
 151         struct mempolicy *policy;
 152         nodemask_t cpuset_context_nmask;
 153
 154         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 155                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 156
 157         if (mode == MPOL_DEFAULT)
 158                 return (nodes && nodes_weight(*nodes)) ? ERR_PTR(-EINVAL) :
 159                                                          NULL;
 160         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 161         if (!policy)
 162                 return ERR_PTR(-ENOMEM);
 163         atomic_set(&policy->refcnt, 1);
 164         cpuset_update_task_memory_state();
 165         if (flags & MPOL_F_RELATIVE_NODES)
 166                 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 167                                        &cpuset_current_mems_allowed);
 168         else
 169                 nodes_and(cpuset_context_nmask, *nodes,
 170                           cpuset_current_mems_allowed);
 171         switch (mode) {
 172         case MPOL_INTERLEAVE:
 173                 if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
 174                         goto free;
 175                 policy->v.nodes = cpuset_context_nmask;
 176                 break;
 177         case MPOL_PREFERRED:
 178                 policy->v.preferred_node = first_node(cpuset_context_nmask);
 179                 if (policy->v.preferred_node >= MAX_NUMNODES)
 180                         goto free;
 181                 break;
 182         case MPOL_BIND:
 183                 if (!is_valid_nodemask(&cpuset_context_nmask))
 184                         goto free;
 185                 policy->v.nodes = cpuset_context_nmask;
 186                 break;
 187         default:
 188                 BUG();
 189         }
 190         policy->policy = mode;
 191         policy->flags = flags;
 192         if (mpol_store_user_nodemask(policy))
 193                 policy->w.user_nodemask = *nodes;
 194         else
 195                 policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
 196         return policy;
 197
 198 free:
 199         kmem_cache_free(policy_cache, policy);
 200         return ERR_PTR(-EINVAL);
 201 }
 202
 203 /* Migrate a policy to a different set of nodes */
 204 static void mpol_rebind_policy(struct mempolicy *pol,
 205                                const nodemask_t *newmask)
 206 {
 207         nodemask_t tmp;
 208         int static_nodes;
 209         int relative_nodes;
 210
 211         if (!pol)
 212                 return;
 213         static_nodes = pol->flags & MPOL_F_STATIC_NODES;
 214         relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
 215         if (!mpol_store_user_nodemask(pol) &&
 216             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 217                 return;
 218
 219         switch (pol->policy) {
 220         case MPOL_DEFAULT:
 221                 break;
 222         case MPOL_BIND:
 223                 /* Fall through */
 224         case MPOL_INTERLEAVE:
 225                 if (static_nodes)
 226                         nodes_and(tmp, pol->w.user_nodemask, *newmask);
 227                 else if (relative_nodes)
 228                         mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
 229                                                newmask);
 230                 else {
 231                         nodes_remap(tmp, pol->v.nodes,
 232                                     pol->w.cpuset_mems_allowed, *newmask);
 233                         pol->w.cpuset_mems_allowed = *newmask;
 234                 }
 235                 pol->v.nodes = tmp;
 236                 if (!node_isset(current->il_next, tmp)) {
 237                         current->il_next = next_node(current->il_next, tmp);
 238                         if (current->il_next >= MAX_NUMNODES)
 239                                 current->il_next = first_node(tmp);
 240                         if (current->il_next >= MAX_NUMNODES)
 241                                 current->il_next = numa_node_id();
 242                 }
 243                 break;
 244         case MPOL_PREFERRED:
 245                 if (static_nodes) {
 246                         int node = first_node(pol->w.user_nodemask);
 247
 248                         if (node_isset(node, *newmask))
 249                                 pol->v.preferred_node = node;
 250                         else
 251                                 pol->v.preferred_node = -1;
 252                 } else if (relative_nodes) {
 253                         mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
 254                                                newmask);
 255                         pol->v.preferred_node = first_node(tmp);
 256                 } else {
 257                         pol->v.preferred_node = node_remap(pol->v.preferred_node,
 258                                         pol->w.cpuset_mems_allowed, *newmask);
 259                         pol->w.cpuset_mems_allowed = *newmask;
 260                 }
 261                 break;
 262         default:
 263                 BUG();
 264                 break;
 265         }
 266 }
 267
 268 /*
 269  * Wrapper for mpol_rebind_policy() that just requires task
 270  * pointer, and updates task mempolicy.
 271  */
 272
 273 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 274 {
 275         mpol_rebind_policy(tsk->mempolicy, new);
 276 }
 277
 278 /*
 279  * Rebind each vma in mm to new nodemask.
 280  *
 281  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 282  */
 283
 284 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 285 {
 286         struct vm_area_struct *vma;
 287
 288         down_write(&mm->mmap_sem);
 289         for (vma = mm->mmap; vma; vma = vma->vm_next)
 290                 mpol_rebind_policy(vma->vm_policy, new);
 291         up_write(&mm->mmap_sem);
 292 }
 293
 294 static void gather_stats(struct page *, void *, int pte_dirty);
 295 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 296                                 unsigned long flags);
 297
 298 /* Scan through pages checking if pages follow certain conditions. */
 299 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 300                 unsigned long addr, unsigned long end,
 301                 const nodemask_t *nodes, unsigned long flags,
 302                 void *private)
 303 {
 304         pte_t *orig_pte;
 305         pte_t *pte;
 306         spinlock_t *ptl;
 307
 308         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 309         do {
 310                 struct page *page;
 311                 int nid;
 312
 313                 if (!pte_present(*pte))
 314                         continue;
 315                 page = vm_normal_page(vma, addr, *pte);
 316                 if (!page)
 317                         continue;
 318                 /*
 319                  * The check for PageReserved here is important to avoid
 320                  * handling zero pages and other pages that may have been
 321                  * marked special by the system.
 322                  *
 323                  * If the PageReserved would not be checked here then f.e.
 324                  * the location of the zero page could have an influence
 325                  * on MPOL_MF_STRICT, zero pages would be counted for
 326                  * the per node stats, and there would be useless attempts
 327                  * to put zero pages on the migration list.
 328                  */
 329                 if (PageReserved(page))
 330                         continue;
 331                 nid = page_to_nid(page);
 332                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 333                         continue;
 334
 335                 if (flags & MPOL_MF_STATS)
 336                         gather_stats(page, private, pte_dirty(*pte));
 337                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 338                         migrate_page_add(page, private, flags);
 339                 else
 340                         break;
 341         } while (pte++, addr += PAGE_SIZE, addr != end);
 342         pte_unmap_unlock(orig_pte, ptl);
 343         return addr != end;
 344 }
 345
 346 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 347                 unsigned long addr, unsigned long end,
 348                 const nodemask_t *nodes, unsigned long flags,
 349                 void *private)
 350 {
 351         pmd_t *pmd;
 352         unsigned long next;
 353
 354         pmd = pmd_offset(pud, addr);
 355         do {
 356                 next = pmd_addr_end(addr, end);
 357                 if (pmd_none_or_clear_bad(pmd))
 358                         continue;
 359                 if (check_pte_range(vma, pmd, addr, next, nodes,
 360                                     flags, private))
 361                         return -EIO;
 362         } while (pmd++, addr = next, addr != end);
 363         return 0;
 364 }
 365
 366 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 367                 unsigned long addr, unsigned long end,
 368                 const nodemask_t *nodes, unsigned long flags,
 369                 void *private)
 370 {
 371         pud_t *pud;
 372         unsigned long next;
 373
 374         pud = pud_offset(pgd, addr);
 375         do {
 376                 next = pud_addr_end(addr, end);
 377                 if (pud_none_or_clear_bad(pud))
 378                         continue;
 379                 if (check_pmd_range(vma, pud, addr, next, nodes,
 380                                     flags, private))
 381                         return -EIO;
 382         } while (pud++, addr = next, addr != end);
 383         return 0;
 384 }
 385
 386 static inline int check_pgd_range(struct vm_area_struct *vma,
 387                 unsigned long addr, unsigned long end,
 388                 const nodemask_t *nodes, unsigned long flags,
 389                 void *private)
 390 {
 391         pgd_t *pgd;
 392         unsigned long next;
 393
 394         pgd = pgd_offset(vma->vm_mm, addr);
 395         do {
 396                 next = pgd_addr_end(addr, end);
 397                 if (pgd_none_or_clear_bad(pgd))
 398                         continue;
 399                 if (check_pud_range(vma, pgd, addr, next, nodes,
 400                                     flags, private))
 401                         return -EIO;
 402         } while (pgd++, addr = next, addr != end);
 403         return 0;
 404 }
 405
 406 /*
 407  * Check if all pages in a range are on a set of nodes.
 408  * If pagelist != NULL then isolate pages from the LRU and
 409  * put them on the pagelist.
 410  */
 411 static struct vm_area_struct *
 412 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 413                 const nodemask_t *nodes, unsigned long flags, void *private)
 414 {
 415         int err;
 416         struct vm_area_struct *first, *vma, *prev;
 417
 418         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 419
 420                 err = migrate_prep();
 421                 if (err)
 422                         return ERR_PTR(err);
 423         }
 424
 425         first = find_vma(mm, start);
 426         if (!first)
 427                 return ERR_PTR(-EFAULT);
 428         prev = NULL;
 429         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 430                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 431                         if (!vma->vm_next && vma->vm_end < end)
 432                                 return ERR_PTR(-EFAULT);
 433                         if (prev && prev->vm_end < vma->vm_start)
 434                                 return ERR_PTR(-EFAULT);
 435                 }
 436                 if (!is_vm_hugetlb_page(vma) &&
 437                     ((flags & MPOL_MF_STRICT) ||
 438                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 439                                 vma_migratable(vma)))) {
 440                         unsigned long endvma = vma->vm_end;
 441
 442                         if (endvma > end)
 443                                 endvma = end;
 444                         if (vma->vm_start > start)
 445                                 start = vma->vm_start;
 446                         err = check_pgd_range(vma, start, endvma, nodes,
 447                                                 flags, private);
 448                         if (err) {
 449                                 first = ERR_PTR(err);
 450                                 break;
 451                         }
 452                 }
 453                 prev = vma;
 454         }
 455         return first;
 456 }
 457
 458 /* Apply policy to a single VMA */
 459 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 460 {
 461         int err = 0;
 462         struct mempolicy *old = vma->vm_policy;
 463
 464         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 465                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 466                  vma->vm_ops, vma->vm_file,
 467                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 468
 469         if (vma->vm_ops && vma->vm_ops->set_policy)
 470                 err = vma->vm_ops->set_policy(vma, new);
 471         if (!err) {
 472                 mpol_get(new);
 473                 vma->vm_policy = new;
 474                 mpol_free(old);
 475         }
 476         return err;
 477 }
 478
 479 /* Step 2: apply policy to a range and do splits. */
 480 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 481                        unsigned long end, struct mempolicy *new)
 482 {
 483         struct vm_area_struct *next;
 484         int err;
 485
 486         err = 0;
 487         for (; vma && vma->vm_start < end; vma = next) {
 488                 next = vma->vm_next;
 489                 if (vma->vm_start < start)
 490                         err = split_vma(vma->vm_mm, vma, start, 1);
 491                 if (!err && vma->vm_end > end)
 492                         err = split_vma(vma->vm_mm, vma, end, 0);
 493                 if (!err)
 494                         err = policy_vma(vma, new);
 495                 if (err)
 496                         break;
 497         }
 498         return err;
 499 }
 500
 501 /*
 502  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 503  * mempolicy.  Allows more rapid checking of this (combined perhaps
 504  * with other PF_* flag bits) on memory allocation hot code paths.
 505  *
 506  * If called from outside this file, the task 'p' should -only- be
 507  * a newly forked child not yet visible on the task list, because
 508  * manipulating the task flags of a visible task is not safe.
 509  *
 510  * The above limitation is why this routine has the funny name
 511  * mpol_fix_fork_child_flag().
 512  *
 513  * It is also safe to call this with a task pointer of current,
 514  * which the static wrapper mpol_set_task_struct_flag() does,
 515  * for use within this file.
 516  */
 517
 518 void mpol_fix_fork_child_flag(struct task_struct *p)
 519 {
 520         if (p->mempolicy)
 521                 p->flags |= PF_MEMPOLICY;
 522         else
 523                 p->flags &= ~PF_MEMPOLICY;
 524 }
 525
 526 static void mpol_set_task_struct_flag(void)
 527 {
 528         mpol_fix_fork_child_flag(current);
 529 }
 530
 531 /* Set the process memory policy */
 532 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 533                              nodemask_t *nodes)
 534 {
 535         struct mempolicy *new;
 536
 537         new = mpol_new(mode, flags, nodes);
 538         if (IS_ERR(new))
 539                 return PTR_ERR(new);
 540         mpol_free(current->mempolicy);
 541         current->mempolicy = new;
 542         mpol_set_task_struct_flag();
 543         if (new && new->policy == MPOL_INTERLEAVE &&
 544             nodes_weight(new->v.nodes))
 545                 current->il_next = first_node(new->v.nodes);
 546         return 0;
 547 }
 548
 549 /* Fill a zone bitmap for a policy */
 550 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 551 {
 552         nodes_clear(*nodes);
 553         switch (p->policy) {
 554         case MPOL_DEFAULT:
 555                 break;
 556         case MPOL_BIND:
 557                 /* Fall through */
 558         case MPOL_INTERLEAVE:
 559                 *nodes = p->v.nodes;
 560                 break;
 561         case MPOL_PREFERRED:
 562                 /* or use current node instead of memory_map? */
 563                 if (p->v.preferred_node < 0)
 564                         *nodes = node_states[N_HIGH_MEMORY];
 565                 else
 566                         node_set(p->v.preferred_node, *nodes);
 567                 break;
 568         default:
 569                 BUG();
 570         }
 571 }
 572
 573 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 574 {
 575         struct page *p;
 576         int err;
 577
 578         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 579         if (err >= 0) {
 580                 err = page_to_nid(p);
 581                 put_page(p);
 582         }
 583         return err;
 584 }
 585
 586 /* Retrieve NUMA policy */
 587 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 588                              unsigned long addr, unsigned long flags)
 589 {
 590         int err;
 591         struct mm_struct *mm = current->mm;
 592         struct vm_area_struct *vma = NULL;
 593         struct mempolicy *pol = current->mempolicy;
 594
 595         cpuset_update_task_memory_state();
 596         if (flags &
 597                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 598                 return -EINVAL;
 599
 600         if (flags & MPOL_F_MEMS_ALLOWED) {
 601                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 602                         return -EINVAL;
 603                 *policy = 0;    /* just so it's initialized */
 604                 *nmask  = cpuset_current_mems_allowed;
 605                 return 0;
 606         }
 607
 608         if (flags & MPOL_F_ADDR) {
 609                 down_read(&mm->mmap_sem);
 610                 vma = find_vma_intersection(mm, addr, addr+1);
 611                 if (!vma) {
 612                         up_read(&mm->mmap_sem);
 613                         return -EFAULT;
 614                 }
 615                 if (vma->vm_ops && vma->vm_ops->get_policy)
 616                         pol = vma->vm_ops->get_policy(vma, addr);
 617                 else
 618                         pol = vma->vm_policy;
 619         } else if (addr)
 620                 return -EINVAL;
 621
 622         if (!pol)
 623                 pol = &default_policy;
 624
 625         if (flags & MPOL_F_NODE) {
 626                 if (flags & MPOL_F_ADDR) {
 627                         err = lookup_node(mm, addr);
 628                         if (err < 0)
 629                                 goto out;
 630                         *policy = err;
 631                 } else if (pol == current->mempolicy &&
 632                                 pol->policy == MPOL_INTERLEAVE) {
 633                         *policy = current->il_next;
 634                 } else {
 635                         err = -EINVAL;
 636                         goto out;
 637                 }
 638         } else
 639                 *policy = pol->policy | pol->flags;
 640
 641         if (vma) {
 642                 up_read(&current->mm->mmap_sem);
 643                 vma = NULL;
 644         }
 645
 646         err = 0;
 647         if (nmask)
 648                 get_zonemask(pol, nmask);
 649
 650  out:
 651         if (vma)
 652                 up_read(&current->mm->mmap_sem);
 653         return err;
 654 }
 655
 656 #ifdef CONFIG_MIGRATION
 657 /*
 658  * page migration
 659  */
 660 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 661                                 unsigned long flags)
 662 {
 663         /*
 664          * Avoid migrating a page that is shared with others.
 665          */
 666         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 667                 isolate_lru_page(page, pagelist);
 668 }
 669
 670 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 671 {
 672         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 673 }
 674
 675 /*
 676  * Migrate pages from one node to a target node.
 677  * Returns error or the number of pages not migrated.
 678  */
 679 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 680                            int flags)
 681 {
 682         nodemask_t nmask;
 683         LIST_HEAD(pagelist);
 684         int err = 0;
 685
 686         nodes_clear(nmask);
 687         node_set(source, nmask);
 688
 689         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 690                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 691
 692         if (!list_empty(&pagelist))
 693                 err = migrate_pages(&pagelist, new_node_page, dest);
 694
 695         return err;
 696 }
 697
 698 /*
 699  * Move pages between the two nodesets so as to preserve the physical
 700  * layout as much as possible.
 701  *
 702  * Returns the number of page that could not be moved.
 703  */
 704 int do_migrate_pages(struct mm_struct *mm,
 705         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 706 {
 707         LIST_HEAD(pagelist);
 708         int busy = 0;
 709         int err = 0;
 710         nodemask_t tmp;
 711
 712         down_read(&mm->mmap_sem);
 713
 714         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 715         if (err)
 716                 goto out;
 717
 718 /*
 719  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 720  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 721  * bit in 'tmp', and return that <source, dest> pair for migration.
 722  * The pair of nodemasks 'to' and 'from' define the map.
 723  *
 724  * If no pair of bits is found that way, fallback to picking some
 725  * pair of 'source' and 'dest' bits that are not the same.  If the
 726  * 'source' and 'dest' bits are the same, this represents a node
 727  * that will be migrating to itself, so no pages need move.
 728  *
 729  * If no bits are left in 'tmp', or if all remaining bits left
 730  * in 'tmp' correspond to the same bit in 'to', return false
 731  * (nothing left to migrate).
 732  *
 733  * This lets us pick a pair of nodes to migrate between, such that
 734  * if possible the dest node is not already occupied by some other
 735  * source node, minimizing the risk of overloading the memory on a
 736  * node that would happen if we migrated incoming memory to a node
 737  * before migrating outgoing memory source that same node.
 738  *
 739  * A single scan of tmp is sufficient.  As we go, we remember the
 740  * most recent <s, d> pair that moved (s != d).  If we find a pair
 741  * that not only moved, but what's better, moved to an empty slot
 742  * (d is not set in tmp), then we break out then, with that pair.
 743  * Otherwise when we finish scannng from_tmp, we at least have the
 744  * most recent <s, d> pair that moved.  If we get all the way through
 745  * the scan of tmp without finding any node that moved, much less
 746  * moved to an empty node, then there is nothing left worth migrating.
 747  */
 748
 749         tmp = *from_nodes;
 750         while (!nodes_empty(tmp)) {
 751                 int s,d;
 752                 int source = -1;
 753                 int dest = 0;
 754
 755                 for_each_node_mask(s, tmp) {
 756                         d = node_remap(s, *from_nodes, *to_nodes);
 757                         if (s == d)
 758                                 continue;
 759
 760                         source = s;     /* Node moved. Memorize */
 761                         dest = d;
 762
 763                         /* dest not in remaining from nodes? */
 764                         if (!node_isset(dest, tmp))
 765                                 break;
 766                 }
 767                 if (source == -1)
 768                         break;
 769
 770                 node_clear(source, tmp);
 771                 err = migrate_to_node(mm, source, dest, flags);
 772                 if (err > 0)
 773                         busy += err;
 774                 if (err < 0)
 775                         break;
 776         }
 777 out:
 778         up_read(&mm->mmap_sem);
 779         if (err < 0)
 780                 return err;
 781         return busy;
 782
 783 }
 784
 785 /*
 786  * Allocate a new page for page migration based on vma policy.
 787  * Start assuming that page is mapped by vma pointed to by @private.
 788  * Search forward from there, if not.  N.B., this assumes that the
 789  * list of pages handed to migrate_pages()--which is how we get here--
 790  * is in virtual address order.
 791  */
 792 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 793 {
 794         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 795         unsigned long uninitialized_var(address);
 796
 797         while (vma) {
 798                 address = page_address_in_vma(page, vma);
 799                 if (address != -EFAULT)
 800                         break;
 801                 vma = vma->vm_next;
 802         }
 803
 804         /*
 805          * if !vma, alloc_page_vma() will use task or system default policy
 806          */
 807         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 808 }
 809 #else
 810
 811 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 812                                 unsigned long flags)
 813 {
 814 }
 815
 816 int do_migrate_pages(struct mm_struct *mm,
 817         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 818 {
 819         return -ENOSYS;
 820 }
 821
 822 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 823 {
 824         return NULL;
 825 }
 826 #endif
 827
 828 static long do_mbind(unsigned long start, unsigned long len,
 829                      unsigned short mode, unsigned short mode_flags,
 830                      nodemask_t *nmask, unsigned long flags)
 831 {
 832         struct vm_area_struct *vma;
 833         struct mm_struct *mm = current->mm;
 834         struct mempolicy *new;
 835         unsigned long end;
 836         int err;
 837         LIST_HEAD(pagelist);
 838
 839         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 840                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 841                 return -EINVAL;
 842         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 843                 return -EPERM;
 844
 845         if (start & ~PAGE_MASK)
 846                 return -EINVAL;
 847
 848         if (mode == MPOL_DEFAULT)
 849                 flags &= ~MPOL_MF_STRICT;
 850
 851         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 852         end = start + len;
 853
 854         if (end < start)
 855                 return -EINVAL;
 856         if (end == start)
 857                 return 0;
 858
 859         new = mpol_new(mode, mode_flags, nmask);
 860         if (IS_ERR(new))
 861                 return PTR_ERR(new);
 862
 863         /*
 864          * If we are using the default policy then operation
 865          * on discontinuous address spaces is okay after all
 866          */
 867         if (!new)
 868                 flags |= MPOL_MF_DISCONTIG_OK;
 869
 870         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 871                  start, start + len, mode, mode_flags,
 872                  nmask ? nodes_addr(*nmask)[0] : -1);
 873
 874         down_write(&mm->mmap_sem);
 875         vma = check_range(mm, start, end, nmask,
 876                           flags | MPOL_MF_INVERT, &pagelist);
 877
 878         err = PTR_ERR(vma);
 879         if (!IS_ERR(vma)) {
 880                 int nr_failed = 0;
 881
 882                 err = mbind_range(vma, start, end, new);
 883
 884                 if (!list_empty(&pagelist))
 885                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 886                                                 (unsigned long)vma);
 887
 888                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 889                         err = -EIO;
 890         }
 891
 892         up_write(&mm->mmap_sem);
 893         mpol_free(new);
 894         return err;
 895 }
 896
 897 /*
 898  * User space interface with variable sized bitmaps for nodelists.
 899  */
 900
 901 /* Copy a node mask from user space. */
 902 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 903                      unsigned long maxnode)
 904 {
 905         unsigned long k;
 906         unsigned long nlongs;
 907         unsigned long endmask;
 908
 909         --maxnode;
 910         nodes_clear(*nodes);
 911         if (maxnode == 0 || !nmask)
 912                 return 0;
 913         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 914                 return -EINVAL;
 915
 916         nlongs = BITS_TO_LONGS(maxnode);
 917         if ((maxnode % BITS_PER_LONG) == 0)
 918                 endmask = ~0UL;
 919         else
 920                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 921
 922         /* When the user specified more nodes than supported just check
 923            if the non supported part is all zero. */
 924         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 925                 if (nlongs > PAGE_SIZE/sizeof(long))
 926                         return -EINVAL;
 927                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 928                         unsigned long t;
 929                         if (get_user(t, nmask + k))
 930                                 return -EFAULT;
 931                         if (k == nlongs - 1) {
 932                                 if (t & endmask)
 933                                         return -EINVAL;
 934                         } else if (t)
 935                                 return -EINVAL;
 936                 }
 937                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 938                 endmask = ~0UL;
 939         }
 940
 941         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 942                 return -EFAULT;
 943         nodes_addr(*nodes)[nlongs-1] &= endmask;
 944         return 0;
 945 }
 946
 947 /* Copy a kernel node mask to user space */
 948 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 949                               nodemask_t *nodes)
 950 {
 951         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 952         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 953
 954         if (copy > nbytes) {
 955                 if (copy > PAGE_SIZE)
 956                         return -EINVAL;
 957                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 958                         return -EFAULT;
 959                 copy = nbytes;
 960         }
 961         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 962 }
 963
 964 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 965                         unsigned long mode,
 966                         unsigned long __user *nmask, unsigned long maxnode,
 967                         unsigned flags)
 968 {
 969         nodemask_t nodes;
 970         int err;
 971         unsigned short mode_flags;
 972
 973         mode_flags = mode & MPOL_MODE_FLAGS;
 974         mode &= ~MPOL_MODE_FLAGS;
 975         if (mode >= MPOL_MAX)
 976                 return -EINVAL;
 977         if ((mode_flags & MPOL_F_STATIC_NODES) &&
 978             (mode_flags & MPOL_F_RELATIVE_NODES))
 979                 return -EINVAL;
 980         err = get_nodes(&nodes, nmask, maxnode);
 981         if (err)
 982                 return err;
 983         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 984 }
 985
 986 /* Set the process memory policy */
 987 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 988                 unsigned long maxnode)
 989 {
 990         int err;
 991         nodemask_t nodes;
 992         unsigned short flags;
 993
 994         flags = mode & MPOL_MODE_FLAGS;
 995         mode &= ~MPOL_MODE_FLAGS;
 996         if ((unsigned int)mode >= MPOL_MAX)
 997                 return -EINVAL;
 998         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
 999                 return -EINVAL;
1000         err = get_nodes(&nodes, nmask, maxnode);
1001         if (err)
1002                 return err;
1003         return do_set_mempolicy(mode, flags, &nodes);
1004 }
1005
1006 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
1007                 const unsigned long __user *old_nodes,
1008                 const unsigned long __user *new_nodes)
1009 {
1010         struct mm_struct *mm;
1011         struct task_struct *task;
1012         nodemask_t old;
1013         nodemask_t new;
1014         nodemask_t task_nodes;
1015         int err;
1016
1017         err = get_nodes(&old, old_nodes, maxnode);
1018         if (err)
1019                 return err;
1020
1021         err = get_nodes(&new, new_nodes, maxnode);
1022         if (err)
1023                 return err;
1024
1025         /* Find the mm_struct */
1026         read_lock(&tasklist_lock);
1027         task = pid ? find_task_by_vpid(pid) : current;
1028         if (!task) {
1029                 read_unlock(&tasklist_lock);
1030                 return -ESRCH;
1031         }
1032         mm = get_task_mm(task);
1033         read_unlock(&tasklist_lock);
1034
1035         if (!mm)
1036                 return -EINVAL;
1037
1038         /*
1039          * Check if this process has the right to modify the specified
1040          * process. The right exists if the process has administrative
1041          * capabilities, superuser privileges or the same
1042          * userid as the target process.
1043          */
1044         if ((current->euid != task->suid) && (current->euid != task->uid) &&
1045             (current->uid != task->suid) && (current->uid != task->uid) &&
1046             !capable(CAP_SYS_NICE)) {
1047                 err = -EPERM;
1048                 goto out;
1049         }
1050
1051         task_nodes = cpuset_mems_allowed(task);
1052         /* Is the user allowed to access the target nodes? */
1053         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1054                 err = -EPERM;
1055                 goto out;
1056         }
1057
1058         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1059                 err = -EINVAL;
1060                 goto out;
1061         }
1062
1063         err = security_task_movememory(task);
1064         if (err)
1065                 goto out;
1066
1067         err = do_migrate_pages(mm, &old, &new,
1068                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1069 out:
1070         mmput(mm);
1071         return err;
1072 }
1073
1074
1075 /* Retrieve NUMA policy */
1076 asmlinkage long sys_get_mempolicy(int __user *policy,
1077                                 unsigned long __user *nmask,
1078                                 unsigned long maxnode,
1079                                 unsigned long addr, unsigned long flags)
1080 {
1081         int err;
1082         int uninitialized_var(pval);
1083         nodemask_t nodes;
1084
1085         if (nmask != NULL && maxnode < MAX_NUMNODES)
1086                 return -EINVAL;
1087
1088         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1089
1090         if (err)
1091                 return err;
1092
1093         if (policy && put_user(pval, policy))
1094                 return -EFAULT;
1095
1096         if (nmask)
1097                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1098
1099         return err;
1100 }
1101
1102 #ifdef CONFIG_COMPAT
1103
1104 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1105                                      compat_ulong_t __user *nmask,
1106                                      compat_ulong_t maxnode,
1107                                      compat_ulong_t addr, compat_ulong_t flags)
1108 {
1109         long err;
1110         unsigned long __user *nm = NULL;
1111         unsigned long nr_bits, alloc_size;
1112         DECLARE_BITMAP(bm, MAX_NUMNODES);
1113
1114         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1115         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1116
1117         if (nmask)
1118                 nm = compat_alloc_user_space(alloc_size);
1119
1120         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1121
1122         if (!err && nmask) {
1123                 err = copy_from_user(bm, nm, alloc_size);
1124                 /* ensure entire bitmap is zeroed */
1125                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1126                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1127         }
1128
1129         return err;
1130 }
1131
1132 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1133                                      compat_ulong_t maxnode)
1134 {
1135         long err = 0;
1136         unsigned long __user *nm = NULL;
1137         unsigned long nr_bits, alloc_size;
1138         DECLARE_BITMAP(bm, MAX_NUMNODES);
1139
1140         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1141         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1142
1143         if (nmask) {
1144                 err = compat_get_bitmap(bm, nmask, nr_bits);
1145                 nm = compat_alloc_user_space(alloc_size);
1146                 err |= copy_to_user(nm, bm, alloc_size);
1147         }
1148
1149         if (err)
1150                 return -EFAULT;
1151
1152         return sys_set_mempolicy(mode, nm, nr_bits+1);
1153 }
1154
1155 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1156                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1157                              compat_ulong_t maxnode, compat_ulong_t flags)
1158 {
1159         long err = 0;
1160         unsigned long __user *nm = NULL;
1161         unsigned long nr_bits, alloc_size;
1162         nodemask_t bm;
1163
1164         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1165         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1166
1167         if (nmask) {
1168                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1169                 nm = compat_alloc_user_space(alloc_size);
1170                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1171         }
1172
1173         if (err)
1174                 return -EFAULT;
1175
1176         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1177 }
1178
1179 #endif
1180
1181 /*
1182  * get_vma_policy(@task, @vma, @addr)
1183  * @task - task for fallback if vma policy == default
1184  * @vma   - virtual memory area whose policy is sought
1185  * @addr  - address in @vma for shared policy lookup
1186  *
1187  * Returns effective policy for a VMA at specified address.
1188  * Falls back to @task or system default policy, as necessary.
1189  * Returned policy has extra reference count if shared, vma,
1190  * or some other task's policy [show_numa_maps() can pass
1191  * @task != current].  It is the caller's responsibility to
1192  * free the reference in these cases.
1193  */
1194 static struct mempolicy * get_vma_policy(struct task_struct *task,
1195                 struct vm_area_struct *vma, unsigned long addr)
1196 {
1197         struct mempolicy *pol = task->mempolicy;
1198         int shared_pol = 0;
1199
1200         if (vma) {
1201                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1202                         pol = vma->vm_ops->get_policy(vma, addr);
1203                         shared_pol = 1; /* if pol non-NULL, add ref below */
1204                 } else if (vma->vm_policy &&
1205                                 vma->vm_policy->policy != MPOL_DEFAULT)
1206                         pol = vma->vm_policy;
1207         }
1208         if (!pol)
1209                 pol = &default_policy;
1210         else if (!shared_pol && pol != current->mempolicy)
1211                 mpol_get(pol);  /* vma or other task's policy */
1212         return pol;
1213 }
1214
1215 /* Return a nodemask representing a mempolicy */
1216 static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1217 {
1218         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1219         if (unlikely(policy->policy == MPOL_BIND) &&
1220                         gfp_zone(gfp) >= policy_zone &&
1221                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1222                 return &policy->v.nodes;
1223
1224         return NULL;
1225 }
1226
1227 /* Return a zonelist representing a mempolicy */
1228 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1229 {
1230         int nd;
1231
1232         switch (policy->policy) {
1233         case MPOL_PREFERRED:
1234                 nd = policy->v.preferred_node;
1235                 if (nd < 0)
1236                         nd = numa_node_id();
1237                 break;
1238         case MPOL_BIND:
1239                 /*
1240                  * Normally, MPOL_BIND allocations node-local are node-local
1241                  * within the allowed nodemask. However, if __GFP_THISNODE is
1242                  * set and the current node is part of the mask, we use the
1243                  * the zonelist for the first node in the mask instead.
1244                  */
1245                 nd = numa_node_id();
1246                 if (unlikely(gfp & __GFP_THISNODE) &&
1247                                 unlikely(!node_isset(nd, policy->v.nodes)))
1248                         nd = first_node(policy->v.nodes);
1249                 break;
1250         case MPOL_INTERLEAVE: /* should not happen */
1251         case MPOL_DEFAULT:
1252                 nd = numa_node_id();
1253                 break;
1254         default:
1255                 nd = 0;
1256                 BUG();
1257         }
1258         return node_zonelist(nd, gfp);
1259 }
1260
1261 /* Do dynamic interleaving for a process */
1262 static unsigned interleave_nodes(struct mempolicy *policy)
1263 {
1264         unsigned nid, next;
1265         struct task_struct *me = current;
1266
1267         nid = me->il_next;
1268         next = next_node(nid, policy->v.nodes);
1269         if (next >= MAX_NUMNODES)
1270                 next = first_node(policy->v.nodes);
1271         if (next < MAX_NUMNODES)
1272                 me->il_next = next;
1273         return nid;
1274 }
1275
1276 /*
1277  * Depending on the memory policy provide a node from which to allocate the
1278  * next slab entry.
1279  */
1280 unsigned slab_node(struct mempolicy *policy)
1281 {
1282         unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1283
1284         switch (pol) {
1285         case MPOL_INTERLEAVE:
1286                 return interleave_nodes(policy);
1287
1288         case MPOL_BIND: {
1289                 /*
1290                  * Follow bind policy behavior and start allocation at the
1291                  * first node.
1292                  */
1293                 struct zonelist *zonelist;
1294                 struct zone *zone;
1295                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1296                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1297                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1298                                                         &policy->v.nodes,
1299                                                         &zone);
1300                 return zone->node;
1301         }
1302
1303         case MPOL_PREFERRED:
1304                 if (policy->v.preferred_node >= 0)
1305                         return policy->v.preferred_node;
1306                 /* Fall through */
1307
1308         default:
1309                 return numa_node_id();
1310         }
1311 }
1312
1313 /* Do static interleaving for a VMA with known offset. */
1314 static unsigned offset_il_node(struct mempolicy *pol,
1315                 struct vm_area_struct *vma, unsigned long off)
1316 {
1317         unsigned nnodes = nodes_weight(pol->v.nodes);
1318         unsigned target;
1319         int c;
1320         int nid = -1;
1321
1322         if (!nnodes)
1323                 return numa_node_id();
1324         target = (unsigned int)off % nnodes;
1325         c = 0;
1326         do {
1327                 nid = next_node(nid, pol->v.nodes);
1328                 c++;
1329         } while (c <= target);
1330         return nid;
1331 }
1332
1333 /* Determine a node number for interleave */
1334 static inline unsigned interleave_nid(struct mempolicy *pol,
1335                  struct vm_area_struct *vma, unsigned long addr, int shift)
1336 {
1337         if (vma) {
1338                 unsigned long off;
1339
1340                 /*
1341                  * for small pages, there is no difference between
1342                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1343                  * for huge pages, since vm_pgoff is in units of small
1344                  * pages, we need to shift off the always 0 bits to get
1345                  * a useful offset.
1346                  */
1347                 BUG_ON(shift < PAGE_SHIFT);
1348                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1349                 off += (addr - vma->vm_start) >> shift;
1350                 return offset_il_node(pol, vma, off);
1351         } else
1352                 return interleave_nodes(pol);
1353 }
1354
1355 #ifdef CONFIG_HUGETLBFS
1356 /*
1357  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1358  * @vma = virtual memory area whose policy is sought
1359  * @addr = address in @vma for shared policy lookup and interleave policy
1360  * @gfp_flags = for requested zone
1361  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1362  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1363  *
1364  * Returns a zonelist suitable for a huge page allocation.
1365  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1366  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1367  * If it is also a policy for which get_vma_policy() returns an extra
1368  * reference, we must hold that reference until after the allocation.
1369  * In that case, return policy via @mpol so hugetlb allocation can drop
1370  * the reference. For non-'BIND referenced policies, we can/do drop the
1371  * reference here, so the caller doesn't need to know about the special case
1372  * for default and current task policy.
1373  */
1374 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1375                                 gfp_t gfp_flags, struct mempolicy **mpol,
1376                                 nodemask_t **nodemask)
1377 {
1378         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1379         struct zonelist *zl;
1380
1381         *mpol = NULL;           /* probably no unref needed */
1382         *nodemask = NULL;       /* assume !MPOL_BIND */
1383         if (pol->policy == MPOL_BIND) {
1384                         *nodemask = &pol->v.nodes;
1385         } else if (pol->policy == MPOL_INTERLEAVE) {
1386                 unsigned nid;
1387
1388                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1389                 if (unlikely(pol != &default_policy &&
1390                                 pol != current->mempolicy))
1391                         __mpol_free(pol);       /* finished with pol */
1392                 return node_zonelist(nid, gfp_flags);
1393         }
1394
1395         zl = zonelist_policy(GFP_HIGHUSER, pol);
1396         if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1397                 if (pol->policy != MPOL_BIND)
1398                         __mpol_free(pol);       /* finished with pol */
1399                 else
1400                         *mpol = pol;    /* unref needed after allocation */
1401         }
1402         return zl;
1403 }
1404 #endif
1405
1406 /* Allocate a page in interleaved policy.
1407    Own path because it needs to do special accounting. */
1408 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1409                                         unsigned nid)
1410 {
1411         struct zonelist *zl;
1412         struct page *page;
1413
1414         zl = node_zonelist(nid, gfp);
1415         page = __alloc_pages(gfp, order, zl);
1416         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1417                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1418         return page;
1419 }
1420
1421 /**
1422  *      alloc_page_vma  - Allocate a page for a VMA.
1423  *
1424  *      @gfp:
1425  *      %GFP_USER    user allocation.
1426  *      %GFP_KERNEL  kernel allocations,
1427  *      %GFP_HIGHMEM highmem/user allocations,
1428  *      %GFP_FS      allocation should not call back into a file system.
1429  *      %GFP_ATOMIC  don't sleep.
1430  *
1431  *      @vma:  Pointer to VMA or NULL if not available.
1432  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1433  *
1434  *      This function allocates a page from the kernel page pool and applies
1435  *      a NUMA policy associated with the VMA or the current process.
1436  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1437  *      mm_struct of the VMA to prevent it from going away. Should be used for
1438  *      all allocations for pages that will be mapped into
1439  *      user space. Returns NULL when no page can be allocated.
1440  *
1441  *      Should be called with the mm_sem of the vma hold.
1442  */
1443 struct page *
1444 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1445 {
1446         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1447         struct zonelist *zl;
1448
1449         cpuset_update_task_memory_state();
1450
1451         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1452                 unsigned nid;
1453
1454                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1455                 if (unlikely(pol != &default_policy &&
1456                                 pol != current->mempolicy))
1457                         __mpol_free(pol);       /* finished with pol */
1458                 return alloc_page_interleave(gfp, 0, nid);
1459         }
1460         zl = zonelist_policy(gfp, pol);
1461         if (pol != &default_policy && pol != current->mempolicy) {
1462                 /*
1463                  * slow path: ref counted policy -- shared or vma
1464                  */
1465                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1466                                                 zl, nodemask_policy(gfp, pol));
1467                 __mpol_free(pol);
1468                 return page;
1469         }
1470         /*
1471          * fast path:  default or task policy
1472          */
1473         return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1474 }
1475
1476 /**
1477  *      alloc_pages_current - Allocate pages.
1478  *
1479  *      @gfp:
1480  *              %GFP_USER   user allocation,
1481  *              %GFP_KERNEL kernel allocation,
1482  *              %GFP_HIGHMEM highmem allocation,
1483  *              %GFP_FS     don't call back into a file system.
1484  *              %GFP_ATOMIC don't sleep.
1485  *      @order: Power of two of allocation size in pages. 0 is a single page.
1486  *
1487  *      Allocate a page from the kernel page pool.  When not in
1488  *      interrupt context and apply the current process NUMA policy.
1489  *      Returns NULL when no page can be allocated.
1490  *
1491  *      Don't call cpuset_update_task_memory_state() unless
1492  *      1) it's ok to take cpuset_sem (can WAIT), and
1493  *      2) allocating for current task (not interrupt).
1494  */
1495 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1496 {
1497         struct mempolicy *pol = current->mempolicy;
1498
1499         if ((gfp & __GFP_WAIT) && !in_interrupt())
1500                 cpuset_update_task_memory_state();
1501         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1502                 pol = &default_policy;
1503         if (pol->policy == MPOL_INTERLEAVE)
1504                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1505         return __alloc_pages_nodemask(gfp, order,
1506                         zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1507 }
1508 EXPORT_SYMBOL(alloc_pages_current);
1509
1510 /*
1511  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1512  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1513  * with the mems_allowed returned by cpuset_mems_allowed().  This
1514  * keeps mempolicies cpuset relative after its cpuset moves.  See
1515  * further kernel/cpuset.c update_nodemask().
1516  */
1517
1518 /* Slow path of a mempolicy copy */
1519 struct mempolicy *__mpol_copy(struct mempolicy *old)
1520 {
1521         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1522
1523         if (!new)
1524                 return ERR_PTR(-ENOMEM);
1525         if (current_cpuset_is_being_rebound()) {
1526                 nodemask_t mems = cpuset_mems_allowed(current);
1527                 mpol_rebind_policy(old, &mems);
1528         }
1529         *new = *old;
1530         atomic_set(&new->refcnt, 1);
1531         return new;
1532 }
1533
1534 static int mpol_match_intent(const struct mempolicy *a,
1535                              const struct mempolicy *b)
1536 {
1537         if (a->flags != b->flags)
1538                 return 0;
1539         if (!mpol_store_user_nodemask(a))
1540                 return 1;
1541         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1542 }
1543
1544 /* Slow path of a mempolicy comparison */
1545 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1546 {
1547         if (!a || !b)
1548                 return 0;
1549         if (a->policy != b->policy)
1550                 return 0;
1551         if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1552                 return 0;
1553         switch (a->policy) {
1554         case MPOL_DEFAULT:
1555                 return 1;
1556         case MPOL_BIND:
1557                 /* Fall through */
1558         case MPOL_INTERLEAVE:
1559                 return nodes_equal(a->v.nodes, b->v.nodes);
1560         case MPOL_PREFERRED:
1561                 return a->v.preferred_node == b->v.preferred_node;
1562         default:
1563                 BUG();
1564                 return 0;
1565         }
1566 }
1567
1568 /* Slow path of a mpol destructor. */
1569 void __mpol_free(struct mempolicy *p)
1570 {
1571         if (!atomic_dec_and_test(&p->refcnt))
1572                 return;
1573         p->policy = MPOL_DEFAULT;
1574         kmem_cache_free(policy_cache, p);
1575 }
1576
1577 /*
1578  * Shared memory backing store policy support.
1579  *
1580  * Remember policies even when nobody has shared memory mapped.
1581  * The policies are kept in Red-Black tree linked from the inode.
1582  * They are protected by the sp->lock spinlock, which should be held
1583  * for any accesses to the tree.
1584  */
1585
1586 /* lookup first element intersecting start-end */
1587 /* Caller holds sp->lock */
1588 static struct sp_node *
1589 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1590 {
1591         struct rb_node *n = sp->root.rb_node;
1592
1593         while (n) {
1594                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1595
1596                 if (start >= p->end)
1597                         n = n->rb_right;
1598                 else if (end <= p->start)
1599                         n = n->rb_left;
1600                 else
1601                         break;
1602         }
1603         if (!n)
1604                 return NULL;
1605         for (;;) {
1606                 struct sp_node *w = NULL;
1607                 struct rb_node *prev = rb_prev(n);
1608                 if (!prev)
1609                         break;
1610                 w = rb_entry(prev, struct sp_node, nd);
1611                 if (w->end <= start)
1612                         break;
1613                 n = prev;
1614         }
1615         return rb_entry(n, struct sp_node, nd);
1616 }
1617
1618 /* Insert a new shared policy into the list. */
1619 /* Caller holds sp->lock */
1620 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1621 {
1622         struct rb_node **p = &sp->root.rb_node;
1623         struct rb_node *parent = NULL;
1624         struct sp_node *nd;
1625
1626         while (*p) {
1627                 parent = *p;
1628                 nd = rb_entry(parent, struct sp_node, nd);
1629                 if (new->start < nd->start)
1630                         p = &(*p)->rb_left;
1631                 else if (new->end > nd->end)
1632                         p = &(*p)->rb_right;
1633                 else
1634                         BUG();
1635         }
1636         rb_link_node(&new->nd, parent, p);
1637         rb_insert_color(&new->nd, &sp->root);
1638         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1639                  new->policy ? new->policy->policy : 0);
1640 }
1641
1642 /* Find shared policy intersecting idx */
1643 struct mempolicy *
1644 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1645 {
1646         struct mempolicy *pol = NULL;
1647         struct sp_node *sn;
1648
1649         if (!sp->root.rb_node)
1650                 return NULL;
1651         spin_lock(&sp->lock);
1652         sn = sp_lookup(sp, idx, idx+1);
1653         if (sn) {
1654                 mpol_get(sn->policy);
1655                 pol = sn->policy;
1656         }
1657         spin_unlock(&sp->lock);
1658         return pol;
1659 }
1660
1661 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1662 {
1663         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1664         rb_erase(&n->nd, &sp->root);
1665         mpol_free(n->policy);
1666         kmem_cache_free(sn_cache, n);
1667 }
1668
1669 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1670                                 struct mempolicy *pol)
1671 {
1672         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1673
1674         if (!n)
1675                 return NULL;
1676         n->start = start;
1677         n->end = end;
1678         mpol_get(pol);
1679         n->policy = pol;
1680         return n;
1681 }
1682
1683 /* Replace a policy range. */
1684 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1685                                  unsigned long end, struct sp_node *new)
1686 {
1687         struct sp_node *n, *new2 = NULL;
1688
1689 restart:
1690         spin_lock(&sp->lock);
1691         n = sp_lookup(sp, start, end);
1692         /* Take care of old policies in the same range. */
1693         while (n && n->start < end) {
1694                 struct rb_node *next = rb_next(&n->nd);
1695                 if (n->start >= start) {
1696                         if (n->end <= end)
1697                                 sp_delete(sp, n);
1698                         else
1699                                 n->start = end;
1700                 } else {
1701                         /* Old policy spanning whole new range. */
1702                         if (n->end > end) {
1703                                 if (!new2) {
1704                                         spin_unlock(&sp->lock);
1705                                         new2 = sp_alloc(end, n->end, n->policy);
1706                                         if (!new2)
1707                                                 return -ENOMEM;
1708                                         goto restart;
1709                                 }
1710                                 n->end = start;
1711                                 sp_insert(sp, new2);
1712                                 new2 = NULL;
1713                                 break;
1714                         } else
1715                                 n->end = start;
1716                 }
1717                 if (!next)
1718                         break;
1719                 n = rb_entry(next, struct sp_node, nd);
1720         }
1721         if (new)
1722                 sp_insert(sp, new);
1723         spin_unlock(&sp->lock);
1724         if (new2) {
1725                 mpol_free(new2->policy);
1726                 kmem_cache_free(sn_cache, new2);
1727         }
1728         return 0;
1729 }
1730
1731 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1732                         unsigned short flags, nodemask_t *policy_nodes)
1733 {
1734         info->root = RB_ROOT;
1735         spin_lock_init(&info->lock);
1736
1737         if (policy != MPOL_DEFAULT) {
1738                 struct mempolicy *newpol;
1739
1740                 /* Falls back to MPOL_DEFAULT on any error */
1741                 newpol = mpol_new(policy, flags, policy_nodes);
1742                 if (!IS_ERR(newpol)) {
1743                         /* Create pseudo-vma that contains just the policy */
1744                         struct vm_area_struct pvma;
1745
1746                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1747                         /* Policy covers entire file */
1748                         pvma.vm_end = TASK_SIZE;
1749                         mpol_set_shared_policy(info, &pvma, newpol);
1750                         mpol_free(newpol);
1751                 }
1752         }
1753 }
1754
1755 int mpol_set_shared_policy(struct shared_policy *info,
1756                         struct vm_area_struct *vma, struct mempolicy *npol)
1757 {
1758         int err;
1759         struct sp_node *new = NULL;
1760         unsigned long sz = vma_pages(vma);
1761
1762         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1763                  vma->vm_pgoff,
1764                  sz, npol ? npol->policy : -1,
1765                  npol ? npol->flags : -1,
1766                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1767
1768         if (npol) {
1769                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1770                 if (!new)
1771                         return -ENOMEM;
1772         }
1773         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1774         if (err && new)
1775                 kmem_cache_free(sn_cache, new);
1776         return err;
1777 }
1778
1779 /* Free a backing policy store on inode delete. */
1780 void mpol_free_shared_policy(struct shared_policy *p)
1781 {
1782         struct sp_node *n;
1783         struct rb_node *next;
1784
1785         if (!p->root.rb_node)
1786                 return;
1787         spin_lock(&p->lock);
1788         next = rb_first(&p->root);
1789         while (next) {
1790                 n = rb_entry(next, struct sp_node, nd);
1791                 next = rb_next(&n->nd);
1792                 rb_erase(&n->nd, &p->root);
1793                 mpol_free(n->policy);
1794                 kmem_cache_free(sn_cache, n);
1795         }
1796         spin_unlock(&p->lock);
1797 }
1798
1799 /* assumes fs == KERNEL_DS */
1800 void __init numa_policy_init(void)
1801 {
1802         nodemask_t interleave_nodes;
1803         unsigned long largest = 0;
1804         int nid, prefer = 0;
1805
1806         policy_cache = kmem_cache_create("numa_policy",
1807                                          sizeof(struct mempolicy),
1808                                          0, SLAB_PANIC, NULL);
1809
1810         sn_cache = kmem_cache_create("shared_policy_node",
1811                                      sizeof(struct sp_node),
1812                                      0, SLAB_PANIC, NULL);
1813
1814         /*
1815          * Set interleaving policy for system init. Interleaving is only
1816          * enabled across suitably sized nodes (default is >= 16MB), or
1817          * fall back to the largest node if they're all smaller.
1818          */
1819         nodes_clear(interleave_nodes);
1820         for_each_node_state(nid, N_HIGH_MEMORY) {
1821                 unsigned long total_pages = node_present_pages(nid);
1822
1823                 /* Preserve the largest node */
1824                 if (largest < total_pages) {
1825                         largest = total_pages;
1826                         prefer = nid;
1827                 }
1828
1829                 /* Interleave this node? */
1830                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1831                         node_set(nid, interleave_nodes);
1832         }
1833
1834         /* All too small, use the largest */
1835         if (unlikely(nodes_empty(interleave_nodes)))
1836                 node_set(prefer, interleave_nodes);
1837
1838         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1839                 printk("numa_policy_init: interleaving failed\n");
1840 }
1841
1842 /* Reset policy of current process to default */
1843 void numa_default_policy(void)
1844 {
1845         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1846 }
1847
1848 /*
1849  * Display pages allocated per node and memory policy via /proc.
1850  */
1851
1852 static const char * const policy_types[] =
1853         { "default", "prefer", "bind", "interleave" };
1854
1855 /*
1856  * Convert a mempolicy into a string.
1857  * Returns the number of characters in buffer (if positive)
1858  * or an error (negative)
1859  */
1860 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1861 {
1862         char *p = buffer;
1863         int l;
1864         nodemask_t nodes;
1865         unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1866         unsigned short flags = pol ? pol->flags : 0;
1867
1868         switch (mode) {
1869         case MPOL_DEFAULT:
1870                 nodes_clear(nodes);
1871                 break;
1872
1873         case MPOL_PREFERRED:
1874                 nodes_clear(nodes);
1875                 node_set(pol->v.preferred_node, nodes);
1876                 break;
1877
1878         case MPOL_BIND:
1879                 /* Fall through */
1880         case MPOL_INTERLEAVE:
1881                 nodes = pol->v.nodes;
1882                 break;
1883
1884         default:
1885                 BUG();
1886                 return -EFAULT;
1887         }
1888
1889         l = strlen(policy_types[mode]);
1890         if (buffer + maxlen < p + l + 1)
1891                 return -ENOSPC;
1892
1893         strcpy(p, policy_types[mode]);
1894         p += l;
1895
1896         if (flags) {
1897                 int need_bar = 0;
1898
1899                 if (buffer + maxlen < p + 2)
1900                         return -ENOSPC;
1901                 *p++ = '=';
1902
1903                 if (flags & MPOL_F_STATIC_NODES)
1904                         p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
1905                 if (flags & MPOL_F_RELATIVE_NODES)
1906                         p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1907         }
1908
1909         if (!nodes_empty(nodes)) {
1910                 if (buffer + maxlen < p + 2)
1911                         return -ENOSPC;
1912                 *p++ = '=';
1913                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1914         }
1915         return p - buffer;
1916 }
1917
1918 struct numa_maps {
1919         unsigned long pages;
1920         unsigned long anon;
1921         unsigned long active;
1922         unsigned long writeback;
1923         unsigned long mapcount_max;
1924         unsigned long dirty;
1925         unsigned long swapcache;
1926         unsigned long node[MAX_NUMNODES];
1927 };
1928
1929 static void gather_stats(struct page *page, void *private, int pte_dirty)
1930 {
1931         struct numa_maps *md = private;
1932         int count = page_mapcount(page);
1933
1934         md->pages++;
1935         if (pte_dirty || PageDirty(page))
1936                 md->dirty++;
1937
1938         if (PageSwapCache(page))
1939                 md->swapcache++;
1940
1941         if (PageActive(page))
1942                 md->active++;
1943
1944         if (PageWriteback(page))
1945                 md->writeback++;
1946
1947         if (PageAnon(page))
1948                 md->anon++;
1949
1950         if (count > md->mapcount_max)
1951                 md->mapcount_max = count;
1952
1953         md->node[page_to_nid(page)]++;
1954 }
1955
1956 #ifdef CONFIG_HUGETLB_PAGE
1957 static void check_huge_range(struct vm_area_struct *vma,
1958                 unsigned long start, unsigned long end,
1959                 struct numa_maps *md)
1960 {
1961         unsigned long addr;
1962         struct page *page;
1963
1964         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1965                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1966                 pte_t pte;
1967
1968                 if (!ptep)
1969                         continue;
1970
1971                 pte = *ptep;
1972                 if (pte_none(pte))
1973                         continue;
1974
1975                 page = pte_page(pte);
1976                 if (!page)
1977                         continue;
1978
1979                 gather_stats(page, md, pte_dirty(*ptep));
1980         }
1981 }
1982 #else
1983 static inline void check_huge_range(struct vm_area_struct *vma,
1984                 unsigned long start, unsigned long end,
1985                 struct numa_maps *md)
1986 {
1987 }
1988 #endif
1989
1990 int show_numa_map(struct seq_file *m, void *v)
1991 {
1992         struct proc_maps_private *priv = m->private;
1993         struct vm_area_struct *vma = v;
1994         struct numa_maps *md;
1995         struct file *file = vma->vm_file;
1996         struct mm_struct *mm = vma->vm_mm;
1997         struct mempolicy *pol;
1998         int n;
1999         char buffer[50];
2000
2001         if (!mm)
2002                 return 0;
2003
2004         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2005         if (!md)
2006                 return 0;
2007
2008         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2009         mpol_to_str(buffer, sizeof(buffer), pol);
2010         /*
2011          * unref shared or other task's mempolicy
2012          */
2013         if (pol != &default_policy && pol != current->mempolicy)
2014                 __mpol_free(pol);
2015
2016         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2017
2018         if (file) {
2019                 seq_printf(m, " file=");
2020                 seq_path(m, &file->f_path, "\n\t= ");
2021         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2022                 seq_printf(m, " heap");
2023         } else if (vma->vm_start <= mm->start_stack &&
2024                         vma->vm_end >= mm->start_stack) {
2025                 seq_printf(m, " stack");
2026         }
2027
2028         if (is_vm_hugetlb_page(vma)) {
2029                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2030                 seq_printf(m, " huge");
2031         } else {
2032                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2033                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2034         }
2035
2036         if (!md->pages)
2037                 goto out;
2038
2039         if (md->anon)
2040                 seq_printf(m," anon=%lu",md->anon);
2041
2042         if (md->dirty)
2043                 seq_printf(m," dirty=%lu",md->dirty);
2044
2045         if (md->pages != md->anon && md->pages != md->dirty)
2046                 seq_printf(m, " mapped=%lu", md->pages);
2047
2048         if (md->mapcount_max > 1)
2049                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2050
2051         if (md->swapcache)
2052                 seq_printf(m," swapcache=%lu", md->swapcache);
2053
2054         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2055                 seq_printf(m," active=%lu", md->active);
2056
2057         if (md->writeback)
2058                 seq_printf(m," writeback=%lu", md->writeback);
2059
2060         for_each_node_state(n, N_HIGH_MEMORY)
2061                 if (md->node[n])
2062                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2063 out:
2064         seq_putc(m, '\n');
2065         kfree(md);
2066
2067         if (m->count < m->size)
2068                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2069         return 0;
2070 }