mm: memcontrol: zap oom_info_lock

[cascardo/linux.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index d06cae2..36db05f 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
         return (memcg == root_mem_cgroup);
  }
  
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX      USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
-       return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
-       struct cgroup_subsys_state *css;
-
-       css = css_from_id(id, &memory_cgrp_subsys);
-       return mem_cgroup_from_css(css);
-}
-
  #ifndef CONFIG_SLOB
  /*
   * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -663,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
  }
  
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                                 int nid,
-                                                 unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+                                          int nid, unsigned int lru_mask)
  {
         unsigned long nr = 0;
         int zid;
@@ -1176,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
-       /* oom_info_lock ensures that parallel ooms do not interleave */
-       static DEFINE_MUTEX(oom_info_lock);
         struct mem_cgroup *iter;
         unsigned int i;
  
-       mutex_lock(&oom_info_lock);
         rcu_read_lock();
  
         if (p) {
@@ -1225,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  
                 pr_cont("\n");
         }
-       mutex_unlock(&oom_info_lock);
  }
  
  /*
@@ -1262,7 +1232,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
         return limit;
  }
  
-static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                      int order)
  {
         struct oom_control oc = {
@@ -1340,6 +1310,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         }
  unlock:
         mutex_unlock(&oom_lock);
+       return chosen;
  }
  
  #if MAX_NUMNODES > 1
@@ -1709,19 +1680,13 @@ cleanup:
  }
  
  /**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
   *
- *   memcg = mem_cgroup_begin_page_stat(page);
- *   if (TestClearPageState(page))
- *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
   */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
  {
         struct mem_cgroup *memcg;
         unsigned long flags;
@@ -1730,25 +1695,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
          * The RCU lock is held throughout the transaction.  The fast
          * path can get away without acquiring the memcg->move_lock
          * because page moving starts with an RCU grace period.
-        *
-        * The RCU lock also protects the memcg from being freed when
-        * the page state that is going to change is the only thing
-        * preventing the page from being uncharged.
-        * E.g. end-writeback clearing PageWriteback(), which allows
-        * migration to go ahead and uncharge the page before the
-        * account transaction might be complete.
          */
         rcu_read_lock();
  
         if (mem_cgroup_disabled())
-               return NULL;
+               return;
  again:
         memcg = page->mem_cgroup;
         if (unlikely(!memcg))
-               return NULL;
+               return;
  
         if (atomic_read(&memcg->moving_account) <= 0)
-               return memcg;
+               return;
  
         spin_lock_irqsave(&memcg->move_lock, flags);
         if (memcg != page->mem_cgroup) {
@@ -1759,21 +1717,23 @@ again:
         /*
          * When charge migration first begins, we can have locked and
          * unlocked page stat updates happening concurrently.  Track
-        * the task who has the lock for mem_cgroup_end_page_stat().
+        * the task who has the lock for unlock_page_memcg().
          */
         memcg->move_lock_task = current;
         memcg->move_lock_flags = flags;
  
-       return memcg;
+       return;
  }
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
  
  /**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
   */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
  {
+       struct mem_cgroup *memcg = page->mem_cgroup;
+
         if (memcg && memcg->move_lock_task == current) {
                 unsigned long flags = memcg->move_lock_flags;
  
@@ -1785,7 +1745,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
  
         rcu_read_unlock();
  }
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
  
  /*
   * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2361,9 +2321,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
         struct page_counter *counter;
         int ret;
  
-       if (!memcg_kmem_online(memcg))
-               return 0;
-
         ret = try_charge(memcg, gfp, nr_pages);
         if (ret)
                 return ret;
@@ -2382,10 +2339,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
  int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
  {
         struct mem_cgroup *memcg;
-       int ret;
+       int ret = 0;
  
         memcg = get_mem_cgroup_from_mm(current->mm);
-       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       if (!mem_cgroup_is_root(memcg))
+               ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
         css_put(&memcg->css);
         return ret;
  }
@@ -2755,39 +2713,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
         return retval;
  }
  
-static unsigned long tree_stat(struct mem_cgroup *memcg,
-                              enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
  {
         struct mem_cgroup *iter;
-       unsigned long val = 0;
+       int i;
  
-       for_each_mem_cgroup_tree(iter, memcg)
-               val += mem_cgroup_read_stat(iter, idx);
+       memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
  
-       return val;
+       for_each_mem_cgroup_tree(iter, memcg) {
+               for (i = 0; i < MEMCG_NR_STAT; i++)
+                       stat[i] += mem_cgroup_read_stat(iter, i);
+       }
  }
  
-static unsigned long tree_events(struct mem_cgroup *memcg,
-                                enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
  {
         struct mem_cgroup *iter;
-       unsigned long val = 0;
+       int i;
  
-       for_each_mem_cgroup_tree(iter, memcg)
-               val += mem_cgroup_read_events(iter, idx);
+       memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
  
-       return val;
+       for_each_mem_cgroup_tree(iter, memcg) {
+               for (i = 0; i < MEMCG_NR_EVENTS; i++)
+                       events[i] += mem_cgroup_read_events(iter, i);
+       }
  }
  
  static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
  {
-       unsigned long val;
+       unsigned long val = 0;
  
         if (mem_cgroup_is_root(memcg)) {
-               val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
-               val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
-               if (swap)
-                       val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+               struct mem_cgroup *iter;
+
+               for_each_mem_cgroup_tree(iter, memcg) {
+                       val += mem_cgroup_read_stat(iter,
+                                       MEM_CGROUP_STAT_CACHE);
+                       val += mem_cgroup_read_stat(iter,
+                                       MEM_CGROUP_STAT_RSS);
+                       if (swap)
+                               val += mem_cgroup_read_stat(iter,
+                                               MEM_CGROUP_STAT_SWAP);
+               }
         } else {
                 if (!swap)
                         val = page_counter_read(&memcg->memory);
@@ -2853,6 +2820,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
  {
         int memcg_id;
  
+       if (cgroup_memory_nokmem)
+               return 0;
+
         BUG_ON(memcg->kmemcg_id >= 0);
         BUG_ON(memcg->kmem_state);
  
@@ -2873,24 +2843,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
         return 0;
  }
  
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
-                               struct mem_cgroup *memcg)
-{
-       int ret = 0;
-
-       mutex_lock(&memcg_limit_mutex);
-       /*
-        * If the parent cgroup is not kmem-online now, it cannot be
-        * onlined after this point, because it has at least one child
-        * already.
-        */
-       if (memcg_kmem_online(parent) ||
-           (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
-               ret = memcg_online_kmem(memcg);
-       mutex_unlock(&memcg_limit_mutex);
-       return ret;
-}
-
  static void memcg_offline_kmem(struct mem_cgroup *memcg)
  {
         struct cgroup_subsys_state *css;
@@ -2949,10 +2901,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
         }
  }
  #else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
-       return 0;
-}
  static int memcg_online_kmem(struct mem_cgroup *memcg)
  {
         return 0;
@@ -2968,22 +2916,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
                                    unsigned long limit)
  {
-       int ret = 0;
+       int ret;
  
         mutex_lock(&memcg_limit_mutex);
-       /* Top-level cgroup doesn't propagate from root */
-       if (!memcg_kmem_online(memcg)) {
-               if (cgroup_is_populated(memcg->css.cgroup) ||
-                   (memcg->use_hierarchy && memcg_has_children(memcg)))
-                       ret = -EBUSY;
-               if (ret)
-                       goto out;
-               ret = memcg_online_kmem(memcg);
-               if (ret)
-                       goto out;
-       }
         ret = page_counter_limit(&memcg->kmem, limit);
-out:
         mutex_unlock(&memcg_limit_mutex);
         return ret;
  }
@@ -4234,7 +4170,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
                 return &memcg->css;
         }
  
-       error = memcg_propagate_kmem(parent, memcg);
+       error = memcg_online_kmem(memcg);
         if (error)
                 goto fail;
  
@@ -4318,9 +4254,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
-       mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
-       mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
-       memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
         memcg->low = 0;
         memcg->high = PAGE_COUNTER_MAX;
         memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4488,7 +4426,7 @@ static int mem_cgroup_move_account(struct page *page,
         VM_BUG_ON(compound && !PageTransHuge(page));
  
         /*
-        * Prevent mem_cgroup_replace_page() from looking at
+        * Prevent mem_cgroup_migrate() from looking at
          * page->mem_cgroup of its source page while we change it.
          */
         ret = -EBUSY;
@@ -4923,9 +4861,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
  
         lru_add_drain_all();
         /*
-        * Signal mem_cgroup_begin_page_stat() to take the memcg's
-        * move_lock while we're moving its pages to another memcg.
-        * Then wait for already started RCU-only updates to finish.
+        * Signal lock_page_memcg() to take the memcg's move_lock
+        * while we're moving its pages to another memcg. Then wait
+        * for already started RCU-only updates to finish.
          */
         atomic_inc(&mc.from->moving_account);
         synchronize_rcu();
@@ -5051,6 +4989,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long nr_pages;
         unsigned long high;
         int err;
  
@@ -5061,6 +5000,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
  
         memcg->high = high;
  
+       nr_pages = page_counter_read(&memcg->memory);
+       if (nr_pages > high)
+               try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+                                            GFP_KERNEL, true);
+
         memcg_wb_domain_size_changed(memcg);
         return nbytes;
  }
@@ -5082,6 +5026,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+       bool drained = false;
         unsigned long max;
         int err;
  
@@ -5090,9 +5036,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
         if (err)
                 return err;
  
-       err = mem_cgroup_resize_limit(memcg, max);
-       if (err)
-               return err;
+       xchg(&memcg->memory.limit, max);
+
+       for (;;) {
+               unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+               if (nr_pages <= max)
+                       break;
+
+               if (signal_pending(current)) {
+                       err = -EINTR;
+                       break;
+               }
+
+               if (!drained) {
+                       drain_all_stock(memcg);
+                       drained = true;
+                       continue;
+               }
+
+               if (nr_reclaims) {
+                       if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+                                                         GFP_KERNEL, true))
+                               nr_reclaims--;
+                       continue;
+               }
+
+               mem_cgroup_events(memcg, MEMCG_OOM, 1);
+               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+                       break;
+       }
  
         memcg_wb_domain_size_changed(memcg);
         return nbytes;
@@ -5113,6 +5086,8 @@ static int memory_events_show(struct seq_file *m, void *v)
  static int memory_stat_show(struct seq_file *m, void *v)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long stat[MEMCG_NR_STAT];
+       unsigned long events[MEMCG_NR_EVENTS];
         int i;
  
         /*
@@ -5126,22 +5101,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
          * Current memory state:
          */
  
+       tree_stat(memcg, stat);
+       tree_events(memcg, events);
+
         seq_printf(m, "anon %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
         seq_printf(m, "file %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+       seq_printf(m, "kernel_stack %llu\n",
+                  (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+       seq_printf(m, "slab %llu\n",
+                  (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+                        stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
         seq_printf(m, "sock %llu\n",
-                  (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+                  (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
  
         seq_printf(m, "file_mapped %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
-                  PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
         seq_printf(m, "file_dirty %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
-                  PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
         seq_printf(m, "file_writeback %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
-                  PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
  
         for (i = 0; i < NR_LRU_LISTS; i++) {
                 struct mem_cgroup *mi;
@@ -5153,12 +5133,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
                            mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
         }
  
+       seq_printf(m, "slab_reclaimable %llu\n",
+                  (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+       seq_printf(m, "slab_unreclaimable %llu\n",
+                  (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
         /* Accumulated memory events */
  
         seq_printf(m, "pgfault %lu\n",
-                  tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+                  events[MEM_CGROUP_EVENTS_PGFAULT]);
         seq_printf(m, "pgmajfault %lu\n",
-                  tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+                  events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
  
         return 0;
  }
@@ -5431,6 +5416,10 @@ static void uncharge_list(struct list_head *page_list)
         struct list_head *next;
         struct page *page;
  
+       /*
+        * Note that the list can be a single page->lru; hence the
+        * do-while loop instead of a simple list_for_each_entry().
+        */
         next = page_list->next;
         do {
                 unsigned int nr_pages = 1;
@@ -5517,16 +5506,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
  }
  
  /**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
   *
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
   *
   * Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
   */
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
  {
         struct mem_cgroup *memcg;
         unsigned int nr_pages;
@@ -5559,7 +5548,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
                 page_counter_charge(&memcg->memsw, nr_pages);
         css_get_many(&memcg->css, nr_pages);
  
-       commit_charge(newpage, memcg, true);
+       commit_charge(newpage, memcg, false);
  
         local_irq_disable();
         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);