mm: memcontrol: zap oom_info_lock
[cascardo/linux.git] / mm / memcontrol.c
index d06cae2..36db05f 100644 (file)
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
        return (memcg == root_mem_cgroup);
 }
 
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX      USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
-       return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock().  The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
-       struct cgroup_subsys_state *css;
-
-       css = css_from_id(id, &memory_cgrp_subsys);
-       return mem_cgroup_from_css(css);
-}
-
 #ifndef CONFIG_SLOB
 /*
  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -663,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
 
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
-                                                 int nid,
-                                                 unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+                                          int nid, unsigned int lru_mask)
 {
        unsigned long nr = 0;
        int zid;
@@ -1176,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-       /* oom_info_lock ensures that parallel ooms do not interleave */
-       static DEFINE_MUTEX(oom_info_lock);
        struct mem_cgroup *iter;
        unsigned int i;
 
-       mutex_lock(&oom_info_lock);
        rcu_read_lock();
 
        if (p) {
@@ -1225,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
                pr_cont("\n");
        }
-       mutex_unlock(&oom_info_lock);
 }
 
 /*
@@ -1262,7 +1232,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return limit;
 }
 
-static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                     int order)
 {
        struct oom_control oc = {
@@ -1340,6 +1310,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
        }
 unlock:
        mutex_unlock(&oom_lock);
+       return chosen;
 }
 
 #if MAX_NUMNODES > 1
@@ -1709,19 +1680,13 @@ cleanup:
 }
 
 /**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
  *
- *   memcg = mem_cgroup_begin_page_stat(page);
- *   if (TestClearPageState(page))
- *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
  */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
 {
        struct mem_cgroup *memcg;
        unsigned long flags;
@@ -1730,25 +1695,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
         * The RCU lock is held throughout the transaction.  The fast
         * path can get away without acquiring the memcg->move_lock
         * because page moving starts with an RCU grace period.
-        *
-        * The RCU lock also protects the memcg from being freed when
-        * the page state that is going to change is the only thing
-        * preventing the page from being uncharged.
-        * E.g. end-writeback clearing PageWriteback(), which allows
-        * migration to go ahead and uncharge the page before the
-        * account transaction might be complete.
         */
        rcu_read_lock();
 
        if (mem_cgroup_disabled())
-               return NULL;
+               return;
 again:
        memcg = page->mem_cgroup;
        if (unlikely(!memcg))
-               return NULL;
+               return;
 
        if (atomic_read(&memcg->moving_account) <= 0)
-               return memcg;
+               return;
 
        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
@@ -1759,21 +1717,23 @@ again:
        /*
         * When charge migration first begins, we can have locked and
         * unlocked page stat updates happening concurrently.  Track
-        * the task who has the lock for mem_cgroup_end_page_stat().
+        * the task who has the lock for unlock_page_memcg().
         */
        memcg->move_lock_task = current;
        memcg->move_lock_flags = flags;
 
-       return memcg;
+       return;
 }
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
 
 /**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
  */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
 {
+       struct mem_cgroup *memcg = page->mem_cgroup;
+
        if (memcg && memcg->move_lock_task == current) {
                unsigned long flags = memcg->move_lock_flags;
 
@@ -1785,7 +1745,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 
        rcu_read_unlock();
 }
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
 
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2361,9 +2321,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
        struct page_counter *counter;
        int ret;
 
-       if (!memcg_kmem_online(memcg))
-               return 0;
-
        ret = try_charge(memcg, gfp, nr_pages);
        if (ret)
                return ret;
@@ -2382,10 +2339,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 {
        struct mem_cgroup *memcg;
-       int ret;
+       int ret = 0;
 
        memcg = get_mem_cgroup_from_mm(current->mm);
-       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       if (!mem_cgroup_is_root(memcg))
+               ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
        css_put(&memcg->css);
        return ret;
 }
@@ -2755,39 +2713,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
        return retval;
 }
 
-static unsigned long tree_stat(struct mem_cgroup *memcg,
-                              enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
 {
        struct mem_cgroup *iter;
-       unsigned long val = 0;
+       int i;
 
-       for_each_mem_cgroup_tree(iter, memcg)
-               val += mem_cgroup_read_stat(iter, idx);
+       memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
 
-       return val;
+       for_each_mem_cgroup_tree(iter, memcg) {
+               for (i = 0; i < MEMCG_NR_STAT; i++)
+                       stat[i] += mem_cgroup_read_stat(iter, i);
+       }
 }
 
-static unsigned long tree_events(struct mem_cgroup *memcg,
-                                enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
 {
        struct mem_cgroup *iter;
-       unsigned long val = 0;
+       int i;
 
-       for_each_mem_cgroup_tree(iter, memcg)
-               val += mem_cgroup_read_events(iter, idx);
+       memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
 
-       return val;
+       for_each_mem_cgroup_tree(iter, memcg) {
+               for (i = 0; i < MEMCG_NR_EVENTS; i++)
+                       events[i] += mem_cgroup_read_events(iter, i);
+       }
 }
 
 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
-       unsigned long val;
+       unsigned long val = 0;
 
        if (mem_cgroup_is_root(memcg)) {
-               val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
-               val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
-               if (swap)
-                       val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+               struct mem_cgroup *iter;
+
+               for_each_mem_cgroup_tree(iter, memcg) {
+                       val += mem_cgroup_read_stat(iter,
+                                       MEM_CGROUP_STAT_CACHE);
+                       val += mem_cgroup_read_stat(iter,
+                                       MEM_CGROUP_STAT_RSS);
+                       if (swap)
+                               val += mem_cgroup_read_stat(iter,
+                                               MEM_CGROUP_STAT_SWAP);
+               }
        } else {
                if (!swap)
                        val = page_counter_read(&memcg->memory);
@@ -2853,6 +2820,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
        int memcg_id;
 
+       if (cgroup_memory_nokmem)
+               return 0;
+
        BUG_ON(memcg->kmemcg_id >= 0);
        BUG_ON(memcg->kmem_state);
 
@@ -2873,24 +2843,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
        return 0;
 }
 
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
-                               struct mem_cgroup *memcg)
-{
-       int ret = 0;
-
-       mutex_lock(&memcg_limit_mutex);
-       /*
-        * If the parent cgroup is not kmem-online now, it cannot be
-        * onlined after this point, because it has at least one child
-        * already.
-        */
-       if (memcg_kmem_online(parent) ||
-           (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
-               ret = memcg_online_kmem(memcg);
-       mutex_unlock(&memcg_limit_mutex);
-       return ret;
-}
-
 static void memcg_offline_kmem(struct mem_cgroup *memcg)
 {
        struct cgroup_subsys_state *css;
@@ -2949,10 +2901,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
        }
 }
 #else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
-       return 0;
-}
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
        return 0;
@@ -2968,22 +2916,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
                                   unsigned long limit)
 {
-       int ret = 0;
+       int ret;
 
        mutex_lock(&memcg_limit_mutex);
-       /* Top-level cgroup doesn't propagate from root */
-       if (!memcg_kmem_online(memcg)) {
-               if (cgroup_is_populated(memcg->css.cgroup) ||
-                   (memcg->use_hierarchy && memcg_has_children(memcg)))
-                       ret = -EBUSY;
-               if (ret)
-                       goto out;
-               ret = memcg_online_kmem(memcg);
-               if (ret)
-                       goto out;
-       }
        ret = page_counter_limit(&memcg->kmem, limit);
-out:
        mutex_unlock(&memcg_limit_mutex);
        return ret;
 }
@@ -4234,7 +4170,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
                return &memcg->css;
        }
 
-       error = memcg_propagate_kmem(parent, memcg);
+       error = memcg_online_kmem(memcg);
        if (error)
                goto fail;
 
@@ -4318,9 +4254,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-       mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
-       mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
-       memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
+       page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
        memcg->low = 0;
        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4488,7 +4426,7 @@ static int mem_cgroup_move_account(struct page *page,
        VM_BUG_ON(compound && !PageTransHuge(page));
 
        /*
-        * Prevent mem_cgroup_replace_page() from looking at
+        * Prevent mem_cgroup_migrate() from looking at
         * page->mem_cgroup of its source page while we change it.
         */
        ret = -EBUSY;
@@ -4923,9 +4861,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
 
        lru_add_drain_all();
        /*
-        * Signal mem_cgroup_begin_page_stat() to take the memcg's
-        * move_lock while we're moving its pages to another memcg.
-        * Then wait for already started RCU-only updates to finish.
+        * Signal lock_page_memcg() to take the memcg's move_lock
+        * while we're moving its pages to another memcg. Then wait
+        * for already started RCU-only updates to finish.
         */
        atomic_inc(&mc.from->moving_account);
        synchronize_rcu();
@@ -5051,6 +4989,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long nr_pages;
        unsigned long high;
        int err;
 
@@ -5061,6 +5000,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
        memcg->high = high;
 
+       nr_pages = page_counter_read(&memcg->memory);
+       if (nr_pages > high)
+               try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+                                            GFP_KERNEL, true);
+
        memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }
@@ -5082,6 +5026,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                                char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+       bool drained = false;
        unsigned long max;
        int err;
 
@@ -5090,9 +5036,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
-       err = mem_cgroup_resize_limit(memcg, max);
-       if (err)
-               return err;
+       xchg(&memcg->memory.limit, max);
+
+       for (;;) {
+               unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+               if (nr_pages <= max)
+                       break;
+
+               if (signal_pending(current)) {
+                       err = -EINTR;
+                       break;
+               }
+
+               if (!drained) {
+                       drain_all_stock(memcg);
+                       drained = true;
+                       continue;
+               }
+
+               if (nr_reclaims) {
+                       if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+                                                         GFP_KERNEL, true))
+                               nr_reclaims--;
+                       continue;
+               }
+
+               mem_cgroup_events(memcg, MEMCG_OOM, 1);
+               if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+                       break;
+       }
 
        memcg_wb_domain_size_changed(memcg);
        return nbytes;
@@ -5113,6 +5086,8 @@ static int memory_events_show(struct seq_file *m, void *v)
 static int memory_stat_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+       unsigned long stat[MEMCG_NR_STAT];
+       unsigned long events[MEMCG_NR_EVENTS];
        int i;
 
        /*
@@ -5126,22 +5101,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
         * Current memory state:
         */
 
+       tree_stat(memcg, stat);
+       tree_events(memcg, events);
+
        seq_printf(m, "anon %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
        seq_printf(m, "file %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+       seq_printf(m, "kernel_stack %llu\n",
+                  (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+       seq_printf(m, "slab %llu\n",
+                  (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+                        stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
        seq_printf(m, "sock %llu\n",
-                  (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+                  (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
 
        seq_printf(m, "file_mapped %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
-                  PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
        seq_printf(m, "file_dirty %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
-                  PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
        seq_printf(m, "file_writeback %llu\n",
-                  (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
-                  PAGE_SIZE);
+                  (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
 
        for (i = 0; i < NR_LRU_LISTS; i++) {
                struct mem_cgroup *mi;
@@ -5153,12 +5133,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
                           mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
        }
 
+       seq_printf(m, "slab_reclaimable %llu\n",
+                  (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+       seq_printf(m, "slab_unreclaimable %llu\n",
+                  (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
        /* Accumulated memory events */
 
        seq_printf(m, "pgfault %lu\n",
-                  tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+                  events[MEM_CGROUP_EVENTS_PGFAULT]);
        seq_printf(m, "pgmajfault %lu\n",
-                  tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+                  events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
 
        return 0;
 }
@@ -5431,6 +5416,10 @@ static void uncharge_list(struct list_head *page_list)
        struct list_head *next;
        struct page *page;
 
+       /*
+        * Note that the list can be a single page->lru; hence the
+        * do-while loop instead of a simple list_for_each_entry().
+        */
        next = page_list->next;
        do {
                unsigned int nr_pages = 1;
@@ -5517,16 +5506,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
 }
 
 /**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
  *
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
  *
  * Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
  */
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
 {
        struct mem_cgroup *memcg;
        unsigned int nr_pages;
@@ -5559,7 +5548,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
                page_counter_charge(&memcg->memsw, nr_pages);
        css_get_many(&memcg->css, nr_pages);
 
-       commit_charge(newpage, memcg, true);
+       commit_charge(newpage, memcg, false);
 
        local_irq_disable();
        mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);