mm: slub: introduce metadata_access_enable()/metadata_access_disable()
[cascardo/linux.git] / mm / slub.c
index fe376fe..37555ad 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -467,13 +468,31 @@ static int slub_debug;
 static char *slub_debug_slabs;
 static int disable_higher_order_debug;
 
+/*
+ * slub is about to manipulate internal object metadata.  This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error.  metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+       kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+       kasan_enable_current();
+}
+
 /*
  * Object debugging
  */
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
+       metadata_access_enable();
        print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
                        length, 1);
+       metadata_access_disable();
 }
 
 static struct track *get_track(struct kmem_cache *s, void *object,
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object,
                trace.max_entries = TRACK_ADDRS_COUNT;
                trace.entries = p->addrs;
                trace.skip = 3;
+               metadata_access_enable();
                save_stack_trace(&trace);
+               metadata_access_disable();
 
                /* See rant in lockdep.c */
                if (trace.nr_entries != 0 &&
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        dump_stack();
 }
 
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
                        u8 *object, char *reason)
 {
        slab_bug(s, "%s", reason);
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
        u8 *fault;
        u8 *end;
 
+       metadata_access_enable();
        fault = memchr_inv(start, value, bytes);
+       metadata_access_disable();
        if (!fault)
                return 1;
 
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
        if (!remainder)
                return 1;
 
+       metadata_access_enable();
        fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+       metadata_access_disable();
        if (!fault)
                return 1;
        while (end > fault && end[-1] == POISON_INUSE)
@@ -2007,6 +2032,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
        int pages;
        int pobjects;
 
+       preempt_disable();
        do {
                pages = 0;
                pobjects = 0;
@@ -2040,6 +2066,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 
        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
                                                                != oldpage);
+       if (unlikely(!s->cpu_partial)) {
+               unsigned long flags;
+
+               local_irq_save(flags);
+               unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+               local_irq_restore(flags);
+       }
+       preempt_enable();
 #endif
 }
 
@@ -2398,13 +2432,24 @@ redo:
         * reading from one cpu area. That does not matter as long
         * as we end up on the original cpu again when doing the cmpxchg.
         *
-        * Preemption is disabled for the retrieval of the tid because that
-        * must occur from the current processor. We cannot allow rescheduling
-        * on a different processor between the determination of the pointer
-        * and the retrieval of the tid.
+        * We should guarantee that tid and kmem_cache are retrieved on
+        * the same cpu. It could be different if CONFIG_PREEMPT so we need
+        * to check if it is matched or not.
         */
-       preempt_disable();
-       c = this_cpu_ptr(s->cpu_slab);
+       do {
+               tid = this_cpu_read(s->cpu_slab->tid);
+               c = raw_cpu_ptr(s->cpu_slab);
+       } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+
+       /*
+        * Irqless object alloc/free algorithm used here depends on sequence
+        * of fetching cpu_slab's data. tid should be fetched before anything
+        * on c to guarantee that object and page associated with previous tid
+        * won't be used with current tid. If we fetch tid first, object and
+        * page could be one associated with next tid and our alloc/free
+        * request will be failed. In this case, we will retry. So, no problem.
+        */
+       barrier();
 
        /*
         * The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2457,6 @@ redo:
         * occurs on the right processor and that there was no operation on the
         * linked list in between.
         */
-       tid = c->tid;
-       preempt_enable();
 
        object = c->freelist;
        page = c->page;
@@ -2512,7 +2555,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 
 /*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
  * have a longer lifetime than the cpu slabs in most processing loads.
  *
  * So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2702,13 @@ redo:
         * data is retrieved via this pointer. If we are on the same cpu
         * during the cmpxchg then the free will succedd.
         */
-       preempt_disable();
-       c = this_cpu_ptr(s->cpu_slab);
+       do {
+               tid = this_cpu_read(s->cpu_slab->tid);
+               c = raw_cpu_ptr(s->cpu_slab);
+       } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
 
-       tid = c->tid;
-       preempt_enable();
+       /* Same with comment on barrier() in slab_alloc_node() */
+       barrier();
 
        if (likely(page == c->page)) {
                set_freepointer(s, object, c->freelist);
@@ -3347,69 +3392,92 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
+#define SHRINK_PROMOTE_MAX 32
+
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
  *
  * The slabs with the least items are placed last. This results in them
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
        int node;
        int i;
        struct kmem_cache_node *n;
        struct page *page;
        struct page *t;
-       int objects = oo_objects(s->max);
-       struct list_head *slabs_by_inuse =
-               kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+       struct list_head discard;
+       struct list_head promote[SHRINK_PROMOTE_MAX];
        unsigned long flags;
+       int ret = 0;
 
-       if (!slabs_by_inuse)
-               return -ENOMEM;
+       if (deactivate) {
+               /*
+                * Disable empty slabs caching. Used to avoid pinning offline
+                * memory cgroups by kmem pages that can be freed.
+                */
+               s->cpu_partial = 0;
+               s->min_partial = 0;
+
+               /*
+                * s->cpu_partial is checked locklessly (see put_cpu_partial),
+                * so we have to make sure the change is visible.
+                */
+               kick_all_cpus_sync();
+       }
 
        flush_all(s);
        for_each_kmem_cache_node(s, node, n) {
-               if (!n->nr_partial)
-                       continue;
-
-               for (i = 0; i < objects; i++)
-                       INIT_LIST_HEAD(slabs_by_inuse + i);
+               INIT_LIST_HEAD(&discard);
+               for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+                       INIT_LIST_HEAD(promote + i);
 
                spin_lock_irqsave(&n->list_lock, flags);
 
                /*
-                * Build lists indexed by the items in use in each slab.
+                * Build lists of slabs to discard or promote.
                 *
                 * Note that concurrent frees may occur while we hold the
                 * list_lock. page->inuse here is the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
-                       list_move(&page->lru, slabs_by_inuse + page->inuse);
-                       if (!page->inuse)
+                       int free = page->objects - page->inuse;
+
+                       /* Do not reread page->inuse */
+                       barrier();
+
+                       /* We do not keep full slabs on the list */
+                       BUG_ON(free <= 0);
+
+                       if (free == page->objects) {
+                               list_move(&page->lru, &discard);
                                n->nr_partial--;
+                       } else if (free <= SHRINK_PROMOTE_MAX)
+                               list_move(&page->lru, promote + free - 1);
                }
 
                /*
-                * Rebuild the partial list with the slabs filled up most
-                * first and the least used slabs at the end.
+                * Promote the slabs filled up most to the head of the
+                * partial list.
                 */
-               for (i = objects - 1; i > 0; i--)
-                       list_splice(slabs_by_inuse + i, n->partial.prev);
+               for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+                       list_splice(promote + i, &n->partial);
 
                spin_unlock_irqrestore(&n->list_lock, flags);
 
                /* Release empty slabs */
-               list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+               list_for_each_entry_safe(page, t, &discard, lru)
                        discard_slab(s, page);
+
+               if (slabs_node(s, node))
+                       ret = 1;
        }
 
-       kfree(slabs_by_inuse);
-       return 0;
+       return ret;
 }
 
 static int slab_mem_going_offline_callback(void *arg)
@@ -3418,7 +3486,7 @@ static int slab_mem_going_offline_callback(void *arg)
 
        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
-               __kmem_cache_shrink(s);
+               __kmem_cache_shrink(s, false);
        mutex_unlock(&slab_mutex);
 
        return 0;
@@ -3566,6 +3634,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
                        p->slab_cache = s;
 #endif
        }
+       slab_init_memcg_params(s);
        list_add(&s->list, &slab_caches);
        return s;
 }
@@ -3624,13 +3693,10 @@ struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
                   unsigned long flags, void (*ctor)(void *))
 {
-       struct kmem_cache *s;
+       struct kmem_cache *s, *c;
 
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
-               int i;
-               struct kmem_cache *c;
-
                s->refcount++;
 
                /*
@@ -3640,10 +3706,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 
-               for_each_memcg_cache_index(i) {
-                       c = cache_from_memcg_idx(s, i);
-                       if (!c)
-                               continue;
+               for_each_memcg_cache(c, s) {
                        c->object_size = s->object_size;
                        c->inuse = max_t(int, c->inuse,
                                         ALIGN(size, sizeof(void *)));
@@ -4070,20 +4133,16 @@ static int list_locations(struct kmem_cache *s, char *buf,
 
                if (num_online_cpus() > 1 &&
                                !cpumask_empty(to_cpumask(l->cpus)) &&
-                               len < PAGE_SIZE - 60) {
-                       len += sprintf(buf + len, " cpus=");
-                       len += cpulist_scnprintf(buf + len,
-                                                PAGE_SIZE - len - 50,
-                                                to_cpumask(l->cpus));
-               }
+                               len < PAGE_SIZE - 60)
+                       len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+                                        " cpus=%*pbl",
+                                        cpumask_pr_args(to_cpumask(l->cpus)));
 
                if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
-                               len < PAGE_SIZE - 60) {
-                       len += sprintf(buf + len, " nodes=");
-                       len += nodelist_scnprintf(buf + len,
-                                                 PAGE_SIZE - len - 50,
-                                                 l->nodes);
-               }
+                               len < PAGE_SIZE - 60)
+                       len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+                                        " nodes=%*pbl",
+                                        nodemask_pr_args(&l->nodes));
 
                len += sprintf(buf + len, "\n");
        }
@@ -4680,12 +4739,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 static ssize_t shrink_store(struct kmem_cache *s,
                        const char *buf, size_t length)
 {
-       if (buf[0] == '1') {
-               int rc = kmem_cache_shrink(s);
-
-               if (rc)
-                       return rc;
-       } else
+       if (buf[0] == '1')
+               kmem_cache_shrink(s);
+       else
                return -EINVAL;
        return length;
 }
@@ -4909,7 +4965,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
        err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-               int i;
+               struct kmem_cache *c;
 
                mutex_lock(&slab_mutex);
                if (s->max_attr_size < len)
@@ -4932,11 +4988,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                 * directly either failed or succeeded, in which case we loop
                 * through the descendants with best-effort propagation.
                 */
-               for_each_memcg_cache_index(i) {
-                       struct kmem_cache *c = cache_from_memcg_idx(s, i);
-                       if (c)
-                               attribute->store(c, buf, len);
-               }
+               for_each_memcg_cache(c, s)
+                       attribute->store(c, buf, len);
                mutex_unlock(&slab_mutex);
        }
 #endif
@@ -4953,7 +5006,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
        if (is_root_cache(s))
                return;
 
-       root_cache = s->memcg_params->root_cache;
+       root_cache = s->memcg_params.root_cache;
 
        /*
         * This mean this cache had no attribute written. Therefore, no point
@@ -5033,7 +5086,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
 {
 #ifdef CONFIG_MEMCG_KMEM
        if (!is_root_cache(s))
-               return s->memcg_params->root_cache->memcg_kset;
+               return s->memcg_params.root_cache->memcg_kset;
 #endif
        return slab_kset;
 }