mm: slub: introduce metadata_access_enable()/metadata_access_disable()

[cascardo/linux.git] / mm / slub.c
diff --git a/mm/slub.c b/mm/slub.c

index fe376fe..37555ad 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
  #include <linux/proc_fs.h>
  #include <linux/notifier.h>
  #include <linux/seq_file.h>
+#include <linux/kasan.h>
  #include <linux/kmemcheck.h>
  #include <linux/cpu.h>
  #include <linux/cpuset.h>
@@ -467,13 +468,31 @@ static int slub_debug;
  static char *slub_debug_slabs;
  static int disable_higher_order_debug;
  
+/*
+ * slub is about to manipulate internal object metadata.  This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error.  metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+       kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+       kasan_enable_current();
+}
+
  /*
   * Object debugging
   */
  static void print_section(char *text, u8 *addr, unsigned int length)
  {
+       metadata_access_enable();
         print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
                         length, 1);
+       metadata_access_disable();
  }
  
  static struct track *get_track(struct kmem_cache *s, void *object,
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object,
                 trace.max_entries = TRACK_ADDRS_COUNT;
                 trace.entries = p->addrs;
                 trace.skip = 3;
+               metadata_access_enable();
                 save_stack_trace(&trace);
+               metadata_access_disable();
  
                 /* See rant in lockdep.c */
                 if (trace.nr_entries != 0 &&
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
         dump_stack();
  }
  
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
                         u8 *object, char *reason)
  {
         slab_bug(s, "%s", reason);
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
         u8 *fault;
         u8 *end;
  
+       metadata_access_enable();
         fault = memchr_inv(start, value, bytes);
+       metadata_access_disable();
         if (!fault)
                 return 1;
  
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
         if (!remainder)
                 return 1;
  
+       metadata_access_enable();
         fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+       metadata_access_disable();
         if (!fault)
                 return 1;
         while (end > fault && end[-1] == POISON_INUSE)
@@ -2007,6 +2032,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
         int pages;
         int pobjects;
  
+       preempt_disable();
         do {
                 pages = 0;
                 pobjects = 0;
@@ -2040,6 +2066,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
  
         } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
                                                                 != oldpage);
+       if (unlikely(!s->cpu_partial)) {
+               unsigned long flags;
+
+               local_irq_save(flags);
+               unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+               local_irq_restore(flags);
+       }
+       preempt_enable();
  #endif
  }
  
@@ -2398,13 +2432,24 @@ redo:
          * reading from one cpu area. That does not matter as long
          * as we end up on the original cpu again when doing the cmpxchg.
          *
-        * Preemption is disabled for the retrieval of the tid because that
-        * must occur from the current processor. We cannot allow rescheduling
-        * on a different processor between the determination of the pointer
-        * and the retrieval of the tid.
+        * We should guarantee that tid and kmem_cache are retrieved on
+        * the same cpu. It could be different if CONFIG_PREEMPT so we need
+        * to check if it is matched or not.
          */
-       preempt_disable();
-       c = this_cpu_ptr(s->cpu_slab);
+       do {
+               tid = this_cpu_read(s->cpu_slab->tid);
+               c = raw_cpu_ptr(s->cpu_slab);
+       } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+
+       /*
+        * Irqless object alloc/free algorithm used here depends on sequence
+        * of fetching cpu_slab's data. tid should be fetched before anything
+        * on c to guarantee that object and page associated with previous tid
+        * won't be used with current tid. If we fetch tid first, object and
+        * page could be one associated with next tid and our alloc/free
+        * request will be failed. In this case, we will retry. So, no problem.
+        */
+       barrier();
  
         /*
          * The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2457,6 @@ redo:
          * occurs on the right processor and that there was no operation on the
          * linked list in between.
          */
-       tid = c->tid;
-       preempt_enable();
  
         object = c->freelist;
         page = c->page;
@@ -2512,7 +2555,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
  #endif
  
  /*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
   * have a longer lifetime than the cpu slabs in most processing loads.
   *
   * So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2702,13 @@ redo:
          * data is retrieved via this pointer. If we are on the same cpu
          * during the cmpxchg then the free will succedd.
          */
-       preempt_disable();
-       c = this_cpu_ptr(s->cpu_slab);
+       do {
+               tid = this_cpu_read(s->cpu_slab->tid);
+               c = raw_cpu_ptr(s->cpu_slab);
+       } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
  
-       tid = c->tid;
-       preempt_enable();
+       /* Same with comment on barrier() in slab_alloc_node() */
+       barrier();
  
         if (likely(page == c->page)) {
                 set_freepointer(s, object, c->freelist);
@@ -3347,69 +3392,92 @@ void kfree(const void *x)
  }
  EXPORT_SYMBOL(kfree);
  
+#define SHRINK_PROMOTE_MAX 32
+
  /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
   *
   * The slabs with the least items are placed last. This results in them
   * being allocated from last increasing the chance that the last objects
   * are freed in them.
   */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
  {
         int node;
         int i;
         struct kmem_cache_node *n;
         struct page *page;
         struct page *t;
-       int objects = oo_objects(s->max);
-       struct list_head *slabs_by_inuse =
-               kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+       struct list_head discard;
+       struct list_head promote[SHRINK_PROMOTE_MAX];
         unsigned long flags;
+       int ret = 0;
  
-       if (!slabs_by_inuse)
-               return -ENOMEM;
+       if (deactivate) {
+               /*
+                * Disable empty slabs caching. Used to avoid pinning offline
+                * memory cgroups by kmem pages that can be freed.
+                */
+               s->cpu_partial = 0;
+               s->min_partial = 0;
+
+               /*
+                * s->cpu_partial is checked locklessly (see put_cpu_partial),
+                * so we have to make sure the change is visible.
+                */
+               kick_all_cpus_sync();
+       }
  
         flush_all(s);
         for_each_kmem_cache_node(s, node, n) {
-               if (!n->nr_partial)
-                       continue;
-
-               for (i = 0; i < objects; i++)
-                       INIT_LIST_HEAD(slabs_by_inuse + i);
+               INIT_LIST_HEAD(&discard);
+               for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+                       INIT_LIST_HEAD(promote + i);
  
                 spin_lock_irqsave(&n->list_lock, flags);
  
                 /*
-                * Build lists indexed by the items in use in each slab.
+                * Build lists of slabs to discard or promote.
                  *
                  * Note that concurrent frees may occur while we hold the
                  * list_lock. page->inuse here is the upper limit.
                  */
                 list_for_each_entry_safe(page, t, &n->partial, lru) {
-                       list_move(&page->lru, slabs_by_inuse + page->inuse);
-                       if (!page->inuse)
+                       int free = page->objects - page->inuse;
+
+                       /* Do not reread page->inuse */
+                       barrier();
+
+                       /* We do not keep full slabs on the list */
+                       BUG_ON(free <= 0);
+
+                       if (free == page->objects) {
+                               list_move(&page->lru, &discard);
                                 n->nr_partial--;
+                       } else if (free <= SHRINK_PROMOTE_MAX)
+                               list_move(&page->lru, promote + free - 1);
                 }
  
                 /*
-                * Rebuild the partial list with the slabs filled up most
-                * first and the least used slabs at the end.
+                * Promote the slabs filled up most to the head of the
+                * partial list.
                  */
-               for (i = objects - 1; i > 0; i--)
-                       list_splice(slabs_by_inuse + i, n->partial.prev);
+               for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+                       list_splice(promote + i, &n->partial);
  
                 spin_unlock_irqrestore(&n->list_lock, flags);
  
                 /* Release empty slabs */
-               list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+               list_for_each_entry_safe(page, t, &discard, lru)
                         discard_slab(s, page);
+
+               if (slabs_node(s, node))
+                       ret = 1;
         }
  
-       kfree(slabs_by_inuse);
-       return 0;
+       return ret;
  }
  
  static int slab_mem_going_offline_callback(void *arg)
@@ -3418,7 +3486,7 @@ static int slab_mem_going_offline_callback(void *arg)
  
         mutex_lock(&slab_mutex);
         list_for_each_entry(s, &slab_caches, list)
-               __kmem_cache_shrink(s);
+               __kmem_cache_shrink(s, false);
         mutex_unlock(&slab_mutex);
  
         return 0;
@@ -3566,6 +3634,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
                         p->slab_cache = s;
  #endif
         }
+       slab_init_memcg_params(s);
         list_add(&s->list, &slab_caches);
         return s;
  }
@@ -3624,13 +3693,10 @@ struct kmem_cache *
  __kmem_cache_alias(const char *name, size_t size, size_t align,
                    unsigned long flags, void (*ctor)(void *))
  {
-       struct kmem_cache *s;
+       struct kmem_cache *s, *c;
  
         s = find_mergeable(size, align, flags, name, ctor);
         if (s) {
-               int i;
-               struct kmem_cache *c;
-
                 s->refcount++;
  
                 /*
@@ -3640,10 +3706,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
                 s->object_size = max(s->object_size, (int)size);
                 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
  
-               for_each_memcg_cache_index(i) {
-                       c = cache_from_memcg_idx(s, i);
-                       if (!c)
-                               continue;
+               for_each_memcg_cache(c, s) {
                         c->object_size = s->object_size;
                         c->inuse = max_t(int, c->inuse,
                                          ALIGN(size, sizeof(void *)));
@@ -4070,20 +4133,16 @@ static int list_locations(struct kmem_cache *s, char *buf,
  
                 if (num_online_cpus() > 1 &&
                                 !cpumask_empty(to_cpumask(l->cpus)) &&
-                               len < PAGE_SIZE - 60) {
-                       len += sprintf(buf + len, " cpus=");
-                       len += cpulist_scnprintf(buf + len,
-                                                PAGE_SIZE - len - 50,
-                                                to_cpumask(l->cpus));
-               }
+                               len < PAGE_SIZE - 60)
+                       len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+                                        " cpus=%*pbl",
+                                        cpumask_pr_args(to_cpumask(l->cpus)));
  
                 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
-                               len < PAGE_SIZE - 60) {
-                       len += sprintf(buf + len, " nodes=");
-                       len += nodelist_scnprintf(buf + len,
-                                                 PAGE_SIZE - len - 50,
-                                                 l->nodes);
-               }
+                               len < PAGE_SIZE - 60)
+                       len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+                                        " nodes=%*pbl",
+                                        nodemask_pr_args(&l->nodes));
  
                 len += sprintf(buf + len, "\n");
         }
@@ -4680,12 +4739,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
  static ssize_t shrink_store(struct kmem_cache *s,
                         const char *buf, size_t length)
  {
-       if (buf[0] == '1') {
-               int rc = kmem_cache_shrink(s);
-
-               if (rc)
-                       return rc;
-       } else
+       if (buf[0] == '1')
+               kmem_cache_shrink(s);
+       else
                 return -EINVAL;
         return length;
  }
@@ -4909,7 +4965,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
         err = attribute->store(s, buf, len);
  #ifdef CONFIG_MEMCG_KMEM
         if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-               int i;
+               struct kmem_cache *c;
  
                 mutex_lock(&slab_mutex);
                 if (s->max_attr_size < len)
@@ -4932,11 +4988,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                  * directly either failed or succeeded, in which case we loop
                  * through the descendants with best-effort propagation.
                  */
-               for_each_memcg_cache_index(i) {
-                       struct kmem_cache *c = cache_from_memcg_idx(s, i);
-                       if (c)
-                               attribute->store(c, buf, len);
-               }
+               for_each_memcg_cache(c, s)
+                       attribute->store(c, buf, len);
                 mutex_unlock(&slab_mutex);
         }
  #endif
@@ -4953,7 +5006,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
         if (is_root_cache(s))
                 return;
  
-       root_cache = s->memcg_params->root_cache;
+       root_cache = s->memcg_params.root_cache;
  
         /*
          * This mean this cache had no attribute written. Therefore, no point
@@ -5033,7 +5086,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
  {
  #ifdef CONFIG_MEMCG_KMEM
         if (!is_root_cache(s))
-               return s->memcg_params->root_cache->memcg_kset;
+               return s->memcg_params.root_cache->memcg_kset;
  #endif
         return slab_kset;
  }