#include <linux/proc_fs.h>
#include <linux/notifier.h>
#include <linux/seq_file.h>
+#include <linux/kasan.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
static char *slub_debug_slabs;
static int disable_higher_order_debug;
+/*
+ * slub is about to manipulate internal object metadata. This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error. metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+ kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+ kasan_enable_current();
+}
+
/*
* Object debugging
*/
static void print_section(char *text, u8 *addr, unsigned int length)
{
+ metadata_access_enable();
print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
+ metadata_access_disable();
}
static struct track *get_track(struct kmem_cache *s, void *object,
trace.max_entries = TRACK_ADDRS_COUNT;
trace.entries = p->addrs;
trace.skip = 3;
+ metadata_access_enable();
save_stack_trace(&trace);
+ metadata_access_disable();
/* See rant in lockdep.c */
if (trace.nr_entries != 0 &&
dump_stack();
}
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
slab_bug(s, "%s", reason);
u8 *fault;
u8 *end;
+ metadata_access_enable();
fault = memchr_inv(start, value, bytes);
+ metadata_access_disable();
if (!fault)
return 1;
if (!remainder)
return 1;
+ metadata_access_enable();
fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+ metadata_access_disable();
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
int pages;
int pobjects;
+ preempt_disable();
do {
pages = 0;
pobjects = 0;
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
+ if (unlikely(!s->cpu_partial)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ }
+ preempt_enable();
#endif
}
}
EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
+
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
*
* The slabs with the least items are placed last. This results in them
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
{
int node;
int i;
struct kmem_cache_node *n;
struct page *page;
struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ struct list_head discard;
+ struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
+ int ret = 0;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ if (deactivate) {
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial),
+ * so we have to make sure the change is visible.
+ */
+ kick_all_cpus_sync();
+ }
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
- if (!n->nr_partial)
- continue;
-
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ INIT_LIST_HEAD(&discard);
+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+ INIT_LIST_HEAD(promote + i);
spin_lock_irqsave(&n->list_lock, flags);
/*
- * Build lists indexed by the items in use in each slab.
+ * Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- list_move(&page->lru, slabs_by_inuse + page->inuse);
- if (!page->inuse)
+ int free = page->objects - page->inuse;
+
+ /* Do not reread page->inuse */
+ barrier();
+
+ /* We do not keep full slabs on the list */
+ BUG_ON(free <= 0);
+
+ if (free == page->objects) {
+ list_move(&page->lru, &discard);
n->nr_partial--;
+ } else if (free <= SHRINK_PROMOTE_MAX)
+ list_move(&page->lru, promote + free - 1);
}
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * Promote the slabs filled up most to the head of the
+ * partial list.
*/
- for (i = objects - 1; i > 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+ list_splice(promote + i, &n->partial);
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ list_for_each_entry_safe(page, t, &discard, lru)
discard_slab(s, page);
+
+ if (slabs_node(s, node))
+ ret = 1;
}
- kfree(slabs_by_inuse);
- return 0;
+ return ret;
}
static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ __kmem_cache_shrink(s, false);
mutex_unlock(&slab_mutex);
return 0;
p->slab_cache = s;
#endif
}
+ slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
return s;
}
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s, *c;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int i;
- struct kmem_cache *c;
-
s->refcount++;
/*
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
+ for_each_memcg_cache(c, s) {
c->object_size = s->object_size;
c->inuse = max_t(int, c->inuse,
ALIGN(size, sizeof(void *)));
if (num_online_cpus() > 1 &&
!cpumask_empty(to_cpumask(l->cpus)) &&
- len < PAGE_SIZE - 60) {
- len += sprintf(buf + len, " cpus=");
- len += cpulist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- to_cpumask(l->cpus));
- }
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
- len < PAGE_SIZE - 60) {
- len += sprintf(buf + len, " nodes=");
- len += nodelist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- l->nodes);
- }
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
len += sprintf(buf + len, "\n");
}
static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1') {
- int rc = kmem_cache_shrink(s);
-
- if (rc)
- return rc;
- } else
+ if (buf[0] == '1')
+ kmem_cache_shrink(s);
+ else
return -EINVAL;
return length;
}
err = attribute->store(s, buf, len);
#ifdef CONFIG_MEMCG_KMEM
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- int i;
+ struct kmem_cache *c;
mutex_lock(&slab_mutex);
if (s->max_attr_size < len)
* directly either failed or succeeded, in which case we loop
* through the descendants with best-effort propagation.
*/
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
- if (c)
- attribute->store(c, buf, len);
- }
+ for_each_memcg_cache(c, s)
+ attribute->store(c, buf, len);
mutex_unlock(&slab_mutex);
}
#endif
if (is_root_cache(s))
return;
- root_cache = s->memcg_params->root_cache;
+ root_cache = s->memcg_params.root_cache;
/*
* This mean this cache had no attribute written. Therefore, no point
{
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
- return s->memcg_params->root_cache->memcg_kset;
+ return s->memcg_params.root_cache->memcg_kset;
#endif
return slab_kset;
}