Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...
[cascardo/linux.git] / mm / zsmalloc.c
index 3bd8b0a..08bd7a3 100644 (file)
  */
 
 /*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
  * Following is how we use various fields and flags of underlying
  * struct page(s) to form a zspage.
  *
@@ -57,6 +28,8 @@
  *
  *     page->private (union with page->first_page): refers to the
  *             component page after the first page
+ *             If the page is first_page for huge object, it stores handle.
+ *             Look at size_class->huge.
  *     page->freelist: points to the first free object in zspage.
  *             Free objects are linked together using in-place
  *             metadata.
 #define ZS_MIN_ALLOC_SIZE \
        MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
 /* each chunk includes extra space to keep handle */
-#define ZS_MAX_ALLOC_SIZE      (PAGE_SIZE + ZS_HANDLE_SIZE)
+#define ZS_MAX_ALLOC_SIZE      PAGE_SIZE
 
 /*
  * On systems with 4K page size, this gives 255 size classes! There is a
@@ -195,6 +168,8 @@ enum fullness_group {
 enum zs_stat_type {
        OBJ_ALLOCATED,
        OBJ_USED,
+       CLASS_ALMOST_FULL,
+       CLASS_ALMOST_EMPTY,
        NR_ZS_STAT_TYPE,
 };
 
@@ -239,6 +214,8 @@ struct size_class {
 
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
+       /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+       bool huge;
 
 #ifdef CONFIG_ZSMALLOC_STAT
        struct zs_size_stat stats;
@@ -300,6 +277,7 @@ struct mapping_area {
 #endif
        char *vm_addr; /* address of kmap_atomic()'ed pages */
        enum zs_mapmode vm_mm; /* mapping mode */
+       bool huge;
 };
 
 static int create_handle_cache(struct zs_pool *pool)
@@ -407,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
 MODULE_ALIAS("zpool-zsmalloc");
 #endif /* CONFIG_ZPOOL */
 
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+       return pages_per_zspage * PAGE_SIZE / size;
+}
+
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
 
@@ -457,9 +440,182 @@ static int get_size_class_index(int size)
                idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
                                ZS_SIZE_CLASS_DELTA);
 
-       return idx;
+       return min(zs_size_classes - 1, idx);
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+                               enum zs_stat_type type, unsigned long cnt)
+{
+       class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+                               enum zs_stat_type type, unsigned long cnt)
+{
+       class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+                               enum zs_stat_type type)
+{
+       return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+       if (!debugfs_initialized())
+               return -ENODEV;
+
+       zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+       if (!zs_stat_root)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+       debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+       int i;
+       struct zs_pool *pool = s->private;
+       struct size_class *class;
+       int objs_per_zspage;
+       unsigned long class_almost_full, class_almost_empty;
+       unsigned long obj_allocated, obj_used, pages_used;
+       unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+       unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+       seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+                       "class", "size", "almost_full", "almost_empty",
+                       "obj_allocated", "obj_used", "pages_used",
+                       "pages_per_zspage");
+
+       for (i = 0; i < zs_size_classes; i++) {
+               class = pool->size_class[i];
+
+               if (class->index != i)
+                       continue;
+
+               spin_lock(&class->lock);
+               class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+               class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+               obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+               obj_used = zs_stat_get(class, OBJ_USED);
+               spin_unlock(&class->lock);
+
+               objs_per_zspage = get_maxobj_per_zspage(class->size,
+                               class->pages_per_zspage);
+               pages_used = obj_allocated / objs_per_zspage *
+                               class->pages_per_zspage;
+
+               seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+                       i, class->size, class_almost_full, class_almost_empty,
+                       obj_allocated, obj_used, pages_used,
+                       class->pages_per_zspage);
+
+               total_class_almost_full += class_almost_full;
+               total_class_almost_empty += class_almost_empty;
+               total_objs += obj_allocated;
+               total_used_objs += obj_used;
+               total_pages += pages_used;
+       }
+
+       seq_puts(s, "\n");
+       seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+                       "Total", "", total_class_almost_full,
+                       total_class_almost_empty, total_objs,
+                       total_used_objs, total_pages);
+
+       return 0;
 }
 
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+       .open           = zs_stats_size_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+       struct dentry *entry;
+
+       if (!zs_stat_root)
+               return -ENODEV;
+
+       entry = debugfs_create_dir(name, zs_stat_root);
+       if (!entry) {
+               pr_warn("debugfs dir <%s> creation failed\n", name);
+               return -ENOMEM;
+       }
+       pool->stat_dentry = entry;
+
+       entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+                       pool->stat_dentry, pool, &zs_stat_size_ops);
+       if (!entry) {
+               pr_warn("%s: debugfs file entry <%s> creation failed\n",
+                               name, "classes");
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+       debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+                               enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+                               enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+                               enum zs_stat_type type)
+{
+       return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+       return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+       return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
+
 /*
  * For each size class, zspages are divided into different groups
  * depending on how "full" they are. This was done so that we could
@@ -509,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
                list_add_tail(&page->lru, &(*head)->lru);
 
        *head = page;
+       zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+                       CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 
 /*
@@ -534,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
                                        struct page, lru);
 
        list_del_init(&page->lru);
+       zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+                       CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
 
 /*
@@ -571,7 +731,8 @@ out:
  * to form a zspage for each size class. This is important
  * to reduce wastage due to unusable space left at end of
  * each zspage which is given as:
- *     wastage = Zp - Zp % size_class
+ *     wastage = Zp % class_size
+ *     usage = Zp - wastage
  * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
  *
  * For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -666,9 +827,14 @@ static unsigned long handle_to_obj(unsigned long handle)
        return *(unsigned long *)handle;
 }
 
-unsigned long obj_to_head(void *obj)
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+                       void *obj)
 {
-       return *(unsigned long *)obj;
+       if (class->huge) {
+               VM_BUG_ON(!is_first_page(page));
+               return *(unsigned long *)page_private(page);
+       } else
+               return *(unsigned long *)obj;
 }
 
 static unsigned long obj_idx_to_offset(struct page *page,
@@ -954,9 +1120,12 @@ static void __zs_unmap_object(struct mapping_area *area,
        if (area->vm_mm == ZS_MM_RO)
                goto out;
 
-       buf = area->vm_buf + ZS_HANDLE_SIZE;
-       size -= ZS_HANDLE_SIZE;
-       off += ZS_HANDLE_SIZE;
+       buf = area->vm_buf;
+       if (!area->huge) {
+               buf = buf + ZS_HANDLE_SIZE;
+               size -= ZS_HANDLE_SIZE;
+               off += ZS_HANDLE_SIZE;
+       }
 
        sizes[0] = PAGE_SIZE - off;
        sizes[1] = size - sizes[0];
@@ -1044,11 +1213,6 @@ static void init_zs_size_classes(void)
        zs_size_classes = nr;
 }
 
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
-       return pages_per_zspage * PAGE_SIZE / size;
-}
-
 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 {
        if (prev->pages_per_zspage != pages_per_zspage)
@@ -1068,166 +1232,6 @@ static bool zspage_full(struct page *page)
        return page->inuse == page->objects;
 }
 
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static inline void zs_stat_inc(struct size_class *class,
-                               enum zs_stat_type type, unsigned long cnt)
-{
-       class->stats.objs[type] += cnt;
-}
-
-static inline void zs_stat_dec(struct size_class *class,
-                               enum zs_stat_type type, unsigned long cnt)
-{
-       class->stats.objs[type] -= cnt;
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
-                               enum zs_stat_type type)
-{
-       return class->stats.objs[type];
-}
-
-static int __init zs_stat_init(void)
-{
-       if (!debugfs_initialized())
-               return -ENODEV;
-
-       zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
-       if (!zs_stat_root)
-               return -ENOMEM;
-
-       return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
-       debugfs_remove_recursive(zs_stat_root);
-}
-
-static int zs_stats_size_show(struct seq_file *s, void *v)
-{
-       int i;
-       struct zs_pool *pool = s->private;
-       struct size_class *class;
-       int objs_per_zspage;
-       unsigned long obj_allocated, obj_used, pages_used;
-       unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
-
-       seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
-                               "obj_allocated", "obj_used", "pages_used");
-
-       for (i = 0; i < zs_size_classes; i++) {
-               class = pool->size_class[i];
-
-               if (class->index != i)
-                       continue;
-
-               spin_lock(&class->lock);
-               obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
-               obj_used = zs_stat_get(class, OBJ_USED);
-               spin_unlock(&class->lock);
-
-               objs_per_zspage = get_maxobj_per_zspage(class->size,
-                               class->pages_per_zspage);
-               pages_used = obj_allocated / objs_per_zspage *
-                               class->pages_per_zspage;
-
-               seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
-                       class->size, obj_allocated, obj_used, pages_used);
-
-               total_objs += obj_allocated;
-               total_used_objs += obj_used;
-               total_pages += pages_used;
-       }
-
-       seq_puts(s, "\n");
-       seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
-                       total_objs, total_used_objs, total_pages);
-
-       return 0;
-}
-
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, zs_stats_size_show, inode->i_private);
-}
-
-static const struct file_operations zs_stat_size_ops = {
-       .open           = zs_stats_size_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
-       struct dentry *entry;
-
-       if (!zs_stat_root)
-               return -ENODEV;
-
-       entry = debugfs_create_dir(name, zs_stat_root);
-       if (!entry) {
-               pr_warn("debugfs dir <%s> creation failed\n", name);
-               return -ENOMEM;
-       }
-       pool->stat_dentry = entry;
-
-       entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
-                       pool->stat_dentry, pool, &zs_stat_size_ops);
-       if (!entry) {
-               pr_warn("%s: debugfs file entry <%s> creation failed\n",
-                               name, "obj_in_classes");
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-static void zs_pool_stat_destroy(struct zs_pool *pool)
-{
-       debugfs_remove_recursive(pool->stat_dentry);
-}
-
-#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
-                               enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
-                               enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
-                               enum zs_stat_type type)
-{
-       return 0;
-}
-
-static int __init zs_stat_init(void)
-{
-       return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
-}
-
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
-       return 0;
-}
-
-static inline void zs_pool_stat_destroy(struct zs_pool *pool)
-{
-}
-
-#endif
-
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
        return atomic_long_read(&pool->pages_allocated);
@@ -1295,7 +1299,10 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 
        ret = __zs_map_object(area, pages, off, class->size);
 out:
-       return ret + ZS_HANDLE_SIZE;
+       if (!class->huge)
+               ret += ZS_HANDLE_SIZE;
+
+       return ret;
 }
 EXPORT_SYMBOL_GPL(zs_map_object);
 
@@ -1352,8 +1359,12 @@ static unsigned long obj_malloc(struct page *first_page,
        vaddr = kmap_atomic(m_page);
        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
        first_page->freelist = link->next;
-       /* record handle in the header of allocated chunk */
-       link->handle = handle;
+       if (!class->huge)
+               /* record handle in the header of allocated chunk */
+               link->handle = handle;
+       else
+               /* record handle in first_page->private */
+               set_page_private(first_page, handle);
        kunmap_atomic(vaddr);
        first_page->inuse++;
        zs_stat_inc(class, OBJ_USED, 1);
@@ -1377,7 +1388,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        struct size_class *class;
        struct page *first_page;
 
-       if (unlikely(!size || (size + ZS_HANDLE_SIZE) > ZS_MAX_ALLOC_SIZE))
+       if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
                return 0;
 
        handle = alloc_handle(pool);
@@ -1442,6 +1453,8 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
        /* Insert this object in containing zspage's freelist */
        link = (struct link_free *)(vaddr + f_offset);
        link->next = first_page->freelist;
+       if (class->huge)
+               set_page_private(first_page, 0);
        kunmap_atomic(vaddr);
        first_page->freelist = (void *)obj;
        first_page->inuse--;
@@ -1519,7 +1532,12 @@ static void zs_object_copy(unsigned long src, unsigned long dst,
                if (written == class->size)
                        break;
 
-               if (s_off + size >= PAGE_SIZE) {
+               s_off += size;
+               s_size -= size;
+               d_off += size;
+               d_size -= size;
+
+               if (s_off >= PAGE_SIZE) {
                        kunmap_atomic(d_addr);
                        kunmap_atomic(s_addr);
                        s_page = get_next_page(s_page);
@@ -1528,21 +1546,15 @@ static void zs_object_copy(unsigned long src, unsigned long dst,
                        d_addr = kmap_atomic(d_page);
                        s_size = class->size - written;
                        s_off = 0;
-               } else {
-                       s_off += size;
-                       s_size -= size;
                }
 
-               if (d_off + size >= PAGE_SIZE) {
+               if (d_off >= PAGE_SIZE) {
                        kunmap_atomic(d_addr);
                        d_page = get_next_page(d_page);
                        BUG_ON(!d_page);
                        d_addr = kmap_atomic(d_page);
                        d_size = class->size - written;
                        d_off = 0;
-               } else {
-                       d_off += size;
-                       d_size -= size;
                }
        }
 
@@ -1567,7 +1579,7 @@ static unsigned long find_alloced_obj(struct page *page, int index,
        offset += class->size * index;
 
        while (offset < PAGE_SIZE) {
-               head = obj_to_head(addr + offset);
+               head = obj_to_head(class, page, addr + offset);
                if (head & OBJ_ALLOCATED_TAG) {
                        handle = head & ~OBJ_ALLOCATED_TAG;
                        if (trypin_tag(handle))
@@ -1661,14 +1673,14 @@ static struct page *alloc_target_page(struct size_class *class)
 static void putback_zspage(struct zs_pool *pool, struct size_class *class,
                                struct page *first_page)
 {
-       int class_idx;
        enum fullness_group fullness;
 
        BUG_ON(!is_first_page(first_page));
 
-       get_zspage_mapping(first_page, &class_idx, &fullness);
+       fullness = get_fullness_group(first_page);
        insert_zspage(first_page, class, fullness);
-       fullness = fix_fullness_group(class, first_page);
+       set_zspage_mapping(first_page, class->index, fullness);
+
        if (fullness == ZS_EMPTY) {
                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
                        class->size, class->pages_per_zspage));
@@ -1699,8 +1711,6 @@ static unsigned long __zs_compact(struct zs_pool *pool,
        struct page *dst_page = NULL;
        unsigned long nr_total_migrated = 0;
 
-       cond_resched();
-
        spin_lock(&class->lock);
        while ((src_page = isolate_source_page(class))) {
 
@@ -1760,8 +1770,6 @@ unsigned long zs_compact(struct zs_pool *pool)
                nr_migrated += __zs_compact(pool, class);
        }
 
-       synchronize_rcu();
-
        return nr_migrated;
 }
 EXPORT_SYMBOL_GPL(zs_compact);
@@ -1837,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
                class->size = size;
                class->index = i;
                class->pages_per_zspage = pages_per_zspage;
+               if (pages_per_zspage == 1 &&
+                       get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+                       class->huge = true;
                spin_lock_init(&class->lock);
                pool->size_class[i] = class;