kasan: avoid overflowing quarantine size on low memory systems
[cascardo/linux.git] / mm / zsmalloc.c
index 79295c7..b0bc023 100644 (file)
  * struct page(s) to form a zspage.
  *
  * Usage of struct page fields:
- *     page->private: points to the first component (0-order) page
- *     page->index (union with page->freelist): offset of the first object
- *             starting in this page. For the first page, this is
- *             always 0, so we use this field (aka freelist) to point
- *             to the first free object in zspage.
- *     page->lru: links together all component pages (except the first page)
- *             of a zspage
- *
- *     For _first_ page only:
- *
- *     page->private: refers to the component page after the first page
- *             If the page is first_page for huge object, it stores handle.
- *             Look at size_class->huge.
- *     page->freelist: points to the first free object in zspage.
- *             Free objects are linked together using in-place
- *             metadata.
- *     page->lru: links together first pages of various zspages.
- *             Basically forming list of zspages in a fullness group.
- *     page->mapping: class index and fullness group of the zspage
- *     page->inuse: the number of objects that are used in this zspage
+ *     page->private: points to zspage
+ *     page->freelist(index): links together all component pages of a zspage
+ *             For the huge page, this is always 0, so we use this field
+ *             to store handle.
+ *     page->units: first object offset in a subpage of zspage
  *
  * Usage of struct page flags:
  *     PG_private: identifies the first component page
  *     PG_private2: identifies the last component page
+ *     PG_owner_priv_1: indentifies the huge component page
  *
  */
 
 #include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
+#include <linux/mount.h>
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+
+#define ZSPAGE_MAGIC   0x58
 
 /*
  * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -86,9 +77,7 @@
  * Object location (<PFN>, <obj_idx>) is encoded as
  * as single (unsigned long) handle value.
  *
- * Note that object index <obj_idx> is relative to system
- * page <PFN> it is stored in, so for each sub-page belonging
- * to a zspage, obj_idx starts with 0.
+ * Note that object index <obj_idx> starts from 0.
  *
  * This is made more complicated by various memory models and PAE.
  */
  *  ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
  *  (reason above)
  */
-#define ZS_SIZE_CLASS_DELTA    (PAGE_SIZE >> 8)
+#define ZS_SIZE_CLASS_DELTA    (PAGE_SIZE >> CLASS_BITS)
 
-/*
- * We do not maintain any list for completely empty or full pages
- */
 enum fullness_group {
-       ZS_ALMOST_FULL,
-       ZS_ALMOST_EMPTY,
-       _ZS_NR_FULLNESS_GROUPS,
-
        ZS_EMPTY,
-       ZS_FULL
+       ZS_ALMOST_EMPTY,
+       ZS_ALMOST_FULL,
+       ZS_FULL,
+       NR_ZS_FULLNESS,
 };
 
 enum zs_stat_type {
+       CLASS_EMPTY,
+       CLASS_ALMOST_EMPTY,
+       CLASS_ALMOST_FULL,
+       CLASS_FULL,
        OBJ_ALLOCATED,
        OBJ_USED,
-       CLASS_ALMOST_FULL,
-       CLASS_ALMOST_EMPTY,
+       NR_ZS_STAT_TYPE,
 };
 
-#ifdef CONFIG_ZSMALLOC_STAT
-#define NR_ZS_STAT_TYPE        (CLASS_ALMOST_EMPTY + 1)
-#else
-#define NR_ZS_STAT_TYPE        (OBJ_USED + 1)
-#endif
-
 struct zs_size_stat {
        unsigned long objs[NR_ZS_STAT_TYPE];
 };
@@ -182,6 +164,10 @@ struct zs_size_stat {
 static struct dentry *zs_stat_root;
 #endif
 
+#ifdef CONFIG_COMPACTION
+static struct vfsmount *zsmalloc_mnt;
+#endif
+
 /*
  * number of size_classes
  */
@@ -205,36 +191,49 @@ static const int fullness_threshold_frac = 4;
 
 struct size_class {
        spinlock_t lock;
-       struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
+       struct list_head fullness_list[NR_ZS_FULLNESS];
        /*
         * Size of objects stored in this class. Must be multiple
         * of ZS_ALIGN.
         */
        int size;
        int objs_per_zspage;
-       unsigned int index;
-
-       struct zs_size_stat stats;
-
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
-       /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
-       bool huge;
+
+       unsigned int index;
+       struct zs_size_stat stats;
 };
 
+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+static void SetPageHugeObject(struct page *page)
+{
+       SetPageOwnerPriv1(page);
+}
+
+static void ClearPageHugeObject(struct page *page)
+{
+       ClearPageOwnerPriv1(page);
+}
+
+static int PageHugeObject(struct page *page)
+{
+       return PageOwnerPriv1(page);
+}
+
 /*
  * Placed within free objects to form a singly linked list.
- * For every zspage, first_page->freelist gives head of this list.
+ * For every zspage, zspage->freeobj gives head of this list.
  *
  * This must be power of 2 and less than or equal to ZS_ALIGN
  */
 struct link_free {
        union {
                /*
-                * Position of next free chunk (encodes <PFN, obj_idx>)
+                * Free object index;
                 * It's valid for non-allocated object
                 */
-               void *next;
+               unsigned long next;
                /*
                 * Handle of allocated object.
                 */
@@ -247,6 +246,7 @@ struct zs_pool {
 
        struct size_class **size_class;
        struct kmem_cache *handle_cachep;
+       struct kmem_cache *zspage_cachep;
 
        atomic_long_t pages_allocated;
 
@@ -262,16 +262,36 @@ struct zs_pool {
 #ifdef CONFIG_ZSMALLOC_STAT
        struct dentry *stat_dentry;
 #endif
+#ifdef CONFIG_COMPACTION
+       struct inode *inode;
+       struct work_struct free_work;
+#endif
 };
 
 /*
  * A zspage's class index and fullness group
  * are encoded in its (first)page->mapping
  */
-#define CLASS_IDX_BITS 28
-#define FULLNESS_BITS  4
-#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1)
-#define FULLNESS_MASK  ((1 << FULLNESS_BITS) - 1)
+#define FULLNESS_BITS  2
+#define CLASS_BITS     8
+#define ISOLATED_BITS  3
+#define MAGIC_VAL_BITS 8
+
+struct zspage {
+       struct {
+               unsigned int fullness:FULLNESS_BITS;
+               unsigned int class:CLASS_BITS;
+               unsigned int isolated:ISOLATED_BITS;
+               unsigned int magic:MAGIC_VAL_BITS;
+       };
+       unsigned int inuse;
+       unsigned int freeobj;
+       struct page *first_page;
+       struct list_head list; /* fullness list */
+#ifdef CONFIG_COMPACTION
+       rwlock_t lock;
+#endif
+};
 
 struct mapping_area {
 #ifdef CONFIG_PGTABLE_MAPPING
@@ -283,29 +303,74 @@ struct mapping_area {
        enum zs_mapmode vm_mm; /* mapping mode */
 };
 
-static int create_handle_cache(struct zs_pool *pool)
+#ifdef CONFIG_COMPACTION
+static int zs_register_migration(struct zs_pool *pool);
+static void zs_unregister_migration(struct zs_pool *pool);
+static void migrate_lock_init(struct zspage *zspage);
+static void migrate_read_lock(struct zspage *zspage);
+static void migrate_read_unlock(struct zspage *zspage);
+static void kick_deferred_free(struct zs_pool *pool);
+static void init_deferred_free(struct zs_pool *pool);
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
+#else
+static int zsmalloc_mount(void) { return 0; }
+static void zsmalloc_unmount(void) {}
+static int zs_register_migration(struct zs_pool *pool) { return 0; }
+static void zs_unregister_migration(struct zs_pool *pool) {}
+static void migrate_lock_init(struct zspage *zspage) {}
+static void migrate_read_lock(struct zspage *zspage) {}
+static void migrate_read_unlock(struct zspage *zspage) {}
+static void kick_deferred_free(struct zs_pool *pool) {}
+static void init_deferred_free(struct zs_pool *pool) {}
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
+#endif
+
+static int create_cache(struct zs_pool *pool)
 {
        pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
                                        0, 0, NULL);
-       return pool->handle_cachep ? 0 : 1;
+       if (!pool->handle_cachep)
+               return 1;
+
+       pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
+                                       0, 0, NULL);
+       if (!pool->zspage_cachep) {
+               kmem_cache_destroy(pool->handle_cachep);
+               pool->handle_cachep = NULL;
+               return 1;
+       }
+
+       return 0;
 }
 
-static void destroy_handle_cache(struct zs_pool *pool)
+static void destroy_cache(struct zs_pool *pool)
 {
        kmem_cache_destroy(pool->handle_cachep);
+       kmem_cache_destroy(pool->zspage_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
+static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
        return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-                       gfp & ~__GFP_HIGHMEM);
+                       gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
 }
 
-static void free_handle(struct zs_pool *pool, unsigned long handle)
+static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
 {
        kmem_cache_free(pool->handle_cachep, (void *)handle);
 }
 
+static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
+{
+       return kmem_cache_alloc(pool->zspage_cachep,
+                       flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
+};
+
+static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
+{
+       kmem_cache_free(pool->zspage_cachep, zspage);
+}
+
 static void record_obj(unsigned long handle, unsigned long obj)
 {
        /*
@@ -400,46 +465,79 @@ static struct zpool_driver zs_zpool_driver = {
 MODULE_ALIAS("zpool-zsmalloc");
 #endif /* CONFIG_ZPOOL */
 
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
-       return pages_per_zspage * PAGE_SIZE / size;
-}
-
 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
 static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
 
+static bool is_zspage_isolated(struct zspage *zspage)
+{
+       return zspage->isolated;
+}
+
 static int is_first_page(struct page *page)
 {
        return PagePrivate(page);
 }
 
-static int is_last_page(struct page *page)
+/* Protected by class->lock */
+static inline int get_zspage_inuse(struct zspage *zspage)
+{
+       return zspage->inuse;
+}
+
+static inline void set_zspage_inuse(struct zspage *zspage, int val)
+{
+       zspage->inuse = val;
+}
+
+static inline void mod_zspage_inuse(struct zspage *zspage, int val)
+{
+       zspage->inuse += val;
+}
+
+static inline struct page *get_first_page(struct zspage *zspage)
+{
+       struct page *first_page = zspage->first_page;
+
+       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+       return first_page;
+}
+
+static inline int get_first_obj_offset(struct page *page)
+{
+       return page->units;
+}
+
+static inline void set_first_obj_offset(struct page *page, int offset)
+{
+       page->units = offset;
+}
+
+static inline unsigned int get_freeobj(struct zspage *zspage)
+{
+       return zspage->freeobj;
+}
+
+static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
 {
-       return PagePrivate2(page);
+       zspage->freeobj = obj;
 }
 
-static void get_zspage_mapping(struct page *first_page,
+static void get_zspage_mapping(struct zspage *zspage,
                                unsigned int *class_idx,
                                enum fullness_group *fullness)
 {
-       unsigned long m;
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+       BUG_ON(zspage->magic != ZSPAGE_MAGIC);
 
-       m = (unsigned long)first_page->mapping;
-       *fullness = m & FULLNESS_MASK;
-       *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
+       *fullness = zspage->fullness;
+       *class_idx = zspage->class;
 }
 
-static void set_zspage_mapping(struct page *first_page,
+static void set_zspage_mapping(struct zspage *zspage,
                                unsigned int class_idx,
                                enum fullness_group fullness)
 {
-       unsigned long m;
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-       m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
-                       (fullness & FULLNESS_MASK);
-       first_page->mapping = (struct address_space *)m;
+       zspage->class = class_idx;
+       zspage->fullness = fullness;
 }
 
 /*
@@ -463,23 +561,19 @@ static int get_size_class_index(int size)
 static inline void zs_stat_inc(struct size_class *class,
                                enum zs_stat_type type, unsigned long cnt)
 {
-       if (type < NR_ZS_STAT_TYPE)
-               class->stats.objs[type] += cnt;
+       class->stats.objs[type] += cnt;
 }
 
 static inline void zs_stat_dec(struct size_class *class,
                                enum zs_stat_type type, unsigned long cnt)
 {
-       if (type < NR_ZS_STAT_TYPE)
-               class->stats.objs[type] -= cnt;
+       class->stats.objs[type] -= cnt;
 }
 
 static inline unsigned long zs_stat_get(struct size_class *class,
                                enum zs_stat_type type)
 {
-       if (type < NR_ZS_STAT_TYPE)
-               return class->stats.objs[type];
-       return 0;
+       return class->stats.objs[type];
 }
 
 #ifdef CONFIG_ZSMALLOC_STAT
@@ -534,8 +628,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
                freeable = zs_can_compact(class);
                spin_unlock(&class->lock);
 
-               objs_per_zspage = get_maxobj_per_zspage(class->size,
-                               class->pages_per_zspage);
+               objs_per_zspage = class->objs_per_zspage;
                pages_used = obj_allocated / objs_per_zspage *
                                class->pages_per_zspage;
 
@@ -623,6 +716,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 #endif
 
+
 /*
  * For each size class, zspages are divided into different groups
  * depending on how "full" they are. This was done so that we could
@@ -631,14 +725,12 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
  * status of the given page.
  */
 static enum fullness_group get_fullness_group(struct size_class *class,
-                                               struct page *first_page)
+                                               struct zspage *zspage)
 {
        int inuse, objs_per_zspage;
        enum fullness_group fg;
 
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-       inuse = first_page->inuse;
+       inuse = get_zspage_inuse(zspage);
        objs_per_zspage = class->objs_per_zspage;
 
        if (inuse == 0)
@@ -660,32 +752,25 @@ static enum fullness_group get_fullness_group(struct size_class *class,
  * identified by <class, fullness_group>.
  */
 static void insert_zspage(struct size_class *class,
-                               enum fullness_group fullness,
-                               struct page *first_page)
+                               struct zspage *zspage,
+                               enum fullness_group fullness)
 {
-       struct page **head;
-
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-       if (fullness >= _ZS_NR_FULLNESS_GROUPS)
-               return;
-
-       zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
-                       CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
-
-       head = &class->fullness_list[fullness];
-       if (!*head) {
-               *head = first_page;
-               return;
-       }
+       struct zspage *head;
 
+       zs_stat_inc(class, fullness, 1);
+       head = list_first_entry_or_null(&class->fullness_list[fullness],
+                                       struct zspage, list);
        /*
-        * We want to see more ZS_FULL pages and less almost
-        * empty/full. Put pages with higher ->inuse first.
+        * We want to see more ZS_FULL pages and less almost empty/full.
+        * Put pages with higher ->inuse first.
         */
-       list_add_tail(&first_page->lru, &(*head)->lru);
-       if (first_page->inuse >= (*head)->inuse)
-               *head = first_page;
+       if (head) {
+               if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
+                       list_add(&zspage->list, &head->list);
+                       return;
+               }
+       }
+       list_add(&zspage->list, &class->fullness_list[fullness]);
 }
 
 /*
@@ -693,27 +778,14 @@ static void insert_zspage(struct size_class *class,
  * by <class, fullness_group>.
  */
 static void remove_zspage(struct size_class *class,
-                               enum fullness_group fullness,
-                               struct page *first_page)
+                               struct zspage *zspage,
+                               enum fullness_group fullness)
 {
-       struct page **head;
-
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-       if (fullness >= _ZS_NR_FULLNESS_GROUPS)
-               return;
+       VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
+       VM_BUG_ON(is_zspage_isolated(zspage));
 
-       head = &class->fullness_list[fullness];
-       VM_BUG_ON_PAGE(!*head, first_page);
-       if (list_empty(&(*head)->lru))
-               *head = NULL;
-       else if (*head == first_page)
-               *head = (struct page *)list_entry((*head)->lru.next,
-                                       struct page, lru);
-
-       list_del_init(&first_page->lru);
-       zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
-                       CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+       list_del_init(&zspage->list);
+       zs_stat_dec(class, fullness, 1);
 }
 
 /*
@@ -726,19 +798,22 @@ static void remove_zspage(struct size_class *class,
  * fullness group.
  */
 static enum fullness_group fix_fullness_group(struct size_class *class,
-                                               struct page *first_page)
+                                               struct zspage *zspage)
 {
        int class_idx;
        enum fullness_group currfg, newfg;
 
-       get_zspage_mapping(first_page, &class_idx, &currfg);
-       newfg = get_fullness_group(class, first_page);
+       get_zspage_mapping(zspage, &class_idx, &currfg);
+       newfg = get_fullness_group(class, zspage);
        if (newfg == currfg)
                goto out;
 
-       remove_zspage(class, currfg, first_page);
-       insert_zspage(class, newfg, first_page);
-       set_zspage_mapping(first_page, class_idx, newfg);
+       if (!is_zspage_isolated(zspage)) {
+               remove_zspage(class, zspage, currfg);
+               insert_zspage(class, zspage, newfg);
+       }
+
+       set_zspage_mapping(zspage, class_idx, newfg);
 
 out:
        return newfg;
@@ -780,64 +855,49 @@ static int get_pages_per_zspage(int class_size)
        return max_usedpc_order;
 }
 
-/*
- * A single 'zspage' is composed of many system pages which are
- * linked together using fields in struct page. This function finds
- * the first/head page, given any component page of a zspage.
- */
-static struct page *get_first_page(struct page *page)
+static struct zspage *get_zspage(struct page *page)
 {
-       if (is_first_page(page))
-               return page;
-       else
-               return (struct page *)page_private(page);
+       struct zspage *zspage = (struct zspage *)page->private;
+
+       BUG_ON(zspage->magic != ZSPAGE_MAGIC);
+       return zspage;
 }
 
 static struct page *get_next_page(struct page *page)
 {
-       struct page *next;
+       if (unlikely(PageHugeObject(page)))
+               return NULL;
 
-       if (is_last_page(page))
-               next = NULL;
-       else if (is_first_page(page))
-               next = (struct page *)page_private(page);
-       else
-               next = list_entry(page->lru.next, struct page, lru);
+       return page->freelist;
+}
 
-       return next;
+/**
+ * obj_to_location - get (<page>, <obj_idx>) from encoded object value
+ * @page: page object resides in zspage
+ * @obj_idx: object index
+ */
+static void obj_to_location(unsigned long obj, struct page **page,
+                               unsigned int *obj_idx)
+{
+       obj >>= OBJ_TAG_BITS;
+       *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+       *obj_idx = (obj & OBJ_INDEX_MASK);
 }
 
-/*
- * Encode <page, obj_idx> as a single handle value.
- * We use the least bit of handle for tagging.
+/**
+ * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
+ * @page: page object resides in zspage
+ * @obj_idx: object index
  */
-static void *location_to_obj(struct page *page, unsigned long obj_idx)
+static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
 {
        unsigned long obj;
 
-       if (!page) {
-               VM_BUG_ON(obj_idx);
-               return NULL;
-       }
-
        obj = page_to_pfn(page) << OBJ_INDEX_BITS;
-       obj |= ((obj_idx) & OBJ_INDEX_MASK);
+       obj |= obj_idx & OBJ_INDEX_MASK;
        obj <<= OBJ_TAG_BITS;
 
-       return (void *)obj;
-}
-
-/*
- * Decode <page, obj_idx> pair from the given object handle. We adjust the
- * decoded obj_idx back to its original value since it was adjusted in
- * location_to_obj().
- */
-static void obj_to_location(unsigned long obj, struct page **page,
-                               unsigned long *obj_idx)
-{
-       obj >>= OBJ_TAG_BITS;
-       *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
-       *obj_idx = (obj & OBJ_INDEX_MASK);
+       return obj;
 }
 
 static unsigned long handle_to_obj(unsigned long handle)
@@ -845,109 +905,146 @@ static unsigned long handle_to_obj(unsigned long handle)
        return *(unsigned long *)handle;
 }
 
-static unsigned long obj_to_head(struct size_class *class, struct page *page,
-                       void *obj)
+static unsigned long obj_to_head(struct page *page, void *obj)
 {
-       if (class->huge) {
+       if (unlikely(PageHugeObject(page))) {
                VM_BUG_ON_PAGE(!is_first_page(page), page);
-               return page_private(page);
+               return page->index;
        } else
                return *(unsigned long *)obj;
 }
 
-static unsigned long obj_idx_to_offset(struct page *page,
-                               unsigned long obj_idx, int class_size)
+static inline int testpin_tag(unsigned long handle)
 {
-       unsigned long off = 0;
-
-       if (!is_first_page(page))
-               off = page->index;
-
-       return off + obj_idx * class_size;
+       return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
 
 static inline int trypin_tag(unsigned long handle)
 {
-       unsigned long *ptr = (unsigned long *)handle;
-
-       return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+       return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
 
 static void pin_tag(unsigned long handle)
 {
-       while (!trypin_tag(handle));
+       bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
 
 static void unpin_tag(unsigned long handle)
 {
-       unsigned long *ptr = (unsigned long *)handle;
-
-       clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+       bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
 }
 
 static void reset_page(struct page *page)
 {
-       clear_bit(PG_private, &page->flags);
-       clear_bit(PG_private_2, &page->flags);
+       __ClearPageMovable(page);
+       ClearPagePrivate(page);
+       ClearPagePrivate2(page);
        set_page_private(page, 0);
-       page->mapping = NULL;
-       page->freelist = NULL;
        page_mapcount_reset(page);
+       ClearPageHugeObject(page);
+       page->freelist = NULL;
 }
 
-static void free_zspage(struct page *first_page)
+/*
+ * To prevent zspage destroy during migration, zspage freeing should
+ * hold locks of all pages in the zspage.
+ */
+void lock_zspage(struct zspage *zspage)
 {
-       struct page *nextp, *tmp, *head_extra;
+       struct page *page = get_first_page(zspage);
 
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-       VM_BUG_ON_PAGE(first_page->inuse, first_page);
+       do {
+               lock_page(page);
+       } while ((page = get_next_page(page)) != NULL);
+}
 
-       head_extra = (struct page *)page_private(first_page);
+int trylock_zspage(struct zspage *zspage)
+{
+       struct page *cursor, *fail;
 
-       reset_page(first_page);
-       __free_page(first_page);
+       for (cursor = get_first_page(zspage); cursor != NULL; cursor =
+                                       get_next_page(cursor)) {
+               if (!trylock_page(cursor)) {
+                       fail = cursor;
+                       goto unlock;
+               }
+       }
 
-       /* zspage with only 1 system page */
-       if (!head_extra)
-               return;
+       return 1;
+unlock:
+       for (cursor = get_first_page(zspage); cursor != fail; cursor =
+                                       get_next_page(cursor))
+               unlock_page(cursor);
+
+       return 0;
+}
+
+static void __free_zspage(struct zs_pool *pool, struct size_class *class,
+                               struct zspage *zspage)
+{
+       struct page *page, *next;
+       enum fullness_group fg;
+       unsigned int class_idx;
 
-       list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) {
-               list_del(&nextp->lru);
-               reset_page(nextp);
-               __free_page(nextp);
+       get_zspage_mapping(zspage, &class_idx, &fg);
+
+       assert_spin_locked(&class->lock);
+
+       VM_BUG_ON(get_zspage_inuse(zspage));
+       VM_BUG_ON(fg != ZS_EMPTY);
+
+       next = page = get_first_page(zspage);
+       do {
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
+               next = get_next_page(page);
+               reset_page(page);
+               unlock_page(page);
+               dec_zone_page_state(page, NR_ZSPAGES);
+               put_page(page);
+               page = next;
+       } while (page != NULL);
+
+       cache_free_zspage(pool, zspage);
+
+       zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
+       atomic_long_sub(class->pages_per_zspage,
+                                       &pool->pages_allocated);
+}
+
+static void free_zspage(struct zs_pool *pool, struct size_class *class,
+                               struct zspage *zspage)
+{
+       VM_BUG_ON(get_zspage_inuse(zspage));
+       VM_BUG_ON(list_empty(&zspage->list));
+
+       if (!trylock_zspage(zspage)) {
+               kick_deferred_free(pool);
+               return;
        }
-       reset_page(head_extra);
-       __free_page(head_extra);
+
+       remove_zspage(class, zspage, ZS_EMPTY);
+       __free_zspage(pool, class, zspage);
 }
 
 /* Initialize a newly allocated zspage */
-static void init_zspage(struct size_class *class, struct page *first_page)
+static void init_zspage(struct size_class *class, struct zspage *zspage)
 {
+       unsigned int freeobj = 1;
        unsigned long off = 0;
-       struct page *page = first_page;
-
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+       struct page *page = get_first_page(zspage);
 
        while (page) {
                struct page *next_page;
                struct link_free *link;
-               unsigned int i = 1;
                void *vaddr;
 
-               /*
-                * page->index stores offset of first object starting
-                * in the page. For the first page, this is always 0,
-                * so we use first_page->index (aka ->freelist) to store
-                * head of corresponding zspage's freelist.
-                */
-               if (page != first_page)
-                       page->index = off;
+               set_first_obj_offset(page, off);
 
                vaddr = kmap_atomic(page);
                link = (struct link_free *)vaddr + off / sizeof(*link);
 
                while ((off += class->size) < PAGE_SIZE) {
-                       link->next = location_to_obj(page, i++);
+                       link->next = freeobj++ << OBJ_TAG_BITS;
                        link += class->size / sizeof(*link);
                }
 
@@ -957,84 +1054,112 @@ static void init_zspage(struct size_class *class, struct page *first_page)
                 * page (if present)
                 */
                next_page = get_next_page(page);
-               link->next = location_to_obj(next_page, 0);
+               if (next_page) {
+                       link->next = freeobj++ << OBJ_TAG_BITS;
+               } else {
+                       /*
+                        * Reset OBJ_TAG_BITS bit to last link to tell
+                        * whether it's allocated object or not.
+                        */
+                       link->next = -1 << OBJ_TAG_BITS;
+               }
                kunmap_atomic(vaddr);
                page = next_page;
                off %= PAGE_SIZE;
        }
+
+       set_freeobj(zspage, 0);
 }
 
-/*
- * Allocate a zspage for the given size class
- */
-static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
+static void create_page_chain(struct size_class *class, struct zspage *zspage,
+                               struct page *pages[])
 {
-       int i, error;
-       struct page *first_page = NULL, *uninitialized_var(prev_page);
+       int i;
+       struct page *page;
+       struct page *prev_page = NULL;
+       int nr_pages = class->pages_per_zspage;
 
        /*
         * Allocate individual pages and link them together as:
-        * 1. first page->private = first sub-page
-        * 2. all sub-pages are linked together using page->lru
-        * 3. each sub-page is linked to the first page using page->private
+        * 1. all pages are linked together using page->freelist
+        * 2. each sub-page point to zspage using page->private
         *
-        * For each size class, First/Head pages are linked together using
-        * page->lru. Also, we set PG_private to identify the first page
-        * (i.e. no other sub-page has this flag set) and PG_private_2 to
-        * identify the last page.
+        * we set PG_private to identify the first page (i.e. no other sub-page
+        * has this flag set) and PG_private_2 to identify the last page.
         */
-       error = -ENOMEM;
-       for (i = 0; i < class->pages_per_zspage; i++) {
-               struct page *page;
-
-               page = alloc_page(flags);
-               if (!page)
-                       goto cleanup;
-
-               INIT_LIST_HEAD(&page->lru);
-               if (i == 0) {   /* first page */
+       for (i = 0; i < nr_pages; i++) {
+               page = pages[i];
+               set_page_private(page, (unsigned long)zspage);
+               page->freelist = NULL;
+               if (i == 0) {
+                       zspage->first_page = page;
                        SetPagePrivate(page);
-                       set_page_private(page, 0);
-                       first_page = page;
-                       first_page->inuse = 0;
+                       if (unlikely(class->objs_per_zspage == 1 &&
+                                       class->pages_per_zspage == 1))
+                               SetPageHugeObject(page);
+               } else {
+                       prev_page->freelist = page;
                }
-               if (i == 1)
-                       set_page_private(first_page, (unsigned long)page);
-               if (i >= 1)
-                       set_page_private(page, (unsigned long)first_page);
-               if (i >= 2)
-                       list_add(&page->lru, &prev_page->lru);
-               if (i == class->pages_per_zspage - 1)   /* last page */
+               if (i == nr_pages - 1)
                        SetPagePrivate2(page);
                prev_page = page;
        }
+}
 
-       init_zspage(class, first_page);
+/*
+ * Allocate a zspage for the given size class
+ */
+static struct zspage *alloc_zspage(struct zs_pool *pool,
+                                       struct size_class *class,
+                                       gfp_t gfp)
+{
+       int i;
+       struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
+       struct zspage *zspage = cache_alloc_zspage(pool, gfp);
+
+       if (!zspage)
+               return NULL;
+
+       memset(zspage, 0, sizeof(struct zspage));
+       zspage->magic = ZSPAGE_MAGIC;
+       migrate_lock_init(zspage);
+
+       for (i = 0; i < class->pages_per_zspage; i++) {
+               struct page *page;
 
-       first_page->freelist = location_to_obj(first_page, 0);
-       error = 0; /* Success */
+               page = alloc_page(gfp);
+               if (!page) {
+                       while (--i >= 0) {
+                               dec_zone_page_state(pages[i], NR_ZSPAGES);
+                               __free_page(pages[i]);
+                       }
+                       cache_free_zspage(pool, zspage);
+                       return NULL;
+               }
 
-cleanup:
-       if (unlikely(error) && first_page) {
-               free_zspage(first_page);
-               first_page = NULL;
+               inc_zone_page_state(page, NR_ZSPAGES);
+               pages[i] = page;
        }
 
-       return first_page;
+       create_page_chain(class, zspage, pages);
+       init_zspage(class, zspage);
+
+       return zspage;
 }
 
-static struct page *find_get_zspage(struct size_class *class)
+static struct zspage *find_get_zspage(struct size_class *class)
 {
        int i;
-       struct page *page;
+       struct zspage *zspage;
 
-       for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
-               page = class->fullness_list[i];
-               if (page)
+       for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
+               zspage = list_first_entry_or_null(&class->fullness_list[i],
+                               struct zspage, list);
+               if (zspage)
                        break;
        }
 
-       return page;
+       return zspage;
 }
 
 #ifdef CONFIG_PGTABLE_MAPPING
@@ -1216,7 +1341,7 @@ static void zs_unregister_cpu_notifier(void)
        cpu_notifier_register_done();
 }
 
-static void init_zs_size_classes(void)
+static void __init init_zs_size_classes(void)
 {
        int nr;
 
@@ -1227,23 +1352,19 @@ static void init_zs_size_classes(void)
        zs_size_classes = nr;
 }
 
-static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
+static bool can_merge(struct size_class *prev, int pages_per_zspage,
+                                       int objs_per_zspage)
 {
-       if (prev->pages_per_zspage != pages_per_zspage)
-               return false;
+       if (prev->pages_per_zspage == pages_per_zspage &&
+               prev->objs_per_zspage == objs_per_zspage)
+               return true;
 
-       if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage)
-               != get_maxobj_per_zspage(size, pages_per_zspage))
-               return false;
-
-       return true;
+       return false;
 }
 
-static bool zspage_full(struct size_class *class, struct page *first_page)
+static bool zspage_full(struct size_class *class, struct zspage *zspage)
 {
-       VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
-
-       return first_page->inuse == class->objs_per_zspage;
+       return get_zspage_inuse(zspage) == class->objs_per_zspage;
 }
 
 unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1269,8 +1390,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
                        enum zs_mapmode mm)
 {
+       struct zspage *zspage;
        struct page *page;
-       unsigned long obj, obj_idx, off;
+       unsigned long obj, off;
+       unsigned int obj_idx;
 
        unsigned int class_idx;
        enum fullness_group fg;
@@ -1291,9 +1414,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 
        obj = handle_to_obj(handle);
        obj_to_location(obj, &page, &obj_idx);
-       get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+       zspage = get_zspage(page);
+
+       /* migration cannot move any subpage in this zspage */
+       migrate_read_lock(zspage);
+
+       get_zspage_mapping(zspage, &class_idx, &fg);
        class = pool->size_class[class_idx];
-       off = obj_idx_to_offset(page, obj_idx, class->size);
+       off = (class->size * obj_idx) & ~PAGE_MASK;
 
        area = &get_cpu_var(zs_map_area);
        area->vm_mm = mm;
@@ -1311,7 +1439,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 
        ret = __zs_map_object(area, pages, off, class->size);
 out:
-       if (!class->huge)
+       if (likely(!PageHugeObject(page)))
                ret += ZS_HANDLE_SIZE;
 
        return ret;
@@ -1320,8 +1448,10 @@ EXPORT_SYMBOL_GPL(zs_map_object);
 
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 {
+       struct zspage *zspage;
        struct page *page;
-       unsigned long obj, obj_idx, off;
+       unsigned long obj, off;
+       unsigned int obj_idx;
 
        unsigned int class_idx;
        enum fullness_group fg;
@@ -1330,9 +1460,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 
        obj = handle_to_obj(handle);
        obj_to_location(obj, &page, &obj_idx);
-       get_zspage_mapping(get_first_page(page), &class_idx, &fg);
+       zspage = get_zspage(page);
+       get_zspage_mapping(zspage, &class_idx, &fg);
        class = pool->size_class[class_idx];
-       off = obj_idx_to_offset(page, obj_idx, class->size);
+       off = (class->size * obj_idx) & ~PAGE_MASK;
 
        area = this_cpu_ptr(&zs_map_area);
        if (off + class->size <= PAGE_SIZE)
@@ -1347,38 +1478,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
                __zs_unmap_object(area, pages, off, class->size);
        }
        put_cpu_var(zs_map_area);
+
+       migrate_read_unlock(zspage);
        unpin_tag(handle);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
 static unsigned long obj_malloc(struct size_class *class,
-                               struct page *first_page, unsigned long handle)
+                               struct zspage *zspage, unsigned long handle)
 {
+       int i, nr_page, offset;
        unsigned long obj;
        struct link_free *link;
 
        struct page *m_page;
-       unsigned long m_objidx, m_offset;
+       unsigned long m_offset;
        void *vaddr;
 
        handle |= OBJ_ALLOCATED_TAG;
-       obj = (unsigned long)first_page->freelist;
-       obj_to_location(obj, &m_page, &m_objidx);
-       m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+       obj = get_freeobj(zspage);
+
+       offset = obj * class->size;
+       nr_page = offset >> PAGE_SHIFT;
+       m_offset = offset & ~PAGE_MASK;
+       m_page = get_first_page(zspage);
+
+       for (i = 0; i < nr_page; i++)
+               m_page = get_next_page(m_page);
 
        vaddr = kmap_atomic(m_page);
        link = (struct link_free *)vaddr + m_offset / sizeof(*link);
-       first_page->freelist = link->next;
-       if (!class->huge)
+       set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
+       if (likely(!PageHugeObject(m_page)))
                /* record handle in the header of allocated chunk */
                link->handle = handle;
        else
-               /* record handle in first_page->private */
-               set_page_private(first_page, handle);
+               /* record handle to page->index */
+               zspage->first_page->index = handle;
+
        kunmap_atomic(vaddr);
-       first_page->inuse++;
+       mod_zspage_inuse(zspage, 1);
        zs_stat_inc(class, OBJ_USED, 1);
 
+       obj = location_to_obj(m_page, obj);
+
        return obj;
 }
 
@@ -1387,6 +1530,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * zs_malloc - Allocate block of given size from pool.
  * @pool: pool to allocate from
  * @size: size of block to allocate
+ * @gfp: gfp flags when allocating object
  *
  * On success, handle to the allocated object is returned,
  * otherwise 0.
@@ -1396,12 +1540,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
        unsigned long handle, obj;
        struct size_class *class;
-       struct page *first_page;
+       enum fullness_group newfg;
+       struct zspage *zspage;
 
        if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
                return 0;
 
-       handle = alloc_handle(pool, gfp);
+       handle = cache_alloc_handle(pool, gfp);
        if (!handle)
                return 0;
 
@@ -1410,29 +1555,37 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
        class = pool->size_class[get_size_class_index(size)];
 
        spin_lock(&class->lock);
-       first_page = find_get_zspage(class);
-
-       if (!first_page) {
+       zspage = find_get_zspage(class);
+       if (likely(zspage)) {
+               obj = obj_malloc(class, zspage, handle);
+               /* Now move the zspage to another fullness group, if required */
+               fix_fullness_group(class, zspage);
+               record_obj(handle, obj);
                spin_unlock(&class->lock);
-               first_page = alloc_zspage(class, gfp);
-               if (unlikely(!first_page)) {
-                       free_handle(pool, handle);
-                       return 0;
-               }
 
-               set_zspage_mapping(first_page, class->index, ZS_EMPTY);
-               atomic_long_add(class->pages_per_zspage,
-                                       &pool->pages_allocated);
+               return handle;
+       }
 
-               spin_lock(&class->lock);
-               zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-                               class->size, class->pages_per_zspage));
+       spin_unlock(&class->lock);
+
+       zspage = alloc_zspage(pool, class, gfp);
+       if (!zspage) {
+               cache_free_handle(pool, handle);
+               return 0;
        }
 
-       obj = obj_malloc(class, first_page, handle);
-       /* Now move the zspage to another fullness group, if required */
-       fix_fullness_group(class, first_page);
+       spin_lock(&class->lock);
+       obj = obj_malloc(class, zspage, handle);
+       newfg = get_fullness_group(class, zspage);
+       insert_zspage(class, zspage, newfg);
+       set_zspage_mapping(zspage, class->index, newfg);
        record_obj(handle, obj);
+       atomic_long_add(class->pages_per_zspage,
+                               &pool->pages_allocated);
+       zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
+
+       /* We completely set up zspage so mark them as movable */
+       SetZsPageMovable(pool, zspage);
        spin_unlock(&class->lock);
 
        return handle;
@@ -1442,36 +1595,38 @@ EXPORT_SYMBOL_GPL(zs_malloc);
 static void obj_free(struct size_class *class, unsigned long obj)
 {
        struct link_free *link;
-       struct page *first_page, *f_page;
-       unsigned long f_objidx, f_offset;
+       struct zspage *zspage;
+       struct page *f_page;
+       unsigned long f_offset;
+       unsigned int f_objidx;
        void *vaddr;
 
        obj &= ~OBJ_ALLOCATED_TAG;
        obj_to_location(obj, &f_page, &f_objidx);
-       first_page = get_first_page(f_page);
-
-       f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
+       f_offset = (class->size * f_objidx) & ~PAGE_MASK;
+       zspage = get_zspage(f_page);
 
        vaddr = kmap_atomic(f_page);
 
        /* Insert this object in containing zspage's freelist */
        link = (struct link_free *)(vaddr + f_offset);
-       link->next = first_page->freelist;
-       if (class->huge)
-               set_page_private(first_page, 0);
+       link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
        kunmap_atomic(vaddr);
-       first_page->freelist = (void *)obj;
-       first_page->inuse--;
+       set_freeobj(zspage, f_objidx);
+       mod_zspage_inuse(zspage, -1);
        zs_stat_dec(class, OBJ_USED, 1);
 }
 
 void zs_free(struct zs_pool *pool, unsigned long handle)
 {
-       struct page *first_page, *f_page;
-       unsigned long obj, f_objidx;
+       struct zspage *zspage;
+       struct page *f_page;
+       unsigned long obj;
+       unsigned int f_objidx;
        int class_idx;
        struct size_class *class;
        enum fullness_group fullness;
+       bool isolated;
 
        if (unlikely(!handle))
                return;
@@ -1479,25 +1634,31 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
        pin_tag(handle);
        obj = handle_to_obj(handle);
        obj_to_location(obj, &f_page, &f_objidx);
-       first_page = get_first_page(f_page);
+       zspage = get_zspage(f_page);
+
+       migrate_read_lock(zspage);
 
-       get_zspage_mapping(first_page, &class_idx, &fullness);
+       get_zspage_mapping(zspage, &class_idx, &fullness);
        class = pool->size_class[class_idx];
 
        spin_lock(&class->lock);
        obj_free(class, obj);
-       fullness = fix_fullness_group(class, first_page);
-       if (fullness == ZS_EMPTY) {
-               zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-                               class->size, class->pages_per_zspage));
-               atomic_long_sub(class->pages_per_zspage,
-                               &pool->pages_allocated);
-               free_zspage(first_page);
+       fullness = fix_fullness_group(class, zspage);
+       if (fullness != ZS_EMPTY) {
+               migrate_read_unlock(zspage);
+               goto out;
        }
+
+       isolated = is_zspage_isolated(zspage);
+       migrate_read_unlock(zspage);
+       /* If zspage is isolated, zs_page_putback will free the zspage */
+       if (likely(!isolated))
+               free_zspage(pool, class, zspage);
+out:
+
        spin_unlock(&class->lock);
        unpin_tag(handle);
-
-       free_handle(pool, handle);
+       cache_free_handle(pool, handle);
 }
 EXPORT_SYMBOL_GPL(zs_free);
 
@@ -1505,7 +1666,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
                                unsigned long src)
 {
        struct page *s_page, *d_page;
-       unsigned long s_objidx, d_objidx;
+       unsigned int s_objidx, d_objidx;
        unsigned long s_off, d_off;
        void *s_addr, *d_addr;
        int s_size, d_size, size;
@@ -1516,8 +1677,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
        obj_to_location(src, &s_page, &s_objidx);
        obj_to_location(dst, &d_page, &d_objidx);
 
-       s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
-       d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+       s_off = (class->size * s_objidx) & ~PAGE_MASK;
+       d_off = (class->size * d_objidx) & ~PAGE_MASK;
 
        if (s_off + class->size > PAGE_SIZE)
                s_size = PAGE_SIZE - s_off;
@@ -1569,19 +1730,19 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
  * return handle.
  */
 static unsigned long find_alloced_obj(struct size_class *class,
-                                       struct page *page, int index)
+                                       struct page *page, int *obj_idx)
 {
        unsigned long head;
        int offset = 0;
+       int index = *obj_idx;
        unsigned long handle = 0;
        void *addr = kmap_atomic(page);
 
-       if (!is_first_page(page))
-               offset = page->index;
+       offset = get_first_obj_offset(page);
        offset += class->size * index;
 
        while (offset < PAGE_SIZE) {
-               head = obj_to_head(class, page, addr + offset);
+               head = obj_to_head(page, addr + offset);
                if (head & OBJ_ALLOCATED_TAG) {
                        handle = head & ~OBJ_ALLOCATED_TAG;
                        if (trypin_tag(handle))
@@ -1594,18 +1755,21 @@ static unsigned long find_alloced_obj(struct size_class *class,
        }
 
        kunmap_atomic(addr);
+
+       *obj_idx = index;
+
        return handle;
 }
 
 struct zs_compact_control {
-       /* Source page for migration which could be a subpage of zspage. */
+       /* Source spage for migration which could be a subpage of zspage */
        struct page *s_page;
        /* Destination page for migration which should be a first page
         * of zspage. */
        struct page *d_page;
         /* Starting object index within @s_page which used for live object
          * in the subpage. */
-       int index;
+       int obj_idx;
 };
 
 static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1615,30 +1779,30 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
        unsigned long handle;
        struct page *s_page = cc->s_page;
        struct page *d_page = cc->d_page;
-       unsigned long index = cc->index;
+       int obj_idx = cc->obj_idx;
        int ret = 0;
 
        while (1) {
-               handle = find_alloced_obj(class, s_page, index);
+               handle = find_alloced_obj(class, s_page, &obj_idx);
                if (!handle) {
                        s_page = get_next_page(s_page);
                        if (!s_page)
                                break;
-                       index = 0;
+                       obj_idx = 0;
                        continue;
                }
 
                /* Stop if there is no more space */
-               if (zspage_full(class, d_page)) {
+               if (zspage_full(class, get_zspage(d_page))) {
                        unpin_tag(handle);
                        ret = -ENOMEM;
                        break;
                }
 
                used_obj = handle_to_obj(handle);
-               free_obj = obj_malloc(class, d_page, handle);
+               free_obj = obj_malloc(class, get_zspage(d_page), handle);
                zs_object_copy(class, free_obj, used_obj);
-               index++;
+               obj_idx++;
                /*
                 * record_obj updates handle's value to free_obj and it will
                 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
@@ -1653,73 +1817,426 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 
        /* Remember last position in this iteration */
        cc->s_page = s_page;
-       cc->index = index;
+       cc->obj_idx = obj_idx;
 
        return ret;
 }
 
-static struct page *isolate_target_page(struct size_class *class)
+static struct zspage *isolate_zspage(struct size_class *class, bool source)
 {
        int i;
-       struct page *page;
+       struct zspage *zspage;
+       enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
 
-       for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
-               page = class->fullness_list[i];
-               if (page) {
-                       remove_zspage(class, i, page);
-                       break;
+       if (!source) {
+               fg[0] = ZS_ALMOST_FULL;
+               fg[1] = ZS_ALMOST_EMPTY;
+       }
+
+       for (i = 0; i < 2; i++) {
+               zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
+                                                       struct zspage, list);
+               if (zspage) {
+                       VM_BUG_ON(is_zspage_isolated(zspage));
+                       remove_zspage(class, zspage, fg[i]);
+                       return zspage;
                }
        }
 
-       return page;
+       return zspage;
 }
 
 /*
- * putback_zspage - add @first_page into right class's fullness list
- * @pool: target pool
+ * putback_zspage - add @zspage into right class's fullness list
  * @class: destination class
- * @first_page: target page
+ * @zspage: target page
  *
- * Return @fist_page's fullness_group
+ * Return @zspage's fullness_group
  */
-static enum fullness_group putback_zspage(struct zs_pool *pool,
-                       struct size_class *class,
-                       struct page *first_page)
+static enum fullness_group putback_zspage(struct size_class *class,
+                       struct zspage *zspage)
 {
        enum fullness_group fullness;
 
-       fullness = get_fullness_group(class, first_page);
-       insert_zspage(class, fullness, first_page);
-       set_zspage_mapping(first_page, class->index, fullness);
+       VM_BUG_ON(is_zspage_isolated(zspage));
 
-       if (fullness == ZS_EMPTY) {
-               zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
-                       class->size, class->pages_per_zspage));
-               atomic_long_sub(class->pages_per_zspage,
-                               &pool->pages_allocated);
+       fullness = get_fullness_group(class, zspage);
+       insert_zspage(class, zspage, fullness);
+       set_zspage_mapping(zspage, class->index, fullness);
+
+       return fullness;
+}
+
+#ifdef CONFIG_COMPACTION
+static struct dentry *zs_mount(struct file_system_type *fs_type,
+                               int flags, const char *dev_name, void *data)
+{
+       static const struct dentry_operations ops = {
+               .d_dname = simple_dname,
+       };
 
-               free_zspage(first_page);
+       return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+}
+
+static struct file_system_type zsmalloc_fs = {
+       .name           = "zsmalloc",
+       .mount          = zs_mount,
+       .kill_sb        = kill_anon_super,
+};
+
+static int zsmalloc_mount(void)
+{
+       int ret = 0;
+
+       zsmalloc_mnt = kern_mount(&zsmalloc_fs);
+       if (IS_ERR(zsmalloc_mnt))
+               ret = PTR_ERR(zsmalloc_mnt);
+
+       return ret;
+}
+
+static void zsmalloc_unmount(void)
+{
+       kern_unmount(zsmalloc_mnt);
+}
+
+static void migrate_lock_init(struct zspage *zspage)
+{
+       rwlock_init(&zspage->lock);
+}
+
+static void migrate_read_lock(struct zspage *zspage)
+{
+       read_lock(&zspage->lock);
+}
+
+static void migrate_read_unlock(struct zspage *zspage)
+{
+       read_unlock(&zspage->lock);
+}
+
+static void migrate_write_lock(struct zspage *zspage)
+{
+       write_lock(&zspage->lock);
+}
+
+static void migrate_write_unlock(struct zspage *zspage)
+{
+       write_unlock(&zspage->lock);
+}
+
+/* Number of isolated subpage for *page migration* in this zspage */
+static void inc_zspage_isolation(struct zspage *zspage)
+{
+       zspage->isolated++;
+}
+
+static void dec_zspage_isolation(struct zspage *zspage)
+{
+       zspage->isolated--;
+}
+
+static void replace_sub_page(struct size_class *class, struct zspage *zspage,
+                               struct page *newpage, struct page *oldpage)
+{
+       struct page *page;
+       struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
+       int idx = 0;
+
+       page = get_first_page(zspage);
+       do {
+               if (page == oldpage)
+                       pages[idx] = newpage;
+               else
+                       pages[idx] = page;
+               idx++;
+       } while ((page = get_next_page(page)) != NULL);
+
+       create_page_chain(class, zspage, pages);
+       set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
+       if (unlikely(PageHugeObject(oldpage)))
+               newpage->index = oldpage->index;
+       __SetPageMovable(newpage, page_mapping(oldpage));
+}
+
+bool zs_page_isolate(struct page *page, isolate_mode_t mode)
+{
+       struct zs_pool *pool;
+       struct size_class *class;
+       int class_idx;
+       enum fullness_group fullness;
+       struct zspage *zspage;
+       struct address_space *mapping;
+
+       /*
+        * Page is locked so zspage couldn't be destroyed. For detail, look at
+        * lock_zspage in free_zspage.
+        */
+       VM_BUG_ON_PAGE(!PageMovable(page), page);
+       VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+       zspage = get_zspage(page);
+
+       /*
+        * Without class lock, fullness could be stale while class_idx is okay
+        * because class_idx is constant unless page is freed so we should get
+        * fullness again under class lock.
+        */
+       get_zspage_mapping(zspage, &class_idx, &fullness);
+       mapping = page_mapping(page);
+       pool = mapping->private_data;
+       class = pool->size_class[class_idx];
+
+       spin_lock(&class->lock);
+       if (get_zspage_inuse(zspage) == 0) {
+               spin_unlock(&class->lock);
+               return false;
        }
 
-       return fullness;
+       /* zspage is isolated for object migration */
+       if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+               spin_unlock(&class->lock);
+               return false;
+       }
+
+       /*
+        * If this is first time isolation for the zspage, isolate zspage from
+        * size_class to prevent further object allocation from the zspage.
+        */
+       if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
+               get_zspage_mapping(zspage, &class_idx, &fullness);
+               remove_zspage(class, zspage, fullness);
+       }
+
+       inc_zspage_isolation(zspage);
+       spin_unlock(&class->lock);
+
+       return true;
+}
+
+int zs_page_migrate(struct address_space *mapping, struct page *newpage,
+               struct page *page, enum migrate_mode mode)
+{
+       struct zs_pool *pool;
+       struct size_class *class;
+       int class_idx;
+       enum fullness_group fullness;
+       struct zspage *zspage;
+       struct page *dummy;
+       void *s_addr, *d_addr, *addr;
+       int offset, pos;
+       unsigned long handle, head;
+       unsigned long old_obj, new_obj;
+       unsigned int obj_idx;
+       int ret = -EAGAIN;
+
+       VM_BUG_ON_PAGE(!PageMovable(page), page);
+       VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+       zspage = get_zspage(page);
+
+       /* Concurrent compactor cannot migrate any subpage in zspage */
+       migrate_write_lock(zspage);
+       get_zspage_mapping(zspage, &class_idx, &fullness);
+       pool = mapping->private_data;
+       class = pool->size_class[class_idx];
+       offset = get_first_obj_offset(page);
+
+       spin_lock(&class->lock);
+       if (!get_zspage_inuse(zspage)) {
+               ret = -EBUSY;
+               goto unlock_class;
+       }
+
+       pos = offset;
+       s_addr = kmap_atomic(page);
+       while (pos < PAGE_SIZE) {
+               head = obj_to_head(page, s_addr + pos);
+               if (head & OBJ_ALLOCATED_TAG) {
+                       handle = head & ~OBJ_ALLOCATED_TAG;
+                       if (!trypin_tag(handle))
+                               goto unpin_objects;
+               }
+               pos += class->size;
+       }
+
+       /*
+        * Here, any user cannot access all objects in the zspage so let's move.
+        */
+       d_addr = kmap_atomic(newpage);
+       memcpy(d_addr, s_addr, PAGE_SIZE);
+       kunmap_atomic(d_addr);
+
+       for (addr = s_addr + offset; addr < s_addr + pos;
+                                       addr += class->size) {
+               head = obj_to_head(page, addr);
+               if (head & OBJ_ALLOCATED_TAG) {
+                       handle = head & ~OBJ_ALLOCATED_TAG;
+                       if (!testpin_tag(handle))
+                               BUG();
+
+                       old_obj = handle_to_obj(handle);
+                       obj_to_location(old_obj, &dummy, &obj_idx);
+                       new_obj = (unsigned long)location_to_obj(newpage,
+                                                               obj_idx);
+                       new_obj |= BIT(HANDLE_PIN_BIT);
+                       record_obj(handle, new_obj);
+               }
+       }
+
+       replace_sub_page(class, zspage, newpage, page);
+       get_page(newpage);
+
+       dec_zspage_isolation(zspage);
+
+       /*
+        * Page migration is done so let's putback isolated zspage to
+        * the list if @page is final isolated subpage in the zspage.
+        */
+       if (!is_zspage_isolated(zspage))
+               putback_zspage(class, zspage);
+
+       reset_page(page);
+       put_page(page);
+       page = newpage;
+
+       ret = MIGRATEPAGE_SUCCESS;
+unpin_objects:
+       for (addr = s_addr + offset; addr < s_addr + pos;
+                                               addr += class->size) {
+               head = obj_to_head(page, addr);
+               if (head & OBJ_ALLOCATED_TAG) {
+                       handle = head & ~OBJ_ALLOCATED_TAG;
+                       if (!testpin_tag(handle))
+                               BUG();
+                       unpin_tag(handle);
+               }
+       }
+       kunmap_atomic(s_addr);
+unlock_class:
+       spin_unlock(&class->lock);
+       migrate_write_unlock(zspage);
+
+       return ret;
 }
 
-static struct page *isolate_source_page(struct size_class *class)
+void zs_page_putback(struct page *page)
+{
+       struct zs_pool *pool;
+       struct size_class *class;
+       int class_idx;
+       enum fullness_group fg;
+       struct address_space *mapping;
+       struct zspage *zspage;
+
+       VM_BUG_ON_PAGE(!PageMovable(page), page);
+       VM_BUG_ON_PAGE(!PageIsolated(page), page);
+
+       zspage = get_zspage(page);
+       get_zspage_mapping(zspage, &class_idx, &fg);
+       mapping = page_mapping(page);
+       pool = mapping->private_data;
+       class = pool->size_class[class_idx];
+
+       spin_lock(&class->lock);
+       dec_zspage_isolation(zspage);
+       if (!is_zspage_isolated(zspage)) {
+               fg = putback_zspage(class, zspage);
+               /*
+                * Due to page_lock, we cannot free zspage immediately
+                * so let's defer.
+                */
+               if (fg == ZS_EMPTY)
+                       schedule_work(&pool->free_work);
+       }
+       spin_unlock(&class->lock);
+}
+
+const struct address_space_operations zsmalloc_aops = {
+       .isolate_page = zs_page_isolate,
+       .migratepage = zs_page_migrate,
+       .putback_page = zs_page_putback,
+};
+
+static int zs_register_migration(struct zs_pool *pool)
+{
+       pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
+       if (IS_ERR(pool->inode)) {
+               pool->inode = NULL;
+               return 1;
+       }
+
+       pool->inode->i_mapping->private_data = pool;
+       pool->inode->i_mapping->a_ops = &zsmalloc_aops;
+       return 0;
+}
+
+static void zs_unregister_migration(struct zs_pool *pool)
+{
+       flush_work(&pool->free_work);
+       iput(pool->inode);
+}
+
+/*
+ * Caller should hold page_lock of all pages in the zspage
+ * In here, we cannot use zspage meta data.
+ */
+static void async_free_zspage(struct work_struct *work)
 {
        int i;
-       struct page *page = NULL;
+       struct size_class *class;
+       unsigned int class_idx;
+       enum fullness_group fullness;
+       struct zspage *zspage, *tmp;
+       LIST_HEAD(free_pages);
+       struct zs_pool *pool = container_of(work, struct zs_pool,
+                                       free_work);
 
-       for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
-               page = class->fullness_list[i];
-               if (!page)
+       for (i = 0; i < zs_size_classes; i++) {
+               class = pool->size_class[i];
+               if (class->index != i)
                        continue;
 
-               remove_zspage(class, i, page);
-               break;
+               spin_lock(&class->lock);
+               list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
+               spin_unlock(&class->lock);
+       }
+
+
+       list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
+               list_del(&zspage->list);
+               lock_zspage(zspage);
+
+               get_zspage_mapping(zspage, &class_idx, &fullness);
+               VM_BUG_ON(fullness != ZS_EMPTY);
+               class = pool->size_class[class_idx];
+               spin_lock(&class->lock);
+               __free_zspage(pool, pool->size_class[class_idx], zspage);
+               spin_unlock(&class->lock);
        }
+};
+
+static void kick_deferred_free(struct zs_pool *pool)
+{
+       schedule_work(&pool->free_work);
+}
+
+static void init_deferred_free(struct zs_pool *pool)
+{
+       INIT_WORK(&pool->free_work, async_free_zspage);
+}
+
+static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
+{
+       struct page *page = get_first_page(zspage);
 
-       return page;
+       do {
+               WARN_ON(!trylock_page(page));
+               __SetPageMovable(page, pool->inode->i_mapping);
+               unlock_page(page);
+       } while ((page = get_next_page(page)) != NULL);
 }
+#endif
 
 /*
  *
@@ -1736,8 +2253,7 @@ static unsigned long zs_can_compact(struct size_class *class)
                return 0;
 
        obj_wasted = obj_allocated - obj_used;
-       obj_wasted /= get_maxobj_per_zspage(class->size,
-                       class->pages_per_zspage);
+       obj_wasted /= class->objs_per_zspage;
 
        return obj_wasted * class->pages_per_zspage;
 }
@@ -1745,20 +2261,20 @@ static unsigned long zs_can_compact(struct size_class *class)
 static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 {
        struct zs_compact_control cc;
-       struct page *src_page;
-       struct page *dst_page = NULL;
+       struct zspage *src_zspage;
+       struct zspage *dst_zspage = NULL;
 
        spin_lock(&class->lock);
-       while ((src_page = isolate_source_page(class))) {
+       while ((src_zspage = isolate_zspage(class, true))) {
 
                if (!zs_can_compact(class))
                        break;
 
-               cc.index = 0;
-               cc.s_page = src_page;
+               cc.obj_idx = 0;
+               cc.s_page = get_first_page(src_zspage);
 
-               while ((dst_page = isolate_target_page(class))) {
-                       cc.d_page = dst_page;
+               while ((dst_zspage = isolate_zspage(class, false))) {
+                       cc.d_page = get_first_page(dst_zspage);
                        /*
                         * If there is no more space in dst_page, resched
                         * and see if anyone had allocated another zspage.
@@ -1766,23 +2282,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
                        if (!migrate_zspage(pool, class, &cc))
                                break;
 
-                       putback_zspage(pool, class, dst_page);
+                       putback_zspage(class, dst_zspage);
                }
 
                /* Stop if we couldn't find slot */
-               if (dst_page == NULL)
+               if (dst_zspage == NULL)
                        break;
 
-               putback_zspage(pool, class, dst_page);
-               if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+               putback_zspage(class, dst_zspage);
+               if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
+                       free_zspage(pool, class, src_zspage);
                        pool->stats.pages_compacted += class->pages_per_zspage;
+               }
                spin_unlock(&class->lock);
                cond_resched();
                spin_lock(&class->lock);
        }
 
-       if (src_page)
-               putback_zspage(pool, class, src_page);
+       if (src_zspage)
+               putback_zspage(class, src_zspage);
 
        spin_unlock(&class->lock);
 }
@@ -1871,7 +2389,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
 
 /**
  * zs_create_pool - Creates an allocation pool to work from.
- * @flags: allocation flags used to allocate pool metadata
+ * @name: pool name to be created
  *
  * This function must be called before anything when using
  * the zsmalloc allocator.
@@ -1889,6 +2407,7 @@ struct zs_pool *zs_create_pool(const char *name)
        if (!pool)
                return NULL;
 
+       init_deferred_free(pool);
        pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
                        GFP_KERNEL);
        if (!pool->size_class) {
@@ -1900,7 +2419,7 @@ struct zs_pool *zs_create_pool(const char *name)
        if (!pool->name)
                goto err;
 
-       if (create_handle_cache(pool))
+       if (create_cache(pool))
                goto err;
 
        /*
@@ -1910,12 +2429,15 @@ struct zs_pool *zs_create_pool(const char *name)
        for (i = zs_size_classes - 1; i >= 0; i--) {
                int size;
                int pages_per_zspage;
+               int objs_per_zspage;
                struct size_class *class;
+               int fullness = 0;
 
                size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
                if (size > ZS_MAX_ALLOC_SIZE)
                        size = ZS_MAX_ALLOC_SIZE;
                pages_per_zspage = get_pages_per_zspage(size);
+               objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
 
                /*
                 * size_class is used for normal zsmalloc operation such
@@ -1927,7 +2449,7 @@ struct zs_pool *zs_create_pool(const char *name)
                 * previous size_class if possible.
                 */
                if (prev_class) {
-                       if (can_merge(prev_class, size, pages_per_zspage)) {
+                       if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) {
                                pool->size_class[i] = prev_class;
                                continue;
                        }
@@ -1940,12 +2462,12 @@ struct zs_pool *zs_create_pool(const char *name)
                class->size = size;
                class->index = i;
                class->pages_per_zspage = pages_per_zspage;
-               class->objs_per_zspage = class->pages_per_zspage *
-                                               PAGE_SIZE / class->size;
-               if (pages_per_zspage == 1 && class->objs_per_zspage == 1)
-                       class->huge = true;
+               class->objs_per_zspage = objs_per_zspage;
                spin_lock_init(&class->lock);
                pool->size_class[i] = class;
+               for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
+                                                       fullness++)
+                       INIT_LIST_HEAD(&class->fullness_list[fullness]);
 
                prev_class = class;
        }
@@ -1953,6 +2475,9 @@ struct zs_pool *zs_create_pool(const char *name)
        /* debug only, don't abort if it fails */
        zs_pool_stat_create(pool, name);
 
+       if (zs_register_migration(pool))
+               goto err;
+
        /*
         * Not critical, we still can use the pool
         * and user can trigger compaction manually.
@@ -1972,6 +2497,7 @@ void zs_destroy_pool(struct zs_pool *pool)
        int i;
 
        zs_unregister_shrinker(pool);
+       zs_unregister_migration(pool);
        zs_pool_stat_destroy(pool);
 
        for (i = 0; i < zs_size_classes; i++) {
@@ -1984,8 +2510,8 @@ void zs_destroy_pool(struct zs_pool *pool)
                if (class->index != i)
                        continue;
 
-               for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
-                       if (class->fullness_list[fg]) {
+               for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
+                       if (!list_empty(&class->fullness_list[fg])) {
                                pr_info("Freeing non-empty class with size %db, fullness group %d\n",
                                        class->size, fg);
                        }
@@ -1993,7 +2519,7 @@ void zs_destroy_pool(struct zs_pool *pool)
                kfree(class);
        }
 
-       destroy_handle_cache(pool);
+       destroy_cache(pool);
        kfree(pool->size_class);
        kfree(pool->name);
        kfree(pool);
@@ -2002,7 +2528,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
 
 static int __init zs_init(void)
 {
-       int ret = zs_register_cpu_notifier();
+       int ret;
+
+       ret = zsmalloc_mount();
+       if (ret)
+               goto out;
+
+       ret = zs_register_cpu_notifier();
 
        if (ret)
                goto notifier_fail;
@@ -2019,7 +2551,8 @@ static int __init zs_init(void)
 
 notifier_fail:
        zs_unregister_cpu_notifier();
-
+       zsmalloc_unmount();
+out:
        return ret;
 }
 
@@ -2028,6 +2561,7 @@ static void __exit zs_exit(void)
 #ifdef CONFIG_ZPOOL
        zpool_unregister_driver(&zs_zpool_driver);
 #endif
+       zsmalloc_unmount();
        zs_unregister_cpu_notifier();
 
        zs_stat_exit();