tile: fix typos in comment in arch/tile/kernel/unaligned.c
[cascardo/linux.git] / mm / swapfile.c
index 6cf2e60..3963fc2 100644 (file)
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si,
        }
 }
 
-static int wait_for_discard(void *word)
+#define SWAPFILE_CLUSTER       256
+#define LATENCY_LIMIT          256
+
+static inline void cluster_set_flag(struct swap_cluster_info *info,
+       unsigned int flag)
 {
-       schedule();
-       return 0;
+       info->flags = flag;
 }
 
-#define SWAPFILE_CLUSTER       256
-#define LATENCY_LIMIT          256
+static inline unsigned int cluster_count(struct swap_cluster_info *info)
+{
+       return info->data;
+}
+
+static inline void cluster_set_count(struct swap_cluster_info *info,
+                                    unsigned int c)
+{
+       info->data = c;
+}
+
+static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+                                        unsigned int c, unsigned int f)
+{
+       info->flags = f;
+       info->data = c;
+}
+
+static inline unsigned int cluster_next(struct swap_cluster_info *info)
+{
+       return info->data;
+}
+
+static inline void cluster_set_next(struct swap_cluster_info *info,
+                                   unsigned int n)
+{
+       info->data = n;
+}
+
+static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+                                        unsigned int n, unsigned int f)
+{
+       info->flags = f;
+       info->data = n;
+}
+
+static inline bool cluster_is_free(struct swap_cluster_info *info)
+{
+       return info->flags & CLUSTER_FLAG_FREE;
+}
+
+static inline bool cluster_is_null(struct swap_cluster_info *info)
+{
+       return info->flags & CLUSTER_FLAG_NEXT_NULL;
+}
+
+static inline void cluster_set_null(struct swap_cluster_info *info)
+{
+       info->flags = CLUSTER_FLAG_NEXT_NULL;
+       info->data = 0;
+}
+
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+               unsigned int idx)
+{
+       /*
+        * If scan_swap_map() can't find a free cluster, it will check
+        * si->swap_map directly. To make sure the discarding cluster isn't
+        * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+        * will be cleared after discard
+        */
+       memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                       SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+       if (cluster_is_null(&si->discard_cluster_head)) {
+               cluster_set_next_flag(&si->discard_cluster_head,
+                                               idx, 0);
+               cluster_set_next_flag(&si->discard_cluster_tail,
+                                               idx, 0);
+       } else {
+               unsigned int tail = cluster_next(&si->discard_cluster_tail);
+               cluster_set_next(&si->cluster_info[tail], idx);
+               cluster_set_next_flag(&si->discard_cluster_tail,
+                                               idx, 0);
+       }
+
+       schedule_work(&si->discard_work);
+}
+
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+       struct swap_cluster_info *info;
+       unsigned int idx;
+
+       info = si->cluster_info;
+
+       while (!cluster_is_null(&si->discard_cluster_head)) {
+               idx = cluster_next(&si->discard_cluster_head);
+
+               cluster_set_next_flag(&si->discard_cluster_head,
+                                               cluster_next(&info[idx]), 0);
+               if (cluster_next(&si->discard_cluster_tail) == idx) {
+                       cluster_set_null(&si->discard_cluster_head);
+                       cluster_set_null(&si->discard_cluster_tail);
+               }
+               spin_unlock(&si->lock);
+
+               discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+                               SWAPFILE_CLUSTER);
+
+               spin_lock(&si->lock);
+               cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+               if (cluster_is_null(&si->free_cluster_head)) {
+                       cluster_set_next_flag(&si->free_cluster_head,
+                                               idx, 0);
+                       cluster_set_next_flag(&si->free_cluster_tail,
+                                               idx, 0);
+               } else {
+                       unsigned int tail;
+
+                       tail = cluster_next(&si->free_cluster_tail);
+                       cluster_set_next(&info[tail], idx);
+                       cluster_set_next_flag(&si->free_cluster_tail,
+                                               idx, 0);
+               }
+               memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+                               0, SWAPFILE_CLUSTER);
+       }
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+       struct swap_info_struct *si;
+
+       si = container_of(work, struct swap_info_struct, discard_work);
+
+       spin_lock(&si->lock);
+       swap_do_scheduled_discard(si);
+       spin_unlock(&si->lock);
+}
+
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+       struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+       if (!cluster_info)
+               return;
+       if (cluster_is_free(&cluster_info[idx])) {
+               VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
+               cluster_set_next_flag(&p->free_cluster_head,
+                       cluster_next(&cluster_info[idx]), 0);
+               if (cluster_next(&p->free_cluster_tail) == idx) {
+                       cluster_set_null(&p->free_cluster_tail);
+                       cluster_set_null(&p->free_cluster_head);
+               }
+               cluster_set_count_flag(&cluster_info[idx], 0, 0);
+       }
+
+       VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+       cluster_set_count(&cluster_info[idx],
+               cluster_count(&cluster_info[idx]) + 1);
+}
+
+/*
+ * The cluster corresponding to page_nr decreases one usage. If the usage
+ * counter becomes 0, which means no page in the cluster is in using, we can
+ * optionally discard the cluster and add it to free cluster list.
+ */
+static void dec_cluster_info_page(struct swap_info_struct *p,
+       struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+       unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+       if (!cluster_info)
+               return;
+
+       VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+       cluster_set_count(&cluster_info[idx],
+               cluster_count(&cluster_info[idx]) - 1);
+
+       if (cluster_count(&cluster_info[idx]) == 0) {
+               /*
+                * If the swap is discardable, prepare discard the cluster
+                * instead of free it immediately. The cluster will be freed
+                * after discard.
+                */
+               if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+                                (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+                       swap_cluster_schedule_discard(p, idx);
+                       return;
+               }
+
+               cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+               if (cluster_is_null(&p->free_cluster_head)) {
+                       cluster_set_next_flag(&p->free_cluster_head, idx, 0);
+                       cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+               } else {
+                       unsigned int tail = cluster_next(&p->free_cluster_tail);
+                       cluster_set_next(&cluster_info[tail], idx);
+                       cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+               }
+       }
+}
+
+/*
+ * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * cluster list. Avoiding such abuse to avoid list corruption.
+ */
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
+       unsigned long offset)
+{
+       struct percpu_cluster *percpu_cluster;
+       bool conflict;
+
+       offset /= SWAPFILE_CLUSTER;
+       conflict = !cluster_is_null(&si->free_cluster_head) &&
+               offset != cluster_next(&si->free_cluster_head) &&
+               cluster_is_free(&si->cluster_info[offset]);
+
+       if (!conflict)
+               return false;
+
+       percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+       cluster_set_null(&percpu_cluster->index);
+       return true;
+}
+
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+       unsigned long *offset, unsigned long *scan_base)
+{
+       struct percpu_cluster *cluster;
+       bool found_free;
+       unsigned long tmp;
+
+new_cluster:
+       cluster = this_cpu_ptr(si->percpu_cluster);
+       if (cluster_is_null(&cluster->index)) {
+               if (!cluster_is_null(&si->free_cluster_head)) {
+                       cluster->index = si->free_cluster_head;
+                       cluster->next = cluster_next(&cluster->index) *
+                                       SWAPFILE_CLUSTER;
+               } else if (!cluster_is_null(&si->discard_cluster_head)) {
+                       /*
+                        * we don't have free cluster but have some clusters in
+                        * discarding, do discard now and reclaim them
+                        */
+                       swap_do_scheduled_discard(si);
+                       *scan_base = *offset = si->cluster_next;
+                       goto new_cluster;
+               } else
+                       return;
+       }
+
+       found_free = false;
+
+       /*
+        * Other CPUs can use our cluster if they can't find a free cluster,
+        * check if there is still free entry in the cluster
+        */
+       tmp = cluster->next;
+       while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
+              SWAPFILE_CLUSTER) {
+               if (!si->swap_map[tmp]) {
+                       found_free = true;
+                       break;
+               }
+               tmp++;
+       }
+       if (!found_free) {
+               cluster_set_null(&cluster->index);
+               goto new_cluster;
+       }
+       cluster->next = tmp + 1;
+       *offset = tmp;
+       *scan_base = tmp;
+}
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
                                   unsigned char usage)
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
        int latency_ration = LATENCY_LIMIT;
-       int found_free_cluster = 0;
 
        /*
         * We try to cluster swap pages by allocating them sequentially
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
        si->flags += SWP_SCANNING;
        scan_base = offset = si->cluster_next;
 
+       /* SSD algorithm */
+       if (si->cluster_info) {
+               scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+               goto checks;
+       }
+
        if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                }
-               if (si->flags & SWP_PAGE_DISCARD) {
-                       /*
-                        * Start range check on racing allocations, in case
-                        * they overlap the cluster we eventually decide on
-                        * (we scan without swap_lock to allow preemption).
-                        * It's hardly conceivable that cluster_nr could be
-                        * wrapped during our scan, but don't depend on it.
-                        */
-                       if (si->lowest_alloc)
-                               goto checks;
-                       si->lowest_alloc = si->max;
-                       si->highest_alloc = 0;
-               }
+
                spin_unlock(&si->lock);
 
                /*
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                               found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                                offset -= SWAPFILE_CLUSTER - 1;
                                si->cluster_next = offset;
                                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-                               found_free_cluster = 1;
                                goto checks;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
                offset = scan_base;
                spin_lock(&si->lock);
                si->cluster_nr = SWAPFILE_CLUSTER - 1;
-               si->lowest_alloc = 0;
        }
 
 checks:
+       if (si->cluster_info) {
+               while (scan_swap_map_ssd_cluster_conflict(si, offset))
+                       scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+       }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
@@ -317,62 +593,10 @@ checks:
                si->highest_bit = 0;
        }
        si->swap_map[offset] = usage;
+       inc_cluster_info_page(si, si->cluster_info, offset);
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
 
-       if (si->lowest_alloc) {
-               /*
-                * Only set when SWP_PAGE_DISCARD, and there's a scan
-                * for a free cluster in progress or just completed.
-                */
-               if (found_free_cluster) {
-                       /*
-                        * To optimize wear-levelling, discard the
-                        * old data of the cluster, taking care not to
-                        * discard any of its pages that have already
-                        * been allocated by racing tasks (offset has
-                        * already stepped over any at the beginning).
-                        */
-                       if (offset < si->highest_alloc &&
-                           si->lowest_alloc <= last_in_cluster)
-                               last_in_cluster = si->lowest_alloc - 1;
-                       si->flags |= SWP_DISCARDING;
-                       spin_unlock(&si->lock);
-
-                       if (offset < last_in_cluster)
-                               discard_swap_cluster(si, offset,
-                                       last_in_cluster - offset + 1);
-
-                       spin_lock(&si->lock);
-                       si->lowest_alloc = 0;
-                       si->flags &= ~SWP_DISCARDING;
-
-                       smp_mb();       /* wake_up_bit advises this */
-                       wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
-
-               } else if (si->flags & SWP_DISCARDING) {
-                       /*
-                        * Delay using pages allocated by racing tasks
-                        * until the whole discard has been issued. We
-                        * could defer that delay until swap_writepage,
-                        * but it's easier to keep this self-contained.
-                        */
-                       spin_unlock(&si->lock);
-                       wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
-                               wait_for_discard, TASK_UNINTERRUPTIBLE);
-                       spin_lock(&si->lock);
-               } else {
-                       /*
-                        * Note pages allocated by racing tasks while
-                        * scan for a free cluster is in progress, so
-                        * that its final discard can exclude them.
-                        */
-                       if (offset < si->lowest_alloc)
-                               si->lowest_alloc = offset;
-                       if (offset > si->highest_alloc)
-                               si->highest_alloc = offset;
-               }
-       }
        return offset;
 
 scan:
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
        return p;
 
 bad_free:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+       pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
        goto out;
 bad_offset:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+       pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
        goto out;
 bad_device:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+       pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
        goto out;
 bad_nofile:
-       printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+       pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
        return NULL;
 }
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 
        /* free if no reference */
        if (!usage) {
+               dec_cluster_info_page(p, p->cluster_info, offset);
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        else
                                continue;
                }
-               count = si->swap_map[i];
+               count = ACCESS_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
        }
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
 {
        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
-       unsigned char *swap_map;
+       volatile unsigned char *swap_map; /* swap_map is accessed without
+                                          * locking. Mark it as volatile
+                                          * to prevent compiler doing
+                                          * something odd.
+                                          */
        unsigned char swcount;
        struct page *page;
        swp_entry_t entry;
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
                         * reused since sys_swapoff() already disabled
                         * allocation from here, or alloc_page() failed.
                         */
-                       if (!*swap_map)
+                       swcount = *swap_map;
+                       /*
+                        * We don't hold lock here, so the swap entry could be
+                        * SWAP_MAP_BAD (when the cluster is discarding).
+                        * Instead of fail out, We can just skip the swap
+                        * entry because swapoff will wait for discarding
+                        * finish anyway.
+                        */
+                       if (!swcount || swcount == SWAP_MAP_BAD)
                                continue;
                        retval = -ENOMEM;
                        break;
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 }
 
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
-                               unsigned char *swap_map)
+                               unsigned char *swap_map,
+                               struct swap_cluster_info *cluster_info)
 {
        int i, prev;
 
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
+       p->cluster_info = cluster_info;
        p->flags |= SWP_WRITEOK;
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
                                unsigned char *swap_map,
+                               struct swap_cluster_info *cluster_info,
                                unsigned long *frontswap_map)
 {
        frontswap_init(p->type, frontswap_map);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
-        _enable_swap_info(p, prio, swap_map);
+        _enable_swap_info(p, prio, swap_map, cluster_info);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 {
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
-       _enable_swap_info(p, p->prio, p->swap_map);
+       _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
 }
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
        struct swap_info_struct *p = NULL;
        unsigned char *swap_map;
+       struct swap_cluster_info *cluster_info;
        unsigned long *frontswap_map;
        struct file *swap_file, *victim;
        struct address_space *mapping;
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                goto out_dput;
        }
 
+       flush_work(&p->discard_work);
+
        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->max = 0;
        swap_map = p->swap_map;
        p->swap_map = NULL;
+       cluster_info = p->cluster_info;
+       p->cluster_info = NULL;
        p->flags = 0;
        frontswap_map = frontswap_map_get(p);
        frontswap_map_set(p, NULL);
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        frontswap_invalidate_area(type);
        mutex_unlock(&swapon_mutex);
+       free_percpu(p->percpu_cluster);
+       p->percpu_cluster = NULL;
        vfree(swap_map);
+       vfree(cluster_info);
        vfree(frontswap_map);
        /* Destroy swap account informatin */
        swap_cgroup_swapoff(type);
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        int i;
        unsigned long maxpages;
        unsigned long swapfilepages;
+       unsigned long last_page;
 
        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
-               printk(KERN_ERR "Unable to find swap-space signature\n");
+               pr_err("Unable to find swap-space signature\n");
                return 0;
        }
 
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        }
        /* Check the swap header's sub-version */
        if (swap_header->info.version != 1) {
-               printk(KERN_WARNING
-                      "Unable to handle swap header version %d\n",
-                      swap_header->info.version);
+               pr_warn("Unable to handle swap header version %d\n",
+                       swap_header->info.version);
                return 0;
        }
 
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
         */
        maxpages = swp_offset(pte_to_swp_entry(
                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-       if (maxpages > swap_header->info.last_page) {
-               maxpages = swap_header->info.last_page + 1;
+       last_page = swap_header->info.last_page;
+       if (last_page > maxpages) {
+               pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
+                       maxpages << (PAGE_SHIFT - 10),
+                       last_page << (PAGE_SHIFT - 10));
+       }
+       if (maxpages > last_page) {
+               maxpages = last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
                if ((unsigned int)maxpages == 0)
                        maxpages = UINT_MAX;
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
                return 0;
        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
-               printk(KERN_WARNING
-                      "Swap area shorter than signature indicates\n");
+               pr_warn("Swap area shorter than signature indicates\n");
                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
                                        union swap_header *swap_header,
                                        unsigned char *swap_map,
+                                       struct swap_cluster_info *cluster_info,
                                        unsigned long maxpages,
                                        sector_t *span)
 {
        int i;
        unsigned int nr_good_pages;
        int nr_extents;
+       unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+       unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
 
        nr_good_pages = maxpages - 1;   /* omit header page */
 
+       cluster_set_null(&p->free_cluster_head);
+       cluster_set_null(&p->free_cluster_tail);
+       cluster_set_null(&p->discard_cluster_head);
+       cluster_set_null(&p->discard_cluster_tail);
+
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
                if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
                if (page_nr < maxpages) {
                        swap_map[page_nr] = SWAP_MAP_BAD;
                        nr_good_pages--;
+                       /*
+                        * Haven't marked the cluster free yet, no list
+                        * operation involved
+                        */
+                       inc_cluster_info_page(p, cluster_info, page_nr);
                }
        }
 
+       /* Haven't marked the cluster free yet, no list operation involved */
+       for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+               inc_cluster_info_page(p, cluster_info, i);
+
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
+               /*
+                * Not mark the cluster free yet, no list
+                * operation involved
+                */
+               inc_cluster_info_page(p, cluster_info, 0);
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, span);
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
                nr_good_pages = p->pages;
        }
        if (!nr_good_pages) {
-               printk(KERN_WARNING "Empty swap-file\n");
+               pr_warn("Empty swap-file\n");
                return -EINVAL;
        }
 
+       if (!cluster_info)
+               return nr_extents;
+
+       for (i = 0; i < nr_clusters; i++) {
+               if (!cluster_count(&cluster_info[idx])) {
+                       cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+                       if (cluster_is_null(&p->free_cluster_head)) {
+                               cluster_set_next_flag(&p->free_cluster_head,
+                                                               idx, 0);
+                               cluster_set_next_flag(&p->free_cluster_tail,
+                                                               idx, 0);
+                       } else {
+                               unsigned int tail;
+
+                               tail = cluster_next(&p->free_cluster_tail);
+                               cluster_set_next(&cluster_info[tail], idx);
+                               cluster_set_next_flag(&p->free_cluster_tail,
+                                                               idx, 0);
+                       }
+               }
+               idx++;
+               if (idx == nr_clusters)
+                       idx = 0;
+       }
        return nr_extents;
 }
 
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
+       struct swap_cluster_info *cluster_info = NULL;
        unsigned long *frontswap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (IS_ERR(p))
                return PTR_ERR(p);
 
+       INIT_WORK(&p->discard_work, swap_discard_work);
+
        name = getname(specialfile);
        if (IS_ERR(name)) {
                error = PTR_ERR(name);
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                error = -ENOMEM;
                goto bad_swap;
        }
+       if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+               p->flags |= SWP_SOLIDSTATE;
+               /*
+                * select a random position to start with to help wear leveling
+                * SSD
+                */
+               p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+
+               cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
+                       SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+               if (!cluster_info) {
+                       error = -ENOMEM;
+                       goto bad_swap;
+               }
+               p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+               if (!p->percpu_cluster) {
+                       error = -ENOMEM;
+                       goto bad_swap;
+               }
+               for_each_possible_cpu(i) {
+                       struct percpu_cluster *cluster;
+                       cluster = per_cpu_ptr(p->percpu_cluster, i);
+                       cluster_set_null(&cluster->index);
+               }
+       }
 
        error = swap_cgroup_swapon(p->type, maxpages);
        if (error)
                goto bad_swap;
 
        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
-               maxpages, &span);
+               cluster_info, maxpages, &span);
        if (unlikely(nr_extents < 0)) {
                error = nr_extents;
                goto bad_swap;
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (frontswap_enabled)
                frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
 
-       if (p->bdev) {
-               if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
-                       p->flags |= SWP_SOLIDSTATE;
-                       p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
-               }
-
-               if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
-                       /*
-                        * When discard is enabled for swap with no particular
-                        * policy flagged, we set all swap discard flags here in
-                        * order to sustain backward compatibility with older
-                        * swapon(8) releases.
-                        */
-                       p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
-                                    SWP_PAGE_DISCARD);
+       if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+               /*
+                * When discard is enabled for swap with no particular
+                * policy flagged, we set all swap discard flags here in
+                * order to sustain backward compatibility with older
+                * swapon(8) releases.
+                */
+               p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+                            SWP_PAGE_DISCARD);
 
-                       /*
-                        * By flagging sys_swapon, a sysadmin can tell us to
-                        * either do single-time area discards only, or to just
-                        * perform discards for released swap page-clusters.
-                        * Now it's time to adjust the p->flags accordingly.
-                        */
-                       if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
-                               p->flags &= ~SWP_PAGE_DISCARD;
-                       else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
-                               p->flags &= ~SWP_AREA_DISCARD;
-
-                       /* issue a swapon-time discard if it's still required */
-                       if (p->flags & SWP_AREA_DISCARD) {
-                               int err = discard_swap(p);
-                               if (unlikely(err))
-                                       printk(KERN_ERR
-                                              "swapon: discard_swap(%p): %d\n",
-                                               p, err);
-                       }
+               /*
+                * By flagging sys_swapon, a sysadmin can tell us to
+                * either do single-time area discards only, or to just
+                * perform discards for released swap page-clusters.
+                * Now it's time to adjust the p->flags accordingly.
+                */
+               if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+                       p->flags &= ~SWP_PAGE_DISCARD;
+               else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+                       p->flags &= ~SWP_AREA_DISCARD;
+
+               /* issue a swapon-time discard if it's still required */
+               if (p->flags & SWP_AREA_DISCARD) {
+                       int err = discard_swap(p);
+                       if (unlikely(err))
+                               pr_err("swapon: discard_swap(%p): %d\n",
+                                       p, err);
                }
        }
 
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-       enable_swap_info(p, prio, swap_map, frontswap_map);
+       enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
 
-       printk(KERN_INFO "Adding %uk swap on %s.  "
+       pr_info("Adding %uk swap on %s.  "
                        "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = 0;
        goto out;
 bad_swap:
+       free_percpu(p->percpu_cluster);
+       p->percpu_cluster = NULL;
        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                set_blocksize(p->bdev, p->old_block_size);
                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2222,6 +2543,7 @@ bad_swap:
        p->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
+       vfree(cluster_info);
        if (swap_file) {
                if (inode && S_ISREG(inode->i_mode)) {
                        mutex_unlock(&inode->i_mutex);
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
                goto unlock_out;
 
        count = p->swap_map[offset];
+
+       /*
+        * swapin_readahead() doesn't check if a swap entry is valid, so the
+        * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+        */
+       if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+               err = -ENOENT;
+               goto unlock_out;
+       }
+
        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;
        err = 0;
@@ -2326,7 +2658,7 @@ out:
        return err;
 
 bad_file:
-       printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+       pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
        goto out;
 }