Merge branch 'for-3.19/drivers' of git://git.kernel.dk/linux-block
[cascardo/linux.git] / drivers / md / dm.c
index b1cdf69..4c06585 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
+#include <linux/wait.h>
 
 #include <trace/events/block.h>
 
@@ -117,6 +118,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
 #define DMF_DEFERRED_REMOVE 7
+#define DMF_SUSPENDED_INTERNALLY 8
 
 /*
  * A dummy definition to make RCU happy.
@@ -140,7 +142,7 @@ struct mapped_device {
         * Use dm_get_live_table{_fast} or take suspend_lock for
         * dereference.
         */
-       struct dm_table *map;
+       struct dm_table __rcu *map;
 
        struct list_head table_devices;
        struct mutex table_devices_lock;
@@ -525,14 +527,15 @@ retry:
                goto out;
 
        tgt = dm_table_get_target(map, 0);
+       if (!tgt->type->ioctl)
+               goto out;
 
        if (dm_suspended_md(md)) {
                r = -EAGAIN;
                goto out;
        }
 
-       if (tgt->type->ioctl)
-               r = tgt->type->ioctl(tgt, cmd, arg);
+       r = tgt->type->ioctl(tgt, cmd, arg);
 
 out:
        dm_put_live_table(md, srcu_idx);
@@ -1604,9 +1607,9 @@ static int dm_merge_bvec(struct request_queue *q,
         * Find maximum amount of I/O that won't need splitting
         */
        max_sectors = min(max_io_len(bvm->bi_sector, ti),
-                         (sector_t) BIO_MAX_SECTORS);
+                         (sector_t) queue_max_sectors(q));
        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-       if (max_size < 0)
+       if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
                max_size = 0;
 
        /*
@@ -1618,10 +1621,10 @@ static int dm_merge_bvec(struct request_queue *q,
                max_size = ti->type->merge(ti, bvm, biovec, max_size);
        /*
         * If the target doesn't support merge method and some of the devices
-        * provided their merge_bvec method (we know this by looking at
-        * queue_max_hw_sectors), then we can't allow bios with multiple vector
-        * entries.  So always set max_size to 0, and the code below allows
-        * just one page.
+        * provided their merge_bvec method (we know this by looking for the
+        * max_hw_sectors that dm_set_device_limits may set), then we can't
+        * allow bios with multiple vector entries.  So always set max_size
+        * to 0, and the code below allows just one page.
         */
        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
                max_size = 0;
@@ -2325,7 +2328,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
        merge_is_optional = dm_table_merge_is_optional(t);
 
-       old_map = md->map;
+       old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
        rcu_assign_pointer(md->map, t);
        md->immutable_target_type = dm_table_get_immutable_target_type(t);
 
@@ -2334,7 +2337,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
                set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
        else
                clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
-       dm_sync_table(md);
+       if (old_map)
+               dm_sync_table(md);
 
        return old_map;
 }
@@ -2344,7 +2348,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
  */
 static struct dm_table *__unbind(struct mapped_device *md)
 {
-       struct dm_table *map = md->map;
+       struct dm_table *map = rcu_dereference_protected(md->map, 1);
 
        if (!map)
                return NULL;
@@ -2709,36 +2713,18 @@ static void unlock_fs(struct mapped_device *md)
 }
 
 /*
- * We need to be able to change a mapping table under a mounted
- * filesystem.  For example we might want to move some data in
- * the background.  Before the table can be swapped with
- * dm_bind_table, dm_suspend must be called to flush any in
- * flight bios and ensure that any further io gets deferred.
- */
-/*
- * Suspend mechanism in request-based dm.
+ * If __dm_suspend returns 0, the device is completely quiescent
+ * now. There is no request-processing activity. All new requests
+ * are being added to md->deferred list.
  *
- * 1. Flush all I/Os by lock_fs() if needed.
- * 2. Stop dispatching any I/O by stopping the request_queue.
- * 3. Wait for all in-flight I/Os to be completed or requeued.
- *
- * To abort suspend, start the request_queue.
+ * Caller must hold md->suspend_lock
  */
-int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
+                       unsigned suspend_flags, int interruptible)
 {
-       struct dm_table *map = NULL;
-       int r = 0;
-       int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
-       int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
-
-       mutex_lock(&md->suspend_lock);
-
-       if (dm_suspended_md(md)) {
-               r = -EINVAL;
-               goto out_unlock;
-       }
-
-       map = md->map;
+       bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
+       bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
+       int r;
 
        /*
         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
@@ -2747,7 +2733,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        if (noflush)
                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 
-       /* This does not get reverted if there's an error later. */
+       /*
+        * This gets reverted if there's an error later and the targets
+        * provide the .presuspend_undo hook.
+        */
        dm_table_presuspend_targets(map);
 
        /*
@@ -2758,8 +2747,10 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         */
        if (!noflush && do_lockfs) {
                r = lock_fs(md);
-               if (r)
-                       goto out_unlock;
+               if (r) {
+                       dm_table_presuspend_undo_targets(map);
+                       return r;
+               }
        }
 
        /*
@@ -2775,7 +2766,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         * flush_workqueue(md->wq).
         */
        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
-       synchronize_srcu(&md->io_barrier);
+       if (map)
+               synchronize_srcu(&md->io_barrier);
 
        /*
         * Stop md->queue before flushing md->wq in case request-based
@@ -2791,11 +2783,12 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
         * We call dm_wait_for_completion to wait for all existing requests
         * to finish.
         */
-       r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
+       r = dm_wait_for_completion(md, interruptible);
 
        if (noflush)
                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
-       synchronize_srcu(&md->io_barrier);
+       if (map)
+               synchronize_srcu(&md->io_barrier);
 
        /* were we interrupted ? */
        if (r < 0) {
@@ -2805,14 +2798,56 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
                        start_queue(md->queue);
 
                unlock_fs(md);
-               goto out_unlock; /* pushback list is already flushed, so skip flush */
+               dm_table_presuspend_undo_targets(map);
+               /* pushback list is already flushed, so skip flush */
        }
 
-       /*
-        * If dm_wait_for_completion returned 0, the device is completely
-        * quiescent now. There is no request-processing activity. All new
-        * requests are being added to md->deferred list.
-        */
+       return r;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem.  For example we might want to move some data in
+ * the background.  Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight bios and ensure that any further io gets deferred.
+ */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * 1. Flush all I/Os by lock_fs() if needed.
+ * 2. Stop dispatching any I/O by stopping the request_queue.
+ * 3. Wait for all in-flight I/Os to be completed or requeued.
+ *
+ * To abort suspend, start the request_queue.
+ */
+int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
+{
+       struct dm_table *map = NULL;
+       int r = 0;
+
+retry:
+       mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+       if (dm_suspended_md(md)) {
+               r = -EINVAL;
+               goto out_unlock;
+       }
+
+       if (dm_suspended_internally_md(md)) {
+               /* already internally suspended, wait for internal resume */
+               mutex_unlock(&md->suspend_lock);
+               r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+               if (r)
+                       return r;
+               goto retry;
+       }
+
+       map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+       r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
+       if (r)
+               goto out_unlock;
 
        set_bit(DMF_SUSPENDED, &md->flags);
 
@@ -2823,22 +2858,13 @@ out_unlock:
        return r;
 }
 
-int dm_resume(struct mapped_device *md)
+static int __dm_resume(struct mapped_device *md, struct dm_table *map)
 {
-       int r = -EINVAL;
-       struct dm_table *map = NULL;
-
-       mutex_lock(&md->suspend_lock);
-       if (!dm_suspended_md(md))
-               goto out;
-
-       map = md->map;
-       if (!map || !dm_table_get_size(map))
-               goto out;
-
-       r = dm_table_resume_targets(map);
-       if (r)
-               goto out;
+       if (map) {
+               int r = dm_table_resume_targets(map);
+               if (r)
+                       return r;
+       }
 
        dm_queue_flush(md);
 
@@ -2852,6 +2878,37 @@ int dm_resume(struct mapped_device *md)
 
        unlock_fs(md);
 
+       return 0;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+       int r = -EINVAL;
+       struct dm_table *map = NULL;
+
+retry:
+       mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
+
+       if (!dm_suspended_md(md))
+               goto out;
+
+       if (dm_suspended_internally_md(md)) {
+               /* already internally suspended, wait for internal resume */
+               mutex_unlock(&md->suspend_lock);
+               r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
+               if (r)
+                       return r;
+               goto retry;
+       }
+
+       map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+       if (!map || !dm_table_get_size(map))
+               goto out;
+
+       r = __dm_resume(md, map);
+       if (r)
+               goto out;
+
        clear_bit(DMF_SUSPENDED, &md->flags);
 
        r = 0;
@@ -2865,15 +2922,80 @@ out:
  * Internal suspend/resume works like userspace-driven suspend. It waits
  * until all bios finish and prevents issuing new bios to the target drivers.
  * It may be used only from the kernel.
- *
- * Internal suspend holds md->suspend_lock, which prevents interaction with
- * userspace-driven suspend.
  */
 
-void dm_internal_suspend(struct mapped_device *md)
+static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
-       mutex_lock(&md->suspend_lock);
+       struct dm_table *map = NULL;
+
+       if (dm_suspended_internally_md(md))
+               return; /* nested internal suspend */
+
+       if (dm_suspended_md(md)) {
+               set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+               return; /* nest suspend */
+       }
+
+       map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+
+       /*
+        * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
+        * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
+        * would require changing .presuspend to return an error -- avoid this
+        * until there is a need for more elaborate variants of internal suspend.
+        */
+       (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
+
+       set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+
+       dm_table_postsuspend_targets(map);
+}
+
+static void __dm_internal_resume(struct mapped_device *md)
+{
+       if (!dm_suspended_internally_md(md))
+               return; /* resume from nested internal suspend */
+
        if (dm_suspended_md(md))
+               goto done; /* resume from nested suspend */
+
+       /*
+        * NOTE: existing callers don't need to call dm_table_resume_targets
+        * (which may fail -- so best to avoid it for now by passing NULL map)
+        */
+       (void) __dm_resume(md, NULL);
+
+done:
+       clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+       smp_mb__after_atomic();
+       wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
+}
+
+void dm_internal_suspend_noflush(struct mapped_device *md)
+{
+       mutex_lock(&md->suspend_lock);
+       __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
+       mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
+
+void dm_internal_resume(struct mapped_device *md)
+{
+       mutex_lock(&md->suspend_lock);
+       __dm_internal_resume(md);
+       mutex_unlock(&md->suspend_lock);
+}
+EXPORT_SYMBOL_GPL(dm_internal_resume);
+
+/*
+ * Fast variants of internal suspend/resume hold md->suspend_lock,
+ * which prevents interaction with userspace-driven suspend.
+ */
+
+void dm_internal_suspend_fast(struct mapped_device *md)
+{
+       mutex_lock(&md->suspend_lock);
+       if (dm_suspended_md(md) || dm_suspended_internally_md(md))
                return;
 
        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
@@ -2882,9 +3004,9 @@ void dm_internal_suspend(struct mapped_device *md)
        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 }
 
-void dm_internal_resume(struct mapped_device *md)
+void dm_internal_resume_fast(struct mapped_device *md)
 {
-       if (dm_suspended_md(md))
+       if (dm_suspended_md(md) || dm_suspended_internally_md(md))
                goto done;
 
        dm_queue_flush(md);
@@ -2970,6 +3092,11 @@ int dm_suspended_md(struct mapped_device *md)
        return test_bit(DMF_SUSPENDED, &md->flags);
 }
 
+int dm_suspended_internally_md(struct mapped_device *md)
+{
+       return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
+}
+
 int dm_test_deferred_remove_flag(struct mapped_device *md)
 {
        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);