Merge tag 'dm-4.9-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 10 Oct 2016 00:16:18 +0000 (17:16 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 10 Oct 2016 00:16:18 +0000 (17:16 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2016 00:16:18 +0000 (17:16 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 10 Oct 2016 00:16:18 +0000 (17:16 -0700)
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c

index 8625040..125aedc 100644 (file)
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -191,19 +191,6 @@ static void dm_bufio_unlock(struct dm_bufio_client *c)
         mutex_unlock(&c->lock);
  }
  
-/*
- * FIXME Move to sched.h?
- */
-#ifdef CONFIG_PREEMPT_VOLUNTARY
-#  define dm_bufio_cond_resched()              \
-do {                                           \
-       if (unlikely(need_resched()))           \
-               _cond_resched();                \
-} while (0)
-#else
-#  define dm_bufio_cond_resched()                do { } while (0)
-#endif
-
  /*----------------------------------------------------------------*/
  
  /*
@@ -741,7 +728,7 @@ static void __flush_write_list(struct list_head *write_list)
                         list_entry(write_list->next, struct dm_buffer, write_list);
                 list_del(&b->write_list);
                 submit_io(b, WRITE, b->block, write_endio);
-               dm_bufio_cond_resched();
+               cond_resched();
         }
         blk_finish_plug(&plug);
  }
@@ -780,7 +767,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
                         __unlink_buffer(b);
                         return b;
                 }
-               dm_bufio_cond_resched();
+               cond_resched();
         }
  
         list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
@@ -791,7 +778,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
                         __unlink_buffer(b);
                         return b;
                 }
-               dm_bufio_cond_resched();
+               cond_resched();
         }
  
         return NULL;
@@ -923,7 +910,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
                         return;
  
                 __write_dirty_buffer(b, write_list);
-               dm_bufio_cond_resched();
+               cond_resched();
         }
  }
  
@@ -973,7 +960,7 @@ static void __check_watermark(struct dm_bufio_client *c,
                         return;
  
                 __free_buffer_wake(b);
-               dm_bufio_cond_resched();
+               cond_resched();
         }
  
         if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
@@ -1170,7 +1157,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c,
                                 submit_io(b, READ, b->block, read_endio);
                         dm_bufio_release(b);
  
-                       dm_bufio_cond_resched();
+                       cond_resched();
  
                         if (!n_blocks)
                                 goto flush_plug;
@@ -1291,7 +1278,7 @@ again:
                     !test_bit(B_WRITING, &b->state))
                         __relink_lru(b, LIST_CLEAN);
  
-               dm_bufio_cond_resched();
+               cond_resched();
  
                 /*
                  * If we dropped the lock, the list is no longer consistent,
@@ -1574,7 +1561,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
                                 freed++;
                         if (!--nr_to_scan || ((count - freed) <= retain_target))
                                 return freed;
-                       dm_bufio_cond_resched();
+                       cond_resched();
                 }
         }
         return freed;
@@ -1808,7 +1795,7 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
                 if (__try_evict_buffer(b, 0))
                         count--;
  
-               dm_bufio_cond_resched();
+               cond_resched();
         }
  
         dm_bufio_unlock(c);
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c

index 3970cda..6955778 100644 (file)
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -140,6 +140,13 @@ struct dm_cache_metadata {
          * the device.
          */
         bool fail_io:1;
+
+       /*
+        * These structures are used when loading metadata.  They're too
+        * big to put on the stack.
+        */
+       struct dm_array_cursor mapping_cursor;
+       struct dm_array_cursor hint_cursor;
  };
  
  /*-------------------------------------------------------------------
@@ -1171,31 +1178,37 @@ static bool hints_array_available(struct dm_cache_metadata *cmd,
                 hints_array_initialized(cmd);
  }
  
-static int __load_mapping(void *context, uint64_t cblock, void *leaf)
+static int __load_mapping(struct dm_cache_metadata *cmd,
+                         uint64_t cb, bool hints_valid,
+                         struct dm_array_cursor *mapping_cursor,
+                         struct dm_array_cursor *hint_cursor,
+                         load_mapping_fn fn, void *context)
  {
         int r = 0;
-       bool dirty;
-       __le64 value;
-       __le32 hint_value = 0;
+
+       __le64 mapping;
+       __le32 hint = 0;
+
+       __le64 *mapping_value_le;
+       __le32 *hint_value_le;
+
         dm_oblock_t oblock;
         unsigned flags;
-       struct thunk *thunk = context;
-       struct dm_cache_metadata *cmd = thunk->cmd;
  
-       memcpy(&value, leaf, sizeof(value));
-       unpack_value(value, &oblock, &flags);
+       dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le);
+       memcpy(&mapping, mapping_value_le, sizeof(mapping));
+       unpack_value(mapping, &oblock, &flags);
  
         if (flags & M_VALID) {
-               if (thunk->hints_valid) {
-                       r = dm_array_get_value(&cmd->hint_info, cmd->hint_root,
-                                              cblock, &hint_value);
-                       if (r && r != -ENODATA)
-                               return r;
+               if (hints_valid) {
+                       dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le);
+                       memcpy(&hint, hint_value_le, sizeof(hint));
                 }
  
-               dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true;
-               r = thunk->fn(thunk->context, oblock, to_cblock(cblock),
-                             dirty, le32_to_cpu(hint_value), thunk->hints_valid);
+               r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY,
+                      le32_to_cpu(hint), hints_valid);
+               if (r)
+                       DMERR("policy couldn't load cblock");
         }
  
         return r;
@@ -1205,16 +1218,60 @@ static int __load_mappings(struct dm_cache_metadata *cmd,
                            struct dm_cache_policy *policy,
                            load_mapping_fn fn, void *context)
  {
-       struct thunk thunk;
+       int r;
+       uint64_t cb;
+
+       bool hints_valid = hints_array_available(cmd, policy);
+
+       if (from_cblock(cmd->cache_blocks) == 0)
+               /* Nothing to do */
+               return 0;
+
+       r = dm_array_cursor_begin(&cmd->info, cmd->root, &cmd->mapping_cursor);
+       if (r)
+               return r;
  
-       thunk.fn = fn;
-       thunk.context = context;
+       if (hints_valid) {
+               r = dm_array_cursor_begin(&cmd->hint_info, cmd->hint_root, &cmd->hint_cursor);
+               if (r) {
+                       dm_array_cursor_end(&cmd->mapping_cursor);
+                       return r;
+               }
+       }
+
+       for (cb = 0; ; cb++) {
+               r = __load_mapping(cmd, cb, hints_valid,
+                                  &cmd->mapping_cursor, &cmd->hint_cursor,
+                                  fn, context);
+               if (r)
+                       goto out;
+
+               /*
+                * We need to break out before we move the cursors.
+                */
+               if (cb >= (from_cblock(cmd->cache_blocks) - 1))
+                       break;
  
-       thunk.cmd = cmd;
-       thunk.respect_dirty_flags = cmd->clean_when_opened;
-       thunk.hints_valid = hints_array_available(cmd, policy);
+               r = dm_array_cursor_next(&cmd->mapping_cursor);
+               if (r) {
+                       DMERR("dm_array_cursor_next for mapping failed");
+                       goto out;
+               }
  
-       return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk);
+               if (hints_valid) {
+                       r = dm_array_cursor_next(&cmd->hint_cursor);
+                       if (r) {
+                               DMERR("dm_array_cursor_next for hint failed");
+                               goto out;
+                       }
+               }
+       }
+out:
+       dm_array_cursor_end(&cmd->mapping_cursor);
+       if (hints_valid)
+               dm_array_cursor_end(&cmd->hint_cursor);
+
+       return r;
  }
  
  int dm_cache_load_mappings(struct dm_cache_metadata *cmd,
@@ -1368,10 +1425,24 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd,
  
  /*----------------------------------------------------------------*/
  
-static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+static int get_hint(uint32_t index, void *value_le, void *context)
+{
+       uint32_t value;
+       struct dm_cache_policy *policy = context;
+
+       value = policy_get_hint(policy, to_cblock(index));
+       *((__le32 *) value_le) = cpu_to_le32(value);
+
+       return 0;
+}
+
+/*
+ * It's quicker to always delete the hint array, and recreate with
+ * dm_array_new().
+ */
+static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
  {
         int r;
-       __le32 value;
         size_t hint_size;
         const char *policy_name = dm_cache_policy_get_name(policy);
         const unsigned *policy_version = dm_cache_policy_get_version(policy);
@@ -1380,63 +1451,23 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
             (strlen(policy_name) > sizeof(cmd->policy_name) - 1))
                 return -EINVAL;
  
-       if (!policy_unchanged(cmd, policy)) {
-               strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
-               memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
-
-               hint_size = dm_cache_policy_get_hint_size(policy);
-               if (!hint_size)
-                       return 0; /* short-circuit hints initialization */
-               cmd->policy_hint_size = hint_size;
+       strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name));
+       memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version));
  
-               if (cmd->hint_root) {
-                       r = dm_array_del(&cmd->hint_info, cmd->hint_root);
-                       if (r)
-                               return r;
-               }
+       hint_size = dm_cache_policy_get_hint_size(policy);
+       if (!hint_size)
+               return 0; /* short-circuit hints initialization */
+       cmd->policy_hint_size = hint_size;
  
-               r = dm_array_empty(&cmd->hint_info, &cmd->hint_root);
+       if (cmd->hint_root) {
+               r = dm_array_del(&cmd->hint_info, cmd->hint_root);
                 if (r)
                         return r;
-
-               value = cpu_to_le32(0);
-               __dm_bless_for_disk(&value);
-               r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0,
-                                   from_cblock(cmd->cache_blocks),
-                                   &value, &cmd->hint_root);
-               if (r)
-                       return r;
-       }
-
-       return 0;
-}
-
-static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
-{
-       struct dm_cache_metadata *cmd = context;
-       __le32 value = cpu_to_le32(hint);
-       int r;
-
-       __dm_bless_for_disk(&value);
-
-       r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
-                              from_cblock(cblock), &value, &cmd->hint_root);
-       cmd->changed = true;
-
-       return r;
-}
-
-static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
-{
-       int r;
-
-       r = begin_hints(cmd, policy);
-       if (r) {
-               DMERR("begin_hints failed");
-               return r;
         }
  
-       return policy_walk_mappings(policy, save_hint, cmd);
+       return dm_array_new(&cmd->hint_info, &cmd->hint_root,
+                           from_cblock(cmd->cache_blocks),
+                           get_hint, policy);
  }
  
  int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c

index 14aaaf0..2e8a8f1 100644 (file)
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -395,7 +395,7 @@ static void init_policy_functions(struct policy *p)
         p->policy.set_dirty = wb_set_dirty;
         p->policy.clear_dirty = wb_clear_dirty;
         p->policy.load_mapping = wb_load_mapping;
-       p->policy.walk_mappings = NULL;
+       p->policy.get_hint = NULL;
         p->policy.remove_mapping = wb_remove_mapping;
         p->policy.writeback_work = wb_writeback_work;
         p->policy.force_mapping = wb_force_mapping;
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h

index 2816018..808ee0e 100644 (file)
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -48,10 +48,10 @@ static inline int policy_load_mapping(struct dm_cache_policy *p,
         return p->load_mapping(p, oblock, cblock, hint, hint_valid);
  }
  
-static inline int policy_walk_mappings(struct dm_cache_policy *p,
-                                     policy_walk_fn fn, void *context)
+static inline uint32_t policy_get_hint(struct dm_cache_policy *p,
+                                      dm_cblock_t cblock)
  {
-       return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0;
+       return p->get_hint ? p->get_hint(p, cblock) : 0;
  }
  
  static inline int policy_writeback_work(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c

index cf48a61..c33f4a6 100644 (file)
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1359,6 +1359,11 @@ static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
         spin_unlock_irqrestore(&mq->lock, flags);
  }
  
+static unsigned random_level(dm_cblock_t cblock)
+{
+       return hash_32_generic(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
+}
+
  static int smq_load_mapping(struct dm_cache_policy *p,
                             dm_oblock_t oblock, dm_cblock_t cblock,
                             uint32_t hint, bool hint_valid)
@@ -1369,47 +1374,21 @@ static int smq_load_mapping(struct dm_cache_policy *p,
         e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
         e->oblock = oblock;
         e->dirty = false;       /* this gets corrected in a minute */
-       e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+       e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
         push(mq, e);
  
         return 0;
  }
  
-static int smq_save_hints(struct smq_policy *mq, struct queue *q,
-                         policy_walk_fn fn, void *context)
-{
-       int r;
-       unsigned level;
-       struct entry *e;
-
-       for (level = 0; level < q->nr_levels; level++)
-               for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
-                       if (!e->sentinel) {
-                               r = fn(context, infer_cblock(mq, e),
-                                      e->oblock, e->level);
-                               if (r)
-                                       return r;
-                       }
-               }
-
-       return 0;
-}
-
-static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
-                            void *context)
+static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
  {
         struct smq_policy *mq = to_smq_policy(p);
-       int r = 0;
+       struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
  
-       /*
-        * We don't need to lock here since this method is only called once
-        * the IO has stopped.
-        */
-       r = smq_save_hints(mq, &mq->clean, fn, context);
-       if (!r)
-               r = smq_save_hints(mq, &mq->dirty, fn, context);
+       if (!e->allocated)
+               return 0;
  
-       return r;
+       return e->level;
  }
  
  static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
@@ -1616,7 +1595,7 @@ static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
         mq->policy.set_dirty = smq_set_dirty;
         mq->policy.clear_dirty = smq_clear_dirty;
         mq->policy.load_mapping = smq_load_mapping;
-       mq->policy.walk_mappings = smq_walk_mappings;
+       mq->policy.get_hint = smq_get_hint;
         mq->policy.remove_mapping = smq_remove_mapping;
         mq->policy.remove_cblock = smq_remove_cblock;
         mq->policy.writeback_work = smq_writeback_work;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h

index 05db56e..aa10b14 100644 (file)
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -90,9 +90,6 @@ struct policy_result {
         dm_cblock_t cblock;     /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */
  };
  
-typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock,
-                             dm_oblock_t oblock, uint32_t hint);
-
  /*
   * The cache policy object.  Just a bunch of methods.  It is envisaged that
   * this structure will be embedded in a bigger, policy specific structure
@@ -158,8 +155,11 @@ struct dm_cache_policy {
         int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock,
                             dm_cblock_t cblock, uint32_t hint, bool hint_valid);
  
-       int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn,
-                            void *context);
+       /*
+        * Gets the hint for a given cblock.  Called in a single threaded
+        * context.  So no locking required.
+        */
+       uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock);
  
         /*
          * Override functions used on the error paths of the core target.
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c

index 0448e7e..a276883 100644 (file)
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -113,8 +113,7 @@ struct iv_tcw_private {
   * and encrypts / decrypts at the same time.
   */
  enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID,
-            DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD,
-            DM_CRYPT_EXIT_THREAD};
+            DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD };
  
  /*
   * The fields in here must be read only after initialization.
@@ -1207,18 +1206,20 @@ continue_locked:
                 if (!RB_EMPTY_ROOT(&cc->write_tree))
                         goto pop_from_list;
  
-               if (unlikely(test_bit(DM_CRYPT_EXIT_THREAD, &cc->flags))) {
-                       spin_unlock_irq(&cc->write_thread_wait.lock);
-                       break;
-               }
-
-               __set_current_state(TASK_INTERRUPTIBLE);
+               set_current_state(TASK_INTERRUPTIBLE);
                 __add_wait_queue(&cc->write_thread_wait, &wait);
  
                 spin_unlock_irq(&cc->write_thread_wait.lock);
  
+               if (unlikely(kthread_should_stop())) {
+                       set_task_state(current, TASK_RUNNING);
+                       remove_wait_queue(&cc->write_thread_wait, &wait);
+                       break;
+               }
+
                 schedule();
  
+               set_task_state(current, TASK_RUNNING);
                 spin_lock_irq(&cc->write_thread_wait.lock);
                 __remove_wait_queue(&cc->write_thread_wait, &wait);
                 goto continue_locked;
@@ -1533,13 +1534,8 @@ static void crypt_dtr(struct dm_target *ti)
         if (!cc)
                 return;
  
-       if (cc->write_thread) {
-               spin_lock_irq(&cc->write_thread_wait.lock);
-               set_bit(DM_CRYPT_EXIT_THREAD, &cc->flags);
-               wake_up_locked(&cc->write_thread_wait);
-               spin_unlock_irq(&cc->write_thread_wait.lock);
+       if (cc->write_thread)
                 kthread_stop(cc->write_thread);
-       }
  
         if (cc->io_queue)
                 destroy_workqueue(cc->io_queue);
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c

index ac734e5..e477af8 100644 (file)
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -550,9 +550,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
                 pgpath = choose_pgpath(m, nr_bytes);
  
         if (!pgpath) {
-               if (!must_push_back_rq(m))
-                       r = -EIO;       /* Failed */
-               return r;
+               if (must_push_back_rq(m))
+                       return DM_MAPIO_DELAY_REQUEUE;
+               return -EIO;    /* Failed */
         } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
                    test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
                 pg_init_all_paths(m);
@@ -680,9 +680,11 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio)
         return __multipath_map_bio(m, bio, mpio);
  }
  
-static void process_queued_bios_list(struct multipath *m)
+static void process_queued_io_list(struct multipath *m)
  {
-       if (m->queue_mode == DM_TYPE_BIO_BASED)
+       if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED)
+               dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table));
+       else if (m->queue_mode == DM_TYPE_BIO_BASED)
                 queue_work(kmultipathd, &m->process_queued_bios);
  }
  
@@ -752,7 +754,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path,
  
         if (!queue_if_no_path) {
                 dm_table_run_md_queue_async(m->ti->table);
-               process_queued_bios_list(m);
+               process_queued_io_list(m);
         }
  
         return 0;
@@ -1193,21 +1195,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
  
  static void multipath_wait_for_pg_init_completion(struct multipath *m)
  {
-       DECLARE_WAITQUEUE(wait, current);
-
-       add_wait_queue(&m->pg_init_wait, &wait);
+       DEFINE_WAIT(wait);
  
         while (1) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
+               prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE);
  
                 if (!atomic_read(&m->pg_init_in_progress))
                         break;
  
                 io_schedule();
         }
-       set_current_state(TASK_RUNNING);
-
-       remove_wait_queue(&m->pg_init_wait, &wait);
+       finish_wait(&m->pg_init_wait, &wait);
  }
  
  static void flush_multipath_work(struct multipath *m)
@@ -1308,7 +1306,7 @@ out:
         spin_unlock_irqrestore(&m->lock, flags);
         if (run_queue) {
                 dm_table_run_md_queue_async(m->ti->table);
-               process_queued_bios_list(m);
+               process_queued_io_list(m);
         }
  
         return r;
@@ -1506,7 +1504,7 @@ static void pg_init_done(void *data, int errors)
         }
         clear_bit(MPATHF_QUEUE_IO, &m->flags);
  
-       process_queued_bios_list(m);
+       process_queued_io_list(m);
  
         /*
          * Wake up any thread waiting to suspend.
@@ -1521,10 +1519,10 @@ static void activate_path(struct work_struct *work)
  {
         struct pgpath *pgpath =
                 container_of(work, struct pgpath, activate_path.work);
+       struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
  
-       if (pgpath->is_active)
-               scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-                                pg_init_done, pgpath);
+       if (pgpath->is_active && !blk_queue_dying(q))
+               scsi_dh_activate(q, pg_init_done, pgpath);
         else
                 pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED);
  }
@@ -1532,6 +1530,14 @@ static void activate_path(struct work_struct *work)
  static int noretry_error(int error)
  {
         switch (error) {
+       case -EBADE:
+               /*
+                * EBADE signals an reservation conflict.
+                * We shouldn't fail the path here as we can communicate with
+                * the target.  We should failover to the next path, but in
+                * doing so we might be causing a ping-pong between paths.
+                * So just return the reservation conflict error.
+                */
         case -EOPNOTSUPP:
         case -EREMOTEIO:
         case -EILSEQ:
@@ -1576,9 +1582,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
                 if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
                         if (!must_push_back_rq(m))
                                 r = -EIO;
-               } else {
-                       if (error == -EBADE)
-                               r = error;
                 }
         }
  
@@ -1627,9 +1630,6 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone,
                         if (!must_push_back_bio(m))
                                 return -EIO;
                         return DM_ENDIO_REQUEUE;
-               } else {
-                       if (error == -EBADE)
-                               return error;
                 }
         }
  
@@ -1941,7 +1941,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
                 if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
                         pg_init_all_paths(m);
                 dm_table_run_md_queue_async(m->ti->table);
-               process_queued_bios_list(m);
+               process_queued_io_list(m);
         }
  
         /*
@@ -1994,11 +1994,14 @@ static int multipath_busy(struct dm_target *ti)
         struct priority_group *pg, *next_pg;
         struct pgpath *pgpath;
  
-       /* pg_init in progress or no paths available */
-       if (atomic_read(&m->pg_init_in_progress) ||
-           (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
+       /* pg_init in progress */
+       if (atomic_read(&m->pg_init_in_progress))
                 return true;
  
+       /* no paths available, for blk-mq: rely on IO mapping to delay requeue */
+       if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
+               return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
+
         /* Guess which priority_group will be used at next mapping time */
         pg = lockless_dereference(m->current_pg);
         next_pg = lockless_dereference(m->next_pg);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c

index ee48230..182b679 100644 (file)
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -73,43 +73,60 @@ static void dm_old_start_queue(struct request_queue *q)
         spin_unlock_irqrestore(q->queue_lock, flags);
  }
  
+static void dm_mq_start_queue(struct request_queue *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       queue_flag_clear(QUEUE_FLAG_STOPPED, q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+
+       blk_mq_start_stopped_hw_queues(q, true);
+       blk_mq_kick_requeue_list(q);
+}
+
  void dm_start_queue(struct request_queue *q)
  {
         if (!q->mq_ops)
                 dm_old_start_queue(q);
-       else {
-               queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, q);
-               blk_mq_start_stopped_hw_queues(q, true);
-               blk_mq_kick_requeue_list(q);
-       }
+       else
+               dm_mq_start_queue(q);
  }
  
  static void dm_old_stop_queue(struct request_queue *q)
  {
         unsigned long flags;
  
+       spin_lock_irqsave(q->queue_lock, flags);
+       if (!blk_queue_stopped(q))
+               blk_stop_queue(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_mq_stop_queue(struct request_queue *q)
+{
+       unsigned long flags;
+
         spin_lock_irqsave(q->queue_lock, flags);
         if (blk_queue_stopped(q)) {
                 spin_unlock_irqrestore(q->queue_lock, flags);
                 return;
         }
  
-       blk_stop_queue(q);
+       queue_flag_set(QUEUE_FLAG_STOPPED, q);
         spin_unlock_irqrestore(q->queue_lock, flags);
+
+       /* Avoid that requeuing could restart the queue. */
+       blk_mq_cancel_requeue_work(q);
+       blk_mq_stop_hw_queues(q);
  }
  
  void dm_stop_queue(struct request_queue *q)
  {
         if (!q->mq_ops)
                 dm_old_stop_queue(q);
-       else {
-               spin_lock_irq(q->queue_lock);
-               queue_flag_set(QUEUE_FLAG_STOPPED, q);
-               spin_unlock_irq(q->queue_lock);
-
-               blk_mq_cancel_requeue_work(q);
-               blk_mq_stop_hw_queues(q);
-       }
+       else
+               dm_mq_stop_queue(q);
  }
  
  static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
@@ -319,21 +336,32 @@ static void dm_old_requeue_request(struct request *rq)
         spin_unlock_irqrestore(q->queue_lock, flags);
  }
  
-static void dm_mq_requeue_request(struct request *rq)
+static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs)
  {
-       struct request_queue *q = rq->q;
         unsigned long flags;
  
-       blk_mq_requeue_request(rq);
         spin_lock_irqsave(q->queue_lock, flags);
         if (!blk_queue_stopped(q))
-               blk_mq_kick_requeue_list(q);
+               blk_mq_delay_kick_requeue_list(q, msecs);
         spin_unlock_irqrestore(q->queue_lock, flags);
  }
  
-static void dm_requeue_original_request(struct mapped_device *md,
-                                       struct request *rq)
+void dm_mq_kick_requeue_list(struct mapped_device *md)
+{
+       __dm_mq_kick_requeue_list(dm_get_md_queue(md), 0);
+}
+EXPORT_SYMBOL(dm_mq_kick_requeue_list);
+
+static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs)
+{
+       blk_mq_requeue_request(rq);
+       __dm_mq_kick_requeue_list(rq->q, msecs);
+}
+
+static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_requeue)
  {
+       struct mapped_device *md = tio->md;
+       struct request *rq = tio->orig;
         int rw = rq_data_dir(rq);
  
         rq_end_stats(md, rq);
@@ -342,7 +370,7 @@ static void dm_requeue_original_request(struct mapped_device *md,
         if (!rq->q->mq_ops)
                 dm_old_requeue_request(rq);
         else
-               dm_mq_requeue_request(rq);
+               dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0);
  
         rq_completed(md, rw, false);
  }
@@ -372,7 +400,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
                 return;
         else if (r == DM_ENDIO_REQUEUE)
                 /* The target wants to requeue the I/O */
-               dm_requeue_original_request(tio->md, tio->orig);
+               dm_requeue_original_request(tio, false);
         else {
                 DMWARN("unimplemented target endio return value: %d", r);
                 BUG();
@@ -612,20 +640,23 @@ static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
  
  /*
   * Returns:
- * 0                : the request has been processed
- * DM_MAPIO_REQUEUE : the original request needs to be requeued
+ * DM_MAPIO_*       : the request has been processed as indicated
+ * DM_MAPIO_REQUEUE : the original request needs to be immediately requeued
   * < 0              : the request was completed due to failure
   */
-static int map_request(struct dm_rq_target_io *tio, struct request *rq,
-                      struct mapped_device *md)
+static int map_request(struct dm_rq_target_io *tio)
  {
         int r;
         struct dm_target *ti = tio->ti;
+       struct mapped_device *md = tio->md;
+       struct request *rq = tio->orig;
         struct request *clone = NULL;
  
         if (tio->clone) {
                 clone = tio->clone;
                 r = ti->type->map_rq(ti, clone, &tio->info);
+               if (r == DM_MAPIO_DELAY_REQUEUE)
+                       return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */
         } else {
                 r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
                 if (r < 0) {
@@ -633,9 +664,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
                         dm_kill_unmapped_request(rq, r);
                         return r;
                 }
-               if (r != DM_MAPIO_REMAPPED)
-                       return r;
-               if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+               if (r == DM_MAPIO_REMAPPED &&
+                   setup_clone(clone, rq, tio, GFP_ATOMIC)) {
                         /* -ENOMEM */
                         ti->type->release_clone_rq(clone);
                         return DM_MAPIO_REQUEUE;
@@ -654,7 +684,10 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
                 break;
         case DM_MAPIO_REQUEUE:
                 /* The target wants to requeue the I/O */
-               dm_requeue_original_request(md, tio->orig);
+               break;
+       case DM_MAPIO_DELAY_REQUEUE:
+               /* The target wants to requeue the I/O after a delay */
+               dm_requeue_original_request(tio, true);
                 break;
         default:
                 if (r > 0) {
@@ -664,10 +697,9 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
  
                 /* The target wants to complete the I/O */
                 dm_kill_unmapped_request(rq, r);
-               return r;
         }
  
-       return 0;
+       return r;
  }
  
  static void dm_start_request(struct mapped_device *md, struct request *orig)
@@ -706,11 +738,9 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
  static void map_tio_request(struct kthread_work *work)
  {
         struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
-       struct request *rq = tio->orig;
-       struct mapped_device *md = tio->md;
  
-       if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-               dm_requeue_original_request(md, rq);
+       if (map_request(tio) == DM_MAPIO_REQUEUE)
+               dm_requeue_original_request(tio, false);
  }
  
  ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
@@ -896,7 +926,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
         tio->ti = ti;
  
         /* Direct call is fine since .queue_rq allows allocations */
-       if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+       if (map_request(tio) == DM_MAPIO_REQUEUE) {
                 /* Undo dm_start_request() before requeuing */
                 rq_end_stats(md, rq);
                 rq_completed(md, rq_data_dir(rq), false);
diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h

index 9e6f0a3..4da06ca 100644 (file)
--- a/drivers/md/dm-rq.h
+++ b/drivers/md/dm-rq.h
@@ -55,6 +55,8 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md);
  void dm_start_queue(struct request_queue *q);
  void dm_stop_queue(struct request_queue *q);
  
+void dm_mq_kick_requeue_list(struct mapped_device *md);
+
  unsigned dm_get_reserved_rq_based_ios(void);
  
  ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c

index fa9b1cb..be35258 100644 (file)
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1648,6 +1648,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
         struct request_queue *q = md->queue;
         sector_t size;
  
+       lockdep_assert_held(&md->suspend_lock);
+
         size = dm_table_get_size(t);
  
         /*
@@ -1873,6 +1875,7 @@ EXPORT_SYMBOL_GPL(dm_device_name);
  
  static void __dm_destroy(struct mapped_device *md, bool wait)
  {
+       struct request_queue *q = dm_get_md_queue(md);
         struct dm_table *map;
         int srcu_idx;
  
@@ -1883,6 +1886,10 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
         set_bit(DMF_FREEING, &md->flags);
         spin_unlock(&_minor_lock);
  
+       spin_lock_irq(q->queue_lock);
+       queue_flag_set(QUEUE_FLAG_DYING, q);
+       spin_unlock_irq(q->queue_lock);
+
         if (dm_request_based(md) && md->kworker_task)
                 flush_kthread_worker(&md->kworker);
  
@@ -1934,30 +1941,25 @@ void dm_put(struct mapped_device *md)
  }
  EXPORT_SYMBOL_GPL(dm_put);
  
-static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
+static int dm_wait_for_completion(struct mapped_device *md, long task_state)
  {
         int r = 0;
-       DECLARE_WAITQUEUE(wait, current);
-
-       add_wait_queue(&md->wait, &wait);
+       DEFINE_WAIT(wait);
  
         while (1) {
-               set_current_state(interruptible);
+               prepare_to_wait(&md->wait, &wait, task_state);
  
                 if (!md_in_flight(md))
                         break;
  
-               if (interruptible == TASK_INTERRUPTIBLE &&
-                   signal_pending(current)) {
+               if (signal_pending_state(task_state, current)) {
                         r = -EINTR;
                         break;
                 }
  
                 io_schedule();
         }
-       set_current_state(TASK_RUNNING);
-
-       remove_wait_queue(&md->wait, &wait);
+       finish_wait(&md->wait, &wait);
  
         return r;
  }
@@ -2075,6 +2077,10 @@ static void unlock_fs(struct mapped_device *md)
  }
  
  /*
+ * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
+ * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
+ * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
+ *
   * If __dm_suspend returns 0, the device is completely quiescent
   * now. There is no request-processing activity. All new requests
   * are being added to md->deferred list.
@@ -2082,13 +2088,15 @@ static void unlock_fs(struct mapped_device *md)
   * Caller must hold md->suspend_lock
   */
  static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
-                       unsigned suspend_flags, int interruptible,
+                       unsigned suspend_flags, long task_state,
                         int dmf_suspended_flag)
  {
         bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
         bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
         int r;
  
+       lockdep_assert_held(&md->suspend_lock);
+
         /*
          * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
          * This flag is cleared before dm_suspend returns.
@@ -2149,7 +2157,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
          * We call dm_wait_for_completion to wait for all existing requests
          * to finish.
          */
-       r = dm_wait_for_completion(md, interruptible);
+       r = dm_wait_for_completion(md, task_state);
         if (!r)
                 set_bit(dmf_suspended_flag, &md->flags);
  
@@ -2249,10 +2257,11 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
  
  int dm_resume(struct mapped_device *md)
  {
-       int r = -EINVAL;
+       int r;
         struct dm_table *map = NULL;
  
  retry:
+       r = -EINVAL;
         mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
  
         if (!dm_suspended_md(md))
@@ -2276,8 +2285,6 @@ retry:
                 goto out;
  
         clear_bit(DMF_SUSPENDED, &md->flags);
-
-       r = 0;
  out:
         mutex_unlock(&md->suspend_lock);
  
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c

index 431a030..e83047c 100644 (file)
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -277,6 +277,48 @@ static int insert_ablock(struct dm_array_info *info, uint64_t index,
         return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root);
  }
  
+/*----------------------------------------------------------------*/
+
+static int __shadow_ablock(struct dm_array_info *info, dm_block_t b,
+                          struct dm_block **block, struct array_block **ab)
+{
+       int inc;
+       int r = dm_tm_shadow_block(info->btree_info.tm, b,
+                                  &array_validator, block, &inc);
+       if (r)
+               return r;
+
+       *ab = dm_block_data(*block);
+       if (inc)
+               inc_ablock_entries(info, *ab);
+
+       return 0;
+}
+
+/*
+ * The shadow op will often be a noop.  Only insert if it really
+ * copied data.
+ */
+static int __reinsert_ablock(struct dm_array_info *info, unsigned index,
+                            struct dm_block *block, dm_block_t b,
+                            dm_block_t *root)
+{
+       int r = 0;
+
+       if (dm_block_location(block) != b) {
+               /*
+                * dm_tm_shadow_block will have already decremented the old
+                * block, but it is still referenced by the btree.  We
+                * increment to stop the insert decrementing it below zero
+                * when overwriting the old value.
+                */
+               dm_tm_inc(info->btree_info.tm, b);
+               r = insert_ablock(info, index, block, root);
+       }
+
+       return r;
+}
+
  /*
   * Looks up an array block in the btree.  Then shadows it, and updates the
   * btree to point to this new shadow.  'root' is an input/output parameter
@@ -286,49 +328,21 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
                          unsigned index, struct dm_block **block,
                          struct array_block **ab)
  {
-       int r, inc;
+       int r;
         uint64_t key = index;
         dm_block_t b;
         __le64 block_le;
  
-       /*
-        * lookup
-        */
         r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le);
         if (r)
                 return r;
         b = le64_to_cpu(block_le);
  
-       /*
-        * shadow
-        */
-       r = dm_tm_shadow_block(info->btree_info.tm, b,
-                              &array_validator, block, &inc);
+       r = __shadow_ablock(info, b, block, ab);
         if (r)
                 return r;
  
-       *ab = dm_block_data(*block);
-       if (inc)
-               inc_ablock_entries(info, *ab);
-
-       /*
-        * Reinsert.
-        *
-        * The shadow op will often be a noop.  Only insert if it really
-        * copied data.
-        */
-       if (dm_block_location(*block) != b) {
-               /*
-                * dm_tm_shadow_block will have already decremented the old
-                * block, but it is still referenced by the btree.  We
-                * increment to stop the insert decrementing it below zero
-                * when overwriting the old value.
-                */
-               dm_tm_inc(info->btree_info.tm, b);
-               r = insert_ablock(info, index, *block, root);
-       }
-
-       return r;
+       return __reinsert_ablock(info, index, *block, b, root);
  }
  
  /*
@@ -681,6 +695,72 @@ int dm_array_resize(struct dm_array_info *info, dm_block_t root,
  }
  EXPORT_SYMBOL_GPL(dm_array_resize);
  
+static int populate_ablock_with_values(struct dm_array_info *info, struct array_block *ab,
+                                      value_fn fn, void *context, unsigned base, unsigned new_nr)
+{
+       int r;
+       unsigned i;
+       uint32_t nr_entries;
+       struct dm_btree_value_type *vt = &info->value_type;
+
+       BUG_ON(le32_to_cpu(ab->nr_entries));
+       BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
+
+       nr_entries = le32_to_cpu(ab->nr_entries);
+       for (i = 0; i < new_nr; i++) {
+               r = fn(base + i, element_at(info, ab, i), context);
+               if (r)
+                       return r;
+
+               if (vt->inc)
+                       vt->inc(vt->context, element_at(info, ab, i));
+       }
+
+       ab->nr_entries = cpu_to_le32(new_nr);
+       return 0;
+}
+
+int dm_array_new(struct dm_array_info *info, dm_block_t *root,
+                uint32_t size, value_fn fn, void *context)
+{
+       int r;
+       struct dm_block *block;
+       struct array_block *ab;
+       unsigned block_index, end_block, size_of_block, max_entries;
+
+       r = dm_array_empty(info, root);
+       if (r)
+               return r;
+
+       size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm));
+       max_entries = calc_max_entries(info->value_type.size, size_of_block);
+       end_block = dm_div_up(size, max_entries);
+
+       for (block_index = 0; block_index != end_block; block_index++) {
+               r = alloc_ablock(info, size_of_block, max_entries, &block, &ab);
+               if (r)
+                       break;
+
+               r = populate_ablock_with_values(info, ab, fn, context,
+                                               block_index * max_entries,
+                                               min(max_entries, size));
+               if (r) {
+                       unlock_ablock(info, block);
+                       break;
+               }
+
+               r = insert_ablock(info, block_index, block, root);
+               unlock_ablock(info, block);
+               if (r)
+                       break;
+
+               size -= max_entries;
+       }
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(dm_array_new);
+
  int dm_array_del(struct dm_array_info *info, dm_block_t root)
  {
         return dm_btree_del(&info->btree_info, root);
@@ -819,3 +899,89 @@ int dm_array_walk(struct dm_array_info *info, dm_block_t root,
  EXPORT_SYMBOL_GPL(dm_array_walk);
  
  /*----------------------------------------------------------------*/
+
+static int load_ablock(struct dm_array_cursor *c)
+{
+       int r;
+       __le64 value_le;
+       uint64_t key;
+
+       if (c->block)
+               unlock_ablock(c->info, c->block);
+
+       c->block = NULL;
+       c->ab = NULL;
+       c->index = 0;
+
+       r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le);
+       if (r) {
+               DMERR("dm_btree_cursor_get_value failed");
+               dm_btree_cursor_end(&c->cursor);
+
+       } else {
+               r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab);
+               if (r) {
+                       DMERR("get_ablock failed");
+                       dm_btree_cursor_end(&c->cursor);
+               }
+       }
+
+       return r;
+}
+
+int dm_array_cursor_begin(struct dm_array_info *info, dm_block_t root,
+                         struct dm_array_cursor *c)
+{
+       int r;
+
+       memset(c, 0, sizeof(*c));
+       c->info = info;
+       r = dm_btree_cursor_begin(&info->btree_info, root, true, &c->cursor);
+       if (r) {
+               DMERR("couldn't create btree cursor");
+               return r;
+       }
+
+       return load_ablock(c);
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_begin);
+
+void dm_array_cursor_end(struct dm_array_cursor *c)
+{
+       if (c->block) {
+               unlock_ablock(c->info, c->block);
+               dm_btree_cursor_end(&c->cursor);
+       }
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_end);
+
+int dm_array_cursor_next(struct dm_array_cursor *c)
+{
+       int r;
+
+       if (!c->block)
+               return -ENODATA;
+
+       c->index++;
+
+       if (c->index >= le32_to_cpu(c->ab->nr_entries)) {
+               r = dm_btree_cursor_next(&c->cursor);
+               if (r)
+                       return r;
+
+               r = load_ablock(c);
+               if (r)
+                       return r;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_next);
+
+void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le)
+{
+       *value_le = element_at(c->info, c->ab, c->index);
+}
+EXPORT_SYMBOL_GPL(dm_array_cursor_get_value);
+
+/*----------------------------------------------------------------*/
diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h

index ea177d6..27ee49a 100644 (file)
--- a/drivers/md/persistent-data/dm-array.h
+++ b/drivers/md/persistent-data/dm-array.h
@@ -111,6 +111,25 @@ int dm_array_resize(struct dm_array_info *info, dm_block_t root,
                     const void *value, dm_block_t *new_root)
         __dm_written_to_disk(value);
  
+/*
+ * Creates a new array populated with values provided by a callback
+ * function.  This is more efficient than creating an empty array,
+ * resizing, and then setting values since that process incurs a lot of
+ * copying.
+ *
+ * Assumes 32bit values for now since it's only used by the cache hint
+ * array.
+ *
+ * info - describes the array
+ * root - the root block of the array on disk
+ * size - the number of entries in the array
+ * fn - the callback
+ * context - passed to the callback
+ */
+typedef int (*value_fn)(uint32_t index, void *value_le, void *context);
+int dm_array_new(struct dm_array_info *info, dm_block_t *root,
+                uint32_t size, value_fn fn, void *context);
+
  /*
   * Frees a whole array.  The value_type's decrement operation will be called
   * for all values in the array
@@ -163,4 +182,37 @@ int dm_array_walk(struct dm_array_info *info, dm_block_t root,
  
  /*----------------------------------------------------------------*/
  
+/*
+ * Cursor api.
+ *
+ * This lets you iterate through all the entries in an array efficiently
+ * (it will preload metadata).
+ *
+ * I'm using a cursor, rather than a walk function with a callback because
+ * the cache target needs to iterate both the mapping and hint arrays in
+ * unison.
+ */
+struct dm_array_cursor {
+       struct dm_array_info *info;
+       struct dm_btree_cursor cursor;
+
+       struct dm_block *block;
+       struct array_block *ab;
+       unsigned index;
+};
+
+int dm_array_cursor_begin(struct dm_array_info *info,
+                         dm_block_t root, struct dm_array_cursor *c);
+void dm_array_cursor_end(struct dm_array_cursor *c);
+
+uint32_t dm_array_cursor_index(struct dm_array_cursor *c);
+int dm_array_cursor_next(struct dm_array_cursor *c);
+
+/*
+ * value_le is only valid while the cursor points at the current value.
+ */
+void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le);
+
+/*----------------------------------------------------------------*/
+
  #endif /* _LINUX_DM_ARRAY_H */
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c

index 2cc1877..20a4032 100644 (file)
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -994,3 +994,165 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
         return walk_node(info, root, fn, context);
  }
  EXPORT_SYMBOL_GPL(dm_btree_walk);
+
+/*----------------------------------------------------------------*/
+
+static void prefetch_values(struct dm_btree_cursor *c)
+{
+       unsigned i, nr;
+       __le64 value_le;
+       struct cursor_node *n = c->nodes + c->depth - 1;
+       struct btree_node *bn = dm_block_data(n->b);
+       struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
+
+       BUG_ON(c->info->value_type.size != sizeof(value_le));
+
+       nr = le32_to_cpu(bn->header.nr_entries);
+       for (i = 0; i < nr; i++) {
+               memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
+               dm_bm_prefetch(bm, le64_to_cpu(value_le));
+       }
+}
+
+static bool leaf_node(struct dm_btree_cursor *c)
+{
+       struct cursor_node *n = c->nodes + c->depth - 1;
+       struct btree_node *bn = dm_block_data(n->b);
+
+       return le32_to_cpu(bn->header.flags) & LEAF_NODE;
+}
+
+static int push_node(struct dm_btree_cursor *c, dm_block_t b)
+{
+       int r;
+       struct cursor_node *n = c->nodes + c->depth;
+
+       if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
+               DMERR("couldn't push cursor node, stack depth too high");
+               return -EINVAL;
+       }
+
+       r = bn_read_lock(c->info, b, &n->b);
+       if (r)
+               return r;
+
+       n->index = 0;
+       c->depth++;
+
+       if (c->prefetch_leaves || !leaf_node(c))
+               prefetch_values(c);
+
+       return 0;
+}
+
+static void pop_node(struct dm_btree_cursor *c)
+{
+       c->depth--;
+       unlock_block(c->info, c->nodes[c->depth].b);
+}
+
+static int inc_or_backtrack(struct dm_btree_cursor *c)
+{
+       struct cursor_node *n;
+       struct btree_node *bn;
+
+       for (;;) {
+               if (!c->depth)
+                       return -ENODATA;
+
+               n = c->nodes + c->depth - 1;
+               bn = dm_block_data(n->b);
+
+               n->index++;
+               if (n->index < le32_to_cpu(bn->header.nr_entries))
+                       break;
+
+               pop_node(c);
+       }
+
+       return 0;
+}
+
+static int find_leaf(struct dm_btree_cursor *c)
+{
+       int r = 0;
+       struct cursor_node *n;
+       struct btree_node *bn;
+       __le64 value_le;
+
+       for (;;) {
+               n = c->nodes + c->depth - 1;
+               bn = dm_block_data(n->b);
+
+               if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
+                       break;
+
+               memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
+               r = push_node(c, le64_to_cpu(value_le));
+               if (r) {
+                       DMERR("push_node failed");
+                       break;
+               }
+       }
+
+       if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
+               return -ENODATA;
+
+       return r;
+}
+
+int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
+                         bool prefetch_leaves, struct dm_btree_cursor *c)
+{
+       int r;
+
+       c->info = info;
+       c->root = root;
+       c->depth = 0;
+       c->prefetch_leaves = prefetch_leaves;
+
+       r = push_node(c, root);
+       if (r)
+               return r;
+
+       return find_leaf(c);
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
+
+void dm_btree_cursor_end(struct dm_btree_cursor *c)
+{
+       while (c->depth)
+               pop_node(c);
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
+
+int dm_btree_cursor_next(struct dm_btree_cursor *c)
+{
+       int r = inc_or_backtrack(c);
+       if (!r) {
+               r = find_leaf(c);
+               if (r)
+                       DMERR("find_leaf failed");
+       }
+
+       return r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
+
+int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
+{
+       if (c->depth) {
+               struct cursor_node *n = c->nodes + c->depth - 1;
+               struct btree_node *bn = dm_block_data(n->b);
+
+               if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
+                       return -EINVAL;
+
+               *key = le64_to_cpu(*key_ptr(bn, n->index));
+               memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
+               return 0;
+
+       } else
+               return -ENODATA;
+}
+EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h

index c74301f..db9bd26 100644 (file)
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -176,4 +176,39 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
                   int (*fn)(void *context, uint64_t *keys, void *leaf),
                   void *context);
  
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Cursor API.  This does not follow the rolling lock convention.  Since we
+ * know the order that values are required we can issue prefetches to speed
+ * up iteration.  Use on a single level btree only.
+ */
+#define DM_BTREE_CURSOR_MAX_DEPTH 16
+
+struct cursor_node {
+       struct dm_block *b;
+       unsigned index;
+};
+
+struct dm_btree_cursor {
+       struct dm_btree_info *info;
+       dm_block_t root;
+
+       bool prefetch_leaves;
+       unsigned depth;
+       struct cursor_node nodes[DM_BTREE_CURSOR_MAX_DEPTH];
+};
+
+/*
+ * Creates a fresh cursor.  If prefetch_leaves is set then it is assumed
+ * the btree contains block indexes that will be prefetched.  The cursor is
+ * quite large, so you probably don't want to put it on the stack.
+ */
+int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
+                         bool prefetch_leaves, struct dm_btree_cursor *c);
+void dm_btree_cursor_end(struct dm_btree_cursor *c);
+int dm_btree_cursor_next(struct dm_btree_cursor *c);
+int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le);
+
  #endif /* _LINUX_DM_BTREE_H */
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h

index 91acfce..ef7962e 100644 (file)
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -590,6 +590,7 @@ extern struct ratelimit_state dm_ratelimit_state;
  #define DM_MAPIO_SUBMITTED     0
  #define DM_MAPIO_REMAPPED      1
  #define DM_MAPIO_REQUEUE       DM_ENDIO_REQUEUE
+#define DM_MAPIO_DELAY_REQUEUE 3
  
  #define dm_sector_div64(x, y)( \
  { \
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 10 Oct 2016 00:16:18 +0000 (17:16 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 10 Oct 2016 00:16:18 +0000 (17:16 -0700)
drivers/md/dm-bufio.c		patch \| blob \| history
drivers/md/dm-cache-metadata.c		patch \| blob \| history
drivers/md/dm-cache-policy-cleaner.c		patch \| blob \| history
drivers/md/dm-cache-policy-internal.h		patch \| blob \| history
drivers/md/dm-cache-policy-smq.c		patch \| blob \| history
drivers/md/dm-cache-policy.h		patch \| blob \| history
drivers/md/dm-crypt.c		patch \| blob \| history
drivers/md/dm-mpath.c		patch \| blob \| history
drivers/md/dm-rq.c		patch \| blob \| history
drivers/md/dm-rq.h		patch \| blob \| history
drivers/md/dm.c		patch \| blob \| history
drivers/md/persistent-data/dm-array.c		patch \| blob \| history
drivers/md/persistent-data/dm-array.h		patch \| blob \| history
drivers/md/persistent-data/dm-btree.c		patch \| blob \| history
drivers/md/persistent-data/dm-btree.h		patch \| blob \| history
include/linux/device-mapper.h		patch \| blob \| history