Merge branch 'stable/for-jens-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Jens Axboe <axboe@fb.com>

Wed, 13 Jan 2016 15:20:36 +0000 (08:20 -0700)

committer Jens Axboe <axboe@fb.com>

Wed, 13 Jan 2016 15:20:36 +0000 (08:20 -0700)
author Jens Axboe <axboe@fb.com>
Wed, 13 Jan 2016 15:20:36 +0000 (08:20 -0700)
committer Jens Axboe <axboe@fb.com>
Wed, 13 Jan 2016 15:20:36 +0000 (08:20 -0700)
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c

index f909994..148930c 100644 (file)
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -83,6 +83,16 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
  MODULE_PARM_DESC(max_persistent_grants,
                   "Maximum number of grants to map persistently");
  
+/*
+ * Maximum number of rings/queues blkback supports, allow as many queues as there
+ * are CPUs if user has not specified a value.
+ */
+unsigned int xenblk_max_queues;
+module_param_named(max_queues, xenblk_max_queues, uint, 0644);
+MODULE_PARM_DESC(max_queues,
+                "Maximum number of hardware queues per virtual disk." \
+                "By default it is the number of online CPUs.");
+
  /*
   * Maximum order of pages to be used for the shared ring between front and
   * backend, 4KB page granularity is used.
@@ -113,71 +123,71 @@ module_param(log_stats, int, 0644);
  /* Number of free pages to remove on each call to gnttab_free_pages */
  #define NUM_BATCH_FREE_PAGES 10
  
-static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page)
  {
         unsigned long flags;
  
-       spin_lock_irqsave(&blkif->free_pages_lock, flags);
-       if (list_empty(&blkif->free_pages)) {
-               BUG_ON(blkif->free_pages_num != 0);
-               spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+       spin_lock_irqsave(&ring->free_pages_lock, flags);
+       if (list_empty(&ring->free_pages)) {
+               BUG_ON(ring->free_pages_num != 0);
+               spin_unlock_irqrestore(&ring->free_pages_lock, flags);
                 return gnttab_alloc_pages(1, page);
         }
-       BUG_ON(blkif->free_pages_num == 0);
-       page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+       BUG_ON(ring->free_pages_num == 0);
+       page[0] = list_first_entry(&ring->free_pages, struct page, lru);
         list_del(&page[0]->lru);
-       blkif->free_pages_num--;
-       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+       ring->free_pages_num--;
+       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  
         return 0;
  }
  
-static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page,
                                    int num)
  {
         unsigned long flags;
         int i;
  
-       spin_lock_irqsave(&blkif->free_pages_lock, flags);
+       spin_lock_irqsave(&ring->free_pages_lock, flags);
         for (i = 0; i < num; i++)
-               list_add(&page[i]->lru, &blkif->free_pages);
-       blkif->free_pages_num += num;
-       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+               list_add(&page[i]->lru, &ring->free_pages);
+       ring->free_pages_num += num;
+       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
  }
  
-static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num)
  {
         /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
         struct page *page[NUM_BATCH_FREE_PAGES];
         unsigned int num_pages = 0;
         unsigned long flags;
  
-       spin_lock_irqsave(&blkif->free_pages_lock, flags);
-       while (blkif->free_pages_num > num) {
-               BUG_ON(list_empty(&blkif->free_pages));
-               page[num_pages] = list_first_entry(&blkif->free_pages,
+       spin_lock_irqsave(&ring->free_pages_lock, flags);
+       while (ring->free_pages_num > num) {
+               BUG_ON(list_empty(&ring->free_pages));
+               page[num_pages] = list_first_entry(&ring->free_pages,
                                                    struct page, lru);
                 list_del(&page[num_pages]->lru);
-               blkif->free_pages_num--;
+               ring->free_pages_num--;
                 if (++num_pages == NUM_BATCH_FREE_PAGES) {
-                       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+                       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
                         gnttab_free_pages(num_pages, page);
-                       spin_lock_irqsave(&blkif->free_pages_lock, flags);
+                       spin_lock_irqsave(&ring->free_pages_lock, flags);
                         num_pages = 0;
                 }
         }
-       spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+       spin_unlock_irqrestore(&ring->free_pages_lock, flags);
         if (num_pages != 0)
                 gnttab_free_pages(num_pages, page);
  }
  
  #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
  
-static int do_block_io_op(struct xen_blkif *blkif);
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int do_block_io_op(struct xen_blkif_ring *ring);
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
                                 struct blkif_request *req,
                                 struct pending_req *pending_req);
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
                           unsigned short op, int st);
  
  #define foreach_grant_safe(pos, n, rbtree, node) \
@@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id,
  
  /*
   * We don't need locking around the persistent grant helpers
- * because blkback uses a single-thread for each backed, so we
+ * because blkback uses a single-thread for each backend, so we
   * can be sure that this functions will never be called recursively.
   *
   * The only exception to that is put_persistent_grant, that can be called
@@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
   * bit operations to modify the flags of a persistent grant and to count
   * the number of used grants.
   */
-static int add_persistent_gnt(struct xen_blkif *blkif,
+static int add_persistent_gnt(struct xen_blkif_ring *ring,
                                struct persistent_gnt *persistent_gnt)
  {
         struct rb_node **new = NULL, *parent = NULL;
         struct persistent_gnt *this;
+       struct xen_blkif *blkif = ring->blkif;
  
-       if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+       if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) {
                 if (!blkif->vbd.overflow_max_grants)
                         blkif->vbd.overflow_max_grants = 1;
                 return -EBUSY;
         }
         /* Figure out where to put new node */
-       new = &blkif->persistent_gnts.rb_node;
+       new = &ring->persistent_gnts.rb_node;
         while (*new) {
                 this = container_of(*new, struct persistent_gnt, node);
  
@@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif,
         set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
         /* Add new node and rebalance tree. */
         rb_link_node(&(persistent_gnt->node), parent, new);
-       rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
-       blkif->persistent_gnt_c++;
-       atomic_inc(&blkif->persistent_gnt_in_use);
+       rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
+       ring->persistent_gnt_c++;
+       atomic_inc(&ring->persistent_gnt_in_use);
         return 0;
  }
  
-static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
                                                  grant_ref_t gref)
  {
         struct persistent_gnt *data;
         struct rb_node *node = NULL;
  
-       node = blkif->persistent_gnts.rb_node;
+       node = ring->persistent_gnts.rb_node;
         while (node) {
                 data = container_of(node, struct persistent_gnt, node);
  
@@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
                                 return NULL;
                         }
                         set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
-                       atomic_inc(&blkif->persistent_gnt_in_use);
+                       atomic_inc(&ring->persistent_gnt_in_use);
                         return data;
                 }
         }
         return NULL;
  }
  
-static void put_persistent_gnt(struct xen_blkif *blkif,
+static void put_persistent_gnt(struct xen_blkif_ring *ring,
                                 struct persistent_gnt *persistent_gnt)
  {
         if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
                 pr_alert_ratelimited("freeing a grant already unused\n");
         set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
         clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
-       atomic_dec(&blkif->persistent_gnt_in_use);
+       atomic_dec(&ring->persistent_gnt_in_use);
  }
  
-static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
                                   unsigned int num)
  {
         struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
                         unmap_data.count = segs_to_unmap;
                         BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
  
-                       put_free_pages(blkif, pages, segs_to_unmap);
+                       put_free_pages(ring, pages, segs_to_unmap);
                         segs_to_unmap = 0;
                 }
  
@@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
         struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
         struct persistent_gnt *persistent_gnt;
         int segs_to_unmap = 0;
-       struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+       struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
         struct gntab_unmap_queue_data unmap_data;
  
         unmap_data.pages = pages;
         unmap_data.unmap_ops = unmap;
         unmap_data.kunmap_ops = NULL;
  
-       while(!list_empty(&blkif->persistent_purge_list)) {
-               persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+       while(!list_empty(&ring->persistent_purge_list)) {
+               persistent_gnt = list_first_entry(&ring->persistent_purge_list,
                                                   struct persistent_gnt,
                                                   remove_node);
                 list_del(&persistent_gnt->remove_node);
@@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
                 if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
                         unmap_data.count = segs_to_unmap;
                         BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-                       put_free_pages(blkif, pages, segs_to_unmap);
+                       put_free_pages(ring, pages, segs_to_unmap);
                         segs_to_unmap = 0;
                 }
                 kfree(persistent_gnt);
@@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work)
         if (segs_to_unmap > 0) {
                 unmap_data.count = segs_to_unmap;
                 BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
-               put_free_pages(blkif, pages, segs_to_unmap);
+               put_free_pages(ring, pages, segs_to_unmap);
         }
  }
  
-static void purge_persistent_gnt(struct xen_blkif *blkif)
+static void purge_persistent_gnt(struct xen_blkif_ring *ring)
  {
         struct persistent_gnt *persistent_gnt;
         struct rb_node *n;
@@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
         bool scan_used = false, clean_used = false;
         struct rb_root *root;
  
-       if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
-           (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
-           !blkif->vbd.overflow_max_grants)) {
-               return;
+       if (ring->persistent_gnt_c < xen_blkif_max_pgrants ||
+           (ring->persistent_gnt_c == xen_blkif_max_pgrants &&
+           !ring->blkif->vbd.overflow_max_grants)) {
+               goto out;
         }
  
-       if (work_busy(&blkif->persistent_purge_work)) {
+       if (work_busy(&ring->persistent_purge_work)) {
                 pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
-               return;
+               goto out;
         }
  
         num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
-       num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
-       num_clean = min(blkif->persistent_gnt_c, num_clean);
+       num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+       num_clean = min(ring->persistent_gnt_c, num_clean);
         if ((num_clean == 0) ||
-           (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
-               return;
+           (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use))))
+               goto out;
  
         /*
          * At this point, we can assure that there will be no calls
@@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif)
  
         pr_debug("Going to purge %u persistent grants\n", num_clean);
  
-       BUG_ON(!list_empty(&blkif->persistent_purge_list));
-       root = &blkif->persistent_gnts;
+       BUG_ON(!list_empty(&ring->persistent_purge_list));
+       root = &ring->persistent_gnts;
  purge_list:
         foreach_grant_safe(persistent_gnt, n, root, node) {
                 BUG_ON(persistent_gnt->handle ==
@@ -414,7 +425,7 @@ purge_list:
  
                 rb_erase(&persistent_gnt->node, root);
                 list_add(&persistent_gnt->remove_node,
-                        &blkif->persistent_purge_list);
+                        &ring->persistent_purge_list);
                 if (--num_clean == 0)
                         goto finished;
         }
@@ -435,30 +446,32 @@ finished:
                 goto purge_list;
         }
  
-       blkif->persistent_gnt_c -= (total - num_clean);
-       blkif->vbd.overflow_max_grants = 0;
+       ring->persistent_gnt_c -= (total - num_clean);
+       ring->blkif->vbd.overflow_max_grants = 0;
  
         /* We can defer this work */
-       schedule_work(&blkif->persistent_purge_work);
+       schedule_work(&ring->persistent_purge_work);
         pr_debug("Purged %u/%u\n", (total - num_clean), total);
+
+out:
         return;
  }
  
  /*
   * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
   */
-static struct pending_req *alloc_req(struct xen_blkif *blkif)
+static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
  {
         struct pending_req *req = NULL;
         unsigned long flags;
  
-       spin_lock_irqsave(&blkif->pending_free_lock, flags);
-       if (!list_empty(&blkif->pending_free)) {
-               req = list_entry(blkif->pending_free.next, struct pending_req,
+       spin_lock_irqsave(&ring->pending_free_lock, flags);
+       if (!list_empty(&ring->pending_free)) {
+               req = list_entry(ring->pending_free.next, struct pending_req,
                                  free_list);
                 list_del(&req->free_list);
         }
-       spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+       spin_unlock_irqrestore(&ring->pending_free_lock, flags);
         return req;
  }
  
@@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif)
   * Return the 'pending_req' structure back to the freepool. We also
   * wake up the thread if it was waiting for a free page.
   */
-static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
  {
         unsigned long flags;
         int was_empty;
  
-       spin_lock_irqsave(&blkif->pending_free_lock, flags);
-       was_empty = list_empty(&blkif->pending_free);
-       list_add(&req->free_list, &blkif->pending_free);
-       spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+       spin_lock_irqsave(&ring->pending_free_lock, flags);
+       was_empty = list_empty(&ring->pending_free);
+       list_add(&req->free_list, &ring->pending_free);
+       spin_unlock_irqrestore(&ring->pending_free_lock, flags);
         if (was_empty)
-               wake_up(&blkif->pending_free_wq);
+               wake_up(&ring->pending_free_wq);
  }
  
  /*
@@ -556,10 +569,10 @@ abort:
  /*
   * Notification from the guest OS.
   */
-static void blkif_notify_work(struct xen_blkif *blkif)
+static void blkif_notify_work(struct xen_blkif_ring *ring)
  {
-       blkif->waiting_reqs = 1;
-       wake_up(&blkif->wq);
+       ring->waiting_reqs = 1;
+       wake_up(&ring->wq);
  }
  
  irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
@@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
   * SCHEDULER FUNCTIONS
   */
  
-static void print_stats(struct xen_blkif *blkif)
+static void print_stats(struct xen_blkif_ring *ring)
  {
         pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
                  "  |  ds %4llu | pg: %4u/%4d\n",
-                current->comm, blkif->st_oo_req,
-                blkif->st_rd_req, blkif->st_wr_req,
-                blkif->st_f_req, blkif->st_ds_req,
-                blkif->persistent_gnt_c,
+                current->comm, ring->st_oo_req,
+                ring->st_rd_req, ring->st_wr_req,
+                ring->st_f_req, ring->st_ds_req,
+                ring->persistent_gnt_c,
                  xen_blkif_max_pgrants);
-       blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
-       blkif->st_rd_req = 0;
-       blkif->st_wr_req = 0;
-       blkif->st_oo_req = 0;
-       blkif->st_ds_req = 0;
+       ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+       ring->st_rd_req = 0;
+       ring->st_wr_req = 0;
+       ring->st_oo_req = 0;
+       ring->st_ds_req = 0;
  }
  
  int xen_blkif_schedule(void *arg)
  {
-       struct xen_blkif *blkif = arg;
+       struct xen_blkif_ring *ring = arg;
+       struct xen_blkif *blkif = ring->blkif;
         struct xen_vbd *vbd = &blkif->vbd;
         unsigned long timeout;
         int ret;
  
         xen_blkif_get(blkif);
  
+       set_freezable();
         while (!kthread_should_stop()) {
                 if (try_to_freeze())
                         continue;
@@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg)
                 timeout = msecs_to_jiffies(LRU_INTERVAL);
  
                 timeout = wait_event_interruptible_timeout(
-                       blkif->wq,
-                       blkif->waiting_reqs || kthread_should_stop(),
+                       ring->wq,
+                       ring->waiting_reqs || kthread_should_stop(),
                         timeout);
                 if (timeout == 0)
                         goto purge_gnt_list;
                 timeout = wait_event_interruptible_timeout(
-                       blkif->pending_free_wq,
-                       !list_empty(&blkif->pending_free) ||
+                       ring->pending_free_wq,
+                       !list_empty(&ring->pending_free) ||
                         kthread_should_stop(),
                         timeout);
                 if (timeout == 0)
                         goto purge_gnt_list;
  
-               blkif->waiting_reqs = 0;
+               ring->waiting_reqs = 0;
                 smp_mb(); /* clear flag *before* checking for work */
  
-               ret = do_block_io_op(blkif);
+               ret = do_block_io_op(ring);
                 if (ret > 0)
-                       blkif->waiting_reqs = 1;
+                       ring->waiting_reqs = 1;
                 if (ret == -EACCES)
-                       wait_event_interruptible(blkif->shutdown_wq,
+                       wait_event_interruptible(ring->shutdown_wq,
                                                  kthread_should_stop());
  
  purge_gnt_list:
                 if (blkif->vbd.feature_gnt_persistent &&
-                   time_after(jiffies, blkif->next_lru)) {
-                       purge_persistent_gnt(blkif);
-                       blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+                   time_after(jiffies, ring->next_lru)) {
+                       purge_persistent_gnt(ring);
+                       ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
                 }
  
                 /* Shrink if we have more than xen_blkif_max_buffer_pages */
-               shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+               shrink_free_pagepool(ring, xen_blkif_max_buffer_pages);
  
-               if (log_stats && time_after(jiffies, blkif->st_print))
-                       print_stats(blkif);
+               if (log_stats && time_after(jiffies, ring->st_print))
+                       print_stats(ring);
         }
  
         /* Drain pending purge work */
-       flush_work(&blkif->persistent_purge_work);
+       flush_work(&ring->persistent_purge_work);
  
         if (log_stats)
-               print_stats(blkif);
+               print_stats(ring);
  
-       blkif->xenblkd = NULL;
+       ring->xenblkd = NULL;
         xen_blkif_put(blkif);
  
         return 0;
@@ -658,22 +673,22 @@ purge_gnt_list:
  /*
   * Remove persistent grants and empty the pool of free pages
   */
-void xen_blkbk_free_caches(struct xen_blkif *blkif)
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
  {
         /* Free all persistent grant pages */
-       if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
-               free_persistent_gnts(blkif, &blkif->persistent_gnts,
-                       blkif->persistent_gnt_c);
+       if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
+               free_persistent_gnts(ring, &ring->persistent_gnts,
+                       ring->persistent_gnt_c);
  
-       BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-       blkif->persistent_gnt_c = 0;
+       BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+       ring->persistent_gnt_c = 0;
  
         /* Since we are shutting down remove all pages from the buffer */
-       shrink_free_pagepool(blkif, 0 /* All */);
+       shrink_free_pagepool(ring, 0 /* All */);
  }
  
  static unsigned int xen_blkbk_unmap_prepare(
-       struct xen_blkif *blkif,
+       struct xen_blkif_ring *ring,
         struct grant_page **pages,
         unsigned int num,
         struct gnttab_unmap_grant_ref *unmap_ops,
@@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare(
  
         for (i = 0; i < num; i++) {
                 if (pages[i]->persistent_gnt != NULL) {
-                       put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+                       put_persistent_gnt(ring, pages[i]->persistent_gnt);
                         continue;
                 }
                 if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
@@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare(
  
  static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
  {
-       struct pending_req* pending_req = (struct pending_req*) (data->data);
-       struct xen_blkif *blkif = pending_req->blkif;
+       struct pending_req *pending_req = (struct pending_req *)(data->data);
+       struct xen_blkif_ring *ring = pending_req->ring;
+       struct xen_blkif *blkif = ring->blkif;
  
         /* BUG_ON used to reproduce existing behaviour,
            but is this the best way to deal with this? */
         BUG_ON(result);
  
-       put_free_pages(blkif, data->pages, data->count);
-       make_response(blkif, pending_req->id,
+       put_free_pages(ring, data->pages, data->count);
+       make_response(ring, pending_req->id,
                       pending_req->operation, pending_req->status);
-       free_req(blkif, pending_req);
+       free_req(ring, pending_req);
         /*
          * Make sure the request is freed before releasing blkif,
          * or there could be a race between free_req and the
@@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
          * pending_free_wq if there's a drain going on, but it has
          * to be taken into account if the current model is changed.
          */
-       if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+       if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
                 complete(&blkif->drain_complete);
         }
         xen_blkif_put(blkif);
@@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_
  static void xen_blkbk_unmap_and_respond(struct pending_req *req)
  {
         struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
-       struct xen_blkif *blkif = req->blkif;
+       struct xen_blkif_ring *ring = req->ring;
         struct grant_page **pages = req->segments;
         unsigned int invcount;
  
-       invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs,
+       invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
                                            req->unmap, req->unmap_pages);
  
         work->data = req;
@@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req)
   * of hypercalls, but since this is only used in error paths there's
   * no real need.
   */
-static void xen_blkbk_unmap(struct xen_blkif *blkif,
+static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
                              struct grant_page *pages[],
                              int num)
  {
@@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif,
  
         while (num) {
                 unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-               
-               invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+
+               invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
                                                    unmap, unmap_pages);
                 if (invcount) {
                         ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
                         BUG_ON(ret);
-                       put_free_pages(blkif, unmap_pages, invcount);
+                       put_free_pages(ring, unmap_pages, invcount);
                 }
                 pages += batch;
                 num -= batch;
         }
  }
  
-static int xen_blkbk_map(struct xen_blkif *blkif,
+static int xen_blkbk_map(struct xen_blkif_ring *ring,
                          struct grant_page *pages[],
                          int num, bool ro)
  {
@@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif,
         int ret = 0;
         int last_map = 0, map_until = 0;
         int use_persistent_gnts;
+       struct xen_blkif *blkif = ring->blkif;
  
         use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
  
@@ -806,10 +823,11 @@ again:
         for (i = map_until; i < num; i++) {
                 uint32_t flags;
  
-               if (use_persistent_gnts)
+               if (use_persistent_gnts) {
                         persistent_gnt = get_persistent_gnt(
-                               blkif,
+                               ring,
                                 pages[i]->gref);
+               }
  
                 if (persistent_gnt) {
                         /*
@@ -819,7 +837,7 @@ again:
                         pages[i]->page = persistent_gnt->page;
                         pages[i]->persistent_gnt = persistent_gnt;
                 } else {
-                       if (get_free_page(blkif, &pages[i]->page))
+                       if (get_free_page(ring, &pages[i]->page))
                                 goto out_of_memory;
                         addr = vaddr(pages[i]->page);
                         pages_to_gnt[segs_to_map] = pages[i]->page;
@@ -852,7 +870,7 @@ again:
                         BUG_ON(new_map_idx >= segs_to_map);
                         if (unlikely(map[new_map_idx].status != 0)) {
                                 pr_debug("invalid buffer -- could not remap it\n");
-                               put_free_pages(blkif, &pages[seg_idx]->page, 1);
+                               put_free_pages(ring, &pages[seg_idx]->page, 1);
                                 pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
                                 ret |= 1;
                                 goto next;
@@ -862,7 +880,7 @@ again:
                         continue;
                 }
                 if (use_persistent_gnts &&
-                   blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+                   ring->persistent_gnt_c < xen_blkif_max_pgrants) {
                         /*
                          * We are using persistent grants, the grant is
                          * not mapped but we might have room for it.
@@ -880,7 +898,7 @@ again:
                         persistent_gnt->gnt = map[new_map_idx].ref;
                         persistent_gnt->handle = map[new_map_idx].handle;
                         persistent_gnt->page = pages[seg_idx]->page;
-                       if (add_persistent_gnt(blkif,
+                       if (add_persistent_gnt(ring,
                                                persistent_gnt)) {
                                 kfree(persistent_gnt);
                                 persistent_gnt = NULL;
@@ -888,7 +906,7 @@ again:
                         }
                         pages[seg_idx]->persistent_gnt = persistent_gnt;
                         pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
-                                persistent_gnt->gnt, blkif->persistent_gnt_c,
+                                persistent_gnt->gnt, ring->persistent_gnt_c,
                                  xen_blkif_max_pgrants);
                         goto next;
                 }
@@ -913,7 +931,7 @@ next:
  
  out_of_memory:
         pr_alert("%s: out of memory\n", __func__);
-       put_free_pages(blkif, pages_to_gnt, segs_to_map);
+       put_free_pages(ring, pages_to_gnt, segs_to_map);
         return -ENOMEM;
  }
  
@@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req)
  {
         int rc;
  
-       rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+       rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
                            pending_req->nr_segs,
                            (pending_req->operation != BLKIF_OP_READ));
  
@@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
                                     struct phys_req *preq)
  {
         struct grant_page **pages = pending_req->indirect_pages;
-       struct xen_blkif *blkif = pending_req->blkif;
+       struct xen_blkif_ring *ring = pending_req->ring;
         int indirect_grefs, rc, n, nseg, i;
         struct blkif_request_segment *segments = NULL;
  
@@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
         for (i = 0; i < indirect_grefs; i++)
                 pages[i]->gref = req->u.indirect.indirect_grefs[i];
  
-       rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+       rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
         if (rc)
                 goto unmap;
  
@@ -972,15 +990,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
  unmap:
         if (segments)
                 kunmap_atomic(segments);
-       xen_blkbk_unmap(blkif, pages, indirect_grefs);
+       xen_blkbk_unmap(ring, pages, indirect_grefs);
         return rc;
  }
  
-static int dispatch_discard_io(struct xen_blkif *blkif,
+static int dispatch_discard_io(struct xen_blkif_ring *ring,
                                 struct blkif_request *req)
  {
         int err = 0;
         int status = BLKIF_RSP_OKAY;
+       struct xen_blkif *blkif = ring->blkif;
         struct block_device *bdev = blkif->vbd.bdev;
         unsigned long secure;
         struct phys_req preq;
@@ -997,7 +1016,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
                         preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
                 goto fail_response;
         }
-       blkif->st_ds_req++;
+       ring->st_ds_req++;
  
         secure = (blkif->vbd.discard_secure &&
                  (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
@@ -1013,26 +1032,28 @@ fail_response:
         } else if (err)
                 status = BLKIF_RSP_ERROR;
  
-       make_response(blkif, req->u.discard.id, req->operation, status);
+       make_response(ring, req->u.discard.id, req->operation, status);
         xen_blkif_put(blkif);
         return err;
  }
  
-static int dispatch_other_io(struct xen_blkif *blkif,
+static int dispatch_other_io(struct xen_blkif_ring *ring,
                              struct blkif_request *req,
                              struct pending_req *pending_req)
  {
-       free_req(blkif, pending_req);
-       make_response(blkif, req->u.other.id, req->operation,
+       free_req(ring, pending_req);
+       make_response(ring, req->u.other.id, req->operation,
                       BLKIF_RSP_EOPNOTSUPP);
         return -EIO;
  }
  
-static void xen_blk_drain_io(struct xen_blkif *blkif)
+static void xen_blk_drain_io(struct xen_blkif_ring *ring)
  {
+       struct xen_blkif *blkif = ring->blkif;
+
         atomic_set(&blkif->drain, 1);
         do {
-               if (atomic_read(&blkif->inflight) == 0)
+               if (atomic_read(&ring->inflight) == 0)
                         break;
                 wait_for_completion_interruptible_timeout(
                                 &blkif->drain_complete, HZ);
@@ -1053,12 +1074,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
         if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
             (error == -EOPNOTSUPP)) {
                 pr_debug("flush diskcache op failed, not supported\n");
-               xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+               xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
         } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
                     (error == -EOPNOTSUPP)) {
                 pr_debug("write barrier op failed, not supported\n");
-               xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+               xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
                 pending_req->status = BLKIF_RSP_EOPNOTSUPP;
         } else if (error) {
                 pr_debug("Buffer not up-to-date at end of operation,"
@@ -1092,9 +1113,9 @@ static void end_block_io_op(struct bio *bio)
   * and transmute  it to the block API to hand it over to the proper block disk.
   */
  static int
-__do_block_io_op(struct xen_blkif *blkif)
+__do_block_io_op(struct xen_blkif_ring *ring)
  {
-       union blkif_back_rings *blk_rings = &blkif->blk_rings;
+       union blkif_back_rings *blk_rings = &ring->blk_rings;
         struct blkif_request req;
         struct pending_req *pending_req;
         RING_IDX rc, rp;
@@ -1107,7 +1128,7 @@ __do_block_io_op(struct xen_blkif *blkif)
         if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
                 rc = blk_rings->common.rsp_prod_pvt;
                 pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
-                       rp, rc, rp - rc, blkif->vbd.pdevice);
+                       rp, rc, rp - rc, ring->blkif->vbd.pdevice);
                 return -EACCES;
         }
         while (rc != rp) {
@@ -1120,14 +1141,14 @@ __do_block_io_op(struct xen_blkif *blkif)
                         break;
                 }
  
-               pending_req = alloc_req(blkif);
+               pending_req = alloc_req(ring);
                 if (NULL == pending_req) {
-                       blkif->st_oo_req++;
+                       ring->st_oo_req++;
                         more_to_do = 1;
                         break;
                 }
  
-               switch (blkif->blk_protocol) {
+               switch (ring->blkif->blk_protocol) {
                 case BLKIF_PROTOCOL_NATIVE:
                         memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
                         break;
@@ -1151,16 +1172,16 @@ __do_block_io_op(struct xen_blkif *blkif)
                 case BLKIF_OP_WRITE_BARRIER:
                 case BLKIF_OP_FLUSH_DISKCACHE:
                 case BLKIF_OP_INDIRECT:
-                       if (dispatch_rw_block_io(blkif, &req, pending_req))
+                       if (dispatch_rw_block_io(ring, &req, pending_req))
                                 goto done;
                         break;
                 case BLKIF_OP_DISCARD:
-                       free_req(blkif, pending_req);
-                       if (dispatch_discard_io(blkif, &req))
+                       free_req(ring, pending_req);
+                       if (dispatch_discard_io(ring, &req))
                                 goto done;
                         break;
                 default:
-                       if (dispatch_other_io(blkif, &req, pending_req))
+                       if (dispatch_other_io(ring, &req, pending_req))
                                 goto done;
                         break;
                 }
@@ -1173,13 +1194,13 @@ done:
  }
  
  static int
-do_block_io_op(struct xen_blkif *blkif)
+do_block_io_op(struct xen_blkif_ring *ring)
  {
-       union blkif_back_rings *blk_rings = &blkif->blk_rings;
+       union blkif_back_rings *blk_rings = &ring->blk_rings;
         int more_to_do;
  
         do {
-               more_to_do = __do_block_io_op(blkif);
+               more_to_do = __do_block_io_op(ring);
                 if (more_to_do)
                         break;
  
@@ -1192,7 +1213,7 @@ do_block_io_op(struct xen_blkif *blkif)
   * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
   * and call the 'submit_bio' to pass it to the underlying storage.
   */
-static int dispatch_rw_block_io(struct xen_blkif *blkif,
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
                                 struct blkif_request *req,
                                 struct pending_req *pending_req)
  {
@@ -1220,17 +1241,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
  
         switch (req_operation) {
         case BLKIF_OP_READ:
-               blkif->st_rd_req++;
+               ring->st_rd_req++;
                 operation = READ;
                 break;
         case BLKIF_OP_WRITE:
-               blkif->st_wr_req++;
+               ring->st_wr_req++;
                 operation = WRITE_ODIRECT;
                 break;
         case BLKIF_OP_WRITE_BARRIER:
                 drain = true;
         case BLKIF_OP_FLUSH_DISKCACHE:
-               blkif->st_f_req++;
+               ring->st_f_req++;
                 operation = WRITE_FLUSH;
                 break;
         default:
@@ -1255,7 +1276,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
  
         preq.nr_sects      = 0;
  
-       pending_req->blkif     = blkif;
+       pending_req->ring      = ring;
         pending_req->id        = req->u.rw.id;
         pending_req->operation = req_operation;
         pending_req->status    = BLKIF_RSP_OKAY;
@@ -1282,12 +1303,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                         goto fail_response;
         }
  
-       if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+       if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
                 pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
                          operation == READ ? "read" : "write",
                          preq.sector_number,
                          preq.sector_number + preq.nr_sects,
-                        blkif->vbd.pdevice);
+                        ring->blkif->vbd.pdevice);
                 goto fail_response;
         }
  
@@ -1299,7 +1320,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
                 if (((int)preq.sector_number|(int)seg[i].nsec) &
                     ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
                         pr_debug("Misaligned I/O request from domain %d\n",
-                                blkif->domid);
+                                ring->blkif->domid);
                         goto fail_response;
                 }
         }
@@ -1308,7 +1329,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
          * issue the WRITE_FLUSH.
          */
         if (drain)
-               xen_blk_drain_io(pending_req->blkif);
+               xen_blk_drain_io(pending_req->ring);
  
         /*
          * If we have failed at this point, we need to undo the M2P override,
@@ -1323,8 +1344,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
          * This corresponding xen_blkif_put is done in __end_block_io_op, or
          * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
          */
-       xen_blkif_get(blkif);
-       atomic_inc(&blkif->inflight);
+       xen_blkif_get(ring->blkif);
+       atomic_inc(&ring->inflight);
  
         for (i = 0; i < nseg; i++) {
                 while ((bio == NULL) ||
@@ -1372,19 +1393,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
         blk_finish_plug(&plug);
  
         if (operation == READ)
-               blkif->st_rd_sect += preq.nr_sects;
+               ring->st_rd_sect += preq.nr_sects;
         else if (operation & WRITE)
-               blkif->st_wr_sect += preq.nr_sects;
+               ring->st_wr_sect += preq.nr_sects;
  
         return 0;
  
   fail_flush:
-       xen_blkbk_unmap(blkif, pending_req->segments,
+       xen_blkbk_unmap(ring, pending_req->segments,
                         pending_req->nr_segs);
   fail_response:
         /* Haven't submitted any bio's yet. */
-       make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
-       free_req(blkif, pending_req);
+       make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+       free_req(ring, pending_req);
         msleep(1); /* back off a bit */
         return -EIO;
  
@@ -1402,21 +1423,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
  /*
   * Put a response on the ring on how the operation fared.
   */
-static void make_response(struct xen_blkif *blkif, u64 id,
+static void make_response(struct xen_blkif_ring *ring, u64 id,
                           unsigned short op, int st)
  {
         struct blkif_response  resp;
         unsigned long     flags;
-       union blkif_back_rings *blk_rings = &blkif->blk_rings;
+       union blkif_back_rings *blk_rings;
         int notify;
  
         resp.id        = id;
         resp.operation = op;
         resp.status    = st;
  
-       spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+       spin_lock_irqsave(&ring->blk_ring_lock, flags);
+       blk_rings = &ring->blk_rings;
         /* Place on the response ring for the relevant domain. */
-       switch (blkif->blk_protocol) {
+       switch (ring->blkif->blk_protocol) {
         case BLKIF_PROTOCOL_NATIVE:
                 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
                        &resp, sizeof(resp));
@@ -1434,9 +1456,9 @@ static void make_response(struct xen_blkif *blkif, u64 id,
         }
         blk_rings->common.rsp_prod_pvt++;
         RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
-       spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+       spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
         if (notify)
-               notify_remote_via_irq(blkif->irq);
+               notify_remote_via_irq(ring->irq);
  }
  
  static int __init xen_blkif_init(void)
@@ -1452,6 +1474,9 @@ static int __init xen_blkif_init(void)
                 xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
         }
  
+       if (xenblk_max_queues == 0)
+               xenblk_max_queues = num_online_cpus();
+
         rc = xen_blkif_interface_init();
         if (rc)
                 goto failed_init;
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h

index 68e87a0..b27c5ba 100644 (file)
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -46,6 +46,7 @@
  #include <xen/interface/io/protocols.h>
  
  extern unsigned int xen_blkif_max_ring_order;
+extern unsigned int xenblk_max_queues;
  /*
   * This is the maximum number of segments that would be allowed in indirect
   * requests. This value will also be passed to the frontend.
@@ -269,68 +270,79 @@ struct persistent_gnt {
         struct list_head remove_node;
  };
  
-struct xen_blkif {
-       /* Unique identifier for this interface. */
-       domid_t                 domid;
-       unsigned int            handle;
+/* Per-ring information. */
+struct xen_blkif_ring {
         /* Physical parameters of the comms window. */
         unsigned int            irq;
-       /* Comms information. */
-       enum blkif_protocol     blk_protocol;
         union blkif_back_rings  blk_rings;
         void                    *blk_ring;
-       /* The VBD attached to this interface. */
-       struct xen_vbd          vbd;
-       /* Back pointer to the backend_info. */
-       struct backend_info     *be;
         /* Private fields. */
         spinlock_t              blk_ring_lock;
-       atomic_t                refcnt;
  
         wait_queue_head_t       wq;
-       /* for barrier (drain) requests */
-       struct completion       drain_complete;
-       atomic_t                drain;
         atomic_t                inflight;
-       /* One thread per one blkif. */
+       /* One thread per blkif ring. */
         struct task_struct      *xenblkd;
         unsigned int            waiting_reqs;
  
-       /* tree to store persistent grants */
+       /* List of all 'pending_req' available */
+       struct list_head        pending_free;
+       /* And its spinlock. */
+       spinlock_t              pending_free_lock;
+       wait_queue_head_t       pending_free_wq;
+
+       /* Tree to store persistent grants. */
+       spinlock_t              pers_gnts_lock;
         struct rb_root          persistent_gnts;
         unsigned int            persistent_gnt_c;
         atomic_t                persistent_gnt_in_use;
         unsigned long           next_lru;
  
-       /* used by the kworker that offload work from the persistent purge */
+       /* Statistics. */
+       unsigned long           st_print;
+       unsigned long long      st_rd_req;
+       unsigned long long      st_wr_req;
+       unsigned long long      st_oo_req;
+       unsigned long long      st_f_req;
+       unsigned long long      st_ds_req;
+       unsigned long long      st_rd_sect;
+       unsigned long long      st_wr_sect;
+
+       /* Used by the kworker that offload work from the persistent purge. */
         struct list_head        persistent_purge_list;
         struct work_struct      persistent_purge_work;
  
-       /* buffer of free pages to map grant refs */
+       /* Buffer of free pages to map grant refs. */
         spinlock_t              free_pages_lock;
         int                     free_pages_num;
         struct list_head        free_pages;
  
-       /* List of all 'pending_req' available */
-       struct list_head        pending_free;
-       /* And its spinlock. */
-       spinlock_t              pending_free_lock;
-       wait_queue_head_t       pending_free_wq;
-
-       /* statistics */
-       unsigned long           st_print;
-       unsigned long long                      st_rd_req;
-       unsigned long long                      st_wr_req;
-       unsigned long long                      st_oo_req;
-       unsigned long long                      st_f_req;
-       unsigned long long                      st_ds_req;
-       unsigned long long                      st_rd_sect;
-       unsigned long long                      st_wr_sect;
-
         struct work_struct      free_work;
         /* Thread shutdown wait queue. */
         wait_queue_head_t       shutdown_wq;
-       unsigned int nr_ring_pages;
+       struct xen_blkif        *blkif;
+};
+
+struct xen_blkif {
+       /* Unique identifier for this interface. */
+       domid_t                 domid;
+       unsigned int            handle;
+       /* Comms information. */
+       enum blkif_protocol     blk_protocol;
+       /* The VBD attached to this interface. */
+       struct xen_vbd          vbd;
+       /* Back pointer to the backend_info. */
+       struct backend_info     *be;
+       atomic_t                refcnt;
+       /* for barrier (drain) requests */
+       struct completion       drain_complete;
+       atomic_t                drain;
+
+       struct work_struct      free_work;
+       unsigned int            nr_ring_pages;
+       /* All rings for this device. */
+       struct xen_blkif_ring   *rings;
+       unsigned int            nr_rings;
  };
  
  struct seg_buf {
@@ -352,7 +364,7 @@ struct grant_page {
   * response queued for it, with the saved 'id' passed back.
   */
  struct pending_req {
-       struct xen_blkif        *blkif;
+       struct xen_blkif_ring   *ring;
         u64                     id;
         int                     nr_segs;
         atomic_t                pendcnt;
@@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void);
  irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
  int xen_blkif_schedule(void *arg);
  int xen_blkif_purge_persistent(void *arg);
-void xen_blkbk_free_caches(struct xen_blkif *blkif);
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
  
  int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
                               struct backend_info *be, int state);
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c

index f53cff4..876763f 100644 (file)
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
  {
         int err;
         char name[BLKBACK_NAME_LEN];
+       struct xen_blkif_ring *ring;
+       int i;
  
         /* Not ready to connect? */
-       if (!blkif->irq || !blkif->vbd.bdev)
+       if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
                 return;
  
         /* Already connected? */
@@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
         }
         invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
  
-       blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
-       if (IS_ERR(blkif->xenblkd)) {
-               err = PTR_ERR(blkif->xenblkd);
-               blkif->xenblkd = NULL;
-               xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
-               return;
+       for (i = 0; i < blkif->nr_rings; i++) {
+               ring = &blkif->rings[i];
+               ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
+               if (IS_ERR(ring->xenblkd)) {
+                       err = PTR_ERR(ring->xenblkd);
+                       ring->xenblkd = NULL;
+                       xenbus_dev_fatal(blkif->be->dev, err,
+                                       "start %s-%d xenblkd", name, i);
+                       goto out;
+               }
+       }
+       return;
+
+out:
+       while (--i >= 0) {
+               ring = &blkif->rings[i];
+               kthread_stop(ring->xenblkd);
+       }
+       return;
+}
+
+static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+{
+       unsigned int r;
+
+       blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL);
+       if (!blkif->rings)
+               return -ENOMEM;
+
+       for (r = 0; r < blkif->nr_rings; r++) {
+               struct xen_blkif_ring *ring = &blkif->rings[r];
+
+               spin_lock_init(&ring->blk_ring_lock);
+               init_waitqueue_head(&ring->wq);
+               INIT_LIST_HEAD(&ring->pending_free);
+               INIT_LIST_HEAD(&ring->persistent_purge_list);
+               INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+               spin_lock_init(&ring->free_pages_lock);
+               INIT_LIST_HEAD(&ring->free_pages);
+
+               spin_lock_init(&ring->pending_free_lock);
+               init_waitqueue_head(&ring->pending_free_wq);
+               init_waitqueue_head(&ring->shutdown_wq);
+               ring->blkif = blkif;
+               ring->st_print = jiffies;
+               xen_blkif_get(blkif);
         }
+
+       return 0;
  }
  
  static struct xen_blkif *xen_blkif_alloc(domid_t domid)
@@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
                 return ERR_PTR(-ENOMEM);
  
         blkif->domid = domid;
-       spin_lock_init(&blkif->blk_ring_lock);
         atomic_set(&blkif->refcnt, 1);
-       init_waitqueue_head(&blkif->wq);
         init_completion(&blkif->drain_complete);
-       atomic_set(&blkif->drain, 0);
-       blkif->st_print = jiffies;
-       blkif->persistent_gnts.rb_node = NULL;
-       spin_lock_init(&blkif->free_pages_lock);
-       INIT_LIST_HEAD(&blkif->free_pages);
-       INIT_LIST_HEAD(&blkif->persistent_purge_list);
-       blkif->free_pages_num = 0;
-       atomic_set(&blkif->persistent_gnt_in_use, 0);
-       atomic_set(&blkif->inflight, 0);
-       INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
-
-       INIT_LIST_HEAD(&blkif->pending_free);
         INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
-       spin_lock_init(&blkif->pending_free_lock);
-       init_waitqueue_head(&blkif->pending_free_wq);
-       init_waitqueue_head(&blkif->shutdown_wq);
  
         return blkif;
  }
  
-static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
+static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
                          unsigned int nr_grefs, unsigned int evtchn)
  {
         int err;
+       struct xen_blkif *blkif = ring->blkif;
  
         /* Already connected through? */
-       if (blkif->irq)
+       if (ring->irq)
                 return 0;
  
         err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
-                                    &blkif->blk_ring);
+                                    &ring->blk_ring);
         if (err < 0)
                 return err;
  
@@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
         case BLKIF_PROTOCOL_NATIVE:
         {
                 struct blkif_sring *sring;
-               sring = (struct blkif_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.native, sring,
+               sring = (struct blkif_sring *)ring->blk_ring;
+               BACK_RING_INIT(&ring->blk_rings.native, sring,
                                XEN_PAGE_SIZE * nr_grefs);
                 break;
         }
         case BLKIF_PROTOCOL_X86_32:
         {
                 struct blkif_x86_32_sring *sring_x86_32;
-               sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+               sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring;
+               BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32,
                                XEN_PAGE_SIZE * nr_grefs);
                 break;
         }
         case BLKIF_PROTOCOL_X86_64:
         {
                 struct blkif_x86_64_sring *sring_x86_64;
-               sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+               sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring;
+               BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64,
                                XEN_PAGE_SIZE * nr_grefs);
                 break;
         }
@@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
  
         err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
                                                     xen_blkif_be_int, 0,
-                                                   "blkif-backend", blkif);
+                                                   "blkif-backend", ring);
         if (err < 0) {
-               xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-               blkif->blk_rings.common.sring = NULL;
+               xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+               ring->blk_rings.common.sring = NULL;
                 return err;
         }
-       blkif->irq = err;
+       ring->irq = err;
  
         return 0;
  }
@@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
  static int xen_blkif_disconnect(struct xen_blkif *blkif)
  {
         struct pending_req *req, *n;
-       int i = 0, j;
+       unsigned int j, r;
  
-       if (blkif->xenblkd) {
-               kthread_stop(blkif->xenblkd);
-               wake_up(&blkif->shutdown_wq);
-               blkif->xenblkd = NULL;
-       }
+       for (r = 0; r < blkif->nr_rings; r++) {
+               struct xen_blkif_ring *ring = &blkif->rings[r];
+               unsigned int i = 0;
  
-       /* The above kthread_stop() guarantees that at this point we
-        * don't have any discard_io or other_io requests. So, checking
-        * for inflight IO is enough.
-        */
-       if (atomic_read(&blkif->inflight) > 0)
-               return -EBUSY;
+               if (ring->xenblkd) {
+                       kthread_stop(ring->xenblkd);
+                       wake_up(&ring->shutdown_wq);
+                       ring->xenblkd = NULL;
+               }
  
-       if (blkif->irq) {
-               unbind_from_irqhandler(blkif->irq, blkif);
-               blkif->irq = 0;
-       }
+               /* The above kthread_stop() guarantees that at this point we
+                * don't have any discard_io or other_io requests. So, checking
+                * for inflight IO is enough.
+                */
+               if (atomic_read(&ring->inflight) > 0)
+                       return -EBUSY;
  
-       if (blkif->blk_rings.common.sring) {
-               xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
-               blkif->blk_rings.common.sring = NULL;
-       }
+               if (ring->irq) {
+                       unbind_from_irqhandler(ring->irq, ring);
+                       ring->irq = 0;
+               }
  
-       /* Remove all persistent grants and the cache of ballooned pages. */
-       xen_blkbk_free_caches(blkif);
+               if (ring->blk_rings.common.sring) {
+                       xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+                       ring->blk_rings.common.sring = NULL;
+               }
  
-       /* Check that there is no request in use */
-       list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
-               list_del(&req->free_list);
+               /* Remove all persistent grants and the cache of ballooned pages. */
+               xen_blkbk_free_caches(ring);
  
-               for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
-                       kfree(req->segments[j]);
+               /* Check that there is no request in use */
+               list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+                       list_del(&req->free_list);
  
-               for (j = 0; j < MAX_INDIRECT_PAGES; j++)
-                       kfree(req->indirect_pages[j]);
+                       for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+                               kfree(req->segments[j]);
  
-               kfree(req);
-               i++;
-       }
+                       for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+                               kfree(req->indirect_pages[j]);
+
+                       kfree(req);
+                       i++;
+               }
  
-       WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+               BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
+               BUG_ON(!list_empty(&ring->persistent_purge_list));
+               BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+               BUG_ON(!list_empty(&ring->free_pages));
+               BUG_ON(ring->free_pages_num != 0);
+               BUG_ON(ring->persistent_gnt_c != 0);
+               WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+               xen_blkif_put(blkif);
+       }
         blkif->nr_ring_pages = 0;
+       /*
+        * blkif->rings was allocated in connect_ring, so we should free it in
+        * here.
+        */
+       kfree(blkif->rings);
+       blkif->rings = NULL;
+       blkif->nr_rings = 0;
  
         return 0;
  }
@@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif)
         xen_vbd_free(&blkif->vbd);
  
         /* Make sure everything is drained before shutting down */
-       BUG_ON(blkif->persistent_gnt_c != 0);
-       BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
-       BUG_ON(blkif->free_pages_num != 0);
-       BUG_ON(!list_empty(&blkif->persistent_purge_list));
-       BUG_ON(!list_empty(&blkif->free_pages));
-       BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
-
         kmem_cache_free(xen_blkif_cachep, blkif);
  }
  
@@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void)
   *  sysfs interface for VBD I/O requests
   */
  
-#define VBD_SHOW(name, format, args...)                                        \
+#define VBD_SHOW_ALLRING(name, format)                                 \
         static ssize_t show_##name(struct device *_dev,                 \
                                    struct device_attribute *attr,       \
                                    char *buf)                           \
         {                                                               \
                 struct xenbus_device *dev = to_xenbus_device(_dev);     \
                 struct backend_info *be = dev_get_drvdata(&dev->dev);   \
+               struct xen_blkif *blkif = be->blkif;                    \
+               unsigned int i;                                         \
+               unsigned long long result = 0;                          \
                                                                         \
-               return sprintf(buf, format, ##args);                    \
+               if (!blkif->rings)                              \
+                       goto out;                                       \
+                                                                       \
+               for (i = 0; i < blkif->nr_rings; i++) {         \
+                       struct xen_blkif_ring *ring = &blkif->rings[i]; \
+                                                                       \
+                       result += ring->st_##name;                      \
+               }                                                       \
+                                                                       \
+out:                                                                   \
+               return sprintf(buf, format, result);                    \
         }                                                               \
         static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
  
-VBD_SHOW(oo_req,  "%llu\n", be->blkif->st_oo_req);
-VBD_SHOW(rd_req,  "%llu\n", be->blkif->st_rd_req);
-VBD_SHOW(wr_req,  "%llu\n", be->blkif->st_wr_req);
-VBD_SHOW(f_req,  "%llu\n", be->blkif->st_f_req);
-VBD_SHOW(ds_req,  "%llu\n", be->blkif->st_ds_req);
-VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
-VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+VBD_SHOW_ALLRING(oo_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_req,  "%llu\n");
+VBD_SHOW_ALLRING(wr_req,  "%llu\n");
+VBD_SHOW_ALLRING(f_req,  "%llu\n");
+VBD_SHOW_ALLRING(ds_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_sect, "%llu\n");
+VBD_SHOW_ALLRING(wr_sect, "%llu\n");
  
  static struct attribute *xen_vbdstat_attrs[] = {
         &dev_attr_oo_req.attr,
@@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = {
         .attrs = xen_vbdstat_attrs,
  };
  
+#define VBD_SHOW(name, format, args...)                                        \
+       static ssize_t show_##name(struct device *_dev,                 \
+                                  struct device_attribute *attr,       \
+                                  char *buf)                           \
+       {                                                               \
+               struct xenbus_device *dev = to_xenbus_device(_dev);     \
+               struct backend_info *be = dev_get_drvdata(&dev->dev);   \
+                                                                       \
+               return sprintf(buf, format, ##args);                    \
+       }                                                               \
+       static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
  VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
  VBD_SHOW(mode, "%s\n", be->mode);
  
@@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev)
  
         dev_set_drvdata(&dev->dev, NULL);
  
-       if (be->blkif) {
+       if (be->blkif)
                 xen_blkif_disconnect(be->blkif);
-               xen_blkif_put(be->blkif);
-       }
  
+       /* Put the reference we set in xen_blkif_alloc(). */
+       xen_blkif_put(be->blkif);
         kfree(be->mode);
         kfree(be);
         return 0;
@@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
                 goto fail;
         }
  
+       /* Multi-queue: advertise how many queues are supported by us.*/
+       err = xenbus_printf(XBT_NIL, dev->nodename,
+                           "multi-queue-max-queues", "%u", xenblk_max_queues);
+       if (err)
+               pr_warn("Error writing multi-queue-max-queues\n");
+
         /* setup back pointer */
         be->blkif->be = be;
  
@@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev,
                 }
  
                 err = connect_ring(be);
-               if (err)
+               if (err) {
+                       /*
+                        * Clean up so that memory resources can be used by
+                        * other devices. connect_ring reported already error.
+                        */
+                       xen_blkif_disconnect(be->blkif);
                         break;
+               }
                 xen_update_blkif_status(be->blkif);
                 break;
  
@@ -825,50 +902,43 @@ again:
         xenbus_transaction_end(xbt, 1);
  }
  
-
-static int connect_ring(struct backend_info *be)
+/*
+ * Each ring may have multi pages, depends on "ring-page-order".
+ */
+static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
  {
-       struct xenbus_device *dev = be->dev;
         unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
-       unsigned int evtchn, nr_grefs, ring_page_order;
-       unsigned int pers_grants;
-       char protocol[64] = "";
         struct pending_req *req, *n;
         int err, i, j;
+       struct xen_blkif *blkif = ring->blkif;
+       struct xenbus_device *dev = blkif->be->dev;
+       unsigned int ring_page_order, nr_grefs, evtchn;
  
-       pr_debug("%s %s\n", __func__, dev->otherend);
-
-       err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+       err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
                           &evtchn);
         if (err != 1) {
                 err = -EINVAL;
-               xenbus_dev_fatal(dev, err, "reading %s/event-channel",
-                                dev->otherend);
+               xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
                 return err;
         }
-       pr_info("event-channel %u\n", evtchn);
  
         err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
                           &ring_page_order);
         if (err != 1) {
-               err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
-                                 "%u", &ring_ref[0]);
+               err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]);
                 if (err != 1) {
                         err = -EINVAL;
-                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
-                                        dev->otherend);
+                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
                         return err;
                 }
                 nr_grefs = 1;
-               pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
-                       ring_ref[0]);
         } else {
                 unsigned int i;
  
                 if (ring_page_order > xen_blkif_max_ring_order) {
                         err = -EINVAL;
                         xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
-                                        dev->otherend, ring_page_order,
+                                        dir, ring_page_order,
                                          xen_blkif_max_ring_order);
                         return err;
                 }
@@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be)
                         char ring_ref_name[RINGREF_NAME_LEN];
  
                         snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-                       err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name,
+                       err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
                                            "%u", &ring_ref[i]);
                         if (err != 1) {
                                 err = -EINVAL;
                                 xenbus_dev_fatal(dev, err, "reading %s/%s",
-                                                dev->otherend, ring_ref_name);
+                                                dir, ring_ref_name);
                                 return err;
                         }
-                       pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
                 }
         }
-
-       be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
-       err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
-                           "%63s", protocol, NULL);
-       if (err)
-               strcpy(protocol, "unspecified, assuming default");
-       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-               be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
-       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
-       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
-       else {
-               xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
-               return -1;
-       }
-       err = xenbus_gather(XBT_NIL, dev->otherend,
-                           "feature-persistent", "%u",
-                           &pers_grants, NULL);
-       if (err)
-               pers_grants = 0;
-
-       be->blkif->vbd.feature_gnt_persistent = pers_grants;
-       be->blkif->vbd.overflow_max_grants = 0;
-       be->blkif->nr_ring_pages = nr_grefs;
-
-       pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
-               nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
-               pers_grants ? "persistent grants" : "");
+       blkif->nr_ring_pages = nr_grefs;
  
         for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
                 req = kzalloc(sizeof(*req), GFP_KERNEL);
                 if (!req)
                         goto fail;
-               list_add_tail(&req->free_list, &be->blkif->pending_free);
+               list_add_tail(&req->free_list, &ring->pending_free);
                 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
                         req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
                         if (!req->segments[j])
@@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be)
         }
  
         /* Map the shared frame, irq etc. */
-       err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
+       err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
         if (err) {
                 xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
                 return err;
@@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be)
         return 0;
  
  fail:
-       list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) {
+       list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
                 list_del(&req->free_list);
                 for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
                         if (!req->segments[j])
@@ -962,6 +1003,93 @@ fail:
                 kfree(req);
         }
         return -ENOMEM;
+
+}
+
+static int connect_ring(struct backend_info *be)
+{
+       struct xenbus_device *dev = be->dev;
+       unsigned int pers_grants;
+       char protocol[64] = "";
+       int err, i;
+       char *xspath;
+       size_t xspathsize;
+       const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+       unsigned int requested_num_queues = 0;
+
+       pr_debug("%s %s\n", __func__, dev->otherend);
+
+       be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+       err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+                           "%63s", protocol, NULL);
+       if (err)
+               strcpy(protocol, "unspecified, assuming default");
+       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+       else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+               be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+       else {
+               xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+               return -ENOSYS;
+       }
+       err = xenbus_gather(XBT_NIL, dev->otherend,
+                           "feature-persistent", "%u",
+                           &pers_grants, NULL);
+       if (err)
+               pers_grants = 0;
+
+       be->blkif->vbd.feature_gnt_persistent = pers_grants;
+       be->blkif->vbd.overflow_max_grants = 0;
+
+       /*
+        * Read the number of hardware queues from frontend.
+        */
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues",
+                          "%u", &requested_num_queues);
+       if (err < 0) {
+               requested_num_queues = 1;
+       } else {
+               if (requested_num_queues > xenblk_max_queues
+                   || requested_num_queues == 0) {
+                       /* Buggy or malicious guest. */
+                       xenbus_dev_fatal(dev, err,
+                                       "guest requested %u queues, exceeding the maximum of %u.",
+                                       requested_num_queues, xenblk_max_queues);
+                       return -ENOSYS;
+               }
+       }
+       be->blkif->nr_rings = requested_num_queues;
+       if (xen_blkif_alloc_rings(be->blkif))
+               return -ENOMEM;
+
+       pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
+                be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+                pers_grants ? "persistent grants" : "");
+
+       if (be->blkif->nr_rings == 1)
+               return read_per_ring_refs(&be->blkif->rings[0], dev->otherend);
+       else {
+               xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
+               xspath = kmalloc(xspathsize, GFP_KERNEL);
+               if (!xspath) {
+                       xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
+                       return -ENOMEM;
+               }
+
+               for (i = 0; i < be->blkif->nr_rings; i++) {
+                       memset(xspath, 0, xspathsize);
+                       snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
+                       err = read_per_ring_refs(&be->blkif->rings[i], xspath);
+                       if (err) {
+                               kfree(xspath);
+                               return err;
+                       }
+               }
+               kfree(xspath);
+       }
+       return 0;
  }
  
  static const struct xenbus_device_id xen_blkbk_ids[] = {
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c

index 2fee2ee..8a8dc91 100644 (file)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
  
  #include <asm/xen/hypervisor.h>
  
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
+
  enum blkif_state {
         BLKIF_STATE_DISCONNECTED,
         BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
         struct list_head node;
  };
  
+enum blk_req_status {
+       REQ_WAITING,
+       REQ_DONE,
+       REQ_ERROR,
+       REQ_EOPNOTSUPP,
+};
+
  struct blk_shadow {
         struct blkif_request req;
         struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
         struct grant **indirect_grants;
         struct scatterlist *sg;
         unsigned int num_sg;
+       enum blk_req_status status;
+
+       #define NO_ASSOCIATED_ID ~0UL
+       /*
+        * Id of the sibling if we ever need 2 requests when handling a
+        * block I/O request
+        */
+       unsigned long associated_id;
  };
  
  struct split_bio {
@@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32;
  module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
  MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
  
+static unsigned int xen_blkif_max_queues = 4;
+module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
+
  /*
   * Maximum order of pages to be used for the shared ring between front and
   * backend, 4KB page granularity is used.
@@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
         __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
  
  /*
- * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
- * characters are enough. Define to 20 to keep consist with backend.
+ * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consistent with backend.
   */
  #define RINGREF_NAME_LEN (20)
+/*
+ * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
+ */
+#define QUEUE_NAME_LEN (17)
+
+/*
+ *  Per-ring info.
+ *  Every blkfront device can associate with one or more blkfront_ring_info,
+ *  depending on how many hardware queues/rings to be used.
+ */
+struct blkfront_ring_info {
+       /* Lock to protect data in every ring buffer. */
+       spinlock_t ring_lock;
+       struct blkif_front_ring ring;
+       unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+       unsigned int evtchn, irq;
+       struct work_struct work;
+       struct gnttab_free_callback callback;
+       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
+       struct list_head indirect_pages;
+       struct list_head grants;
+       unsigned int persistent_gnts_c;
+       unsigned long shadow_free;
+       struct blkfront_info *dev_info;
+};
  
  /*
   * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the
   */
  struct blkfront_info
  {
-       spinlock_t io_lock;
         struct mutex mutex;
         struct xenbus_device *xbdev;
         struct gendisk *gd;
         int vdevice;
         blkif_vdev_t handle;
         enum blkif_state connected;
-       int ring_ref[XENBUS_MAX_RING_GRANTS];
+       /* Number of pages per ring buffer. */
         unsigned int nr_ring_pages;
-       struct blkif_front_ring ring;
-       unsigned int evtchn, irq;
         struct request_queue *rq;
-       struct work_struct work;
-       struct gnttab_free_callback callback;
-       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
-       struct list_head grants;
-       struct list_head indirect_pages;
-       unsigned int persistent_gnts_c;
-       unsigned long shadow_free;
         unsigned int feature_flush;
         unsigned int feature_discard:1;
         unsigned int feature_secdiscard:1;
@@ -155,6 +203,8 @@ struct blkfront_info
         unsigned int max_indirect_segments;
         int is_ready;
         struct blk_mq_tag_set tag_set;
+       struct blkfront_ring_info *rinfo;
+       unsigned int nr_rings;
  };
  
  static unsigned int nr_minors;
@@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock);
  
  #define GREFS(_psegs)  ((_psegs) * GRANTS_PER_PSEG)
  
-static int blkfront_setup_indirect(struct blkfront_info *info);
-static int blkfront_gather_backend_features(struct blkfront_info *info);
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
+static void blkfront_gather_backend_features(struct blkfront_info *info);
  
-static int get_id_from_freelist(struct blkfront_info *info)
+static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
  {
-       unsigned long free = info->shadow_free;
-       BUG_ON(free >= BLK_RING_SIZE(info));
-       info->shadow_free = info->shadow[free].req.u.rw.id;
-       info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+       unsigned long free = rinfo->shadow_free;
+
+       BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
+       rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
+       rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
         return free;
  }
  
-static int add_id_to_freelist(struct blkfront_info *info,
-                              unsigned long id)
+static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
+                             unsigned long id)
  {
-       if (info->shadow[id].req.u.rw.id != id)
+       if (rinfo->shadow[id].req.u.rw.id != id)
                 return -EINVAL;
-       if (info->shadow[id].request == NULL)
+       if (rinfo->shadow[id].request == NULL)
                 return -EINVAL;
-       info->shadow[id].req.u.rw.id  = info->shadow_free;
-       info->shadow[id].request = NULL;
-       info->shadow_free = id;
+       rinfo->shadow[id].req.u.rw.id  = rinfo->shadow_free;
+       rinfo->shadow[id].request = NULL;
+       rinfo->shadow_free = id;
         return 0;
  }
  
-static int fill_grant_buffer(struct blkfront_info *info, int num)
+static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
  {
+       struct blkfront_info *info = rinfo->dev_info;
         struct page *granted_page;
         struct grant *gnt_list_entry, *n;
         int i = 0;
  
-       while(i < num) {
+       while (i < num) {
                 gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
                 if (!gnt_list_entry)
                         goto out_of_memory;
@@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
                 }
  
                 gnt_list_entry->gref = GRANT_INVALID_REF;
-               list_add(&gnt_list_entry->node, &info->grants);
+               list_add(&gnt_list_entry->node, &rinfo->grants);
                 i++;
         }
  
@@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
  
  out_of_memory:
         list_for_each_entry_safe(gnt_list_entry, n,
-                                &info->grants, node) {
+                                &rinfo->grants, node) {
                 list_del(&gnt_list_entry->node);
                 if (info->feature_persistent)
                         __free_page(gnt_list_entry->page);
@@ -263,17 +315,17 @@ out_of_memory:
         return -ENOMEM;
  }
  
-static struct grant *get_free_grant(struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
  {
         struct grant *gnt_list_entry;
  
-       BUG_ON(list_empty(&info->grants));
-       gnt_list_entry = list_first_entry(&info->grants, struct grant,
+       BUG_ON(list_empty(&rinfo->grants));
+       gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
                                           node);
         list_del(&gnt_list_entry->node);
  
         if (gnt_list_entry->gref != GRANT_INVALID_REF)
-               info->persistent_gnts_c--;
+               rinfo->persistent_gnts_c--;
  
         return gnt_list_entry;
  }
@@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry,
  
  static struct grant *get_grant(grant_ref_t *gref_head,
                                unsigned long gfn,
-                              struct blkfront_info *info)
+                              struct blkfront_ring_info *rinfo)
  {
-       struct grant *gnt_list_entry = get_free_grant(info);
+       struct grant *gnt_list_entry = get_free_grant(rinfo);
+       struct blkfront_info *info = rinfo->dev_info;
  
         if (gnt_list_entry->gref != GRANT_INVALID_REF)
                 return gnt_list_entry;
@@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head,
  }
  
  static struct grant *get_indirect_grant(grant_ref_t *gref_head,
-                                       struct blkfront_info *info)
+                                       struct blkfront_ring_info *rinfo)
  {
-       struct grant *gnt_list_entry = get_free_grant(info);
+       struct grant *gnt_list_entry = get_free_grant(rinfo);
+       struct blkfront_info *info = rinfo->dev_info;
  
         if (gnt_list_entry->gref != GRANT_INVALID_REF)
                 return gnt_list_entry;
@@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head,
                 struct page *indirect_page;
  
                 /* Fetch a pre-allocated page to use for indirect grefs */
-               BUG_ON(list_empty(&info->indirect_pages));
-               indirect_page = list_first_entry(&info->indirect_pages,
+               BUG_ON(list_empty(&rinfo->indirect_pages));
+               indirect_page = list_first_entry(&rinfo->indirect_pages,
                                                  struct page, lru);
                 list_del(&indirect_page->lru);
                 gnt_list_entry->page = indirect_page;
@@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
  
  static void blkif_restart_queue_callback(void *arg)
  {
-       struct blkfront_info *info = (struct blkfront_info *)arg;
-       schedule_work(&info->work);
+       struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
+       schedule_work(&rinfo->work);
  }
  
  static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
@@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
         return 0;
  }
  
-static int blkif_queue_discard_req(struct request *req)
+static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
+                                           struct request *req,
+                                           struct blkif_request **ring_req)
  {
-       struct blkfront_info *info = req->rq_disk->private_data;
+       unsigned long id;
+
+       *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+       rinfo->ring.req_prod_pvt++;
+
+       id = get_id_from_freelist(rinfo);
+       rinfo->shadow[id].request = req;
+       rinfo->shadow[id].status = REQ_WAITING;
+       rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
+
+       (*ring_req)->u.rw.id = id;
+
+       return id;
+}
+
+static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+       struct blkfront_info *info = rinfo->dev_info;
         struct blkif_request *ring_req;
         unsigned long id;
  
         /* Fill out a communications ring structure. */
-       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-       id = get_id_from_freelist(info);
-       info->shadow[id].request = req;
+       id = blkif_ring_get_request(rinfo, req, &ring_req);
  
         ring_req->operation = BLKIF_OP_DISCARD;
         ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
@@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req)
         else
                 ring_req->u.discard.flag = 0;
  
-       info->ring.req_prod_pvt++;
-
         /* Keep a private copy so we can reissue requests when recovering. */
-       info->shadow[id].req = *ring_req;
+       rinfo->shadow[id].req = *ring_req;
  
         return 0;
  }
@@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req)
  struct setup_rw_req {
         unsigned int grant_idx;
         struct blkif_request_segment *segments;
-       struct blkfront_info *info;
+       struct blkfront_ring_info *rinfo;
         struct blkif_request *ring_req;
         grant_ref_t gref_head;
         unsigned int id;
@@ -495,6 +564,9 @@ struct setup_rw_req {
         bool need_copy;
         unsigned int bvec_off;
         char *bvec_data;
+
+       bool require_extra_req;
+       struct blkif_request *extra_ring_req;
  };
  
  static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
         /* Convenient aliases */
         unsigned int grant_idx = setup->grant_idx;
         struct blkif_request *ring_req = setup->ring_req;
-       struct blkfront_info *info = setup->info;
-       struct blk_shadow *shadow = &info->shadow[setup->id];
+       struct blkfront_ring_info *rinfo = setup->rinfo;
+       /*
+        * We always use the shadow of the first request to store the list
+        * of grant associated to the block I/O request. This made the
+        * completion more easy to handle even if the block I/O request is
+        * split.
+        */
+       struct blk_shadow *shadow = &rinfo->shadow[setup->id];
+
+       if (unlikely(setup->require_extra_req &&
+                    grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+               /*
+                * We are using the second request, setup grant_idx
+                * to be the index of the segment array.
+                */
+               grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               ring_req = setup->extra_ring_req;
+       }
  
         if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
             (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
@@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
                         kunmap_atomic(setup->segments);
  
                 n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
-               gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+               gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
                 shadow->indirect_grants[n] = gnt_list_entry;
                 setup->segments = kmap_atomic(gnt_list_entry->page);
                 ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
         }
  
-       gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+       gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
         ref = gnt_list_entry->gref;
-       shadow->grants_used[grant_idx] = gnt_list_entry;
+       /*
+        * All the grants are stored in the shadow of the first
+        * request. Therefore we have to use the global index.
+        */
+       shadow->grants_used[setup->grant_idx] = gnt_list_entry;
  
         if (setup->need_copy) {
                 void *shared_data;
@@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
         (setup->grant_idx)++;
  }
  
-static int blkif_queue_rw_req(struct request *req)
+static void blkif_setup_extra_req(struct blkif_request *first,
+                                 struct blkif_request *second)
  {
-       struct blkfront_info *info = req->rq_disk->private_data;
-       struct blkif_request *ring_req;
-       unsigned long id;
+       uint16_t nr_segments = first->u.rw.nr_segments;
+
+       /*
+        * The second request is only present when the first request uses
+        * all its segments. It's always the continuity of the first one.
+        */
+       first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       second->u.rw.sector_number = first->u.rw.sector_number +
+               (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+       second->u.rw.handle = first->u.rw.handle;
+       second->operation = first->operation;
+}
+
+static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+       struct blkfront_info *info = rinfo->dev_info;
+       struct blkif_request *ring_req, *extra_ring_req = NULL;
+       unsigned long id, extra_id = NO_ASSOCIATED_ID;
+       bool require_extra_req = false;
         int i;
         struct setup_rw_req setup = {
                 .grant_idx = 0,
                 .segments = NULL,
-               .info = info,
+               .rinfo = rinfo,
                 .need_copy = rq_data_dir(req) && info->feature_persistent,
         };
  
@@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req)
          * existing persistent grants, or if we have to get new grants,
          * as there are not sufficiently many free.
          */
-       bool new_persistent_gnts;
         struct scatterlist *sg;
         int num_sg, max_grefs, num_grant;
  
@@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req)
                  */
                 max_grefs += INDIRECT_GREFS(max_grefs);
  
-       /* Check if we have enough grants to allocate a requests */
-       if (info->persistent_gnts_c < max_grefs) {
-               new_persistent_gnts = 1;
-               if (gnttab_alloc_grant_references(
-                   max_grefs - info->persistent_gnts_c,
-                   &setup.gref_head) < 0) {
+       /*
+        * We have to reserve 'max_grefs' grants because persistent
+        * grants are shared by all rings.
+        */
+       if (max_grefs > 0)
+               if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) {
                         gnttab_request_free_callback(
-                               &info->callback,
+                               &rinfo->callback,
                                 blkif_restart_queue_callback,
-                               info,
+                               rinfo,
                                 max_grefs);
                         return 1;
                 }
-       } else
-               new_persistent_gnts = 0;
  
         /* Fill out a communications ring structure. */
-       ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-       id = get_id_from_freelist(info);
-       info->shadow[id].request = req;
-
-       BUG_ON(info->max_indirect_segments == 0 &&
-              GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-       BUG_ON(info->max_indirect_segments &&
-              GREFS(req->nr_phys_segments) > info->max_indirect_segments);
+       id = blkif_ring_get_request(rinfo, req, &ring_req);
  
-       num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+       num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
         num_grant = 0;
         /* Calculate the number of grant used */
-       for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+       for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
                num_grant += gnttab_count_grant(sg->offset, sg->length);
  
-       ring_req->u.rw.id = id;
-       info->shadow[id].num_sg = num_sg;
-       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+       require_extra_req = info->max_indirect_segments == 0 &&
+               num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+
+       rinfo->shadow[id].num_sg = num_sg;
+       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+           likely(!require_extra_req)) {
                 /*
                  * The indirect operation can only be a BLKIF_OP_READ or
                  * BLKIF_OP_WRITE
@@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req)
                         }
                 }
                 ring_req->u.rw.nr_segments = num_grant;
+               if (unlikely(require_extra_req)) {
+                       extra_id = blkif_ring_get_request(rinfo, req,
+                                                         &extra_ring_req);
+                       /*
+                        * Only the first request contains the scatter-gather
+                        * list.
+                        */
+                       rinfo->shadow[extra_id].num_sg = 0;
+
+                       blkif_setup_extra_req(ring_req, extra_ring_req);
+
+                       /* Link the 2 requests together */
+                       rinfo->shadow[extra_id].associated_id = id;
+                       rinfo->shadow[id].associated_id = extra_id;
+               }
         }
  
         setup.ring_req = ring_req;
         setup.id = id;
-       for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+
+       setup.require_extra_req = require_extra_req;
+       if (unlikely(require_extra_req))
+               setup.extra_ring_req = extra_ring_req;
+
+       for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
                 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
  
                 if (setup.need_copy) {
@@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req)
         if (setup.segments)
                 kunmap_atomic(setup.segments);
  
-       info->ring.req_prod_pvt++;
-
         /* Keep a private copy so we can reissue requests when recovering. */
-       info->shadow[id].req = *ring_req;
+       rinfo->shadow[id].req = *ring_req;
+       if (unlikely(require_extra_req))
+               rinfo->shadow[extra_id].req = *extra_ring_req;
  
-       if (new_persistent_gnts)
+       if (max_grefs > 0)
                 gnttab_free_grant_references(setup.gref_head);
  
         return 0;
@@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req)
   *
   * @req: a request struct
   */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
  {
-       struct blkfront_info *info = req->rq_disk->private_data;
-
-       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+       if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
                 return 1;
  
         if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
-               return blkif_queue_discard_req(req);
+               return blkif_queue_discard_req(req, rinfo);
         else
-               return blkif_queue_rw_req(req);
+               return blkif_queue_rw_req(req, rinfo);
  }
  
-static inline void flush_requests(struct blkfront_info *info)
+static inline void flush_requests(struct blkfront_ring_info *rinfo)
  {
         int notify;
  
-       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+       RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
  
         if (notify)
-               notify_remote_via_irq(info->irq);
+               notify_remote_via_irq(rinfo->irq);
  }
  
  static inline bool blkif_request_flush_invalid(struct request *req,
@@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req,
  }
  
  static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
-                          const struct blk_mq_queue_data *qd)
+                         const struct blk_mq_queue_data *qd)
  {
-       struct blkfront_info *info = qd->rq->rq_disk->private_data;
+       unsigned long flags;
+       struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data;
  
         blk_mq_start_request(qd->rq);
-       spin_lock_irq(&info->io_lock);
-       if (RING_FULL(&info->ring))
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
+       if (RING_FULL(&rinfo->ring))
                 goto out_busy;
  
-       if (blkif_request_flush_invalid(qd->rq, info))
+       if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
                 goto out_err;
  
-       if (blkif_queue_request(qd->rq))
+       if (blkif_queue_request(qd->rq, rinfo))
                 goto out_busy;
  
-       flush_requests(info);
-       spin_unlock_irq(&info->io_lock);
+       flush_requests(rinfo);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
         return BLK_MQ_RQ_QUEUE_OK;
  
  out_err:
-       spin_unlock_irq(&info->io_lock);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
         return BLK_MQ_RQ_QUEUE_ERROR;
  
  out_busy:
-       spin_unlock_irq(&info->io_lock);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
         blk_mq_stop_hw_queue(hctx);
         return BLK_MQ_RQ_QUEUE_BUSY;
  }
  
+static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+                           unsigned int index)
+{
+       struct blkfront_info *info = (struct blkfront_info *)data;
+
+       BUG_ON(info->nr_rings <= index);
+       hctx->driver_data = &info->rinfo[index];
+       return 0;
+}
+
  static struct blk_mq_ops blkfront_mq_ops = {
         .queue_rq = blkif_queue_rq,
         .map_queue = blk_mq_map_queue,
+       .init_hctx = blk_mq_init_hctx,
  };
  
  static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
@@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
  
         memset(&info->tag_set, 0, sizeof(info->tag_set));
         info->tag_set.ops = &blkfront_mq_ops;
-       info->tag_set.nr_hw_queues = 1;
-       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       info->tag_set.nr_hw_queues = info->nr_rings;
+       if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+               /*
+                * When indirect descriptior is not supported, the I/O request
+                * will be split between multiple request in the ring.
+                * To avoid problems when sending the request, divide by
+                * 2 the depth of the queue.
+                */
+               info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+       } else
+               info->tag_set.queue_depth = BLK_RING_SIZE(info);
         info->tag_set.numa_node = NUMA_NO_NODE;
         info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
         info->tag_set.cmd_size = 0;
         info->tag_set.driver_data = info;
  
         if (blk_mq_alloc_tag_set(&info->tag_set))
-               return -1;
+               return -EINVAL;
         rq = blk_mq_init_queue(&info->tag_set);
         if (IS_ERR(rq)) {
                 blk_mq_free_tag_set(&info->tag_set);
-               return -1;
+               return PTR_ERR(rq);
         }
  
         queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
@@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
  
  static void xlvbd_release_gendisk(struct blkfront_info *info)
  {
-       unsigned int minor, nr_minors;
+       unsigned int minor, nr_minors, i;
  
         if (info->rq == NULL)
                 return;
@@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
         /* No more blkif_request(). */
         blk_mq_stop_hw_queues(info->rq);
  
-       /* No more gnttab callback work. */
-       gnttab_cancel_free_callback(&info->callback);
+       for (i = 0; i < info->nr_rings; i++) {
+               struct blkfront_ring_info *rinfo = &info->rinfo[i];
  
-       /* Flush gnttab callback work. Must be done with no locks held. */
-       flush_work(&info->work);
+               /* No more gnttab callback work. */
+               gnttab_cancel_free_callback(&rinfo->callback);
+
+               /* Flush gnttab callback work. Must be done with no locks held. */
+               flush_work(&rinfo->work);
+       }
  
         del_gendisk(info->gd);
  
@@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
         info->gd = NULL;
  }
  
-/* Must be called with io_lock holded */
-static void kick_pending_request_queues(struct blkfront_info *info)
+/* Already hold rinfo->ring_lock. */
+static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
  {
-       if (!RING_FULL(&info->ring))
-               blk_mq_start_stopped_hw_queues(info->rq, true);
+       if (!RING_FULL(&rinfo->ring))
+               blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
  }
  
-static void blkif_restart_queue(struct work_struct *work)
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
  {
-       struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+       unsigned long flags;
  
-       spin_lock_irq(&info->io_lock);
-       if (info->connected == BLKIF_STATE_CONNECTED)
-               kick_pending_request_queues(info);
-       spin_unlock_irq(&info->io_lock);
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
+       kick_pending_request_queues_locked(rinfo);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
  }
  
-static void blkif_free(struct blkfront_info *info, int suspend)
+static void blkif_restart_queue(struct work_struct *work)
  {
-       struct grant *persistent_gnt;
-       struct grant *n;
-       int i, j, segs;
+       struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
  
-       /* Prevent new requests being issued until we fix things up. */
-       spin_lock_irq(&info->io_lock);
-       info->connected = suspend ?
-               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
-       /* No more blkif_request(). */
-       if (info->rq)
-               blk_mq_stop_hw_queues(info->rq);
+       if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
+               kick_pending_request_queues(rinfo);
+}
  
-       /* Remove all persistent grants */
-       if (!list_empty(&info->grants)) {
-               list_for_each_entry_safe(persistent_gnt, n,
-                                        &info->grants, node) {
-                       list_del(&persistent_gnt->node);
-                       if (persistent_gnt->gref != GRANT_INVALID_REF) {
-                               gnttab_end_foreign_access(persistent_gnt->gref,
-                                                         0, 0UL);
-                               info->persistent_gnts_c--;
-                       }
-                       if (info->feature_persistent)
-                               __free_page(persistent_gnt->page);
-                       kfree(persistent_gnt);
-               }
-       }
-       BUG_ON(info->persistent_gnts_c != 0);
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
+{
+       struct grant *persistent_gnt, *n;
+       struct blkfront_info *info = rinfo->dev_info;
+       int i, j, segs;
  
         /*
          * Remove indirect pages, this only happens when using indirect
          * descriptors but not persistent grants
          */
-       if (!list_empty(&info->indirect_pages)) {
+       if (!list_empty(&rinfo->indirect_pages)) {
                 struct page *indirect_page, *n;
  
                 BUG_ON(info->feature_persistent);
-               list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+               list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
                         list_del(&indirect_page->lru);
                         __free_page(indirect_page);
                 }
         }
  
+       /* Remove all persistent grants. */
+       if (!list_empty(&rinfo->grants)) {
+               list_for_each_entry_safe(persistent_gnt, n,
+                                        &rinfo->grants, node) {
+                       list_del(&persistent_gnt->node);
+                       if (persistent_gnt->gref != GRANT_INVALID_REF) {
+                               gnttab_end_foreign_access(persistent_gnt->gref,
+                                                         0, 0UL);
+                               rinfo->persistent_gnts_c--;
+                       }
+                       if (info->feature_persistent)
+                               __free_page(persistent_gnt->page);
+                       kfree(persistent_gnt);
+               }
+       }
+       BUG_ON(rinfo->persistent_gnts_c != 0);
+
         for (i = 0; i < BLK_RING_SIZE(info); i++) {
                 /*
                  * Clear persistent grants present in requests already
                  * on the shared ring
                  */
-               if (!info->shadow[i].request)
+               if (!rinfo->shadow[i].request)
                         goto free_shadow;
  
-               segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
-                      info->shadow[i].req.u.indirect.nr_segments :
-                      info->shadow[i].req.u.rw.nr_segments;
+               segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+                      rinfo->shadow[i].req.u.indirect.nr_segments :
+                      rinfo->shadow[i].req.u.rw.nr_segments;
                 for (j = 0; j < segs; j++) {
-                       persistent_gnt = info->shadow[i].grants_used[j];
+                       persistent_gnt = rinfo->shadow[i].grants_used[j];
                         gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
                         if (info->feature_persistent)
                                 __free_page(persistent_gnt->page);
                         kfree(persistent_gnt);
                 }
  
-               if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+               if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
                         /*
                          * If this is not an indirect operation don't try to
                          * free indirect segments
@@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend)
                         goto free_shadow;
  
                 for (j = 0; j < INDIRECT_GREFS(segs); j++) {
-                       persistent_gnt = info->shadow[i].indirect_grants[j];
+                       persistent_gnt = rinfo->shadow[i].indirect_grants[j];
                         gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
                         __free_page(persistent_gnt->page);
                         kfree(persistent_gnt);
                 }
  
  free_shadow:
-               kfree(info->shadow[i].grants_used);
-               info->shadow[i].grants_used = NULL;
-               kfree(info->shadow[i].indirect_grants);
-               info->shadow[i].indirect_grants = NULL;
-               kfree(info->shadow[i].sg);
-               info->shadow[i].sg = NULL;
+               kfree(rinfo->shadow[i].grants_used);
+               rinfo->shadow[i].grants_used = NULL;
+               kfree(rinfo->shadow[i].indirect_grants);
+               rinfo->shadow[i].indirect_grants = NULL;
+               kfree(rinfo->shadow[i].sg);
+               rinfo->shadow[i].sg = NULL;
         }
  
         /* No more gnttab callback work. */
-       gnttab_cancel_free_callback(&info->callback);
-       spin_unlock_irq(&info->io_lock);
+       gnttab_cancel_free_callback(&rinfo->callback);
  
         /* Flush gnttab callback work. Must be done with no locks held. */
-       flush_work(&info->work);
+       flush_work(&rinfo->work);
  
         /* Free resources associated with old device channel. */
         for (i = 0; i < info->nr_ring_pages; i++) {
-               if (info->ring_ref[i] != GRANT_INVALID_REF) {
-                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
-                       info->ring_ref[i] = GRANT_INVALID_REF;
+               if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
+                       gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
+                       rinfo->ring_ref[i] = GRANT_INVALID_REF;
                 }
         }
-       free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
-       info->ring.sring = NULL;
+       free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+       rinfo->ring.sring = NULL;
  
-       if (info->irq)
-               unbind_from_irqhandler(info->irq, info);
-       info->evtchn = info->irq = 0;
+       if (rinfo->irq)
+               unbind_from_irqhandler(rinfo->irq, rinfo);
+       rinfo->evtchn = rinfo->irq = 0;
+}
  
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+       unsigned int i;
+
+       /* Prevent new requests being issued until we fix things up. */
+       info->connected = suspend ?
+               BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+       /* No more blkif_request(). */
+       if (info->rq)
+               blk_mq_stop_hw_queues(info->rq);
+
+       for (i = 0; i < info->nr_rings; i++)
+               blkif_free_ring(&info->rinfo[i]);
+
+       kfree(info->rinfo);
+       info->rinfo = NULL;
+       info->nr_rings = 0;
  }
  
  struct copy_from_grant {
@@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
         kunmap_atomic(shared_data);
  }
  
-static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+       switch (rsp)
+       {
+       case BLKIF_RSP_OKAY:
+               return REQ_DONE;
+       case BLKIF_RSP_EOPNOTSUPP:
+               return REQ_EOPNOTSUPP;
+       case BLKIF_RSP_ERROR:
+               /* Fallthrough. */
+       default:
+               return REQ_ERROR;
+       }
+}
+
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+                                 enum blk_req_status s2)
+{
+       BUG_ON(s1 == REQ_WAITING);
+       BUG_ON(s2 == REQ_WAITING);
+
+       if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+               return BLKIF_RSP_ERROR;
+       else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+               return BLKIF_RSP_EOPNOTSUPP;
+       return BLKIF_RSP_OKAY;
+}
+
+static bool blkif_completion(unsigned long *id,
+                            struct blkfront_ring_info *rinfo,
                              struct blkif_response *bret)
  {
         int i = 0;
         struct scatterlist *sg;
         int num_sg, num_grant;
+       struct blkfront_info *info = rinfo->dev_info;
+       struct blk_shadow *s = &rinfo->shadow[*id];
         struct copy_from_grant data = {
-               .s = s,
                 .grant_idx = 0,
         };
  
         num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
                 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+       /* The I/O request may be split in two. */
+       if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+               struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+
+               /* Keep the status of the current response in shadow. */
+               s->status = blkif_rsp_to_req_status(bret->status);
+
+               /* Wait the second response if not yet here. */
+               if (s2->status == REQ_WAITING)
+                       return 0;
+
+               bret->status = blkif_get_final_status(s->status,
+                                                     s2->status);
+
+               /*
+                * All the grants is stored in the first shadow in order
+                * to make the completion code simpler.
+                */
+               num_grant += s2->req.u.rw.nr_segments;
+
+               /*
+                * The two responses may not come in order. Only the
+                * first request will store the scatter-gather list.
+                */
+               if (s2->num_sg != 0) {
+                       /* Update "id" with the ID of the first response. */
+                       *id = s->associated_id;
+                       s = s2;
+               }
+
+               /*
+                * We don't need anymore the second request, so recycling
+                * it now.
+                */
+               if (add_id_to_freelist(rinfo, s->associated_id))
+                       WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+                            info->gd->disk_name, s->associated_id);
+       }
+
+       data.s = s;
         num_sg = s->num_sg;
  
         if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                         if (!info->feature_persistent)
                                 pr_alert_ratelimited("backed has not unmapped grant: %u\n",
                                                      s->grants_used[i]->gref);
-                       list_add(&s->grants_used[i]->node, &info->grants);
-                       info->persistent_gnts_c++;
+                       list_add(&s->grants_used[i]->node, &rinfo->grants);
+                       rinfo->persistent_gnts_c++;
                 } else {
                         /*
                          * If the grant is not mapped by the backend we end the
@@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                          */
                         gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
                         s->grants_used[i]->gref = GRANT_INVALID_REF;
-                       list_add_tail(&s->grants_used[i]->node, &info->grants);
+                       list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
                 }
         }
         if (s->req.operation == BLKIF_OP_INDIRECT) {
@@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                                 if (!info->feature_persistent)
                                         pr_alert_ratelimited("backed has not unmapped grant: %u\n",
                                                              s->indirect_grants[i]->gref);
-                               list_add(&s->indirect_grants[i]->node, &info->grants);
-                               info->persistent_gnts_c++;
+                               list_add(&s->indirect_grants[i]->node, &rinfo->grants);
+                               rinfo->persistent_gnts_c++;
                         } else {
                                 struct page *indirect_page;
  
@@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
                                  */
                                 if (!info->feature_persistent) {
                                         indirect_page = s->indirect_grants[i]->page;
-                                       list_add(&indirect_page->lru, &info->indirect_pages);
+                                       list_add(&indirect_page->lru, &rinfo->indirect_pages);
                                 }
                                 s->indirect_grants[i]->gref = GRANT_INVALID_REF;
-                               list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+                               list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
                         }
                 }
         }
+
+       return 1;
  }
  
  static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
         struct blkif_response *bret;
         RING_IDX i, rp;
         unsigned long flags;
-       struct blkfront_info *info = (struct blkfront_info *)dev_id;
+       struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
+       struct blkfront_info *info = rinfo->dev_info;
         int error;
  
-       spin_lock_irqsave(&info->io_lock, flags);
-
-       if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-               spin_unlock_irqrestore(&info->io_lock, flags);
+       if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
                 return IRQ_HANDLED;
-       }
  
+       spin_lock_irqsave(&rinfo->ring_lock, flags);
   again:
-       rp = info->ring.sring->rsp_prod;
+       rp = rinfo->ring.sring->rsp_prod;
         rmb(); /* Ensure we see queued responses up to 'rp'. */
  
-       for (i = info->ring.rsp_cons; i != rp; i++) {
+       for (i = rinfo->ring.rsp_cons; i != rp; i++) {
                 unsigned long id;
  
-               bret = RING_GET_RESPONSE(&info->ring, i);
+               bret = RING_GET_RESPONSE(&rinfo->ring, i);
                 id   = bret->id;
                 /*
                  * The backend has messed up and given us an id that we would
@@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                          * the id is busted. */
                         continue;
                 }
-               req  = info->shadow[id].request;
+               req  = rinfo->shadow[id].request;
  
-               if (bret->operation != BLKIF_OP_DISCARD)
-                       blkif_completion(&info->shadow[id], info, bret);
+               if (bret->operation != BLKIF_OP_DISCARD) {
+                       /*
+                        * We may need to wait for an extra response if the
+                        * I/O request is split in 2
+                        */
+                       if (!blkif_completion(&id, rinfo, bret))
+                               continue;
+               }
  
-               if (add_id_to_freelist(info, id)) {
+               if (add_id_to_freelist(rinfo, id)) {
                         WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
                              info->gd->disk_name, op_name(bret->operation), id);
                         continue;
@@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                                 error = -EOPNOTSUPP;
                         }
                         if (unlikely(bret->status == BLKIF_RSP_ERROR &&
-                                    info->shadow[id].req.u.rw.nr_segments == 0)) {
+                                    rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
                                 printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
                                        info->gd->disk_name, op_name(bret->operation));
                                 error = -EOPNOTSUPP;
@@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                 }
         }
  
-       info->ring.rsp_cons = i;
+       rinfo->ring.rsp_cons = i;
  
-       if (i != info->ring.req_prod_pvt) {
+       if (i != rinfo->ring.req_prod_pvt) {
                 int more_to_do;
-               RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+               RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
                 if (more_to_do)
                         goto again;
         } else
-               info->ring.sring->rsp_event = i + 1;
+               rinfo->ring.sring->rsp_event = i + 1;
  
-       kick_pending_request_queues(info);
+       kick_pending_request_queues_locked(rinfo);
  
-       spin_unlock_irqrestore(&info->io_lock, flags);
+       spin_unlock_irqrestore(&rinfo->ring_lock, flags);
  
         return IRQ_HANDLED;
  }
  
  
  static int setup_blkring(struct xenbus_device *dev,
-                        struct blkfront_info *info)
+                        struct blkfront_ring_info *rinfo)
  {
         struct blkif_sring *sring;
         int err, i;
+       struct blkfront_info *info = rinfo->dev_info;
         unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
         grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
  
         for (i = 0; i < info->nr_ring_pages; i++)
-               info->ring_ref[i] = GRANT_INVALID_REF;
+               rinfo->ring_ref[i] = GRANT_INVALID_REF;
  
         sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
                                                        get_order(ring_size));
@@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev,
                 return -ENOMEM;
         }
         SHARED_RING_INIT(sring);
-       FRONT_RING_INIT(&info->ring, sring, ring_size);
+       FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
  
-       err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
+       err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
         if (err < 0) {
                 free_pages((unsigned long)sring, get_order(ring_size));
-               info->ring.sring = NULL;
+               rinfo->ring.sring = NULL;
                 goto fail;
         }
         for (i = 0; i < info->nr_ring_pages; i++)
-               info->ring_ref[i] = gref[i];
+               rinfo->ring_ref[i] = gref[i];
  
-       err = xenbus_alloc_evtchn(dev, &info->evtchn);
+       err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
         if (err)
                 goto fail;
  
-       err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
-                                       "blkif", info);
+       err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0,
+                                       "blkif", rinfo);
         if (err <= 0) {
                 xenbus_dev_fatal(dev, err,
                                  "bind_evtchn_to_irqhandler failed");
                 goto fail;
         }
-       info->irq = err;
+       rinfo->irq = err;
  
         return 0;
  fail:
@@ -1455,6 +1701,53 @@ fail:
         return err;
  }
  
+/*
+ * Write out per-ring/queue nodes including ring-ref and event-channel, and each
+ * ring buffer may have multi pages depending on ->nr_ring_pages.
+ */
+static int write_per_ring_nodes(struct xenbus_transaction xbt,
+                               struct blkfront_ring_info *rinfo, const char *dir)
+{
+       int err;
+       unsigned int i;
+       const char *message = NULL;
+       struct blkfront_info *info = rinfo->dev_info;
+
+       if (info->nr_ring_pages == 1) {
+               err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
+               if (err) {
+                       message = "writing ring-ref";
+                       goto abort_transaction;
+               }
+       } else {
+               for (i = 0; i < info->nr_ring_pages; i++) {
+                       char ring_ref_name[RINGREF_NAME_LEN];
+
+                       snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+                       err = xenbus_printf(xbt, dir, ring_ref_name,
+                                           "%u", rinfo->ring_ref[i]);
+                       if (err) {
+                               message = "writing ring-ref";
+                               goto abort_transaction;
+                       }
+               }
+       }
+
+       err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
+       if (err) {
+               message = "writing event-channel";
+               goto abort_transaction;
+       }
+
+       return 0;
+
+abort_transaction:
+       xenbus_transaction_end(xbt, 1);
+       if (message)
+               xenbus_dev_fatal(info->xbdev, err, "%s", message);
+
+       return err;
+}
  
  /* Common code used when first setting up, and when resuming. */
  static int talk_to_blkback(struct xenbus_device *dev,
@@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev,
  {
         const char *message = NULL;
         struct xenbus_transaction xbt;
-       int err, i;
-       unsigned int max_page_order = 0;
+       int err;
+       unsigned int i, max_page_order = 0;
         unsigned int ring_page_order = 0;
  
         err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
@@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev,
                 info->nr_ring_pages = 1 << ring_page_order;
         }
  
-       /* Create shared ring, alloc event channel. */
-       err = setup_blkring(dev, info);
-       if (err)
-               goto out;
+       for (i = 0; i < info->nr_rings; i++) {
+               struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
+               /* Create shared ring, alloc event channel. */
+               err = setup_blkring(dev, rinfo);
+               if (err)
+                       goto destroy_blkring;
+       }
  
  again:
         err = xenbus_transaction_start(&xbt);
@@ -1487,38 +1784,49 @@ again:
                 goto destroy_blkring;
         }
  
-       if (info->nr_ring_pages == 1) {
-               err = xenbus_printf(xbt, dev->nodename,
-                                   "ring-ref", "%u", info->ring_ref[0]);
+       if (info->nr_ring_pages > 1) {
+               err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
+                                   ring_page_order);
                 if (err) {
-                       message = "writing ring-ref";
+                       message = "writing ring-page-order";
                         goto abort_transaction;
                 }
+       }
+
+       /* We already got the number of queues/rings in _probe */
+       if (info->nr_rings == 1) {
+               err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename);
+               if (err)
+                       goto destroy_blkring;
         } else {
-               err = xenbus_printf(xbt, dev->nodename,
-                                   "ring-page-order", "%u", ring_page_order);
+               char *path;
+               size_t pathsize;
+
+               err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
+                                   info->nr_rings);
                 if (err) {
-                       message = "writing ring-page-order";
+                       message = "writing multi-queue-num-queues";
                         goto abort_transaction;
                 }
  
-               for (i = 0; i < info->nr_ring_pages; i++) {
-                       char ring_ref_name[RINGREF_NAME_LEN];
+               pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
+               path = kmalloc(pathsize, GFP_KERNEL);
+               if (!path) {
+                       err = -ENOMEM;
+                       message = "ENOMEM while writing ring references";
+                       goto abort_transaction;
+               }
  
-                       snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
-                       err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
-                                           "%u", info->ring_ref[i]);
+               for (i = 0; i < info->nr_rings; i++) {
+                       memset(path, 0, pathsize);
+                       snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
+                       err = write_per_ring_nodes(xbt, &info->rinfo[i], path);
                         if (err) {
-                               message = "writing ring-ref";
-                               goto abort_transaction;
+                               kfree(path);
+                               goto destroy_blkring;
                         }
                 }
-       }
-       err = xenbus_printf(xbt, dev->nodename,
-                           "event-channel", "%u", info->evtchn);
-       if (err) {
-               message = "writing event-channel";
-               goto abort_transaction;
+               kfree(path);
         }
         err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
                             XEN_IO_PROTO_ABI_NATIVE);
@@ -1540,9 +1848,14 @@ again:
                 goto destroy_blkring;
         }
  
-       for (i = 0; i < BLK_RING_SIZE(info); i++)
-               info->shadow[i].req.u.rw.id = i+1;
-       info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+       for (i = 0; i < info->nr_rings; i++) {
+               unsigned int j;
+               struct blkfront_ring_info *rinfo = &info->rinfo[i];
+
+               for (j = 0; j < BLK_RING_SIZE(info); j++)
+                       rinfo->shadow[j].req.u.rw.id = j + 1;
+               rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+       }
         xenbus_switch_state(dev, XenbusStateInitialised);
  
         return 0;
@@ -1553,7 +1866,10 @@ again:
                 xenbus_dev_fatal(dev, err, "%s", message);
   destroy_blkring:
         blkif_free(info, 0);
- out:
+
+       kfree(info);
+       dev_set_drvdata(&dev->dev, NULL);
+
         return err;
  }
  
@@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev,
                           const struct xenbus_device_id *id)
  {
         int err, vdevice;
+       unsigned int r_index;
         struct blkfront_info *info;
+       unsigned int backend_max_queues = 0;
  
         /* FIXME: Use dynamic device id if this is not set. */
         err = xenbus_scanf(XBT_NIL, dev->nodename,
@@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev,
                 return -ENOMEM;
         }
  
-       mutex_init(&info->mutex);
-       spin_lock_init(&info->io_lock);
         info->xbdev = dev;
+       /* Check if backend supports multiple queues. */
+       err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+                          "multi-queue-max-queues", "%u", &backend_max_queues);
+       if (err < 0)
+               backend_max_queues = 1;
+
+       info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+       /* We need at least one ring. */
+       if (!info->nr_rings)
+               info->nr_rings = 1;
+
+       info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL);
+       if (!info->rinfo) {
+               xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure");
+               kfree(info);
+               return -ENOMEM;
+       }
+
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[r_index];
+               INIT_LIST_HEAD(&rinfo->indirect_pages);
+               INIT_LIST_HEAD(&rinfo->grants);
+               rinfo->dev_info = info;
+               INIT_WORK(&rinfo->work, blkif_restart_queue);
+               spin_lock_init(&rinfo->ring_lock);
+       }
+
+       mutex_init(&info->mutex);
         info->vdevice = vdevice;
-       INIT_LIST_HEAD(&info->grants);
-       INIT_LIST_HEAD(&info->indirect_pages);
-       info->persistent_gnts_c = 0;
         info->connected = BLKIF_STATE_DISCONNECTED;
-       INIT_WORK(&info->work, blkif_restart_queue);
  
         /* Front end dir is a number, which is used as the id. */
         info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
@@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio)
  
  static int blkif_recover(struct blkfront_info *info)
  {
-       int i;
+       unsigned int i, r_index;
         struct request *req, *n;
         struct blk_shadow *copy;
         int rc;
@@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info)
         struct split_bio *split_bio;
         struct list_head requests;
  
-       /* Stage 1: Make a safe copy of the shadow state. */
-       copy = kmemdup(info->shadow, sizeof(info->shadow),
-                      GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
-       if (!copy)
-               return -ENOMEM;
-
-       /* Stage 2: Set up free list. */
-       memset(&info->shadow, 0, sizeof(info->shadow));
-       for (i = 0; i < BLK_RING_SIZE(info); i++)
-               info->shadow[i].req.u.rw.id = i+1;
-       info->shadow_free = info->ring.req_prod_pvt;
-       info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
-
-       rc = blkfront_gather_backend_features(info);
-       if (rc) {
-               kfree(copy);
-               return rc;
-       }
-
+       blkfront_gather_backend_features(info);
         segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
         blk_queue_max_segments(info->rq, segs);
         bio_list_init(&bio_list);
         INIT_LIST_HEAD(&requests);
-       for (i = 0; i < BLK_RING_SIZE(info); i++) {
-               /* Not in use? */
-               if (!copy[i].request)
-                       continue;
  
-               /*
-                * Get the bios in the request so we can re-queue them.
-                */
-               if (copy[i].request->cmd_flags &
-                   (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[r_index];
+               /* Stage 1: Make a safe copy of the shadow state. */
+               copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow),
+                              GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+               if (!copy)
+                       return -ENOMEM;
+
+               /* Stage 2: Set up free list. */
+               memset(&rinfo->shadow, 0, sizeof(rinfo->shadow));
+               for (i = 0; i < BLK_RING_SIZE(info); i++)
+                       rinfo->shadow[i].req.u.rw.id = i+1;
+               rinfo->shadow_free = rinfo->ring.req_prod_pvt;
+               rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+
+               rc = blkfront_setup_indirect(rinfo);
+               if (rc) {
+                       kfree(copy);
+                       return rc;
+               }
+
+               for (i = 0; i < BLK_RING_SIZE(info); i++) {
+                       /* Not in use? */
+                       if (!copy[i].request)
+                               continue;
+
                         /*
-                        * Flush operations don't contain bios, so
-                        * we need to requeue the whole request
+                        * Get the bios in the request so we can re-queue them.
                          */
-                       list_add(&copy[i].request->queuelist, &requests);
-                       continue;
+                       if (copy[i].request->cmd_flags &
+                           (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+                               /*
+                                * Flush operations don't contain bios, so
+                                * we need to requeue the whole request
+                                */
+                               list_add(&copy[i].request->queuelist, &requests);
+                               continue;
+                       }
+                       merge_bio.head = copy[i].request->bio;
+                       merge_bio.tail = copy[i].request->biotail;
+                       bio_list_merge(&bio_list, &merge_bio);
+                       copy[i].request->bio = NULL;
+                       blk_end_request_all(copy[i].request, 0);
                 }
-               merge_bio.head = copy[i].request->bio;
-               merge_bio.tail = copy[i].request->biotail;
-               bio_list_merge(&bio_list, &merge_bio);
-               copy[i].request->bio = NULL;
-               blk_end_request_all(copy[i].request, 0);
-       }
-
-       kfree(copy);
  
+               kfree(copy);
+       }
         xenbus_switch_state(info->xbdev, XenbusStateConnected);
  
-       spin_lock_irq(&info->io_lock);
-
         /* Now safe for us to use the shared ring */
         info->connected = BLKIF_STATE_CONNECTED;
  
-       /* Kick any other new requests queued since we resumed */
-       kick_pending_request_queues(info);
+       for (r_index = 0; r_index < info->nr_rings; r_index++) {
+               struct blkfront_ring_info *rinfo;
+
+               rinfo = &info->rinfo[r_index];
+               /* Kick any other new requests queued since we resumed */
+               kick_pending_request_queues(rinfo);
+       }
  
         list_for_each_entry_safe(req, n, &requests, queuelist) {
                 /* Requeue pending requests (flush or discard) */
@@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info)
                 BUG_ON(req->nr_phys_segments > segs);
                 blk_mq_requeue_request(req);
         }
-       spin_unlock_irq(&info->io_lock);
         blk_mq_kick_requeue_list(info->rq);
  
         while ((bio = bio_list_pop(&bio_list)) != NULL) {
@@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev)
         return err;
  }
  
-static void
-blkfront_closing(struct blkfront_info *info)
+static void blkfront_closing(struct blkfront_info *info)
  {
         struct xenbus_device *xbdev = info->xbdev;
         struct block_device *bdev = NULL;
@@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info)
                 info->feature_secdiscard = !!discard_secure;
  }
  
-static int blkfront_setup_indirect(struct blkfront_info *info)
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
  {
         unsigned int psegs, grants;
         int err, i;
+       struct blkfront_info *info = rinfo->dev_info;
  
-       if (info->max_indirect_segments == 0)
-               grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       if (info->max_indirect_segments == 0) {
+               if (!HAS_EXTRA_REQ)
+                       grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               else {
+                       /*
+                        * When an extra req is required, the maximum
+                        * grants supported is related to the size of the
+                        * Linux block segment.
+                        */
+                       grants = GRANTS_PER_PSEG;
+               }
+       }
         else
                 grants = info->max_indirect_segments;
         psegs = grants / GRANTS_PER_PSEG;
  
-       err = fill_grant_buffer(info,
+       err = fill_grant_buffer(rinfo,
                                 (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
         if (err)
                 goto out_of_memory;
@@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
                  */
                 int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
  
-               BUG_ON(!list_empty(&info->indirect_pages));
+               BUG_ON(!list_empty(&rinfo->indirect_pages));
                 for (i = 0; i < num; i++) {
                         struct page *indirect_page = alloc_page(GFP_NOIO);
                         if (!indirect_page)
                                 goto out_of_memory;
-                       list_add(&indirect_page->lru, &info->indirect_pages);
+                       list_add(&indirect_page->lru, &rinfo->indirect_pages);
                 }
         }
  
         for (i = 0; i < BLK_RING_SIZE(info); i++) {
-               info->shadow[i].grants_used = kzalloc(
-                       sizeof(info->shadow[i].grants_used[0]) * grants,
+               rinfo->shadow[i].grants_used = kzalloc(
+                       sizeof(rinfo->shadow[i].grants_used[0]) * grants,
                         GFP_NOIO);
-               info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
+               rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO);
                 if (info->max_indirect_segments)
-                       info->shadow[i].indirect_grants = kzalloc(
-                               sizeof(info->shadow[i].indirect_grants[0]) *
+                       rinfo->shadow[i].indirect_grants = kzalloc(
+                               sizeof(rinfo->shadow[i].indirect_grants[0]) *
                                 INDIRECT_GREFS(grants),
                                 GFP_NOIO);
-               if ((info->shadow[i].grants_used == NULL) ||
-                       (info->shadow[i].sg == NULL) ||
+               if ((rinfo->shadow[i].grants_used == NULL) ||
+                       (rinfo->shadow[i].sg == NULL) ||
                      (info->max_indirect_segments &&
-                    (info->shadow[i].indirect_grants == NULL)))
+                    (rinfo->shadow[i].indirect_grants == NULL)))
                         goto out_of_memory;
-               sg_init_table(info->shadow[i].sg, psegs);
+               sg_init_table(rinfo->shadow[i].sg, psegs);
         }
  
  
@@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
  
  out_of_memory:
         for (i = 0; i < BLK_RING_SIZE(info); i++) {
-               kfree(info->shadow[i].grants_used);
-               info->shadow[i].grants_used = NULL;
-               kfree(info->shadow[i].sg);
-               info->shadow[i].sg = NULL;
-               kfree(info->shadow[i].indirect_grants);
-               info->shadow[i].indirect_grants = NULL;
-       }
-       if (!list_empty(&info->indirect_pages)) {
+               kfree(rinfo->shadow[i].grants_used);
+               rinfo->shadow[i].grants_used = NULL;
+               kfree(rinfo->shadow[i].sg);
+               rinfo->shadow[i].sg = NULL;
+               kfree(rinfo->shadow[i].indirect_grants);
+               rinfo->shadow[i].indirect_grants = NULL;
+       }
+       if (!list_empty(&rinfo->indirect_pages)) {
                 struct page *indirect_page, *n;
-               list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+               list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
                         list_del(&indirect_page->lru);
                         __free_page(indirect_page);
                 }
@@ -1927,7 +2287,7 @@ out_of_memory:
  /*
   * Gather all backend feature-*
   */
-static int blkfront_gather_backend_features(struct blkfront_info *info)
+static void blkfront_gather_backend_features(struct blkfront_info *info)
  {
         int err;
         int barrier, flush, discard, persistent;
@@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info)
         else
                 info->max_indirect_segments = min(indirect_segments,
                                                   xen_blkif_max_segments);
-
-       return blkfront_setup_indirect(info);
  }
  
  /*
@@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info)
         unsigned long sector_size;
         unsigned int physical_sector_size;
         unsigned int binfo;
-       int err;
+       int err, i;
  
         switch (info->connected) {
         case BLKIF_STATE_CONNECTED:
@@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info)
         if (err != 1)
                 physical_sector_size = sector_size;
  
-       err = blkfront_gather_backend_features(info);
-       if (err) {
-               xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
-                                info->xbdev->otherend);
-               return;
+       blkfront_gather_backend_features(info);
+       for (i = 0; i < info->nr_rings; i++) {
+               err = blkfront_setup_indirect(&info->rinfo[i]);
+               if (err) {
+                       xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+                                        info->xbdev->otherend);
+                       blkif_free(info, 0);
+                       break;
+               }
         }
  
         err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
@@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info)
         xenbus_switch_state(info->xbdev, XenbusStateConnected);
  
         /* Kick pending requests. */
-       spin_lock_irq(&info->io_lock);
         info->connected = BLKIF_STATE_CONNECTED;
-       kick_pending_request_queues(info);
-       spin_unlock_irq(&info->io_lock);
+       for (i = 0; i < info->nr_rings; i++)
+               kick_pending_request_queues(&info->rinfo[i]);
  
         add_disk(info->gd);
  
@@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev,
         case XenbusStateInitWait:
                 if (dev->state != XenbusStateInitialising)
                         break;
-               if (talk_to_blkback(dev, info)) {
-                       kfree(info);
-                       dev_set_drvdata(&dev->dev, NULL);
+               if (talk_to_blkback(dev, info))
                         break;
-               }
         case XenbusStateInitialising:
         case XenbusStateInitialised:
         case XenbusStateReconfiguring:
@@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev,
                 break;
  
         case XenbusStateConnected:
+               if (dev->state != XenbusStateInitialised) {
+                       if (talk_to_blkback(dev, info))
+                               break;
+               }
                 blkfront_connect(info);
                 break;
  
@@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = {
  static int __init xlblk_init(void)
  {
         int ret;
+       int nr_cpus = num_online_cpus();
  
         if (!xen_domain())
                 return -ENODEV;
@@ -2288,7 +2651,13 @@ static int __init xlblk_init(void)
         if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
                 pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
                         xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
-               xen_blkif_max_ring_order = 0;
+               xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+       }
+
+       if (xen_blkif_max_queues > nr_cpus) {
+               pr_info("Invalid max_queues (%d), will use default max: %d.\n",
+                       xen_blkif_max_queues, nr_cpus);
+               xen_blkif_max_queues = nr_cpus;
         }
  
         if (!xen_has_pv_disk_devices())
diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h

index c33e1c4..8b8cfad 100644 (file)
--- a/include/xen/interface/io/blkif.h
+++ b/include/xen/interface/io/blkif.h
@@ -27,6 +27,54 @@
  typedef uint16_t blkif_vdev_t;
  typedef uint64_t blkif_sector_t;
  
+/*
+ * Multiple hardware queues/rings:
+ * If supported, the backend will write the key "multi-queue-max-queues" to
+ * the directory for that vbd, and set its value to the maximum supported
+ * number of queues.
+ * Frontends that are aware of this feature and wish to use it can write the
+ * key "multi-queue-num-queues" with the number they wish to use, which must be
+ * greater than zero, and no more than the value reported by the backend in
+ * "multi-queue-max-queues".
+ *
+ * For frontends requesting just one queue, the usual event-channel and
+ * ring-ref keys are written as before, simplifying the backend processing
+ * to avoid distinguishing between a frontend that doesn't understand the
+ * multi-queue feature, and one that does, but requested only one queue.
+ *
+ * Frontends requesting two or more queues must not write the toplevel
+ * event-channel and ring-ref keys, instead writing those keys under sub-keys
+ * having the name "queue-N" where N is the integer ID of the queue/ring for
+ * which those keys belong. Queues are indexed from zero.
+ * For example, a frontend with two queues must write the following set of
+ * queue-related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ * It is also possible to use multiple queues/rings together with
+ * feature multi-page ring buffer.
+ * For example, a frontend requests two queues/rings and the size of each ring
+ * buffer is two pages must write the following set of related keys:
+ *
+ * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2"
+ * /local/domain/1/device/vbd/0/ring-page-order = "1"
+ * /local/domain/1/device/vbd/0/queue-0 = ""
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>"
+ * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>"
+ * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>"
+ * /local/domain/1/device/vbd/0/queue-1 = ""
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>"
+ * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>"
+ * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>"
+ *
+ */
+
  /*
   * REQUEST CODES.
   */
author	Jens Axboe <axboe@fb.com>
	Wed, 13 Jan 2016 15:20:36 +0000 (08:20 -0700)
committer	Jens Axboe <axboe@fb.com>
	Wed, 13 Jan 2016 15:20:36 +0000 (08:20 -0700)
drivers/block/xen-blkback/blkback.c		patch \| blob \| history
drivers/block/xen-blkback/common.h		patch \| blob \| history
drivers/block/xen-blkback/xenbus.c		patch \| blob \| history
drivers/block/xen-blkfront.c		patch \| blob \| history
include/xen/interface/io/blkif.h		patch \| blob \| history