Merge branch 'for-3.19/core' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 13 Dec 2014 22:14:23 +0000 (14:14 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 13 Dec 2014 22:14:23 +0000 (14:14 -0800)
Pull block driver core update from Jens Axboe:
 "This is the pull request for the core block IO changes for 3.19.  Not
  a huge round this time, mostly lots of little good fixes:

   - Fix a bug in sysfs blktrace interface causing a NULL pointer
     dereference, when enabled/disabled through that API.  From Arianna
     Avanzini.

   - Various updates/fixes/improvements for blk-mq:

        - A set of updates from Bart, mostly fixing buts in the tag
          handling.

        - Cleanup/code consolidation from Christoph.

        - Extend queue_rq API to be able to handle batching issues of IO
          requests. NVMe will utilize this shortly. From me.

        - A few tag and request handling updates from me.

        - Cleanup of the preempt handling for running queues from Paolo.

        - Prevent running of unmapped hardware queues from Ming Lei.

        - Move the kdump memory limiting check to be in the correct
          location, from Shaohua.

        - Initialize all software queues at init time from Takashi. This
          prevents a kobject warning when CPUs are brought online that
          weren't online when a queue was registered.

   - Single writeback fix for I_DIRTY clearing from Tejun.  Queued with
     the core IO changes, since it's just a single fix.

   - Version X of the __bio_add_page() segment addition retry from
     Maurizio.  Hope the Xth time is the charm.

   - Documentation fixup for IO scheduler merging from Jan.

   - Introduce (and use) generic IO stat accounting helpers for non-rq
     drivers, from Gu Zheng.

   - Kill off artificial limiting of max sectors in a request from
     Christoph"

* 'for-3.19/core' of git://git.kernel.dk/linux-block: (26 commits)
  bio: modify __bio_add_page() to accept pages that don't start a new segment
  blk-mq: Fix uninitialized kobject at CPU hotplugging
  blktrace: don't let the sysfs interface remove trace from running list
  blk-mq: Use all available hardware queues
  blk-mq: Micro-optimize bt_get()
  blk-mq: Fix a race between bt_clear_tag() and bt_get()
  blk-mq: Avoid that __bt_get_word() wraps multiple times
  blk-mq: Fix a use-after-free
  blk-mq: prevent unmapped hw queue from being scheduled
  blk-mq: re-check for available tags after running the hardware queue
  blk-mq: fix hang in bt_get()
  blk-mq: move the kdump check to blk_mq_alloc_tag_set
  blk-mq: cleanup tag free handling
  blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map
  blk: introduce generic io stat accounting help function
  blk-mq: handle the single queue case in blk_mq_hctx_next_cpu
  genhd: check for int overflow in disk_expand_part_tbl()
  blk-mq: add blk_mq_free_hctx_request()
  blk-mq: export blk_mq_free_request()
  blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable
  ...

21 files changed:
Documentation/block/biodoc.txt
block/bio.c
block/blk-core.c
block/blk-mq-cpumap.c
block/blk-mq-sysfs.c
block/blk-mq-tag.c
block/blk-mq.c
block/blk-mq.h
block/blk-settings.c
block/blk-sysfs.c
block/genhd.c
drivers/block/aoe/aoeblk.c
drivers/block/mtip32xx/mtip32xx.c
drivers/block/null_blk.c
drivers/block/virtio_blk.c
drivers/scsi/scsi_lib.c
fs/fs-writeback.c
include/linux/bio.h
include/linux/blk-mq.h
include/linux/blkdev.h
kernel/trace/blktrace.c

index 6b972b2..5aabc08 100644 (file)
@@ -942,7 +942,11 @@ elevator_allow_merge_fn            called whenever the block layer determines
                                request safely. The io scheduler may still
                                want to stop a merge at this point if it
                                results in some sort of conflict internally,
-                               this hook allows it to do that.
+                               this hook allows it to do that. Note however
+                               that two *requests* can still be merged at later
+                               time. Currently the io scheduler has no way to
+                               prevent that. It can only learn about the fact
+                               from elevator_merge_req_fn callback.
 
 elevator_dispatch_fn*          fills the dispatch queue with ready requests.
                                I/O schedulers are free to postpone requests by
index 3e6e198..471d738 100644 (file)
@@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                                }
                        }
 
+                       bio->bi_iter.bi_size += len;
                        goto done;
                }
 
@@ -764,28 +765,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                return 0;
 
        /*
-        * we might lose a segment or two here, but rather that than
-        * make this too complex.
+        * setup the new entry, we might clear it again later if we
+        * cannot add the page
+        */
+       bvec = &bio->bi_io_vec[bio->bi_vcnt];
+       bvec->bv_page = page;
+       bvec->bv_len = len;
+       bvec->bv_offset = offset;
+       bio->bi_vcnt++;
+       bio->bi_phys_segments++;
+       bio->bi_iter.bi_size += len;
+
+       /*
+        * Perform a recount if the number of segments is greater
+        * than queue_max_segments(q).
         */
 
-       while (bio->bi_phys_segments >= queue_max_segments(q)) {
+       while (bio->bi_phys_segments > queue_max_segments(q)) {
 
                if (retried_segments)
-                       return 0;
+                       goto failed;
 
                retried_segments = 1;
                blk_recount_segments(q, bio);
        }
 
-       /*
-        * setup the new entry, we might clear it again later if we
-        * cannot add the page
-        */
-       bvec = &bio->bi_io_vec[bio->bi_vcnt];
-       bvec->bv_page = page;
-       bvec->bv_len = len;
-       bvec->bv_offset = offset;
-
        /*
         * if queue has other restrictions (eg varying max sector size
         * depending on offset), it can specify a merge_bvec_fn in the
@@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                struct bvec_merge_data bvm = {
                        .bi_bdev = bio->bi_bdev,
                        .bi_sector = bio->bi_iter.bi_sector,
-                       .bi_size = bio->bi_iter.bi_size,
+                       .bi_size = bio->bi_iter.bi_size - len,
                        .bi_rw = bio->bi_rw,
                };
 
@@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-               if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
-                       bvec->bv_page = NULL;
-                       bvec->bv_len = 0;
-                       bvec->bv_offset = 0;
-                       return 0;
-               }
+               if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
+                       goto failed;
        }
 
        /* If we may be able to merge these biovecs, force a recount */
-       if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
+       if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
                bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
-       bio->bi_vcnt++;
-       bio->bi_phys_segments++;
  done:
-       bio->bi_iter.bi_size += len;
        return len;
+
+ failed:
+       bvec->bv_page = NULL;
+       bvec->bv_len = 0;
+       bvec->bv_offset = 0;
+       bio->bi_vcnt--;
+       bio->bi_iter.bi_size -= len;
+       blk_recount_segments(q, bio);
+       return 0;
 }
 
 /**
@@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio)
        }
 }
 
+void generic_start_io_acct(int rw, unsigned long sectors,
+                          struct hd_struct *part)
+{
+       int cpu = part_stat_lock();
+
+       part_round_stats(cpu, part);
+       part_stat_inc(cpu, part, ios[rw]);
+       part_stat_add(cpu, part, sectors[rw], sectors);
+       part_inc_in_flight(part, rw);
+
+       part_stat_unlock();
+}
+EXPORT_SYMBOL(generic_start_io_acct);
+
+void generic_end_io_acct(int rw, struct hd_struct *part,
+                        unsigned long start_time)
+{
+       unsigned long duration = jiffies - start_time;
+       int cpu = part_stat_lock();
+
+       part_stat_add(cpu, part, ticks[rw], duration);
+       part_round_stats(cpu, part);
+       part_dec_in_flight(part, rw);
+
+       part_stat_unlock();
+}
+EXPORT_SYMBOL(generic_end_io_acct);
+
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 void bio_flush_dcache_pages(struct bio *bi)
 {
index ea1c4d0..30f6153 100644 (file)
@@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q)
        del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
        blk_sync_queue(q);
 
+       if (q->mq_ops)
+               blk_mq_free_queue(q);
+
        spin_lock_irq(lock);
        if (q->queue_lock != &q->__queue_lock)
                q->queue_lock = &q->__queue_lock;
index 1065d7c..5f13f4d 100644 (file)
@@ -17,7 +17,7 @@
 static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
                              const int cpu)
 {
-       return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
+       return cpu * nr_queues / nr_cpus;
 }
 
 static int get_first_sibling(unsigned int cpu)
@@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
        unsigned int *map;
 
        /* If cpus are offline, map them to first hctx */
-       map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
+       map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
                                set->numa_node);
        if (!map)
                return NULL;
index 371d880..1630a20 100644 (file)
@@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q)
 {
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
-       int i, j;
+       int i;
 
        kobject_init(&q->mq_kobj, &blk_mq_ktype);
 
-       queue_for_each_hw_ctx(q, hctx, i) {
+       queue_for_each_hw_ctx(q, hctx, i)
                kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
 
-               hctx_for_each_ctx(hctx, ctx, j)
-                       kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
-       }
+       queue_for_each_ctx(q, ctx, i)
+               kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 }
 
 /* see blk_register_queue() */
index 728b9a4..e3d4e40 100644 (file)
@@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
 {
        int tag, org_last_tag, end;
+       bool wrap = last_tag != 0;
 
        org_last_tag = last_tag;
        end = bm->depth;
@@ -148,15 +149,16 @@ restart:
                         * We started with an offset, start from 0 to
                         * exhaust the map.
                         */
-                       if (org_last_tag && last_tag) {
-                               end = last_tag;
+                       if (wrap) {
+                               wrap = false;
+                               end = org_last_tag;
                                last_tag = 0;
                                goto restart;
                        }
                        return -1;
                }
                last_tag = tag + 1;
-       } while (test_and_set_bit_lock(tag, &bm->word));
+       } while (test_and_set_bit(tag, &bm->word));
 
        return tag;
 }
@@ -246,14 +248,29 @@ static int bt_get(struct blk_mq_alloc_data *data,
        if (!(data->gfp & __GFP_WAIT))
                return -1;
 
-       bs = bt_wait_ptr(bt, hctx);
        do {
+               bs = bt_wait_ptr(bt, hctx);
                prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
 
                tag = __bt_get(hctx, bt, last_tag);
                if (tag != -1)
                        break;
 
+               /*
+                * We're out of tags on this hardware queue, kick any
+                * pending IO submits before going to sleep waiting for
+                * some to complete.
+                */
+               blk_mq_run_hw_queue(hctx, false);
+
+               /*
+                * Retry tag allocation after running the hardware queue,
+                * as running the queue may also have found completions.
+                */
+               tag = __bt_get(hctx, bt, last_tag);
+               if (tag != -1)
+                       break;
+
                blk_mq_put_ctx(data->ctx);
 
                io_schedule();
@@ -268,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data,
                        hctx = data->hctx;
                        bt = &hctx->tags->bitmap_tags;
                }
-               finish_wait(&bs->wait, &wait);
-               bs = bt_wait_ptr(bt, hctx);
        } while (1);
 
        finish_wait(&bs->wait, &wait);
@@ -340,11 +355,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
        struct bt_wait_state *bs;
        int wait_cnt;
 
-       /*
-        * The unlock memory barrier need to order access to req in free
-        * path and clearing tag bit
-        */
-       clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+       clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
+
+       /* Ensure that the wait list checks occur after clear_bit(). */
+       smp_mb();
 
        bs = bt_wake_ptr(bt);
        if (!bs)
@@ -360,21 +374,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
        }
 }
 
-static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
-{
-       BUG_ON(tag >= tags->nr_tags);
-
-       bt_clear_tag(&tags->bitmap_tags, tag);
-}
-
-static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
-                                     unsigned int tag)
-{
-       BUG_ON(tag >= tags->nr_reserved_tags);
-
-       bt_clear_tag(&tags->breserved_tags, tag);
-}
-
 void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
                    unsigned int *last_tag)
 {
@@ -383,10 +382,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
        if (tag >= tags->nr_reserved_tags) {
                const int real_tag = tag - tags->nr_reserved_tags;
 
-               __blk_mq_put_tag(tags, real_tag);
+               BUG_ON(real_tag >= tags->nr_tags);
+               bt_clear_tag(&tags->bitmap_tags, real_tag);
                *last_tag = real_tag;
-       } else
-               __blk_mq_put_reserved_tag(tags, tag);
+       } else {
+               BUG_ON(tag >= tags->nr_reserved_tags);
+               bt_clear_tag(&tags->breserved_tags, tag);
+       }
 }
 
 static void bt_for_each(struct blk_mq_hw_ctx *hctx,
index 92ceef0..da1ab56 100644 (file)
@@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
        blk_mq_queue_exit(q);
 }
 
-void blk_mq_free_request(struct request *rq)
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
        struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx;
-       struct request_queue *q = rq->q;
 
        ctx->rq_completed[rq_is_sync(rq)]++;
-
-       hctx = q->mq_ops->map_queue(q, ctx->cpu);
        __blk_mq_free_request(hctx, ctx, rq);
+
+}
+EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
+
+void blk_mq_free_request(struct request *rq)
+{
+       struct blk_mq_hw_ctx *hctx;
+       struct request_queue *q = rq->q;
+
+       hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+       blk_mq_free_hctx_request(hctx, rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, int error)
 {
@@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv)
                 * If not software queues are currently mapped to this
                 * hardware queue, there's nothing to check
                 */
-               if (!hctx->nr_ctx || !hctx->tags)
+               if (!blk_mq_hw_queue_mapped(hctx))
                        continue;
 
                blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
@@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
        struct request_queue *q = hctx->queue;
        struct request *rq;
        LIST_HEAD(rq_list);
+       LIST_HEAD(driver_list);
+       struct list_head *dptr;
        int queued;
 
        WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
@@ -715,17 +725,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
                spin_unlock(&hctx->lock);
        }
 
+       /*
+        * Start off with dptr being NULL, so we start the first request
+        * immediately, even if we have more pending.
+        */
+       dptr = NULL;
+
        /*
         * Now process all the entries, sending them to the driver.
         */
        queued = 0;
        while (!list_empty(&rq_list)) {
+               struct blk_mq_queue_data bd;
                int ret;
 
                rq = list_first_entry(&rq_list, struct request, queuelist);
                list_del_init(&rq->queuelist);
 
-               ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
+               bd.rq = rq;
+               bd.list = dptr;
+               bd.last = list_empty(&rq_list);
+
+               ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
                case BLK_MQ_RQ_QUEUE_OK:
                        queued++;
@@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
                if (ret == BLK_MQ_RQ_QUEUE_BUSY)
                        break;
+
+               /*
+                * We've done the first request. If we have more than 1
+                * left in the list, set dptr to defer issue.
+                */
+               if (!dptr && rq_list.next != rq_list.prev)
+                       dptr = &driver_list;
        }
 
        if (!queued)
@@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
-       int cpu = hctx->next_cpu;
+       if (hctx->queue->nr_hw_queues == 1)
+               return WORK_CPU_UNBOUND;
 
        if (--hctx->next_cpu_batch <= 0) {
-               int next_cpu;
+               int cpu = hctx->next_cpu, next_cpu;
 
                next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
                if (next_cpu >= nr_cpu_ids)
@@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+
+               return cpu;
        }
 
-       return cpu;
+       return hctx->next_cpu;
 }
 
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
-       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
+       if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
+           !blk_mq_hw_queue_mapped(hctx)))
                return;
 
-       if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
-               __blk_mq_run_hw_queue(hctx);
-       else if (hctx->queue->nr_hw_queues == 1)
-               kblockd_schedule_delayed_work(&hctx->run_work, 0);
-       else {
-               unsigned int cpu;
+       if (!async) {
+               int cpu = get_cpu();
+               if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+                       __blk_mq_run_hw_queue(hctx);
+                       put_cpu();
+                       return;
+               }
 
-               cpu = blk_mq_hctx_next_cpu(hctx);
-               kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
+               put_cpu();
        }
+
+       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+                       &hctx->run_work, 0);
 }
 
 void blk_mq_run_queues(struct request_queue *q, bool async)
@@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
                    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
                        continue;
 
-               preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
-               preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
@@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 
-       preempt_disable();
        blk_mq_run_hw_queue(hctx, false);
-       preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 
@@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
                        continue;
 
                clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-               preempt_disable();
                blk_mq_run_hw_queue(hctx, async);
-               preempt_enable();
        }
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
 
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
-       unsigned long tmo = msecs_to_jiffies(msecs);
-
-       if (hctx->queue->nr_hw_queues == 1)
-               kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
-       else {
-               unsigned int cpu;
+       if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
+               return;
 
-               cpu = blk_mq_hctx_next_cpu(hctx);
-               kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
-       }
+       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+                       &hctx->delay_work, msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
@@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                goto run_queue;
        }
 
-       if (is_sync) {
+       /*
+        * If the driver supports defer issued based on 'last', then
+        * queue it up like normal since we can potentially save some
+        * CPU this way.
+        */
+       if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+               struct blk_mq_queue_data bd = {
+                       .rq = rq,
+                       .list = NULL,
+                       .last = 1
+               };
                int ret;
 
                blk_mq_bio_to_request(rq, bio);
@@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 * error (busy), just add it to our list as we previously
                 * would have done
                 */
-               ret = q->mq_ops->queue_rq(data.hctx, rq, true);
+               ret = q->mq_ops->queue_rq(data.hctx, &bd);
                if (ret == BLK_MQ_RQ_QUEUE_OK)
                        goto done;
                else {
@@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
        if (!ctx)
                return ERR_PTR(-ENOMEM);
 
-       /*
-        * If a crashdump is active, then we are potentially in a very
-        * memory constrained environment. Limit us to 1 queue and
-        * 64 tags to prevent using too much memory.
-        */
-       if (is_kdump_kernel()) {
-               set->nr_hw_queues = 1;
-               set->queue_depth = min(64U, set->queue_depth);
-       }
-
        hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
                        set->numa_node);
 
@@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
                set->queue_depth = BLK_MQ_MAX_DEPTH;
        }
 
+       /*
+        * If a crashdump is active, then we are potentially in a very
+        * memory constrained environment. Limit us to 1 queue and
+        * 64 tags to prevent using too much memory.
+        */
+       if (is_kdump_kernel()) {
+               set->nr_hw_queues = 1;
+               set->queue_depth = min(64U, set->queue_depth);
+       }
+
        set->tags = kmalloc_node(set->nr_hw_queues *
                                 sizeof(struct blk_mq_tags *),
                                 GFP_KERNEL, set->numa_node);
index d567d52..206230e 100644 (file)
@@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
        data->hctx = hctx;
 }
 
+static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
+{
+       return hctx->nr_ctx && hctx->tags;
+}
+
 #endif
index aa02247..6ed2cbe 100644 (file)
@@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
                       __func__, max_hw_sectors);
        }
 
-       limits->max_hw_sectors = max_hw_sectors;
-       limits->max_sectors = min_t(unsigned int, max_hw_sectors,
-                                   BLK_DEF_MAX_SECTORS);
+       limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
 }
 EXPORT_SYMBOL(blk_limits_max_hw_sectors);
 
index 1fac434..935ea2a 100644 (file)
@@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
  *     Currently, its primary task it to free all the &struct request
  *     structures that were allocated to the queue and the queue itself.
  *
- * Caveat:
- *     Hopefully the low level driver will have finished any
- *     outstanding requests first...
+ * Note:
+ *     The low level driver must have finished any outstanding requests first
+ *     via blk_cleanup_queue().
  **/
 static void blk_release_queue(struct kobject *kobj)
 {
        struct request_queue *q =
                container_of(kobj, struct request_queue, kobj);
 
-       blk_sync_queue(q);
-
        blkcg_exit_queue(q);
 
        if (q->elevator) {
@@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj)
        if (q->queue_tags)
                __blk_queue_free_tags(q);
 
-       if (q->mq_ops)
-               blk_mq_free_queue(q);
-       else
+       if (!q->mq_ops)
                blk_free_flush_queue(q->fq);
 
        blk_trace_shutdown(q);
index bd30606..0a536dc 100644 (file)
@@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
        struct disk_part_tbl *old_ptbl = disk->part_tbl;
        struct disk_part_tbl *new_ptbl;
        int len = old_ptbl ? old_ptbl->len : 0;
-       int target = partno + 1;
+       int i, target;
        size_t size;
-       int i;
+
+       /*
+        * check for int overflow, since we can get here from blkpg_ioctl()
+        * with a user passed 'partno'.
+        */
+       target = partno + 1;
+       if (target < 0)
+               return -EINVAL;
 
        /* disk_max_parts() is zero during initialization, ignore if so */
        if (disk_max_parts(disk) && target > disk_max_parts(disk))
index dd73e1f..46c282f 100644 (file)
@@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp)
        WARN_ON(d->flags & DEVFL_TKILL);
        WARN_ON(d->gd);
        WARN_ON(d->flags & DEVFL_UP);
-       blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
+       blk_queue_max_hw_sectors(q, 1024);
        q->backing_dev_info.name = "aoe";
        q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
        d->bufpool = mp;
index 1bd5f52..3bd7ca9 100644 (file)
@@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
        return false;
 }
 
-static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
-               bool last)
+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
+                        const struct blk_mq_queue_data *bd)
 {
+       struct request *rq = bd->rq;
        int ret;
 
        if (unlikely(mtip_check_unal_depth(hctx, rq)))
index 8001e81..caa6121 100644 (file)
@@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q)
        }
 }
 
-static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq,
-               bool last)
+static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
+                        const struct blk_mq_queue_data *bd)
 {
-       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+       struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
 
-       cmd->rq = rq;
+       cmd->rq = bd->rq;
        cmd->nq = hctx->driver_data;
 
-       blk_mq_start_request(rq);
+       blk_mq_start_request(bd->rq);
 
        null_handle_cmd(cmd);
        return BLK_MQ_RQ_QUEUE_OK;
index 1fb9e09..7ef7c09 100644 (file)
@@ -159,10 +159,11 @@ static void virtblk_done(struct virtqueue *vq)
        spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
-static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
-               bool last)
+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
+                          const struct blk_mq_queue_data *bd)
 {
        struct virtio_blk *vblk = hctx->queue->queuedata;
+       struct request *req = bd->rq;
        struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
        unsigned long flags;
        unsigned int num;
@@ -223,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
                return BLK_MQ_RQ_QUEUE_ERROR;
        }
 
-       if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
+       if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
                notify = true;
        spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 
index 7e3d954..43318d5 100644 (file)
@@ -1947,9 +1947,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
        blk_mq_complete_request(cmd->request);
 }
 
-static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
-               bool last)
+static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+                        const struct blk_mq_queue_data *bd)
 {
+       struct request *req = bd->rq;
        struct request_queue *q = req->q;
        struct scsi_device *sdev = q->queuedata;
        struct Scsi_Host *shost = sdev->host;
index ef9bef1..2d609a5 100644 (file)
@@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
         * write_inode()
         */
        spin_lock(&inode->i_lock);
-       /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
-       if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-               inode->i_state &= ~I_DIRTY_PAGES;
+
        dirty = inode->i_state & I_DIRTY;
-       inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+       inode->i_state &= ~I_DIRTY;
+
+       /*
+        * Paired with smp_mb() in __mark_inode_dirty().  This allows
+        * __mark_inode_dirty() to test i_state without grabbing i_lock -
+        * either they see the I_DIRTY bits cleared or we see the dirtied
+        * inode.
+        *
+        * I_DIRTY_PAGES is always cleared together above even if @mapping
+        * still has dirty pages.  The flag is reinstated after smp_mb() if
+        * necessary.  This guarantees that either __mark_inode_dirty()
+        * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
+        */
+       smp_mb();
+
+       if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+               inode->i_state |= I_DIRTY_PAGES;
+
        spin_unlock(&inode->i_lock);
+
        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                int err = write_inode(inode, wbc);
@@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        }
 
        /*
-        * make sure that changes are seen by all cpus before we test i_state
-        * -- mikulas
+        * Paired with smp_mb() in __writeback_single_inode() for the
+        * following lockless i_state test.  See there for details.
         */
        smp_mb();
 
-       /* avoid the locking if we can */
        if ((inode->i_state & flags) == flags)
                return;
 
index 7347f48..efead0b 100644 (file)
@@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 
+void generic_start_io_acct(int rw, unsigned long sectors,
+                          struct hd_struct *part);
+void generic_end_io_acct(int rw, struct hd_struct *part,
+                        unsigned long start_time);
+
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error        "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
 #endif
index 15f7034..8aded9a 100644 (file)
@@ -79,7 +79,13 @@ struct blk_mq_tag_set {
        struct list_head        tag_list;
 };
 
-typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool);
+struct blk_mq_queue_data {
+       struct request *rq;
+       struct list_head *list;
+       bool last;
+};
+
+typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
@@ -140,6 +146,7 @@ enum {
        BLK_MQ_F_TAG_SHARED     = 1 << 1,
        BLK_MQ_F_SG_MERGE       = 1 << 2,
        BLK_MQ_F_SYSFS_UP       = 1 << 3,
+       BLK_MQ_F_DEFER_ISSUE    = 1 << 4,
 
        BLK_MQ_S_STOPPED        = 0,
        BLK_MQ_S_TAG_ACTIVE     = 1,
@@ -162,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
                gfp_t gfp, bool reserved);
index 0495e38..92f4b4b 100644 (file)
@@ -1184,7 +1184,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
        BLK_MAX_SEGMENTS        = 128,
        BLK_SAFE_MAX_SECTORS    = 255,
-       BLK_DEF_MAX_SECTORS     = 1024,
        BLK_MAX_SEGMENT_SIZE    = 65536,
        BLK_SEG_BOUNDARY_MASK   = 0xFFFFFFFFUL,
 };
index 11b9cb3..483cecf 100644 (file)
@@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
        if (atomic_dec_and_test(&blk_probes_ref))
                blk_unregister_tracepoints();
 
-       spin_lock_irq(&running_trace_lock);
-       list_del(&bt->running_list);
-       spin_unlock_irq(&running_trace_lock);
        blk_trace_free(bt);
        return 0;
 }