X-Git-Url: http://git.cascardo.info/?a=blobdiff_plain;f=block%2Fblk-mq.c;h=ddc2eed6477146320073b061a61e15ebe4d587eb;hb=fed41f7d039bad02f94cad9059e4b14cd81d13f2;hp=c207fa9870ebcaaed6bfb02efa70b5f7333954f8;hpb=5419e783829127dba712be769bce8c6a1ec0057e;p=cascardo%2Flinux.git diff --git a/block/blk-mq.c b/block/blk-mq.c index c207fa9870eb..ddc2eed64771 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -33,49 +34,28 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); - /* * Check if any of the ctx's have pending work in this hardware queue */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - unsigned int i; - - for (i = 0; i < hctx->ctx_map.size; i++) - if (hctx->ctx_map.map[i].word) - return true; - - return false; + return sbitmap_any_bit_set(&hctx->ctx_map); } -static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx) -{ - return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; -} - -#define CTX_TO_BIT(hctx, ctx) \ - ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) - /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) - set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) + sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } void blk_mq_freeze_queue_start(struct request_queue *q) @@ -244,21 +224,11 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - ctx = alloc_data.ctx; - } blk_mq_put_ctx(ctx); + if (!rq) { blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); @@ -333,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx, tag, &ctx->last_tag); + blk_mq_put_tag(hctx, ctx, tag); blk_queue_exit(q); } @@ -349,11 +319,7 @@ EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; - - hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); - blk_mq_free_hctx_request(hctx, rq); + blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -513,7 +479,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = - container_of(work, struct request_queue, requeue_work); + container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; unsigned long flags; @@ -568,16 +534,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_cancel_requeue_work(struct request_queue *q) { - cancel_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); } EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_work(&q->requeue_work); + kblockd_schedule_delayed_work(&q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); +void blk_mq_delay_kick_requeue_list(struct request_queue *q, + unsigned long msecs) +{ + kblockd_schedule_delayed_work(&q->requeue_work, + msecs_to_jiffies(msecs)); +} +EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); + void blk_mq_abort_requeue_list(struct request_queue *q) { unsigned long flags; @@ -600,8 +574,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - if (tag < tags->nr_tags) + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); return tags->rqs[tag]; + } return NULL; } @@ -756,38 +732,44 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } +struct flush_busy_ctx_data { + struct blk_mq_hw_ctx *hctx; + struct list_head *list; +}; + +static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_busy_ctx_data *flush_data = data; + struct blk_mq_hw_ctx *hctx = flush_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + sbitmap_clear_bit(sb, bitnr); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, flush_data->list); + spin_unlock(&ctx->lock); + return true; +} + /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - struct blk_mq_ctx *ctx; - int i; - - for (i = 0; i < hctx->ctx_map.size; i++) { - struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; - unsigned int off, bit; - - if (!bm->word) - continue; + struct flush_busy_ctx_data data = { + .hctx = hctx, + .list = list, + }; - bit = 0; - off = i * hctx->ctx_map.bits_per_word; - do { - bit = find_next_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); +} - ctx = hctx->ctxs[bit + off]; - clear_bit(bit, &bm->word); - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, list); - spin_unlock(&ctx->lock); +static inline unsigned int queued_to_index(unsigned int queued) +{ + if (!queued) + return 0; - bit++; - } while (1); - } + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } /* @@ -878,10 +860,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) dptr = &driver_list; } - if (!queued) - hctx->dispatched[0]++; - else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) - hctx->dispatched[ilog2(queued) + 1]++; + hctx->dispatched[queued_to_index(queued)]++; /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -937,7 +916,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) !blk_mq_hw_queue_mapped(hctx))) return; - if (!async) { + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); @@ -948,8 +927,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, 0); + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); } void blk_mq_run_hw_queues(struct request_queue *q, bool async) @@ -970,7 +948,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->run_work); + cancel_work(&hctx->run_work); cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } @@ -1023,7 +1001,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work); __blk_mq_run_hw_queue(hctx); } @@ -1076,9 +1054,7 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); @@ -1095,12 +1071,10 @@ static void blk_mq_insert_requests(struct request_queue *q, bool from_schedule) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); trace_block_unplug(q, depth, !from_schedule); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1234,26 +1208,14 @@ static struct request *blk_mq_map_request(struct request_queue *q, blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); + hctx = blk_mq_map_queue(q, ctx->cpu); if (rw_is_sync(bio_op(bio), bio->bi_opf)) op_flags |= REQ_SYNC; trace_block_getrq(q, bio, op); - blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - if (unlikely(!rq)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, op); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - ctx = alloc_data.ctx; - hctx = alloc_data.hctx; - } hctx->queued++; data->hctx = hctx; @@ -1265,8 +1227,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) { int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, - rq->mq_ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, @@ -1470,15 +1431,6 @@ run_queue: return cookie; } -/* - * Default mapping to a software queue, since we use one per CPU. - */ -struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) -{ - return q->queue_hw_ctx[q->mq_map[cpu]]; -} -EXPORT_SYMBOL(blk_mq_map_queue); - static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -1606,42 +1558,18 @@ fail: return NULL; } -static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) -{ - kfree(bitmap->map); -} - -static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) -{ - unsigned int bpw = 8, total, num_maps, i; - - bitmap->bits_per_word = bpw; - - num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; - bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bitmap->map) - return -ENOMEM; - - total = nr_cpu_ids; - for (i = 0; i < num_maps; i++) { - bitmap->map[i].depth = min(total, bitmap->bits_per_word); - total -= bitmap->map[i].depth; - } - - return 0; -} - /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ -static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) +static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { + struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); spin_lock(&ctx->lock); @@ -1652,30 +1580,20 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) spin_unlock(&ctx->lock); if (list_empty(&tmp)) - return NOTIFY_OK; + return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); - return NOTIFY_OK; + return 0; } -static int blk_mq_hctx_notify(void *data, unsigned long action, - unsigned int cpu) +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx = data; - - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - return blk_mq_hctx_cpu_offline(hctx, cpu); - - /* - * In case of CPU online, tags may be reallocated - * in blk_mq_map_swqueue() after mapping is updated. - */ - - return NOTIFY_OK; + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); } /* hctx->ctxs will be freed in queue's release handler */ @@ -1695,9 +1613,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); + blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -1734,7 +1652,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (node == NUMA_NO_NODE) node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); @@ -1742,9 +1660,7 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->queue_num = hctx_idx; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; - blk_mq_init_cpu_notifier(&hctx->cpu_notifier, - blk_mq_hctx_notify, hctx); - blk_mq_register_cpu_notifier(&hctx->cpu_notifier); + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; @@ -1757,7 +1673,8 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->ctxs) goto unregister_cpu_notifier; - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, + node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -1784,12 +1701,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); unregister_cpu_notifier: - blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); - + blk_mq_remove_cpuhp(hctx); return -1; } @@ -1812,7 +1728,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, if (!cpu_online(i)) continue; - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); /* * Set local node, IFF we have more than one hw queue. If @@ -1850,7 +1766,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, continue; ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = q->mq_ops->map_queue(q, i); + hctx = blk_mq_map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; @@ -1860,8 +1776,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, mutex_unlock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_ctxmap *map = &hctx->ctx_map; - /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -1881,13 +1795,12 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); - cpumask_copy(hctx->tags->cpumask, hctx->cpumask); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ - map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Initialize batch roundrobin counts @@ -1975,7 +1888,6 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } - kfree(q->mq_map); q->mq_map = NULL; kfree(q->queue_hw_ctx); @@ -2074,9 +1986,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->queue_hw_ctx) goto err_percpu; - q->mq_map = blk_mq_make_queue_map(set); - if (!q->mq_map) - goto err_map; + q->mq_map = set->mq_map; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2094,7 +2004,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->sg_reserved_size = INT_MAX; - INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); @@ -2126,8 +2036,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(q->mq_map); -err_map: kfree(q->queue_hw_ctx); err_percpu: free_percpu(q->queue_ctx); @@ -2159,8 +2067,6 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); - /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe * we should change hctx numa_node according to new topology (this @@ -2172,50 +2078,18 @@ static void blk_mq_queue_reinit(struct request_queue *q, blk_mq_sysfs_register(q); } -static int blk_mq_queue_reinit_notify(struct notifier_block *nb, - unsigned long action, void *hcpu) +/* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ +static struct cpumask cpuhp_online_new; + +static void blk_mq_queue_reinit_work(void) { struct request_queue *q; - int cpu = (unsigned long)hcpu; - /* - * New online cpumask which is going to be set in this hotplug event. - * Declare this cpumasks as global as cpu-hotplug operation is invoked - * one-by-one and dynamically allocating this could result in a failure. - */ - static struct cpumask online_new; - - /* - * Before hotadded cpu starts handling requests, new mappings must - * be established. Otherwise, these requests in hw queue might - * never be dispatched. - * - * For example, there is a single hw queue (hctx) and two CPU queues - * (ctx0 for CPU0, and ctx1 for CPU1). - * - * Now CPU1 is just onlined and a request is inserted into - * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is - * still zero. - * - * And then while running hw queue, flush_busy_ctxs() finds bit0 is - * set in pending bitmap and tries to retrieve requests in - * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, - * so the request in ctx1->rq_list is ignored. - */ - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - case CPU_UP_CANCELED: - cpumask_copy(&online_new, cpu_online_mask); - break; - case CPU_UP_PREPARE: - cpumask_copy(&online_new, cpu_online_mask); - cpumask_set_cpu(cpu, &online_new); - break; - default: - return NOTIFY_OK; - } mutex_lock(&all_q_mutex); - /* * We need to freeze and reinit all existing queues. Freezing * involves synchronous wait for an RCU grace period and doing it @@ -2236,13 +2110,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, } list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q, &online_new); + blk_mq_queue_reinit(q, &cpuhp_online_new); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_unfreeze_queue(q); mutex_unlock(&all_q_mutex); - return NOTIFY_OK; +} + +static int blk_mq_queue_reinit_dead(unsigned int cpu) +{ + cpumask_copy(&cpuhp_online_new, cpu_online_mask); + blk_mq_queue_reinit_work(); + return 0; +} + +/* + * Before hotadded cpu starts handling requests, new mappings must be + * established. Otherwise, these requests in hw queue might never be + * dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues (ctx0 + * for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list + * and set bit0 in pending bitmap as ctx1->index_hw is still zero. + * + * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in + * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list + * is ignored. + */ +static int blk_mq_queue_reinit_prepare(unsigned int cpu) +{ + cpumask_copy(&cpuhp_online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &cpuhp_online_new); + blk_mq_queue_reinit_work(); + return 0; } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) @@ -2299,12 +2203,6 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } -struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) -{ - return tags->cpumask; -} -EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -2313,6 +2211,8 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { + int ret; + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) @@ -2322,7 +2222,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; - if (!set->ops->queue_rq || !set->ops->map_queue) + if (!set->ops->queue_rq) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { @@ -2351,17 +2251,35 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (!set->tags) return -ENOMEM; - if (blk_mq_alloc_rq_maps(set)) - goto enomem; + ret = -ENOMEM; + set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, + GFP_KERNEL, set->numa_node); + if (!set->mq_map) + goto out_free_tags; + + if (set->ops->map_queues) + ret = set->ops->map_queues(set); + else + ret = blk_mq_map_queues(set); + if (ret) + goto out_free_mq_map; + + ret = blk_mq_alloc_rq_maps(set); + if (ret) + goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -enomem: + +out_free_mq_map: + kfree(set->mq_map); + set->mq_map = NULL; +out_free_tags: kfree(set->tags); set->tags = NULL; - return -ENOMEM; + return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -2374,6 +2292,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) blk_mq_free_rq_map(set, set->tags[i], i); } + kfree(set->mq_map); + set->mq_map = NULL; + kfree(set->tags); set->tags = NULL; } @@ -2444,10 +2365,12 @@ void blk_mq_enable_hotplug(void) static int __init blk_mq_init(void) { - blk_mq_cpu_init(); - - hotcpu_notifier(blk_mq_queue_reinit_notify, 0); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, + blk_mq_hctx_notify_dead); + cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare", + blk_mq_queue_reinit_prepare, + blk_mq_queue_reinit_dead); return 0; } subsys_initcall(blk_mq_init);