diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 228 |
1 files changed, 141 insertions, 87 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index ade8a2d1b0aa..f53779692c77 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) return -EBUSY; ret = wait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; if (ret) @@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; + int freeze_depth; - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); - - if (freeze) { + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_hw_queues(q, false); } @@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { percpu_ref_reinit(&q->mq_usage_counter); wake_up_all(&q->mq_freeze_wq); } @@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv) data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_tag_idle(hctx); + queue_for_each_hw_ctx(q, hctx, i) { + /* the hctx may be unmapped, so check it here */ + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); + } } } @@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) spin_lock(&hctx->lock); list_splice(&rq_list, &hctx->dispatch); spin_unlock(&hctx->lock); + /* + * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but + * it's possible the queue is stopped and restarted again + * before this. Queue restart will dispatch requests. And since + * requests in rq_list aren't added into hctx->dispatch yet, + * the requests in rq_list might get lost. + * + * blk_mq_run_hw_queue() already checks the STOPPED bit + **/ + blk_mq_run_hw_queue(hctx, true); } } @@ -1224,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) + return 0; + else { + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + return -1; + } +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to @@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; + struct request *same_queue_rq = NULL; blk_queue_bounce(q, &bio); @@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) return; } + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + return; + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) return; @@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * we do limited pluging. If bio can be merged, do merge. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is empty + **/ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; + list_del_init(&old_rq->queuelist); } - } + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) + return; + if (!blk_mq_direct_issue_request(old_rq)) + return; + blk_mq_insert_request(old_rq, false, true, true); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); } @@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; - blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { @@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) return; } - if (use_plug && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, NULL)) return; rq = blk_mq_map_request(q, bio, &data); @@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; - - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (list_empty(&plug->mq_list)) + trace_block_plug(q); + else if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + list_add_tail(&rq->queuelist, &plug->mq_list); + blk_mq_put_ctx(data.ctx); + return; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1495,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; fail: @@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) return NOTIFY_OK; } -static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) -{ - struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; - - if (set->tags[hctx->queue_num]) - return NOTIFY_OK; - - set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); - if (!set->tags[hctx->queue_num]) - return NOTIFY_STOP; - - hctx->tags = set->tags[hctx->queue_num]; - return NOTIFY_OK; -} - static int blk_mq_hctx_notify(void *data, unsigned long action, unsigned int cpu) { @@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action, if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) return blk_mq_hctx_cpu_offline(hctx, cpu); - else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - return blk_mq_hctx_cpu_online(hctx, cpu); + + /* + * In case of CPU online, tags may be reallocated + * in blk_mq_map_swqueue() after mapping is updated. + */ return NOTIFY_OK; } +/* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) @@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); blk_free_flush_queue(hctx->fq); - kfree(hctx->ctxs); blk_mq_free_bitmap(&hctx->ctx_map); } @@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) unsigned int i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; + struct blk_mq_tag_set *set = q->tag_set; queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); @@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx = q->mq_ops->map_queue(q, i); cpumask_set_cpu(i, hctx->cpumask); + cpumask_set_cpu(i, hctx->tags->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; } @@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q) * disable it and free the request entries. */ if (!hctx->nr_ctx) { - struct blk_mq_tag_set *set = q->tag_set; - if (set->tags[i]) { blk_mq_free_rq_map(set, set->tags[i], i); set->tags[i] = NULL; - hctx->tags = NULL; } + hctx->tags = NULL; continue; } + /* unmapped hw queue can be remapped after CPU topo changed */ + if (!set->tags[i]) + set->tags[i] = blk_mq_init_rq_map(set, i); + hctx->tags = set->tags[i]; + WARN_ON(!hctx->tags); + /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping @@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q) unsigned int i; /* hctx kobj stays in hctx */ - queue_for_each_hw_ctx(q, hctx, i) + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx) + continue; + kfree(hctx->ctxs); kfree(hctx); + } kfree(q->queue_hw_ctx); @@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); @@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, */ list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_freeze_queue_start(q); - list_for_each_entry(q, &all_q_list, all_q_node) + list_for_each_entry(q, &all_q_list, all_q_node) { blk_mq_freeze_queue_wait(q); + /* + * timeout handler can't touch hw queue during the + * reinitialization + */ + del_timer_sync(&q->timeout); + } + list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_queue_reinit(q); @@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +{ + return tags->cpumask; +} +EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i; for (i = 0; i < set->nr_hw_queues; i++) { - if (set->tags[i]) + if (set->tags[i]) { blk_mq_free_rq_map(set, set->tags[i], i); + free_cpumask_var(set->tags[i]->cpumask); + } } kfree(set->tags); |