summaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c228
1 files changed, 141 insertions, 87 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..f53779692c77 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -89,7 +89,8 @@ static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp)
return -EBUSY;
ret = wait_event_interruptible(q->mq_freeze_wq,
- !q->mq_freeze_depth || blk_queue_dying(q));
+ !atomic_read(&q->mq_freeze_depth) ||
+ blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
if (ret)
@@ -112,13 +113,10 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref)
void blk_mq_freeze_queue_start(struct request_queue *q)
{
- bool freeze;
+ int freeze_depth;
- spin_lock_irq(q->queue_lock);
- freeze = !q->mq_freeze_depth++;
- spin_unlock_irq(q->queue_lock);
-
- if (freeze) {
+ freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
+ if (freeze_depth == 1) {
percpu_ref_kill(&q->mq_usage_counter);
blk_mq_run_hw_queues(q, false);
}
@@ -143,13 +141,11 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
void blk_mq_unfreeze_queue(struct request_queue *q)
{
- bool wake;
+ int freeze_depth;
- spin_lock_irq(q->queue_lock);
- wake = !--q->mq_freeze_depth;
- WARN_ON_ONCE(q->mq_freeze_depth < 0);
- spin_unlock_irq(q->queue_lock);
- if (wake) {
+ freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
+ WARN_ON_ONCE(freeze_depth < 0);
+ if (!freeze_depth) {
percpu_ref_reinit(&q->mq_usage_counter);
wake_up_all(&q->mq_freeze_wq);
}
@@ -677,8 +673,11 @@ static void blk_mq_rq_timer(unsigned long priv)
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
- queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_tag_idle(hctx);
+ queue_for_each_hw_ctx(q, hctx, i) {
+ /* the hctx may be unmapped, so check it here */
+ if (blk_mq_hw_queue_mapped(hctx))
+ blk_mq_tag_idle(hctx);
+ }
}
}
@@ -855,6 +854,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
spin_lock(&hctx->lock);
list_splice(&rq_list, &hctx->dispatch);
spin_unlock(&hctx->lock);
+ /*
+ * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
+ * it's possible the queue is stopped and restarted again
+ * before this. Queue restart will dispatch requests. And since
+ * requests in rq_list aren't added into hctx->dispatch yet,
+ * the requests in rq_list might get lost.
+ *
+ * blk_mq_run_hw_queue() already checks the STOPPED bit
+ **/
+ blk_mq_run_hw_queue(hctx, true);
}
}
@@ -1224,6 +1233,38 @@ static struct request *blk_mq_map_request(struct request_queue *q,
return rq;
}
+static int blk_mq_direct_issue_request(struct request *rq)
+{
+ int ret;
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q,
+ rq->mq_ctx->cpu);
+ struct blk_mq_queue_data bd = {
+ .rq = rq,
+ .list = NULL,
+ .last = 1
+ };
+
+ /*
+ * For OK queue, we are done. For error, kill it. Any other
+ * error (busy), just add it to our list as we previously
+ * would have done
+ */
+ ret = q->mq_ops->queue_rq(hctx, &bd);
+ if (ret == BLK_MQ_RQ_QUEUE_OK)
+ return 0;
+ else {
+ __blk_mq_requeue_request(rq);
+
+ if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
+ rq->errors = -EIO;
+ blk_mq_end_request(rq, rq->errors);
+ return 0;
+ }
+ return -1;
+ }
+}
+
/*
* Multiple hardware queue variant. This will not use per-process plugs,
* but will attempt to bypass the hctx queueing if we can go straight to
@@ -1235,6 +1276,9 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
struct blk_map_ctx data;
struct request *rq;
+ unsigned int request_count = 0;
+ struct blk_plug *plug;
+ struct request *same_queue_rq = NULL;
blk_queue_bounce(q, &bio);
@@ -1243,6 +1287,10 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
return;
}
+ if (!is_flush_fua && !blk_queue_nomerges(q) &&
+ blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+ return;
+
rq = blk_mq_map_request(q, bio, &data);
if (unlikely(!rq))
return;
@@ -1253,38 +1301,42 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
goto run_queue;
}
+ plug = current->plug;
/*
* If the driver supports defer issued based on 'last', then
* queue it up like normal since we can potentially save some
* CPU this way.
*/
- if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
- struct blk_mq_queue_data bd = {
- .rq = rq,
- .list = NULL,
- .last = 1
- };
- int ret;
+ if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
+ !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+ struct request *old_rq = NULL;
blk_mq_bio_to_request(rq, bio);
/*
- * For OK queue, we are done. For error, kill it. Any other
- * error (busy), just add it to our list as we previously
- * would have done
+ * we do limited pluging. If bio can be merged, do merge.
+ * Otherwise the existing request in the plug list will be
+ * issued. So the plug list will have one request at most
*/
- ret = q->mq_ops->queue_rq(data.hctx, &bd);
- if (ret == BLK_MQ_RQ_QUEUE_OK)
- goto done;
- else {
- __blk_mq_requeue_request(rq);
-
- if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
- goto done;
+ if (plug) {
+ /*
+ * The plug list might get flushed before this. If that
+ * happens, same_queue_rq is invalid and plug list is empty
+ **/
+ if (same_queue_rq && !list_empty(&plug->mq_list)) {
+ old_rq = same_queue_rq;
+ list_del_init(&old_rq->queuelist);
}
- }
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+ } else /* is_sync */
+ old_rq = rq;
+ blk_mq_put_ctx(data.ctx);
+ if (!old_rq)
+ return;
+ if (!blk_mq_direct_issue_request(old_rq))
+ return;
+ blk_mq_insert_request(old_rq, false, true, true);
+ return;
}
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1297,7 +1349,6 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
run_queue:
blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
}
-done:
blk_mq_put_ctx(data.ctx);
}
@@ -1309,16 +1360,11 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = rw_is_sync(bio->bi_rw);
const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
- unsigned int use_plug, request_count = 0;
+ struct blk_plug *plug;
+ unsigned int request_count = 0;
struct blk_map_ctx data;
struct request *rq;
- /*
- * If we have multiple hardware queues, just go directly to
- * one of those for sync IO.
- */
- use_plug = !is_flush_fua && !is_sync;
-
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
@@ -1326,8 +1372,8 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
return;
}
- if (use_plug && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, &request_count))
+ if (!is_flush_fua && !blk_queue_nomerges(q) &&
+ blk_attempt_plug_merge(q, bio, &request_count, NULL))
return;
rq = blk_mq_map_request(q, bio, &data);
@@ -1345,21 +1391,18 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
* utilize that to temporarily store requests until the task is
* either done or scheduled away.
*/
- if (use_plug) {
- struct blk_plug *plug = current->plug;
-
- if (plug) {
- blk_mq_bio_to_request(rq, bio);
- if (list_empty(&plug->mq_list))
- trace_block_plug(q);
- else if (request_count >= BLK_MAX_REQUEST_COUNT) {
- blk_flush_plug_list(plug, false);
- trace_block_plug(q);
- }
- list_add_tail(&rq->queuelist, &plug->mq_list);
- blk_mq_put_ctx(data.ctx);
- return;
+ plug = current->plug;
+ if (plug) {
+ blk_mq_bio_to_request(rq, bio);
+ if (list_empty(&plug->mq_list))
+ trace_block_plug(q);
+ else if (request_count >= BLK_MAX_REQUEST_COUNT) {
+ blk_flush_plug_list(plug, false);
+ trace_block_plug(q);
}
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+ blk_mq_put_ctx(data.ctx);
+ return;
}
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
@@ -1495,7 +1538,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
i++;
}
}
-
return tags;
fail:
@@ -1571,22 +1613,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
return NOTIFY_OK;
}
-static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-{
- struct request_queue *q = hctx->queue;
- struct blk_mq_tag_set *set = q->tag_set;
-
- if (set->tags[hctx->queue_num])
- return NOTIFY_OK;
-
- set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
- if (!set->tags[hctx->queue_num])
- return NOTIFY_STOP;
-
- hctx->tags = set->tags[hctx->queue_num];
- return NOTIFY_OK;
-}
-
static int blk_mq_hctx_notify(void *data, unsigned long action,
unsigned int cpu)
{
@@ -1594,12 +1620,16 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
return blk_mq_hctx_cpu_offline(hctx, cpu);
- else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
- return blk_mq_hctx_cpu_online(hctx, cpu);
+
+ /*
+ * In case of CPU online, tags may be reallocated
+ * in blk_mq_map_swqueue() after mapping is updated.
+ */
return NOTIFY_OK;
}
+/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
@@ -1618,7 +1648,6 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
blk_free_flush_queue(hctx->fq);
- kfree(hctx->ctxs);
blk_mq_free_bitmap(&hctx->ctx_map);
}
@@ -1775,6 +1804,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
unsigned int i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
+ struct blk_mq_tag_set *set = q->tag_set;
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
@@ -1791,6 +1821,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx = q->mq_ops->map_queue(q, i);
cpumask_set_cpu(i, hctx->cpumask);
+ cpumask_set_cpu(i, hctx->tags->cpumask);
ctx->index_hw = hctx->nr_ctx;
hctx->ctxs[hctx->nr_ctx++] = ctx;
}
@@ -1803,16 +1834,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* disable it and free the request entries.
*/
if (!hctx->nr_ctx) {
- struct blk_mq_tag_set *set = q->tag_set;
-
if (set->tags[i]) {
blk_mq_free_rq_map(set, set->tags[i], i);
set->tags[i] = NULL;
- hctx->tags = NULL;
}
+ hctx->tags = NULL;
continue;
}
+ /* unmapped hw queue can be remapped after CPU topo changed */
+ if (!set->tags[i])
+ set->tags[i] = blk_mq_init_rq_map(set, i);
+ hctx->tags = set->tags[i];
+ WARN_ON(!hctx->tags);
+
/*
* Set the map size to the number of mapped software queues.
* This is more accurate and more efficient than looping
@@ -1886,8 +1921,12 @@ void blk_mq_release(struct request_queue *q)
unsigned int i;
/* hctx kobj stays in hctx */
- queue_for_each_hw_ctx(q, hctx, i)
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx)
+ continue;
+ kfree(hctx->ctxs);
kfree(hctx);
+ }
kfree(q->queue_hw_ctx);
@@ -2047,7 +2086,7 @@ void blk_mq_free_queue(struct request_queue *q)
/* Basically redo blk_mq_init_queue with queue frozen */
static void blk_mq_queue_reinit(struct request_queue *q)
{
- WARN_ON_ONCE(!q->mq_freeze_depth);
+ WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
blk_mq_sysfs_unregister(q);
@@ -2090,9 +2129,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
*/
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_start(q);
- list_for_each_entry(q, &all_q_list, all_q_node)
+ list_for_each_entry(q, &all_q_list, all_q_node) {
blk_mq_freeze_queue_wait(q);
+ /*
+ * timeout handler can't touch hw queue during the
+ * reinitialization
+ */
+ del_timer_sync(&q->timeout);
+ }
+
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_queue_reinit(q);
@@ -2157,6 +2203,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
return 0;
}
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+ return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -2218,8 +2270,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
- if (set->tags[i])
+ if (set->tags[i]) {
blk_mq_free_rq_map(set, set->tags[i], i);
+ free_cpumask_var(set->tags[i]->cpumask);
+ }
}
kfree(set->tags);