diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 742 |
1 files changed, 499 insertions, 243 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index c3400b5444a7..b2fd175e84d7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -20,6 +20,8 @@ #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/sysctl.h> +#include <linux/sched/topology.h> +#include <linux/sched/signal.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> @@ -32,6 +34,7 @@ #include "blk-mq-tag.h" #include "blk-stat.h" #include "blk-wbt.h" +#include "blk-mq-sched.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -39,9 +42,11 @@ static LIST_HEAD(all_q_list); /* * Check if any of the ctx's have pending work in this hardware queue */ -static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - return sbitmap_any_bit_set(&hctx->ctx_map); + return sbitmap_any_bit_set(&hctx->ctx_map) || + !list_empty_careful(&hctx->dispatch) || + blk_mq_sched_has_work(hctx); } /* @@ -72,10 +77,20 @@ void blk_mq_freeze_queue_start(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); -static void blk_mq_freeze_queue_wait(struct request_queue *q) +void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); + +int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout) +{ + return wait_event_timeout(q->mq_freeze_wq, + percpu_ref_is_zero(&q->q_usage_counter), + timeout); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); /* * Guarantee no request is in use, so we can change any data structure of @@ -167,8 +182,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); -static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, unsigned int op) +void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, + struct request *rq, unsigned int op) { INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ @@ -196,13 +211,7 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->special = NULL; /* tag was already set */ rq->errors = 0; - - rq->cmd = rq->__cmd; - rq->extra_len = 0; - rq->sense_len = 0; - rq->resid_len = 0; - rq->sense = NULL; INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; @@ -213,53 +222,59 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, ctx->rq_dispatched[op_is_sync(op)]++; } +EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); -static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) +struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, + unsigned int op) { struct request *rq; unsigned int tag; tag = blk_mq_get_tag(data); if (tag != BLK_MQ_TAG_FAIL) { - rq = data->hctx->tags->rqs[tag]; + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); - if (blk_mq_tag_busy(data->hctx)) { - rq->rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); + rq = tags->static_rqs[tag]; + + if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->tag = -1; + rq->internal_tag = tag; + } else { + if (blk_mq_tag_busy(data->hctx)) { + rq->rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } + rq->tag = tag; + rq->internal_tag = -1; + data->hctx->tags->rqs[rq->tag] = rq; } - rq->tag = tag; blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } return NULL; } +EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); struct request *blk_mq_alloc_request(struct request_queue *q, int rw, unsigned int flags) { - struct blk_mq_ctx *ctx; - struct blk_mq_hw_ctx *hctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; int ret; ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); if (ret) return ERR_PTR(ret); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - blk_mq_put_ctx(ctx); + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); - if (!rq) { - blk_queue_exit(q); + blk_mq_put_ctx(alloc_data.ctx); + blk_queue_exit(q); + + if (!rq) return ERR_PTR(-EWOULDBLOCK); - } rq->__data_len = 0; rq->__sector = (sector_t) -1; @@ -271,10 +286,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, unsigned int flags, unsigned int hctx_idx) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; + struct blk_mq_alloc_data alloc_data = { .flags = flags }; struct request *rq; - struct blk_mq_alloc_data alloc_data; + unsigned int cpu; int ret; /* @@ -297,32 +311,30 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ - hctx = q->queue_hw_ctx[hctx_idx]; - if (!blk_mq_hw_queue_mapped(hctx)) { - ret = -EXDEV; - goto out_queue_exit; - } - ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); - - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq) { - ret = -EWOULDBLOCK; - goto out_queue_exit; + alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { + blk_queue_exit(q); + return ERR_PTR(-EXDEV); } + cpu = cpumask_first(alloc_data.hctx->cpumask); + alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - return rq; + rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); -out_queue_exit: + blk_mq_put_ctx(alloc_data.ctx); blk_queue_exit(q); - return ERR_PTR(ret); + + if (!rq) + return ERR_PTR(-EWOULDBLOCK); + + return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); -static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct request *rq) +void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct request *rq) { - const int tag = rq->tag; + const int sched_tag = rq->internal_tag; struct request_queue *q = rq->q; if (rq->rq_flags & RQF_MQ_INFLIGHT) @@ -333,23 +345,31 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); - blk_mq_put_tag(hctx, ctx, tag); + if (rq->tag != -1) + blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); + if (sched_tag != -1) + blk_mq_sched_completed_request(hctx, rq); + blk_mq_sched_restart_queues(hctx); blk_queue_exit(q); } -void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, + struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; ctx->rq_completed[rq_is_sync(rq)]++; - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); +} +void blk_mq_finish_request(struct request *rq) +{ + blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); } -EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); void blk_mq_free_request(struct request *rq) { - blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); + blk_mq_sched_put_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); @@ -467,11 +487,9 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + blk_mq_sched_started_request(rq); - rq->resid_len = blk_rq_bytes(rq); - if (unlikely(blk_bidi_rq(rq))) - rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { blk_stat_set_issue_time(&rq->issue_stat); @@ -515,6 +533,7 @@ static void __blk_mq_requeue_request(struct request *rq) trace_block_rq_requeue(q, rq); wbt_requeue(q->rq_wb, &rq->issue_stat); + blk_mq_sched_requeue_request(rq); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -549,13 +568,13 @@ static void blk_mq_requeue_work(struct work_struct *work) rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, true, false, false); + blk_mq_sched_insert_request(rq, true, false, false, true); } while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_insert_request(rq, false, false, false); + blk_mq_sched_insert_request(rq, false, false, false, true); } blk_mq_run_hw_queues(q, false); @@ -639,7 +658,7 @@ struct blk_mq_timeout_data { void blk_mq_rq_timed_out(struct request *req, bool reserved) { - struct blk_mq_ops *ops = req->q->mq_ops; + const struct blk_mq_ops *ops = req->q->mq_ops; enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; /* @@ -754,7 +773,7 @@ static bool blk_mq_attempt_merge(struct request_queue *q, int checked = 8; list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { - int el_ret; + bool merged = false; if (!checked--) break; @@ -762,20 +781,25 @@ static bool blk_mq_attempt_merge(struct request_queue *q, if (!blk_rq_merge_ok(rq, bio)) continue; - el_ret = blk_try_merge(rq, bio); - if (el_ret == ELEVATOR_BACK_MERGE) { - if (bio_attempt_back_merge(q, rq, bio)) { - ctx->rq_merged++; - return true; - } + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_back_merge(q, rq, bio); break; - } else if (el_ret == ELEVATOR_FRONT_MERGE) { - if (bio_attempt_front_merge(q, rq, bio)) { - ctx->rq_merged++; - return true; - } + case ELEVATOR_FRONT_MERGE: + if (blk_mq_sched_allow_merge(q, rq, bio)) + merged = bio_attempt_front_merge(q, rq, bio); break; + case ELEVATOR_DISCARD_MERGE: + merged = bio_attempt_discard_merge(q, rq, bio); + break; + default: + continue; } + + if (merged) + ctx->rq_merged++; + return merged; } return false; @@ -803,7 +827,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) * Process software queues that have been marked busy, splicing them * to the for-dispatch */ -static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, @@ -812,6 +836,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } +EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); static inline unsigned int queued_to_index(unsigned int queued) { @@ -821,6 +846,132 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } +bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, + bool wait) +{ + struct blk_mq_alloc_data data = { + .q = rq->q, + .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, + }; + + if (rq->tag != -1) { +done: + if (hctx) + *hctx = data.hctx; + return true; + } + + if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) + data.flags |= BLK_MQ_REQ_RESERVED; + + rq->tag = blk_mq_get_tag(&data); + if (rq->tag >= 0) { + if (blk_mq_tag_busy(data.hctx)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&data.hctx->nr_active); + } + data.hctx->tags->rqs[rq->tag] = rq; + goto done; + } + + return false; +} + +static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = -1; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + __blk_mq_put_driver_tag(hctx, rq); +} + +static void blk_mq_put_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + + if (rq->tag == -1 || rq->internal_tag == -1) + return; + + hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + __blk_mq_put_driver_tag(hctx, rq); +} + +/* + * If we fail getting a driver tag because all the driver tags are already + * assigned and on the dispatch list, BUT the first entry does not have a + * tag, then we could deadlock. For that case, move entries with assigned + * driver tags to the front, leaving the set of tagged requests in the + * same order, and the untagged set in the same order. + */ +static bool reorder_tags_to_front(struct list_head *list) +{ + struct request *rq, *tmp, *first = NULL; + + list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { + if (rq == first) + break; + if (rq->tag != -1) { + list_move(&rq->queuelist, list); + if (!first) + first = rq; + } + } + + return first != NULL; +} + +static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, + void *key) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + + list_del(&wait->task_list); + clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); + blk_mq_run_hw_queue(hctx, true); + return 1; +} + +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) +{ + struct sbq_wait_state *ws; + + /* + * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. + * The thread which wins the race to grab this bit adds the hardware + * queue to the wait queue. + */ + if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || + test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) + return false; + + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + + /* + * As soon as this returns, it's no longer safe to fiddle with + * hctx->dispatch_wait, since a completion can wake up the wait queue + * and unlock the bit. + */ + add_wait_queue(&ws->wait, &hctx->dispatch_wait); + return true; +} + bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; @@ -843,11 +994,44 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); + if (!blk_mq_get_driver_tag(rq, &hctx, false)) { + if (!queued && reorder_tags_to_front(list)) + continue; + + /* + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. + */ + if (blk_mq_dispatch_wait_add(hctx)) { + /* + * It's possible that a tag was freed in the + * window between the allocation failure and + * adding the hardware queue to the wait queue. + */ + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; + } else { + break; + } + } + list_del_init(&rq->queuelist); bd.rq = rq; bd.list = dptr; - bd.last = list_empty(list); + + /* + * Flag last if we have no more requests, or if we have more + * but can't assign a driver tag to it. + */ + if (list_empty(list)) + bd.last = true; + else { + struct request *nxt; + + nxt = list_first_entry(list, struct request, queuelist); + bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); + } ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { @@ -855,6 +1039,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) queued++; break; case BLK_MQ_RQ_QUEUE_BUSY: + blk_mq_put_driver_tag_hctx(hctx, rq); list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); break; @@ -884,8 +1069,15 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * that is where we will continue on next queue run. */ if (!list_empty(list)) { + /* + * If we got a driver tag for the next request already, + * free it again. + */ + rq = list_first_entry(list, struct request, queuelist); + blk_mq_put_driver_tag(rq); + spin_lock(&hctx->lock); - list_splice(list, &hctx->dispatch); + list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* @@ -896,45 +1088,16 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * the requests in rq_list might get lost. * * blk_mq_run_hw_queue() already checks the STOPPED bit - **/ - blk_mq_run_hw_queue(hctx, true); - } - - return ret != BLK_MQ_RQ_QUEUE_BUSY; -} - -/* - * Run this hardware queue, pulling any software queues mapped to it in. - * Note that this function currently has various problems around ordering - * of IO. In particular, we'd like FIFO behaviour on handling existing - * items on the hctx->dispatch list. Ignore that for now. - */ -static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) -{ - LIST_HEAD(rq_list); - - if (unlikely(blk_mq_hctx_stopped(hctx))) - return; - - hctx->run++; - - /* - * Touch any software queue that has pending entries. - */ - flush_busy_ctxs(hctx, &rq_list); - - /* - * If we have previous entries on our dispatch list, grab them - * and stuff them at the front for more fair dispatch. - */ - if (!list_empty_careful(&hctx->dispatch)) { - spin_lock(&hctx->lock); - if (!list_empty(&hctx->dispatch)) - list_splice_init(&hctx->dispatch, &rq_list); - spin_unlock(&hctx->lock); + * + * If RESTART or TAG_WAITING is set, then let completion restart + * the queue instead of potentially looping here. + */ + if (!blk_mq_sched_needs_restart(hctx) && + !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) + blk_mq_run_hw_queue(hctx, true); } - blk_mq_dispatch_rq_list(hctx, &rq_list); + return queued != 0; } static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) @@ -946,11 +1109,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { rcu_read_lock(); - blk_mq_process_rq_list(hctx); + blk_mq_sched_dispatch_requests(hctx); rcu_read_unlock(); } else { srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); - blk_mq_process_rq_list(hctx); + blk_mq_sched_dispatch_requests(hctx); srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); } } @@ -1006,8 +1169,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) int i; queue_for_each_hw_ctx(q, hctx, i) { - if ((!blk_mq_hctx_has_pending(hctx) && - list_empty_careful(&hctx->dispatch)) || + if (!blk_mq_hctx_has_pending(hctx) || blk_mq_hctx_stopped(hctx)) continue; @@ -1116,6 +1278,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) if (unlikely(!blk_mq_hw_queue_mapped(hctx))) return; + blk_mq_stop_hw_queue(hctx); kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->delay_work, msecs_to_jiffies(msecs)); } @@ -1135,8 +1298,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, list_add_tail(&rq->queuelist, &ctx->rq_list); } -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; @@ -1144,32 +1307,10 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, blk_mq_hctx_mark_pending(hctx, ctx); } -void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, - bool async) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, async); -} - -static void blk_mq_insert_requests(struct request_queue *q, - struct blk_mq_ctx *ctx, - struct list_head *list, - int depth, - bool from_schedule) +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list) { - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - - trace_block_unplug(q, depth, !from_schedule); - /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now @@ -1185,8 +1326,6 @@ static void blk_mq_insert_requests(struct request_queue *q, } blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); - - blk_mq_run_hw_queue(hctx, from_schedule); } static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -1222,9 +1361,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(!rq->q); if (rq->mq_ctx != this_ctx) { if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, - &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, + &ctx_list, + from_schedule); } this_ctx = rq->mq_ctx; @@ -1241,8 +1381,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * on 'ctx_list'. Do those. */ if (this_ctx) { - blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, - from_schedule); + trace_block_unplug(this_q, depth, from_schedule); + blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + from_schedule); } } @@ -1280,46 +1421,39 @@ insert_rq: } spin_unlock(&ctx->lock); - __blk_mq_free_request(hctx, ctx, rq); + __blk_mq_finish_request(hctx, ctx, rq); return true; } } -static struct request *blk_mq_map_request(struct request_queue *q, - struct bio *bio, - struct blk_mq_alloc_data *data) +static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct blk_mq_ctx *ctx; - struct request *rq; - - blk_queue_enter_live(q); - ctx = blk_mq_get_ctx(q); - hctx = blk_mq_map_queue(q, ctx->cpu); - - trace_block_getrq(q, bio, bio->bi_opf); - blk_mq_set_alloc_data(data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(data, bio->bi_opf); + if (rq->tag != -1) + return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); - data->hctx->queued++; - return rq; + return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); } static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) { - int ret; struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); struct blk_mq_queue_data bd = { .rq = rq, .list = NULL, .last = 1 }; - blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + struct blk_mq_hw_ctx *hctx; + blk_qc_t new_cookie; + int ret; + + if (q->elevator) + goto insert; - if (blk_mq_hctx_stopped(hctx)) + if (!blk_mq_get_driver_tag(rq, &hctx, false)) goto insert; + new_cookie = request_to_qc_t(hctx, rq); + /* * For OK queue, we are done. For error, kill it. Any other * error (busy), just add it to our list as we previously @@ -1341,7 +1475,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie) } insert: - blk_mq_insert_request(rq, false, true, true); + blk_mq_sched_insert_request(rq, false, true, true, false); } /* @@ -1352,8 +1486,8 @@ insert: static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); - struct blk_mq_alloc_data data; + const int is_flush_fua = op_is_flush(bio->bi_opf); + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; unsigned int request_count = 0, srcu_idx; struct blk_plug *plug; @@ -1374,9 +1508,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); - rq = blk_mq_map_request(q, bio, &data); + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; @@ -1384,9 +1523,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) wbt_track(&rq->issue_stat, wb_acct); - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + if (q->elevator) + goto elv_insert; blk_mq_bio_to_request(rq, bio); blk_insert_flush(rq); goto run_queue; @@ -1438,6 +1579,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) goto done; } + if (q->elevator) { +elv_insert: + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1460,10 +1609,10 @@ done: static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); + const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_plug *plug; unsigned int request_count = 0; - struct blk_mq_alloc_data data; + struct blk_mq_alloc_data data = { .flags = 0 }; struct request *rq; blk_qc_t cookie; unsigned int wb_acct; @@ -1483,9 +1632,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + if (blk_mq_sched_bio_merge(q, bio)) + return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); - rq = blk_mq_map_request(q, bio, &data); + trace_block_getrq(q, bio, bio->bi_opf); + + rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); if (unlikely(!rq)) { __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; @@ -1493,9 +1647,11 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) wbt_track(&rq->issue_stat, wb_acct); - cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); + cookie = request_to_qc_t(data.hctx, rq); if (unlikely(is_flush_fua)) { + if (q->elevator) + goto elv_insert; blk_mq_bio_to_request(rq, bio); blk_insert_flush(rq); goto run_queue; @@ -1535,6 +1691,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) return cookie; } + if (q->elevator) { +elv_insert: + blk_mq_put_ctx(data.ctx); + blk_mq_bio_to_request(rq, bio); + blk_mq_sched_insert_request(rq, false, true, + !is_sync || is_flush_fua, true); + goto done; + } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { /* * For a SYNC request, send it to the hardware immediately. For @@ -1547,11 +1711,12 @@ run_queue: } blk_mq_put_ctx(data.ctx); +done: return cookie; } -static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) { struct page *page; @@ -1559,11 +1724,13 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, int i; for (i = 0; i < tags->nr_tags; i++) { - if (!tags->rqs[i]) + struct request *rq = tags->static_rqs[i]; + + if (!rq) continue; - set->ops->exit_request(set->driver_data, tags->rqs[i], + set->ops->exit_request(set->driver_data, rq, hctx_idx, i); - tags->rqs[i] = NULL; + tags->static_rqs[i] = NULL; } } @@ -1577,49 +1744,82 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, kmemleak_free(page_address(page)); __free_pages(page, page->private); } +} +void blk_mq_free_rq_map(struct blk_mq_tags *tags) +{ kfree(tags->rqs); + tags->rqs = NULL; + kfree(tags->static_rqs); + tags->static_rqs = NULL; blk_mq_free_tags(tags); } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - -static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; - unsigned int i, j, entries_per_page, max_order = 4; - size_t rq_size, left; + int node; - tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, - set->numa_node, + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; + + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; - INIT_LIST_HEAD(&tags->page_list); - - tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), + tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, - set->numa_node); + node); if (!tags->rqs) { blk_mq_free_tags(tags); return NULL; } + tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node); + if (!tags->static_rqs) { + kfree(tags->rqs); + blk_mq_free_tags(tags); + return NULL; + } + + return tags; +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) +{ + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + int node; + + node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; + + INIT_LIST_HEAD(&tags->page_list); + /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); - left = rq_size * set->queue_depth; + left = rq_size * depth; - for (i = 0; i < set->queue_depth; ) { + for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; @@ -1629,7 +1829,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, this_order--; do { - page = alloc_pages_node(set->numa_node, + page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) @@ -1653,15 +1853,17 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; - to_do = min(entries_per_page, set->queue_depth - i); + to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { - tags->rqs[i] = p; + struct request *rq = p; + + tags->static_rqs[i] = rq; if (set->ops->init_request) { if (set->ops->init_request(set->driver_data, - tags->rqs[i], hctx_idx, i, - set->numa_node)) { - tags->rqs[i] = NULL; + rq, hctx_idx, i, + node)) { + tags->static_rqs[i] = NULL; goto fail; } } @@ -1670,11 +1872,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; + return 0; fail: - blk_mq_free_rq_map(set, tags, hctx_idx); - return NULL; + blk_mq_free_rqs(set, tags, hctx_idx); + return -ENOMEM; } /* @@ -1866,6 +2068,35 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } +static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) +{ + int ret = 0; + + set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + set->queue_depth, set->reserved_tags); + if (!set->tags[hctx_idx]) + return false; + + ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, + set->queue_depth); + if (!ret) + return true; + + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + return false; +} + +static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (set->tags[hctx_idx]) { + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->tags[hctx_idx]); + set->tags[hctx_idx] = NULL; + } +} + static void blk_mq_map_swqueue(struct request_queue *q, const struct cpumask *online_mask) { @@ -1894,17 +2125,15 @@ static void blk_mq_map_swqueue(struct request_queue *q, hctx_idx = q->mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ - if (!set->tags[hctx_idx]) { - set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx); - + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_rq_map(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - if (!set->tags[hctx_idx]) - q->mq_map[i] = 0; + q->mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); @@ -1927,10 +2156,9 @@ static void blk_mq_map_swqueue(struct request_queue *q, * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) { - blk_mq_free_rq_map(set, set->tags[i], i); - set->tags[i] = NULL; - } + if (i && set->tags[i]) + blk_mq_free_map_and_requests(set, i); + hctx->tags = NULL; continue; } @@ -2023,6 +2251,8 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned int i; + blk_mq_sched_teardown(q); + /* hctx kobj stays in hctx */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx) @@ -2097,10 +2327,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) { - blk_mq_free_rq_map(set, hctx->tags, j); - set->tags[j] = NULL; - } + if (hctx->tags) + blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); free_cpumask_var(hctx->cpumask); kobject_put(&hctx->kobj); @@ -2181,6 +2409,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, mutex_unlock(&all_q_mutex); put_online_cpus(); + if (!(set->flags & BLK_MQ_F_NO_SCHED)) { + int ret; + + ret = blk_mq_sched_init(q); + if (ret) + return ERR_PTR(ret); + } + return q; err_hctxs: @@ -2279,10 +2515,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu) * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list * and set bit0 in pending bitmap as ctx1->index_hw is still zero. * - * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in - * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. - * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list - * is ignored. + * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set + * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is + * ignored. */ static int blk_mq_queue_reinit_prepare(unsigned int cpu) { @@ -2296,17 +2532,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { - set->tags[i] = blk_mq_init_rq_map(set, i); - if (!set->tags[i]) + for (i = 0; i < set->nr_hw_queues; i++) + if (!__blk_mq_alloc_rq_map(set, i)) goto out_unwind; - } return 0; out_unwind: while (--i >= 0) - blk_mq_free_rq_map(set, set->tags[i], i); + blk_mq_free_rq_map(set->tags[i]); return -ENOMEM; } @@ -2430,10 +2664,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < nr_cpu_ids; i++) { - if (set->tags[i]) - blk_mq_free_rq_map(set, set->tags[i], i); - } + for (i = 0; i < nr_cpu_ids; i++) + blk_mq_free_map_and_requests(set, i); kfree(set->mq_map); set->mq_map = NULL; @@ -2449,14 +2681,28 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_hw_ctx *hctx; int i, ret; - if (!set || nr > set->queue_depth) + if (!set) return -EINVAL; + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; - ret = blk_mq_tag_update_depth(hctx->tags, nr); + /* + * If we're using an MQ scheduler, just update the scheduler + * queue depth. This is similar to what the old code would do. + */ + if (!hctx->sched_tags) { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, + min(nr, set->queue_depth), + false); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, + nr, true); + } if (ret) break; } @@ -2464,6 +2710,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) q->nr_requests = nr; + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + return ret; } @@ -2483,10 +2732,14 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + /* + * Manually set the make_request_fn as blk_queue_make_request + * resets a lot of the queue settings. + */ if (q->nr_hw_queues > 1) - blk_queue_make_request(q, blk_mq_make_request); + q->make_request_fn = blk_mq_make_request; else - blk_queue_make_request(q, blk_sq_make_request); + q->make_request_fn = blk_sq_make_request; blk_mq_queue_reinit(q, cpu_online_mask); } @@ -2649,7 +2902,10 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) blk_flush_plug_list(plug, false); hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); return __blk_mq_poll(hctx, rq); } |