From 868f2f0b72068a097508b6e8870a8950fd8eb7ef Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 17 Dec 2015 17:08:14 -0700 Subject: blk-mq: dynamic h/w context count The hardware's provided queue count may change at runtime with resource provisioning. This patch allows a block driver to alter the number of h/w queues available when its resource count changes. The main part is a new blk-mq API to request a new number of h/w queues for a given live tag set. The new API freezes all queues using that set, then adjusts the allocated count prior to remapping these to CPUs. The bulk of the rest just shifts where h/w contexts and all their artifacts are allocated and freed. The number of max h/w contexts is capped to the number of possible cpus since there is no use for more than that. As such, all pre-allocated memory for pointers need to account for the max possible rather than the initial number of queues. A side effect of this is that the blk-mq will proceed successfully as long as it can allocate at least one h/w context. Previously it would fail request queue initialization if less than the requested number was allocated. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Tested-by: Jon Derrick Signed-off-by: Jens Axboe --- block/blk-mq.c | 173 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 104 insertions(+), 69 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4c0622fae413..645eb9e716d0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1742,31 +1742,6 @@ static int blk_mq_init_hctx(struct request_queue *q, return -1; } -static int blk_mq_init_hw_queues(struct request_queue *q, - struct blk_mq_tag_set *set) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int i; - - /* - * Initialize hardware queues - */ - queue_for_each_hw_ctx(q, hctx, i) { - if (blk_mq_init_hctx(q, set, hctx, i)) - break; - } - - if (i == q->nr_hw_queues) - return 0; - - /* - * Init failed - */ - blk_mq_exit_hw_queues(q, set, i); - - return 1; -} - static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { @@ -1824,6 +1799,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, continue; hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->cpumask); ctx->index_hw = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; @@ -1972,54 +1948,89 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q) +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) { - struct blk_mq_hw_ctx **hctxs; - struct blk_mq_ctx __percpu *ctx; - unsigned int *map; - int i; - - ctx = alloc_percpu(struct blk_mq_ctx); - if (!ctx) - return ERR_PTR(-ENOMEM); - - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, - set->numa_node); - - if (!hctxs) - goto err_percpu; - - map = blk_mq_make_queue_map(set); - if (!map) - goto err_map; + int i, j; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + blk_mq_sysfs_unregister(q); for (i = 0; i < set->nr_hw_queues; i++) { - int node = blk_mq_hw_queue_to_node(map, i); + int node; + if (hctxs[i]) + continue; + + node = blk_mq_hw_queue_to_node(q->mq_map, i); hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, node); if (!hctxs[i]) - goto err_hctxs; + break; if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, - node)) - goto err_hctxs; + node)) { + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } atomic_set(&hctxs[i]->nr_active, 0); hctxs[i]->numa_node = node; hctxs[i]->queue_num = i; + + if (blk_mq_init_hctx(q, set, hctxs[i], i)) { + free_cpumask_var(hctxs[i]->cpumask); + kfree(hctxs[i]); + hctxs[i] = NULL; + break; + } + blk_mq_hctx_kobj_init(hctxs[i]); } + for (j = i; j < q->nr_hw_queues; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + if (hctx->tags) { + blk_mq_free_rq_map(set, hctx->tags, j); + set->tags[j] = NULL; + } + blk_mq_exit_hctx(q, set, hctx, j); + free_cpumask_var(hctx->cpumask); + kobject_put(&hctx->kobj); + kfree(hctx->ctxs); + kfree(hctx); + hctxs[j] = NULL; + + } + } + q->nr_hw_queues = i; + blk_mq_sysfs_register(q); +} + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!q->queue_ctx) + return ERR_PTR(-ENOMEM); + + q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), + GFP_KERNEL, set->numa_node); + if (!q->queue_hw_ctx) + goto err_percpu; + + q->mq_map = blk_mq_make_queue_map(set); + if (!q->mq_map) + goto err_map; + + blk_mq_realloc_hw_ctxs(set, q); + if (!q->nr_hw_queues) + goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; - q->nr_hw_queues = set->nr_hw_queues; - q->mq_map = map; - - q->queue_ctx = ctx; - q->queue_hw_ctx = hctxs; q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; @@ -2048,9 +2059,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); - if (blk_mq_init_hw_queues(q, set)) - goto err_hctxs; - get_online_cpus(); mutex_lock(&all_q_mutex); @@ -2064,17 +2072,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return q; err_hctxs: - kfree(map); - for (i = 0; i < set->nr_hw_queues; i++) { - if (!hctxs[i]) - break; - free_cpumask_var(hctxs[i]->cpumask); - kfree(hctxs[i]); - } + kfree(q->mq_map); err_map: - kfree(hctxs); + kfree(q->queue_hw_ctx); err_percpu: - free_percpu(ctx); + free_percpu(q->queue_ctx); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(blk_mq_init_allocated_queue); @@ -2282,9 +2284,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->nr_hw_queues = 1; set->queue_depth = min(64U, set->queue_depth); } + /* + * There is no use for more h/w queues than cpus. + */ + if (set->nr_hw_queues > nr_cpu_ids) + set->nr_hw_queues = nr_cpu_ids; - set->tags = kmalloc_node(set->nr_hw_queues * - sizeof(struct blk_mq_tags *), + set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; @@ -2307,7 +2313,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) { + for (i = 0; i < nr_cpu_ids; i++) { if (set->tags[i]) blk_mq_free_rq_map(set, set->tags[i], i); } @@ -2339,6 +2345,35 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +{ + struct request_queue *q; + + if (nr_hw_queues > nr_cpu_ids) + nr_hw_queues = nr_cpu_ids; + if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) + return; + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue(q); + + set->nr_hw_queues = nr_hw_queues; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_realloc_hw_ctxs(set, q); + + if (q->nr_hw_queues > 1) + blk_queue_make_request(q, blk_mq_make_request); + else + blk_queue_make_request(q, blk_sq_make_request); + + blk_mq_queue_reinit(q, cpu_online_mask); + } + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); -- cgit v1.2.3-55-g7522 From 66841672161efb9e3be4a1dbd9755020bb1d86b7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 12 Feb 2016 15:27:00 +0800 Subject: blk-mq: mark request queue as mq asap Currently q->mq_ops is used widely to decide if the queue is mq or not, so we should set the 'flag' asap so that both block core and drivers can get the correct mq info. For example, commit 868f2f0b720(blk-mq: dynamic h/w context count) moves the hctx's initialization before setting q->mq_ops in blk_mq_init_allocated_queue(), then cause blk_alloc_flush_queue() to think the queue is non-mq and don't allocate command size for the per-hctx flush rq. This patches should fix the problem reported by Sasha. Cc: Keith Busch Reported-by: Sasha Levin Signed-off-by: Ming Lei Fixes: 868f2f0b720 ("blk-mq: dynamic h/w context count") Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 645eb9e716d0..f539a53d16c3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2010,6 +2010,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { + /* mark the queue as mq asap */ + q->mq_ops = set->ops; + q->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!q->queue_ctx) return ERR_PTR(-ENOMEM); @@ -2032,7 +2035,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->nr_queues = nr_cpu_ids; - q->mq_ops = set->ops; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; if (!(set->flags & BLK_MQ_F_SG_MERGE)) -- cgit v1.2.3-55-g7522 From e9137d4b93078b6a9965acfb18a2a2ad91cf8405 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 18 Feb 2016 14:56:35 -0700 Subject: blk-mq: Fix NULL pointer updating nr_requests A h/w context's tags are freed if it was not assigned a CPU. Check if the context has tags before updating the depth. Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f539a53d16c3..5667f59c277c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2336,6 +2336,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) ret = 0; queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; ret = blk_mq_tag_update_depth(hctx->tags, nr); if (ret) break; -- cgit v1.2.3-55-g7522 From 4ee86babe09f0682a60b1c56be99819bbe4ba62c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 15 Mar 2016 12:03:28 -0700 Subject: blk-mq: add bounds check on tag-to-rq conversion We need to check for a valid index before accessing the array element to avoid accessing invalid memory regions. Reviewed-by: Christoph Hellwig Reviewed-by: Jeff Moyer Modified by Jens to drop the unlikely(), and make the fall through path be having a valid tag. Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5667f59c277c..261b6feddae6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -544,7 +544,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - return tags->rqs[tag]; + if (tag < tags->nr_tags) + return tags->rqs[tag]; + + return NULL; } EXPORT_SYMBOL(blk_mq_tag_to_rq); -- cgit v1.2.3-55-g7522