From 07b0fdecb2477396bcb69609019aade2b22124a1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Jul 2019 07:58:31 -0700 Subject: blkcg: allow blkcg_policy->pd_stat() to print non-debug info too Currently, ->pd_stat() is called only when moduleparam blkcg_debug_stats is set which prevents it from printing non-debug policy-specific statistics. Let's move debug testing down so that ->pd_stat() can print non-debug stat too. This patch doesn't cause any visible behavior change. Signed-off-by: Tejun Heo Cc: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++------ block/blk-iolatency.c | 3 +++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 24ed26957367..55a7dc227dfb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -54,7 +54,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ -static bool blkcg_debug_stats = false; +bool blkcg_debug_stats = false; static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, @@ -944,10 +944,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) dbytes, dios); } - if (!blkcg_debug_stats) - goto next; - - if (atomic_read(&blkg->use_delay)) { + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; off += scnprintf(buf+off, size-off, " use_delay=%d delay_nsec=%llu", @@ -967,7 +964,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) has_stats = true; off += written; } -next: + if (has_stats) { if (off < size - 1) { off += scnprintf(buf+off, size-off, "\n"); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index d973c38ee4fd..0fff7b56df0e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -917,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, unsigned long long avg_lat; unsigned long long cur_win; + if (!blkcg_debug_stats) + return 0; + if (iolat->ssd) return iolatency_ssd_stat(iolat, buf, size); -- cgit v1.2.3-55-g7522 From 1624b0b200399bd6cd2b46ab3494738d1aef6b75 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 16 Jul 2019 21:59:35 +0900 Subject: block: fix sysfs module parameters directory path in comment The runtime configurable module parameter files are located under /sys/module/MODULENAME/parameters, not /sys/module/MODULENAME. Cc: Jens Axboe Signed-off-by: Akinobu Mita Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 97887e59f3b2..54f1f0d381f4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1969,7 +1969,7 @@ static const struct attribute *disk_events_attrs[] = { * The default polling interval can be specified by the kernel * parameter block.events_dfl_poll_msecs which defaults to 0 * (disable). This can also be modified runtime by writing to - * /sys/module/block/events_dfl_poll_msecs. + * /sys/module/block/parameters/events_dfl_poll_msecs. */ static int disk_events_set_dfl_poll_msecs(const char *val, const struct kernel_param *kp) -- cgit v1.2.3-55-g7522 From b5e02b484d6f12112d49326bff2aecfccd2f518d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 18 Jul 2019 09:08:52 +0200 Subject: block, bfq: check also in-flight I/O in dispatch plugging Consider a sync bfq_queue Q that remains empty while in service, and suppose that, when this happens, there is a fair amount of already in-flight I/O not belonging to Q. In such a situation, I/O dispatching may need to be plugged (until new I/O arrives for Q), for the following reason. The drive may decide to serve in-flight non-Q's I/O requests before Q's ones, thereby delaying the arrival of new I/O requests for Q (recall that Q is sync). If I/O-dispatching is not plugged, then, while Q remains empty, a basically uncontrolled amount of I/O from other queues may be dispatched too, possibly causing the service of Q's I/O to be delayed even longer in the drive. This problem gets more and more serious as the speed and the queue depth of the drive grow, because, as these two quantities grow, the probability to find no queue busy but many requests in flight grows too. If Q has the same weight and priority as the other queues, then the above delay is unlikely to cause any issue, because all queues tend to undergo the same treatment. So, since not plugging I/O dispatching is convenient for throughput, it is better not to plug. Things change in case Q has a higher weight or priority than some other queue, because Q's service guarantees may simply be violated. For this reason, commit 1de0c4cd9ea6 ("block, bfq: reduce idling only in symmetric scenarios") does plug I/O in such an asymmetric scenario. Plugging minimizes the delay induced by already in-flight I/O, and enables Q to recover the bandwidth it may lose because of this delay. Yet the above commit does not cover the case of weight-raised queues, for efficiency concerns. For weight-raised queues, I/O-dispatch plugging is activated simply if not all bfq_queues are weight-raised. But this check does not handle the case of in-flight requests, because a bfq_queue may become non busy *before* all its in-flight requests are completed. This commit performs I/O-dispatch plugging for weight-raised queues if there are some in-flight requests. As a practical example of the resulting recover of control, under write load on a Samsung SSD 970 PRO, gnome-terminal starts in 1.5 seconds after this fix, against 15 seconds before the fix (as a reference, gnome-terminal takes about 35 seconds to start with any of the other I/O schedulers). Fixes: 1de0c4cd9ea6 ("block, bfq: reduce idling only in symmetric scenarios") Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 67 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c9d2598500..b627e3fc6d53 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3354,38 +3354,57 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * there is no active group, then the primary expectation for * this device is probably a high throughput. * - * We are now left only with explaining the additional - * compound condition that is checked below for deciding - * whether the scenario is asymmetric. To explain this - * compound condition, we need to add that the function + * We are now left only with explaining the two sub-conditions in the + * additional compound condition that is checked below for deciding + * whether the scenario is asymmetric. To explain the first + * sub-condition, we need to add that the function * bfq_asymmetric_scenario checks the weights of only - * non-weight-raised queues, for efficiency reasons (see - * comments on bfq_weights_tree_add()). Then the fact that - * bfqq is weight-raised is checked explicitly here. More - * precisely, the compound condition below takes into account - * also the fact that, even if bfqq is being weight-raised, - * the scenario is still symmetric if all queues with requests - * waiting for completion happen to be - * weight-raised. Actually, we should be even more precise - * here, and differentiate between interactive weight raising - * and soft real-time weight raising. + * non-weight-raised queues, for efficiency reasons (see comments on + * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised + * is checked explicitly here. More precisely, the compound condition + * below takes into account also the fact that, even if bfqq is being + * weight-raised, the scenario is still symmetric if all queues with + * requests waiting for completion happen to be + * weight-raised. Actually, we should be even more precise here, and + * differentiate between interactive weight raising and soft real-time + * weight raising. + * + * The second sub-condition checked in the compound condition is + * whether there is a fair amount of already in-flight I/O not + * belonging to bfqq. If so, I/O dispatching is to be plugged, for the + * following reason. The drive may decide to serve in-flight + * non-bfqq's I/O requests before bfqq's ones, thereby delaying the + * arrival of new I/O requests for bfqq (recall that bfqq is sync). If + * I/O-dispatching is not plugged, then, while bfqq remains empty, a + * basically uncontrolled amount of I/O from other queues may be + * dispatched too, possibly causing the service of bfqq's I/O to be + * delayed even longer in the drive. This problem gets more and more + * serious as the speed and the queue depth of the drive grow, + * because, as these two quantities grow, the probability to find no + * queue busy but many requests in flight grows too. By contrast, + * plugging I/O dispatching minimizes the delay induced by already + * in-flight I/O, and enables bfqq to recover the bandwidth it may + * lose because of this delay. * * As a side note, it is worth considering that the above - * device-idling countermeasures may however fail in the - * following unlucky scenario: if idling is (correctly) - * disabled in a time period during which all symmetry - * sub-conditions hold, and hence the device is allowed to - * enqueue many requests, but at some later point in time some - * sub-condition stops to hold, then it may become impossible - * to let requests be served in the desired order until all - * the requests already queued in the device have been served. + * device-idling countermeasures may however fail in the following + * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled + * in a time period during which all symmetry sub-conditions hold, and + * therefore the device is allowed to enqueue many requests, but at + * some later point in time some sub-condition stops to hold, then it + * may become impossible to make requests be served in the desired + * order until all the requests already queued in the device have been + * served. The last sub-condition commented above somewhat mitigates + * this problem for weight-raised queues. */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { return (bfqq->wr_coeff > 1 && - bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd)) || + (bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd) || + bfqd->rq_in_driver >= + bfqq->dispatched + 4)) || bfq_asymmetric_scenario(bfqd, bfqq); } -- cgit v1.2.3-55-g7522 From 545fbd0775bafcefc8f7bc844291bd13c44b7fdc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Jul 2019 16:19:26 -0400 Subject: rq-qos: fix missed wake-ups in rq_qos_throttle We saw a hang in production with WBT where there was only one waiter in the throttle path and no outstanding IO. This is because of the has_sleepers optimization that is used to make sure we don't steal an inflight counter for new submitters when there are people already on the list. We can race with our check to see if the waitqueue has any waiters (this is done locklessly) and the time we actually add ourselves to the waitqueue. If this happens we'll go to sleep and never be woken up because nobody is doing IO to wake us up. Fix this by checking if the waitqueue has a single sleeper on the list after we add ourselves, that way we have an uptodate view of the list. Reviewed-by: Oleg Nesterov Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 659ccb8b693f..67a0a4c07060 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -244,6 +244,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, return; prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + has_sleeper = !wq_has_single_sleeper(&rqw->wait); do { if (data.got_token) break; -- cgit v1.2.3-55-g7522 From 64e7ea875ef63b2801be7954cf7257d1bfccc266 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Jul 2019 16:19:27 -0400 Subject: rq-qos: don't reset has_sleepers on spurious wakeups If we raced with somebody else getting an inflight counter we could fail to get an inflight counter with no sleepers on the list, and thus need to go to sleep. In this case has_sleepers should be true because we are now relying on the waker to get our inflight counter for us. And in the case of spurious wakeups we'd still want this to be the case. So set has_sleepers to true if we went to sleep to make sure we're woken up the proper way. Reviewed-by: Oleg Nesterov Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 67a0a4c07060..69a0f0b77795 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -261,7 +261,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, break; } io_schedule(); - has_sleeper = false; + has_sleeper = true; } while (1); finish_wait(&rqw->wait, &data.wq); } -- cgit v1.2.3-55-g7522 From d14a9b389a86a5154b704bc88ce8dd37c701456a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Jul 2019 16:19:28 -0400 Subject: rq-qos: set ourself TASK_UNINTERRUPTIBLE after we schedule In case we get a spurious wakeup we need to make sure to re-set ourselves to TASK_UNINTERRUPTIBLE so we don't busy wait. Reviewed-by: Oleg Nesterov Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 69a0f0b77795..c450b8952eae 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -262,6 +262,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, } io_schedule(); has_sleeper = true; + set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(&rqw->wait, &data.wq); } -- cgit v1.2.3-55-g7522 From ac38297f7038cd5b80d66f8809c7bbf5b70031f3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 16 Jul 2019 16:19:29 -0400 Subject: rq-qos: use a mb for got_token Oleg noticed that our checking of data.got_token is unsafe in the cleanup case, and should really use a memory barrier. Use a wmb on the write side, and a rmb() on the read side. We don't need one in the main loop since we're saved by set_current_state(). Reviewed-by: Oleg Nesterov Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index c450b8952eae..3954c0dc1443 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -202,6 +202,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr, return -1; data->got_token = true; + smp_wmb(); list_del_init(&curr->entry); wake_up_process(data->task); return 1; @@ -246,6 +247,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); has_sleeper = !wq_has_single_sleeper(&rqw->wait); do { + /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { @@ -256,6 +258,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ + smp_rmb(); if (data.got_token) cleanup_cb(rqw, private_data); break; -- cgit v1.2.3-55-g7522 From 893a1c97205a3ece0cbb3f571a3b972080f3b4c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jul 2019 13:55:23 -0600 Subject: blk-mq: allow REQ_NOWAIT to return an error inline By default, if a caller sets REQ_NOWAIT and we need to block, we'll return -EAGAIN through the bio->bi_end_io() callback. For some use cases, this makes it hard to use. Allow a caller to ask for inline return of errors related to blocking by also setting REQ_NOWAIT_INLINE. Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- include/linux/blk_types.h | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b038ec680e84..2bc2c0705660 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1960,9 +1960,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) + + cookie = BLK_QC_T_NONE; + if (bio->bi_opf & REQ_NOWAIT_INLINE) + cookie = BLK_QC_T_EAGAIN; + else if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); - return BLK_QC_T_NONE; + return cookie; } trace_block_getrq(q, bio, bio->bi_opf); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index feff3fe4467e..1b1fa1557e68 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,6 +311,7 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ + __REQ_NOWAIT_INLINE, /* Return would-block error inline */ /* * When a shared kthread needs to issue a bio for a cgroup, doing * so synchronously can lead to priority inversions as the kthread @@ -345,6 +346,7 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) +#define REQ_NOWAIT_INLINE (1ULL << __REQ_NOWAIT_INLINE) #define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) @@ -418,12 +420,13 @@ static inline int op_stat_group(unsigned int op) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U +#define BLK_QC_T_EAGAIN -2U #define BLK_QC_T_SHIFT 16 #define BLK_QC_T_INTERNAL (1U << 31) static inline bool blk_qc_t_valid(blk_qc_t cookie) { - return cookie != BLK_QC_T_NONE; + return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN; } static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie) -- cgit v1.2.3-55-g7522 From 327fe1d42b83f8a06b33ba30159582b49af5fc8e Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 23 Jul 2019 00:27:41 -0300 Subject: block: blk-mq: Remove blk_mq_sched_started_request and started_request blk_mq_sched_completed_request is a function that checks if the elevator related to the request has started_request implemented, but currently, none of the available IO schedulers implement started_request, so remove both. Signed-off-by: Marcos Paulo de Souza Signed-off-by: Jens Axboe --- block/blk-mq-sched.h | 9 --------- block/blk-mq.c | 2 -- include/linux/elevator.h | 1 - 3 files changed, 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index cf22ab00fefb..126021fc3a11 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -61,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) e->type->ops.completed_request(rq, now); } -static inline void blk_mq_sched_started_request(struct request *rq) -{ - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.started_request) - e->type->ops.started_request(rq); -} - static inline void blk_mq_sched_requeue_request(struct request *rq) { struct request_queue *q = rq->q; diff --git a/block/blk-mq.c b/block/blk-mq.c index 2bc2c0705660..f78d3287dd82 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -669,8 +669,6 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - blk_mq_sched_started_request(rq); - trace_block_rq_issue(q, rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 17cd0078377c..1dd014c9c87b 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -45,7 +45,6 @@ struct elevator_mq_ops { struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); void (*completed_request)(struct request *, u64); - void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); struct request *(*next_request)(struct request_queue *, struct request *); -- cgit v1.2.3-55-g7522