diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/blk-cgroup.c | 66 | ||||
-rw-r--r-- | block/blk-core.c | 6 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 7 | ||||
-rw-r--r-- | block/blk-mq.c | 2 | ||||
-rw-r--r-- | block/blk-mq.h | 32 | ||||
-rw-r--r-- | block/blk-throttle.c | 9 | ||||
-rw-r--r-- | block/blk-zoned.c | 2 |
9 files changed, 112 insertions, 16 deletions
diff --git a/block/Kconfig b/block/Kconfig index 2466dcc3ef1d..56cb1695cd87 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING one needs to mount and use blkio cgroup controller for creating cgroups and specifying per device IO rate policies. - See Documentation/cgroup-v1/blkio-controller.txt for more information. + See Documentation/cgroup-v1/blkio-controller.rst for more information. config BLK_DEV_THROTTLING_LOW bool "Block throttling .low limit interface support (EXPERIMENTAL)" diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 06c9b00507b6..50c9d2598500 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -241,7 +241,7 @@ static struct kmem_cache *bfq_pool; * containing only random (seeky) I/O are prevented from being tagged * as soft real-time. */ -#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history & -1) +#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1) /* Min number of samples required to perform peak-rate update */ #define BFQ_RATE_MIN_SAMPLES 32 diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 53b7bd4c7000..24ed26957367 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -29,6 +29,7 @@ #include <linux/ctype.h> #include <linux/blk-cgroup.h> #include <linux/tracehook.h> +#include <linux/psi.h> #include "blk.h" #define MAX_KEY_LEN 100 @@ -47,12 +48,14 @@ struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; +EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ static bool blkcg_debug_stats = false; +static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) @@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + WARN_ON(!bio_list_empty(&blkg->async_bios)); + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) @@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +static void blkg_async_bio_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + async_bio_work); + struct bio_list bios = BIO_EMPTY_LIST; + struct bio *bio; + + /* as long as there are pending bios, @blkg can't go away */ + spin_lock_bh(&blkg->async_bio_lock); + bio_list_merge(&bios, &blkg->async_bios); + bio_list_init(&blkg->async_bios); + spin_unlock_bh(&blkg->async_bio_lock); + + while ((bio = bio_list_pop(&bios))) + submit_bio(bio); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); + spin_lock_init(&blkg->async_bio_lock); + bio_list_init(&blkg->async_bios); + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; for (i = 0; i < BLKCG_MAX_POLS; i++) { @@ -1526,6 +1551,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); +bool __blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + /* consume the flag first */ + bio->bi_opf &= ~REQ_CGROUP_PUNT; + + /* never bounce for the root cgroup */ + if (!blkg->parent) + return false; + + spin_lock_bh(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock_bh(&blkg->async_bio_lock); + + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + return true; +} + /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { + unsigned long pflags; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; @@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) */ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); - /* - * TODO: the use_memdelay flag is going to be for the upcoming psi stuff - * that hasn't landed upstream yet. Once that stuff is in place we need - * to do a psi_memstall_enter/leave if memdelay is set. - */ + if (use_memdelay) + psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); @@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); + + if (use_memdelay) + psi_memstall_leave(&pflags); } /** @@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +static int __init blkcg_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_init); + module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index 5d1fc8e17dd1..d0cc6e14d2f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->internal_tag = -1; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; + refcount_set(&rq->ref, 1); } EXPORT_SYMBOL(blk_rq_init); @@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct request *rq; struct list_head *plug_list; - plug = current->plug; + plug = blk_mq_plug(q, bio); if (!plug) return false; @@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request); */ blk_qc_t submit_bio(struct bio *bio) { + if (blkcg_punt_bio_submit(bio)) + return BLK_QC_T_NONE; + /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 748164f4e8b1..b3f2ba483992 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -918,6 +918,13 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; + /* + * If the parent directory has not been created yet, return, we will be + * called again later on and the directory/files will be created then. + */ + if (!q->debugfs_dir) + return; + if (!e->queue_debugfs_attrs) return; diff --git a/block/blk-mq.c b/block/blk-mq.c index e5ef40c603ca..b038ec680e84 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - plug = current->plug; + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* bypass scheduler for flush rq */ blk_insert_flush(rq); diff --git a/block/blk-mq.h b/block/blk-mq.h index f4bf5161333e..32c62c64e6c2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } +/* + * blk_mq_plug() - Get caller context plug + * @q: request queue + * @bio : the bio being submitted by the caller context + * + * Plugging, by design, may delay the insertion of BIOs into the elevator in + * order to increase BIO merging opportunities. This however can cause BIO + * insertion order to change from the order in which submit_bio() is being + * executed in the case of multiple contexts concurrently issuing BIOs to a + * device, even if these context are synchronized to tightly control BIO issuing + * order. While this is not a problem with regular block devices, this ordering + * change can cause write BIO failures with zoned block devices as these + * require sequential write patterns to zones. Prevent this from happening by + * ignoring the plug state of a BIO issuing context if the target request queue + * is for a zoned block device and the BIO to plug is a write operation. + * + * Return current->plug if the bio can be plugged and NULL otherwise + */ +static inline struct blk_plug *blk_mq_plug(struct request_queue *q, + struct bio *bio) +{ + /* + * For regular block devices or read operations, use the context plug + * which may be NULL if blk_start_plug() was not executed. + */ + if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) + return current->plug; + + /* Zoned block device write operation case: do not plug the BIO */ + return NULL; +} + #endif diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9ea7c0ecad10..8ab6c8153223 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; - - /* Slice has just started. Consider one slice interval */ - if (!jiffy_elapsed) - jiffy_elapsed_rnd = tg->td->throtl_slice; + jiffy_elapsed = jiffies - tg->slice_start[rw]; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ae7e91bd0618..3249738242b4 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); static inline unsigned int __blkdev_nr_zones(struct request_queue *q, sector_t nr_sectors) { - unsigned long zone_sectors = blk_queue_zone_sectors(q); + sector_t zone_sectors = blk_queue_zone_sectors(q); return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); } |