summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig2
-rw-r--r--block/bfq-iosched.c2
-rw-r--r--block/blk-cgroup.c66
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-mq-debugfs.c7
-rw-r--r--block/blk-mq.c2
-rw-r--r--block/blk-mq.h32
-rw-r--r--block/blk-throttle.c9
-rw-r--r--block/blk-zoned.c2
9 files changed, 112 insertions, 16 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 2466dcc3ef1d..56cb1695cd87 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING
one needs to mount and use blkio cgroup controller for creating
cgroups and specifying per device IO rate policies.
- See Documentation/cgroup-v1/blkio-controller.txt for more information.
+ See Documentation/cgroup-v1/blkio-controller.rst for more information.
config BLK_DEV_THROTTLING_LOW
bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 06c9b00507b6..50c9d2598500 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -241,7 +241,7 @@ static struct kmem_cache *bfq_pool;
* containing only random (seeky) I/O are prevented from being tagged
* as soft real-time.
*/
-#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history & -1)
+#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1)
/* Min number of samples required to perform peak-rate update */
#define BFQ_RATE_MIN_SAMPLES 32
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 53b7bd4c7000..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,7 @@
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -47,12 +48,14 @@ struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
@@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ WARN_ON(!bio_list_empty(&blkg->async_bios));
+
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
@@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ async_bio_work);
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ /* as long as there are pending bios, @blkg can't go away */
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_merge(&bios, &blkg->async_bios);
+ bio_list_init(&blkg->async_bios);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ while ((bio = bio_list_pop(&bios)))
+ submit_bio(bio);
+}
+
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
@@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
+ spin_lock_init(&blkg->async_bio_lock);
+ bio_list_init(&blkg->async_bios);
+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1526,6 +1551,25 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+
+ /* consume the flag first */
+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+ /* never bounce for the root cgroup */
+ if (!blkg->parent)
+ return false;
+
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_add(&blkg->async_bios, bio);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+ return true;
+}
+
/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
*/
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
+ unsigned long pflags;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
@@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*/
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
- /*
- * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
- * that hasn't landed upstream yet. Once that stuff is in place we need
- * to do a psi_memstall_enter/leave if memdelay is set.
- */
+ if (use_memdelay)
+ psi_memstall_enter(&pflags);
exp = ktime_add_ns(now, delay_nsec);
tok = io_schedule_prepare();
@@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
break;
} while (!fatal_signal_pending(current));
io_schedule_finish(tok);
+
+ if (use_memdelay)
+ psi_memstall_leave(&pflags);
}
/**
@@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
+static int __init blkcg_init(void)
+{
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!blkcg_punt_bio_wq)
+ return -ENOMEM;
+ return 0;
+}
+subsys_initcall(blkcg_init);
+
module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index 5d1fc8e17dd1..d0cc6e14d2f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
+ refcount_set(&rq->ref, 1);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct request *rq;
struct list_head *plug_list;
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (!plug)
return false;
@@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
+ if (blkcg_punt_bio_submit(bio))
+ return BLK_QC_T_NONE;
+
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 748164f4e8b1..b3f2ba483992 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -918,6 +918,13 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
{
struct elevator_type *e = q->elevator->type;
+ /*
+ * If the parent directory has not been created yet, return, we will be
+ * called again later on and the directory/files will be created then.
+ */
+ if (!q->debugfs_dir)
+ return;
+
if (!e->queue_debugfs_attrs)
return;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e5ef40c603ca..b038ec680e84 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs);
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) {
/* bypass scheduler for flush rq */
blk_insert_flush(rq);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4bf5161333e..32c62c64e6c2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
+/*
+ * blk_mq_plug() - Get caller context plug
+ * @q: request queue
+ * @bio : the bio being submitted by the caller context
+ *
+ * Plugging, by design, may delay the insertion of BIOs into the elevator in
+ * order to increase BIO merging opportunities. This however can cause BIO
+ * insertion order to change from the order in which submit_bio() is being
+ * executed in the case of multiple contexts concurrently issuing BIOs to a
+ * device, even if these context are synchronized to tightly control BIO issuing
+ * order. While this is not a problem with regular block devices, this ordering
+ * change can cause write BIO failures with zoned block devices as these
+ * require sequential write patterns to zones. Prevent this from happening by
+ * ignoring the plug state of a BIO issuing context if the target request queue
+ * is for a zoned block device and the BIO to plug is a write operation.
+ *
+ * Return current->plug if the bio can be plugged and NULL otherwise
+ */
+static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
+ struct bio *bio)
+{
+ /*
+ * For regular block devices or read operations, use the context plug
+ * which may be NULL if blk_start_plug() was not executed.
+ */
+ if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
+ return current->plug;
+
+ /* Zoned block device write operation case: do not plug the BIO */
+ return NULL;
+}
+
#endif
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9ea7c0ecad10..8ab6c8153223 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
- jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
-
- /* Slice has just started. Consider one slice interval */
- if (!jiffy_elapsed)
- jiffy_elapsed_rnd = tg->td->throtl_slice;
+ jiffy_elapsed = jiffies - tg->slice_start[rw];
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
+ /* Round up to the next throttle slice, wait time must be nonzero */
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..3249738242b4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
sector_t nr_sectors)
{
- unsigned long zone_sectors = blk_queue_zone_sectors(q);
+ sector_t zone_sectors = blk_queue_zone_sectors(q);
return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
}