From 46cd1e8a4752379b1b9d24d43d7be7d5aba03e76 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Mon, 26 Oct 2020 17:58:27 +0100 Subject: qcow2: Skip copy-on-write when allocating a zero cluster Since commit c8bb23cbdbe32f5c326365e0a82e1b0e68cdcd8a when a write request results in a new allocation QEMU first tries to see if the rest of the cluster outside the written area contains only zeroes. In that case, instead of doing a normal copy-on-write operation and writing explicit zero buffers to disk, the code zeroes the whole cluster efficiently using pwrite_zeroes() with BDRV_REQ_NO_FALLBACK. This improves performance very significantly but it only happens when we are writing to an area that was completely unallocated before. Zero clusters (QCOW2_CLUSTER_ZERO_*) are treated like normal clusters and are therefore slower to allocate. This happens because the code uses bdrv_is_allocated_above() rather bdrv_block_status_above(). The former is not as accurate for this purpose but it is faster. However in the case of qcow2 the underlying call does already report zero clusters just fine so there is no reason why we cannot use that information. After testing 4KB writes on an image that only contains zero clusters this patch results in almost five times more IOPS. Signed-off-by: Alberto Garcia Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <6d77cab968c501c44d6e1089b9bc91b04170b49e.1603731354.git.berto@igalia.com> Signed-off-by: Kevin Wolf --- include/block/block.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/block/block.h b/include/block/block.h index d16c401cb4..c9d7c58765 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -508,6 +508,8 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, bool include_base, int64_t offset, int64_t bytes, int64_t *pnum); +int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, + int64_t bytes); bool bdrv_is_read_only(BlockDriverState *bs); int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, -- cgit v1.2.3-55-g7522 From 1a6d3bd229d429879a85a9105fb84cae049d083c Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Fri, 23 Oct 2020 17:01:10 +0200 Subject: block: End quiescent sections when a BDS is deleted If a BDS gets deleted during blk_drain_all(), it might miss a call to bdrv_do_drained_end(). This means missing a call to aio_enable_external() and the AIO context remains disabled for ever. This can cause a device to become irresponsive and to disrupt the guest execution, ie. hang, loop forever or worse. This scenario is quite easy to encounter with virtio-scsi on POWER when punching multiple blockdev-create QMP commands while the guest is booting and it is still running the SLOF firmware. This happens because SLOF disables/re-enables PCI devices multiple times via IO/MEM/MASTER bits of PCI_COMMAND register after the initial probe/feature negotiation, as it tends to work with a single device at a time at various stages like probing and running block/network bootloaders without doing a full reset in-between. This naturally generates many dataplane stops and starts, and thus many drain sections that can race with blockdev_create_run(). In the end, SLOF bails out. It is somehow reproducible on x86 but it requires to generate articial dataplane start/stop activity with stop/cont QMP commands. In this case, seabios ends up looping for ever, waiting for the virtio-scsi device to send a response to a command it never received. Add a helper that pairs all previously called bdrv_do_drained_begin() with a bdrv_do_drained_end() and call it from bdrv_close(). While at it, update the "/bdrv-drain/graph-change/drain_all" test in test-bdrv-drain so that it can catch the issue. BugId: https://bugzilla.redhat.com/show_bug.cgi?id=1874441 Signed-off-by: Greg Kurz Message-Id: <160346526998.272601.9045392804399803158.stgit@bahia.lan> Signed-off-by: Kevin Wolf --- block.c | 9 +++++++++ block/io.c | 13 +++++++++++++ include/block/block.h | 6 ++++++ tests/test-bdrv-drain.c | 1 + 4 files changed, 29 insertions(+) (limited to 'include') diff --git a/block.c b/block.c index 430edf79bb..ee5b28a979 100644 --- a/block.c +++ b/block.c @@ -4458,6 +4458,15 @@ static void bdrv_close(BlockDriverState *bs) } QLIST_INIT(&bs->aio_notifiers); bdrv_drained_end(bs); + + /* + * If we're still inside some bdrv_drain_all_begin()/end() sections, end + * them now since this BDS won't exist anymore when bdrv_drain_all_end() + * gets called. + */ + if (bs->quiesce_counter) { + bdrv_drain_all_end_quiesce(bs); + } } void bdrv_close_all(void) diff --git a/block/io.c b/block/io.c index c33cecd58d..9918f2499c 100644 --- a/block/io.c +++ b/block/io.c @@ -633,6 +633,19 @@ void bdrv_drain_all_begin(void) } } +void bdrv_drain_all_end_quiesce(BlockDriverState *bs) +{ + int drained_end_counter = 0; + + g_assert(bs->quiesce_counter > 0); + g_assert(!bs->refcnt); + + while (bs->quiesce_counter) { + bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter); + } + BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0); +} + void bdrv_drain_all_end(void) { BlockDriverState *bs = NULL; diff --git a/include/block/block.h b/include/block/block.h index c9d7c58765..4bfe3b546b 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -781,6 +781,12 @@ void bdrv_drained_end(BlockDriverState *bs); */ void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); +/** + * End all quiescent sections started by bdrv_drain_all_begin(). This is + * only needed when deleting a BDS before bdrv_drain_all_end() is called. + */ +void bdrv_drain_all_end_quiesce(BlockDriverState *bs); + /** * End a quiescent section started by bdrv_subtree_drained_begin(). */ diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c index 1595bbc92e..8a29e33e00 100644 --- a/tests/test-bdrv-drain.c +++ b/tests/test-bdrv-drain.c @@ -594,6 +594,7 @@ static void test_graph_change_drain_all(void) g_assert_cmpint(bs_b->quiesce_counter, ==, 0); g_assert_cmpint(b_s->drain_count, ==, 0); + g_assert_cmpint(qemu_get_aio_context()->external_disable_cnt, ==, 0); bdrv_unref(bs_b); blk_unref(blk_b); -- cgit v1.2.3-55-g7522