summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorStefan Hajnoczi2017-05-12 15:29:49 +0200
committerStefan Hajnoczi2017-05-12 16:39:08 +0200
commit3753e255da8f6a654c3e7b650a2d27734bec15f9 (patch)
treee2cb28dba28a48ecb3c9912900f74c861620d03a /block
parentmaintainers: Add myself as linux-user reviewer (diff)
parentMerge remote-tracking branch 'mreitz/tags/pull-block-2017-05-11' into queue-b... (diff)
downloadqemu-3753e255da8f6a654c3e7b650a2d27734bec15f9.tar.gz
qemu-3753e255da8f6a654c3e7b650a2d27734bec15f9.tar.xz
qemu-3753e255da8f6a654c3e7b650a2d27734bec15f9.zip
Merge remote-tracking branch 'kwolf/tags/for-upstream' into staging
Block layer patches # gpg: Signature made Thu 11 May 2017 10:31:37 AM EDT # gpg: using RSA key 0x7F09B272C88F2FD6 # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" # Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6 * kwolf/tags/for-upstream: (58 commits) MAINTAINERS: Add qemu-progress to the block layer qcow2: Discard/zero clusters by byte count qcow2: Assert that cluster operations are aligned qcow2: Optimize write zero of unaligned tail cluster iotests: Add test 179 to cover write zeroes with unmap iotests: Improve _filter_qemu_img_map qcow2: Optimize zero_single_l2() to minimize L2 churn qcow2: Make distinction between zero cluster types obvious qcow2: Name typedef for cluster type qcow2: Correctly report status of preallocated zero clusters block: Update comments on BDRV_BLOCK_* meanings qcow2: Use consistent switch indentation qcow2: Nicer variable names in qcow2_update_snapshot_refcount() tests: Add coverage for recent block geometry fixes blkdebug: Add ability to override unmap geometries blkdebug: Simplify override logic blkdebug: Add pass-through write_zero and discard support blkdebug: Refactor error injection blkdebug: Sanity check block layer guarantees qemu-io: Switch 'map' output to byte-based reporting ... Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'block')
-rw-r--r--block/blkdebug.c264
-rw-r--r--block/block-backend.c81
-rw-r--r--block/file-posix.c248
-rw-r--r--block/file-win32.c5
-rw-r--r--block/qcow2-cluster.c252
-rw-r--r--block/qcow2-refcount.c148
-rw-r--r--block/qcow2-snapshot.c7
-rw-r--r--block/qcow2.c47
-rw-r--r--block/qcow2.h26
9 files changed, 796 insertions, 282 deletions
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 3c088934db..a5196e889d 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -1,6 +1,7 @@
/*
* Block protocol for I/O error injection
*
+ * Copyright (C) 2016-2017 Red Hat, Inc.
* Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -37,7 +38,12 @@
typedef struct BDRVBlkdebugState {
int state;
int new_state;
- int align;
+ uint64_t align;
+ uint64_t max_transfer;
+ uint64_t opt_write_zero;
+ uint64_t max_write_zero;
+ uint64_t opt_discard;
+ uint64_t max_discard;
/* For blkdebug_refresh_filename() */
char *config_file;
@@ -342,6 +348,31 @@ static QemuOptsList runtime_opts = {
.type = QEMU_OPT_SIZE,
.help = "Required alignment in bytes",
},
+ {
+ .name = "max-transfer",
+ .type = QEMU_OPT_SIZE,
+ .help = "Maximum transfer size in bytes",
+ },
+ {
+ .name = "opt-write-zero",
+ .type = QEMU_OPT_SIZE,
+ .help = "Optimum write zero alignment in bytes",
+ },
+ {
+ .name = "max-write-zero",
+ .type = QEMU_OPT_SIZE,
+ .help = "Maximum write zero size in bytes",
+ },
+ {
+ .name = "opt-discard",
+ .type = QEMU_OPT_SIZE,
+ .help = "Optimum discard alignment in bytes",
+ },
+ {
+ .name = "max-discard",
+ .type = QEMU_OPT_SIZE,
+ .help = "Maximum discard size in bytes",
+ },
{ /* end of list */ }
},
};
@@ -352,8 +383,8 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
BDRVBlkdebugState *s = bs->opaque;
QemuOpts *opts;
Error *local_err = NULL;
- uint64_t align;
int ret;
+ uint64_t align;
opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -382,19 +413,69 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags,
goto out;
}
- /* Set request alignment */
- align = qemu_opt_get_size(opts, "align", 0);
- if (align < INT_MAX && is_power_of_2(align)) {
- s->align = align;
- } else if (align) {
- error_setg(errp, "Invalid alignment");
- ret = -EINVAL;
+ bs->supported_write_flags = BDRV_REQ_FUA &
+ bs->file->bs->supported_write_flags;
+ bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+ bs->file->bs->supported_zero_flags;
+ ret = -EINVAL;
+
+ /* Set alignment overrides */
+ s->align = qemu_opt_get_size(opts, "align", 0);
+ if (s->align && (s->align >= INT_MAX || !is_power_of_2(s->align))) {
+ error_setg(errp, "Cannot meet constraints with align %" PRIu64,
+ s->align);
goto out;
}
+ align = MAX(s->align, bs->file->bs->bl.request_alignment);
- ret = 0;
- goto out;
+ s->max_transfer = qemu_opt_get_size(opts, "max-transfer", 0);
+ if (s->max_transfer &&
+ (s->max_transfer >= INT_MAX ||
+ !QEMU_IS_ALIGNED(s->max_transfer, align))) {
+ error_setg(errp, "Cannot meet constraints with max-transfer %" PRIu64,
+ s->max_transfer);
+ goto out;
+ }
+
+ s->opt_write_zero = qemu_opt_get_size(opts, "opt-write-zero", 0);
+ if (s->opt_write_zero &&
+ (s->opt_write_zero >= INT_MAX ||
+ !QEMU_IS_ALIGNED(s->opt_write_zero, align))) {
+ error_setg(errp, "Cannot meet constraints with opt-write-zero %" PRIu64,
+ s->opt_write_zero);
+ goto out;
+ }
+
+ s->max_write_zero = qemu_opt_get_size(opts, "max-write-zero", 0);
+ if (s->max_write_zero &&
+ (s->max_write_zero >= INT_MAX ||
+ !QEMU_IS_ALIGNED(s->max_write_zero,
+ MAX(s->opt_write_zero, align)))) {
+ error_setg(errp, "Cannot meet constraints with max-write-zero %" PRIu64,
+ s->max_write_zero);
+ goto out;
+ }
+
+ s->opt_discard = qemu_opt_get_size(opts, "opt-discard", 0);
+ if (s->opt_discard &&
+ (s->opt_discard >= INT_MAX ||
+ !QEMU_IS_ALIGNED(s->opt_discard, align))) {
+ error_setg(errp, "Cannot meet constraints with opt-discard %" PRIu64,
+ s->opt_discard);
+ goto out;
+ }
+
+ s->max_discard = qemu_opt_get_size(opts, "max-discard", 0);
+ if (s->max_discard &&
+ (s->max_discard >= INT_MAX ||
+ !QEMU_IS_ALIGNED(s->max_discard,
+ MAX(s->opt_discard, align)))) {
+ error_setg(errp, "Cannot meet constraints with max-discard %" PRIu64,
+ s->max_discard);
+ goto out;
+ }
+ ret = 0;
out:
if (ret < 0) {
g_free(s->config_file);
@@ -403,11 +484,30 @@ out:
return ret;
}
-static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
+static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
{
BDRVBlkdebugState *s = bs->opaque;
- int error = rule->options.inject.error;
- bool immediately = rule->options.inject.immediately;
+ BlkdebugRule *rule = NULL;
+ int error;
+ bool immediately;
+
+ QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
+ uint64_t inject_offset = rule->options.inject.offset;
+
+ if (inject_offset == -1 ||
+ (bytes && inject_offset >= offset &&
+ inject_offset < offset + bytes))
+ {
+ break;
+ }
+ }
+
+ if (!rule || !rule->options.inject.error) {
+ return 0;
+ }
+
+ immediately = rule->options.inject.immediately;
+ error = rule->options.inject.error;
if (rule->options.inject.once) {
QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next);
@@ -426,21 +526,18 @@ static int coroutine_fn
blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
{
- BDRVBlkdebugState *s = bs->opaque;
- BlkdebugRule *rule = NULL;
-
- QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
- uint64_t inject_offset = rule->options.inject.offset;
+ int err;
- if (inject_offset == -1 ||
- (inject_offset >= offset && inject_offset < offset + bytes))
- {
- break;
- }
+ /* Sanity check block layer guarantees */
+ assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+ assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+ if (bs->bl.max_transfer) {
+ assert(bytes <= bs->bl.max_transfer);
}
- if (rule && rule->options.inject.error) {
- return inject_error(bs, rule);
+ err = rule_check(bs, offset, bytes);
+ if (err) {
+ return err;
}
return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
@@ -450,21 +547,18 @@ static int coroutine_fn
blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
{
- BDRVBlkdebugState *s = bs->opaque;
- BlkdebugRule *rule = NULL;
+ int err;
- QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
- uint64_t inject_offset = rule->options.inject.offset;
-
- if (inject_offset == -1 ||
- (inject_offset >= offset && inject_offset < offset + bytes))
- {
- break;
- }
+ /* Sanity check block layer guarantees */
+ assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+ assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+ if (bs->bl.max_transfer) {
+ assert(bytes <= bs->bl.max_transfer);
}
- if (rule && rule->options.inject.error) {
- return inject_error(bs, rule);
+ err = rule_check(bs, offset, bytes);
+ if (err) {
+ return err;
}
return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
@@ -472,22 +566,81 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
static int blkdebug_co_flush(BlockDriverState *bs)
{
- BDRVBlkdebugState *s = bs->opaque;
- BlkdebugRule *rule = NULL;
+ int err = rule_check(bs, 0, 0);
- QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) {
- if (rule->options.inject.offset == -1) {
- break;
- }
+ if (err) {
+ return err;
}
- if (rule && rule->options.inject.error) {
- return inject_error(bs, rule);
+ return bdrv_co_flush(bs->file->bs);
+}
+
+static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
+ int64_t offset, int count,
+ BdrvRequestFlags flags)
+{
+ uint32_t align = MAX(bs->bl.request_alignment,
+ bs->bl.pwrite_zeroes_alignment);
+ int err;
+
+ /* Only pass through requests that are larger than requested
+ * preferred alignment (so that we test the fallback to writes on
+ * unaligned portions), and check that the block layer never hands
+ * us anything unaligned that crosses an alignment boundary. */
+ if (count < align) {
+ assert(QEMU_IS_ALIGNED(offset, align) ||
+ QEMU_IS_ALIGNED(offset + count, align) ||
+ DIV_ROUND_UP(offset, align) ==
+ DIV_ROUND_UP(offset + count, align));
+ return -ENOTSUP;
+ }
+ assert(QEMU_IS_ALIGNED(offset, align));
+ assert(QEMU_IS_ALIGNED(count, align));
+ if (bs->bl.max_pwrite_zeroes) {
+ assert(count <= bs->bl.max_pwrite_zeroes);
}
- return bdrv_co_flush(bs->file->bs);
+ err = rule_check(bs, offset, count);
+ if (err) {
+ return err;
+ }
+
+ return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
}
+static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
+ int64_t offset, int count)
+{
+ uint32_t align = bs->bl.pdiscard_alignment;
+ int err;
+
+ /* Only pass through requests that are larger than requested
+ * minimum alignment, and ensure that unaligned requests do not
+ * cross optimum discard boundaries. */
+ if (count < bs->bl.request_alignment) {
+ assert(QEMU_IS_ALIGNED(offset, align) ||
+ QEMU_IS_ALIGNED(offset + count, align) ||
+ DIV_ROUND_UP(offset, align) ==
+ DIV_ROUND_UP(offset + count, align));
+ return -ENOTSUP;
+ }
+ assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+ assert(QEMU_IS_ALIGNED(count, bs->bl.request_alignment));
+ if (align && count >= align) {
+ assert(QEMU_IS_ALIGNED(offset, align));
+ assert(QEMU_IS_ALIGNED(count, align));
+ }
+ if (bs->bl.max_pdiscard) {
+ assert(count <= bs->bl.max_pdiscard);
+ }
+
+ err = rule_check(bs, offset, count);
+ if (err) {
+ return err;
+ }
+
+ return bdrv_co_pdiscard(bs->file->bs, offset, count);
+}
static void blkdebug_close(BlockDriverState *bs)
{
@@ -715,6 +868,21 @@ static void blkdebug_refresh_limits(BlockDriverState *bs, Error **errp)
if (s->align) {
bs->bl.request_alignment = s->align;
}
+ if (s->max_transfer) {
+ bs->bl.max_transfer = s->max_transfer;
+ }
+ if (s->opt_write_zero) {
+ bs->bl.pwrite_zeroes_alignment = s->opt_write_zero;
+ }
+ if (s->max_write_zero) {
+ bs->bl.max_pwrite_zeroes = s->max_write_zero;
+ }
+ if (s->opt_discard) {
+ bs->bl.pdiscard_alignment = s->opt_discard;
+ }
+ if (s->max_discard) {
+ bs->bl.max_pdiscard = s->max_discard;
+ }
}
static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state,
@@ -742,6 +910,8 @@ static BlockDriver bdrv_blkdebug = {
.bdrv_co_preadv = blkdebug_co_preadv,
.bdrv_co_pwritev = blkdebug_co_pwritev,
.bdrv_co_flush_to_disk = blkdebug_co_flush,
+ .bdrv_co_pwrite_zeroes = blkdebug_co_pwrite_zeroes,
+ .bdrv_co_pdiscard = blkdebug_co_pdiscard,
.bdrv_debug_event = blkdebug_debug_event,
.bdrv_debug_breakpoint = blkdebug_debug_breakpoint,
diff --git a/block/block-backend.c b/block/block-backend.c
index f5bf13eec9..f3a60081a7 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -130,6 +130,56 @@ static const char *blk_root_get_name(BdrvChild *child)
return blk_name(child->opaque);
}
+/*
+ * Notifies the user of the BlockBackend that migration has completed. qdev
+ * devices can tighten their permissions in response (specifically revoke
+ * shared write permissions that we needed for storage migration).
+ *
+ * If an error is returned, the VM cannot be allowed to be resumed.
+ */
+static void blk_root_activate(BdrvChild *child, Error **errp)
+{
+ BlockBackend *blk = child->opaque;
+ Error *local_err = NULL;
+
+ if (!blk->disable_perm) {
+ return;
+ }
+
+ blk->disable_perm = false;
+
+ blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ blk->disable_perm = true;
+ return;
+ }
+}
+
+static int blk_root_inactivate(BdrvChild *child)
+{
+ BlockBackend *blk = child->opaque;
+
+ if (blk->disable_perm) {
+ return 0;
+ }
+
+ /* Only inactivate BlockBackends for guest devices (which are inactive at
+ * this point because the VM is stopped) and unattached monitor-owned
+ * BlockBackends. If there is still any other user like a block job, then
+ * we simply can't inactivate the image. */
+ if (!blk->dev && !blk->name[0]) {
+ return -EPERM;
+ }
+
+ blk->disable_perm = true;
+ if (blk->root) {
+ bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
+ }
+
+ return 0;
+}
+
static const BdrvChildRole child_root = {
.inherit_options = blk_root_inherit_options,
@@ -140,6 +190,9 @@ static const BdrvChildRole child_root = {
.drained_begin = blk_root_drained_begin,
.drained_end = blk_root_drained_end,
+
+ .activate = blk_root_activate,
+ .inactivate = blk_root_inactivate,
};
/*
@@ -601,34 +654,6 @@ void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
*shared_perm = blk->shared_perm;
}
-/*
- * Notifies the user of all BlockBackends that migration has completed. qdev
- * devices can tighten their permissions in response (specifically revoke
- * shared write permissions that we needed for storage migration).
- *
- * If an error is returned, the VM cannot be allowed to be resumed.
- */
-void blk_resume_after_migration(Error **errp)
-{
- BlockBackend *blk;
- Error *local_err = NULL;
-
- for (blk = blk_all_next(NULL); blk; blk = blk_all_next(blk)) {
- if (!blk->disable_perm) {
- continue;
- }
-
- blk->disable_perm = false;
-
- blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
- blk->disable_perm = true;
- return;
- }
- }
-}
-
static int blk_do_attach_dev(BlockBackend *blk, void *dev)
{
if (blk->dev) {
diff --git a/block/file-posix.c b/block/file-posix.c
index 19c48a043e..4354d49642 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -129,12 +129,23 @@ do { \
#define MAX_BLOCKSIZE 4096
+/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes,
+ * leaving a few more bytes for its future use. */
+#define RAW_LOCK_PERM_BASE 100
+#define RAW_LOCK_SHARED_BASE 200
+
typedef struct BDRVRawState {
int fd;
+ int lock_fd;
+ bool use_lock;
int type;
int open_flags;
size_t buf_align;
+ /* The current permissions. */
+ uint64_t perm;
+ uint64_t shared_perm;
+
#ifdef CONFIG_XFS
bool is_xfs:1;
#endif
@@ -392,6 +403,11 @@ static QemuOptsList raw_runtime_opts = {
.type = QEMU_OPT_STRING,
.help = "host AIO implementation (threads, native)",
},
+ {
+ .name = "locking",
+ .type = QEMU_OPT_STRING,
+ .help = "file locking mode (on/off/auto, default: auto)",
+ },
{ /* end of list */ }
},
};
@@ -406,6 +422,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
BlockdevAioOptions aio, aio_default;
int fd, ret;
struct stat st;
+ OnOffAuto locking;
opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort);
qemu_opts_absorb_qdict(opts, options, &local_err);
@@ -435,6 +452,37 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE);
+ locking = qapi_enum_parse(OnOffAuto_lookup, qemu_opt_get(opts, "locking"),
+ ON_OFF_AUTO__MAX, ON_OFF_AUTO_AUTO, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ ret = -EINVAL;
+ goto fail;
+ }
+ switch (locking) {
+ case ON_OFF_AUTO_ON:
+ s->use_lock = true;
+#ifndef F_OFD_SETLK
+ fprintf(stderr,
+ "File lock requested but OFD locking syscall is unavailable, "
+ "falling back to POSIX file locks.\n"
+ "Due to the implementation, locks can be lost unexpectedly.\n");
+#endif
+ break;
+ case ON_OFF_AUTO_OFF:
+ s->use_lock = false;
+ break;
+ case ON_OFF_AUTO_AUTO:
+#ifdef F_OFD_SETLK
+ s->use_lock = true;
+#else
+ s->use_lock = false;
+#endif
+ break;
+ default:
+ abort();
+ }
+
s->open_flags = open_flags;
raw_parse_flags(bdrv_flags, &s->open_flags);
@@ -450,6 +498,21 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
}
s->fd = fd;
+ s->lock_fd = -1;
+ if (s->use_lock) {
+ fd = qemu_open(filename, s->open_flags);
+ if (fd < 0) {
+ ret = -errno;
+ error_setg_errno(errp, errno, "Could not open '%s' for locking",
+ filename);
+ qemu_close(s->fd);
+ goto fail;
+ }
+ s->lock_fd = fd;
+ }
+ s->perm = 0;
+ s->shared_perm = BLK_PERM_ALL;
+
#ifdef CONFIG_LINUX_AIO
/* Currently Linux does AIO only for files opened with O_DIRECT */
if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) {
@@ -537,6 +600,161 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
return raw_open_common(bs, options, flags, 0, errp);
}
+typedef enum {
+ RAW_PL_PREPARE,
+ RAW_PL_COMMIT,
+ RAW_PL_ABORT,
+} RawPermLockOp;
+
+#define PERM_FOREACH(i) \
+ for ((i) = 0; (1ULL << (i)) <= BLK_PERM_ALL; i++)
+
+/* Lock bytes indicated by @perm_lock_bits and @shared_perm_lock_bits in the
+ * file; if @unlock == true, also unlock the unneeded bytes.
+ * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
+ */
+static int raw_apply_lock_bytes(BDRVRawState *s,
+ uint64_t perm_lock_bits,
+ uint64_t shared_perm_lock_bits,
+ bool unlock, Error **errp)
+{
+ int ret;
+ int i;
+
+ PERM_FOREACH(i) {
+ int off = RAW_LOCK_PERM_BASE + i;
+ if (perm_lock_bits & (1ULL << i)) {
+ ret = qemu_lock_fd(s->lock_fd, off, 1, false);
+ if (ret) {
+ error_setg(errp, "Failed to lock byte %d", off);
+ return ret;
+ }
+ } else if (unlock) {
+ ret = qemu_unlock_fd(s->lock_fd, off, 1);
+ if (ret) {
+ error_setg(errp, "Failed to unlock byte %d", off);
+ return ret;
+ }
+ }
+ }
+ PERM_FOREACH(i) {
+ int off = RAW_LOCK_SHARED_BASE + i;
+ if (shared_perm_lock_bits & (1ULL << i)) {
+ ret = qemu_lock_fd(s->lock_fd, off, 1, false);
+ if (ret) {
+ error_setg(errp, "Failed to lock byte %d", off);
+ return ret;
+ }
+ } else if (unlock) {
+ ret = qemu_unlock_fd(s->lock_fd, off, 1);
+ if (ret) {
+ error_setg(errp, "Failed to unlock byte %d", off);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */
+static int raw_check_lock_bytes(BDRVRawState *s,
+ uint64_t perm, uint64_t shared_perm,
+ Error **errp)
+{
+ int ret;
+ int i;
+
+ PERM_FOREACH(i) {
+ int off = RAW_LOCK_SHARED_BASE + i;
+ uint64_t p = 1ULL << i;
+ if (perm & p) {
+ ret = qemu_lock_fd_test(s->lock_fd, off, 1, true);
+ if (ret) {
+ char *perm_name = bdrv_perm_names(p);
+ error_setg(errp,
+ "Failed to get \"%s\" lock",
+ perm_name);
+ g_free(perm_name);
+ error_append_hint(errp,
+ "Is another process using the image?\n");
+ return ret;
+ }
+ }
+ }
+ PERM_FOREACH(i) {
+ int off = RAW_LOCK_PERM_BASE + i;
+ uint64_t p = 1ULL << i;
+ if (!(shared_perm & p)) {
+ ret = qemu_lock_fd_test(s->lock_fd, off, 1, true);
+ if (ret) {
+ char *perm_name = bdrv_perm_names(p);
+ error_setg(errp,
+ "Failed to get shared \"%s\" lock",
+ perm_name);
+ g_free(perm_name);
+ error_append_hint(errp,
+ "Is another process using the image?\n");
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+static int raw_handle_perm_lock(BlockDriverState *bs,
+ RawPermLockOp op,
+ uint64_t new_perm, uint64_t new_shared,
+ Error **errp)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret = 0;
+ Error *local_err = NULL;
+
+ if (!s->use_lock) {
+ return 0;
+ }
+
+ if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) {
+ return 0;
+ }
+
+ assert(s->lock_fd > 0);
+
+ switch (op) {
+ case RAW_PL_PREPARE:
+ ret = raw_apply_lock_bytes(s, s->perm | new_perm,
+ ~s->shared_perm | ~new_shared,
+ false, errp);
+ if (!ret) {
+ ret = raw_check_lock_bytes(s, new_perm, new_shared, errp);
+ if (!ret) {
+ return 0;
+ }
+ }
+ op = RAW_PL_ABORT;
+ /* fall through to unlock bytes. */
+ case RAW_PL_ABORT:
+ raw_apply_lock_bytes(s, s->perm, ~s->shared_perm, true, &local_err);
+ if (local_err) {
+ /* Theoretically the above call only unlocks bytes and it cannot
+ * fail. Something weird happened, report it.
+ */
+ error_report_err(local_err);
+ }
+ break;
+ case RAW_PL_COMMIT:
+ raw_apply_lock_bytes(s, new_perm, ~new_shared, true, &local_err);
+ if (local_err) {
+ /* Theoretically the above call only unlocks bytes and it cannot
+ * fail. Something weird happened, report it.
+ */
+ error_report_err(local_err);
+ }
+ break;
+ }
+ return ret;
+}
+
static int raw_reopen_prepare(BDRVReopenState *state,
BlockReopenQueue *queue, Error **errp)
{
@@ -1405,6 +1623,10 @@ static void raw_close(BlockDriverState *bs)
qemu_close(s->fd);
s->fd = -1;
}
+ if (s->lock_fd >= 0) {
+ qemu_close(s->lock_fd);
+ s->lock_fd = -1;
+ }
}
static int raw_truncate(BlockDriverState *bs, int64_t offset, Error **errp)
@@ -1949,6 +2171,25 @@ static QemuOptsList raw_create_opts = {
}
};
+static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared,
+ Error **errp)
+{
+ return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp);
+}
+
+static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
+{
+ BDRVRawState *s = bs->opaque;
+ raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL);
+ s->perm = perm;
+ s->shared_perm = shared;
+}
+
+static void raw_abort_perm_update(BlockDriverState *bs)
+{
+ raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
+}
+
BlockDriver bdrv_file = {
.format_name = "file",
.protocol_name = "file",
@@ -1979,7 +2220,9 @@ BlockDriver bdrv_file = {
.bdrv_get_info = raw_get_info,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
-
+ .bdrv_check_perm = raw_check_perm,
+ .bdrv_set_perm = raw_set_perm,
+ .bdrv_abort_perm_update = raw_abort_perm_update,
.create_opts = &raw_create_opts,
};
@@ -2438,6 +2681,9 @@ static BlockDriver bdrv_host_device = {
.bdrv_get_info = raw_get_info,
.bdrv_get_allocated_file_size
= raw_get_allocated_file_size,
+ .bdrv_check_perm = raw_check_perm,
+ .bdrv_set_perm = raw_set_perm,
+ .bdrv_abort_perm_update = raw_abort_perm_update,
.bdrv_probe_blocksizes = hdev_probe_blocksizes,
.bdrv_probe_geometry = hdev_probe_geometry,
diff --git a/block/file-win32.c b/block/file-win32.c
index d1eb0a14b2..1a35dbabf2 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -344,6 +344,11 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags,
goto fail;
}
+ if (qdict_get_try_bool(options, "locking", false)) {
+ error_setg(errp, "locking=on is not supported on Windows");
+ goto fail;
+ }
+
filename = qemu_opt_get(opts, "filename");
use_aio = get_aio_option(opts, flags, &local_err);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 100398c565..347d94b0d2 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -309,14 +309,19 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
uint64_t *l2_table, uint64_t stop_flags)
{
int i;
+ QCow2ClusterType first_cluster_type;
uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
uint64_t first_entry = be64_to_cpu(l2_table[0]);
uint64_t offset = first_entry & mask;
- if (!offset)
+ if (!offset) {
return 0;
+ }
- assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);
+ /* must be allocated */
+ first_cluster_type = qcow2_get_cluster_type(first_entry);
+ assert(first_cluster_type == QCOW2_CLUSTER_NORMAL ||
+ first_cluster_type == QCOW2_CLUSTER_ZERO_ALLOC);
for (i = 0; i < nb_clusters; i++) {
uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
@@ -328,14 +333,21 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
return i;
}
-static int count_contiguous_clusters_by_type(int nb_clusters,
- uint64_t *l2_table,
- int wanted_type)
+/*
+ * Checks how many consecutive unallocated clusters in a given L2
+ * table have the same cluster type.
+ */
+static int count_contiguous_clusters_unallocated(int nb_clusters,
+ uint64_t *l2_table,
+ QCow2ClusterType wanted_type)
{
int i;
+ assert(wanted_type == QCOW2_CLUSTER_ZERO_PLAIN ||
+ wanted_type == QCOW2_CLUSTER_UNALLOCATED);
for (i = 0; i < nb_clusters; i++) {
- int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+ uint64_t entry = be64_to_cpu(l2_table[i]);
+ QCow2ClusterType type = qcow2_get_cluster_type(entry);
if (type != wanted_type) {
break;
@@ -487,6 +499,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
int l1_bits, c;
unsigned int offset_in_cluster;
uint64_t bytes_available, bytes_needed, nb_clusters;
+ QCow2ClusterType type;
int ret;
offset_in_cluster = offset_into_cluster(s, offset);
@@ -509,13 +522,13 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
l1_index = offset >> l1_bits;
if (l1_index >= s->l1_size) {
- ret = QCOW2_CLUSTER_UNALLOCATED;
+ type = QCOW2_CLUSTER_UNALLOCATED;
goto out;
}
l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK;
if (!l2_offset) {
- ret = QCOW2_CLUSTER_UNALLOCATED;
+ type = QCOW2_CLUSTER_UNALLOCATED;
goto out;
}
@@ -544,38 +557,37 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
* true */
assert(nb_clusters <= INT_MAX);
- ret = qcow2_get_cluster_type(*cluster_offset);
- switch (ret) {
+ type = qcow2_get_cluster_type(*cluster_offset);
+ if (s->qcow_version < 3 && (type == QCOW2_CLUSTER_ZERO_PLAIN ||
+ type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+ qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
+ " in pre-v3 image (L2 offset: %#" PRIx64
+ ", L2 index: %#x)", l2_offset, l2_index);
+ ret = -EIO;
+ goto fail;
+ }
+ switch (type) {
case QCOW2_CLUSTER_COMPRESSED:
/* Compressed clusters can only be processed one by one */
c = 1;
*cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK;
break;
- case QCOW2_CLUSTER_ZERO:
- if (s->qcow_version < 3) {
- qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found"
- " in pre-v3 image (L2 offset: %#" PRIx64
- ", L2 index: %#x)", l2_offset, l2_index);
- ret = -EIO;
- goto fail;
- }
- c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
- QCOW2_CLUSTER_ZERO);
- *cluster_offset = 0;
- break;
+ case QCOW2_CLUSTER_ZERO_PLAIN:
case QCOW2_CLUSTER_UNALLOCATED:
/* how many empty clusters ? */
- c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
- QCOW2_CLUSTER_UNALLOCATED);
+ c = count_contiguous_clusters_unallocated(nb_clusters,
+ &l2_table[l2_index], type);
*cluster_offset = 0;
break;
+ case QCOW2_CLUSTER_ZERO_ALLOC:
case QCOW2_CLUSTER_NORMAL:
/* how many allocated clusters ? */
c = count_contiguous_clusters(nb_clusters, s->cluster_size,
- &l2_table[l2_index], QCOW_OFLAG_ZERO);
+ &l2_table[l2_index], QCOW_OFLAG_ZERO);
*cluster_offset &= L2E_OFFSET_MASK;
if (offset_into_cluster(s, *cluster_offset)) {
- qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#"
+ qcow2_signal_corruption(bs, true, -1, -1,
+ "Cluster allocation offset %#"
PRIx64 " unaligned (L2 offset: %#" PRIx64
", L2 index: %#x)", *cluster_offset,
l2_offset, l2_index);
@@ -602,7 +614,7 @@ out:
assert(bytes_available - offset_in_cluster <= UINT_MAX);
*bytes = bytes_available - offset_in_cluster;
- return ret;
+ return type;
fail:
qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table);
@@ -835,7 +847,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
* Don't discard clusters that reach a refcount of 0 (e.g. compressed
* clusters), the next write will reuse them anyway.
*/
- if (j != 0) {
+ if (!m->keep_old_clusters && j != 0) {
for (i = 0; i < j; i++) {
qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
QCOW2_DISCARD_NEVER);
@@ -860,7 +872,7 @@ static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
for (i = 0; i < nb_clusters; i++) {
uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]);
- int cluster_type = qcow2_get_cluster_type(l2_entry);
+ QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
switch(cluster_type) {
case QCOW2_CLUSTER_NORMAL:
@@ -870,7 +882,8 @@ static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters,
break;
case QCOW2_CLUSTER_UNALLOCATED:
case QCOW2_CLUSTER_COMPRESSED:
- case QCOW2_CLUSTER_ZERO:
+ case QCOW2_CLUSTER_ZERO_PLAIN:
+ case QCOW2_CLUSTER_ZERO_ALLOC:
break;
default:
abort();
@@ -1132,8 +1145,9 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
uint64_t entry;
uint64_t nb_clusters;
int ret;
+ bool keep_old_clusters = false;
- uint64_t alloc_cluster_offset;
+ uint64_t alloc_cluster_offset = 0;
trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
*bytes);
@@ -1170,31 +1184,54 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
* wrong with our code. */
assert(nb_clusters > 0);
- qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+ if (qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO_ALLOC &&
+ (entry & QCOW_OFLAG_COPIED) &&
+ (!*host_offset ||
+ start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK)))
+ {
+ /* Try to reuse preallocated zero clusters; contiguous normal clusters
+ * would be fine, too, but count_cow_clusters() above has limited
+ * nb_clusters already to a range of COW clusters */
+ int preallocated_nb_clusters =
+ count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], QCOW_OFLAG_COPIED);
+ assert(preallocated_nb_clusters > 0);
- /* Allocate, if necessary at a given offset in the image file */
- alloc_cluster_offset = start_of_cluster(s, *host_offset);
- ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
- &nb_clusters);
- if (ret < 0) {
- goto fail;
- }
+ nb_clusters = preallocated_nb_clusters;
+ alloc_cluster_offset = entry & L2E_OFFSET_MASK;
- /* Can't extend contiguous allocation */
- if (nb_clusters == 0) {
- *bytes = 0;
- return 0;
+ /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2()
+ * should not free them. */
+ keep_old_clusters = true;
}
- /* !*host_offset would overwrite the image header and is reserved for "no
- * host offset preferred". If 0 was a valid host offset, it'd trigger the
- * following overlap check; do that now to avoid having an invalid value in
- * *host_offset. */
+ qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
+
if (!alloc_cluster_offset) {
- ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
- nb_clusters * s->cluster_size);
- assert(ret < 0);
- goto fail;
+ /* Allocate, if necessary at a given offset in the image file */
+ alloc_cluster_offset = start_of_cluster(s, *host_offset);
+ ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+ &nb_clusters);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ /* Can't extend contiguous allocation */
+ if (nb_clusters == 0) {
+ *bytes = 0;
+ return 0;
+ }
+
+ /* !*host_offset would overwrite the image header and is reserved for
+ * "no host offset preferred". If 0 was a valid host offset, it'd
+ * trigger the following overlap check; do that now to avoid having an
+ * invalid value in *host_offset. */
+ if (!alloc_cluster_offset) {
+ ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset,
+ nb_clusters * s->cluster_size);
+ assert(ret < 0);
+ goto fail;
+ }
}
/*
@@ -1225,6 +1262,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
.offset = start_of_cluster(s, guest_offset),
.nb_clusters = nb_clusters,
+ .keep_old_clusters = keep_old_clusters,
+
.cow_start = {
.offset = 0,
.nb_bytes = offset_into_cluster(s, guest_offset),
@@ -1472,24 +1511,25 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
* but rather fall through to the backing file.
*/
switch (qcow2_get_cluster_type(old_l2_entry)) {
- case QCOW2_CLUSTER_UNALLOCATED:
- if (full_discard || !bs->backing) {
- continue;
- }
- break;
+ case QCOW2_CLUSTER_UNALLOCATED:
+ if (full_discard || !bs->backing) {
+ continue;
+ }
+ break;
- case QCOW2_CLUSTER_ZERO:
- if (!full_discard) {
- continue;
- }
- break;
+ case QCOW2_CLUSTER_ZERO_PLAIN:
+ if (!full_discard) {
+ continue;
+ }
+ break;
- case QCOW2_CLUSTER_NORMAL:
- case QCOW2_CLUSTER_COMPRESSED:
- break;
+ case QCOW2_CLUSTER_ZERO_ALLOC:
+ case QCOW2_CLUSTER_NORMAL:
+ case QCOW2_CLUSTER_COMPRESSED:
+ break;
- default:
- abort();
+ default:
+ abort();
}
/* First remove L2 entries */
@@ -1509,35 +1549,36 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
return nb_clusters;
}
-int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
- int nb_sectors, enum qcow2_discard_type type, bool full_discard)
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, enum qcow2_discard_type type,
+ bool full_discard)
{
BDRVQcow2State *s = bs->opaque;
- uint64_t end_offset;
+ uint64_t end_offset = offset + bytes;
uint64_t nb_clusters;
+ int64_t cleared;
int ret;
- end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
-
- /* The caller must cluster-align start; round end down except at EOF */
+ /* Caller must pass aligned values, except at image end */
assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
- if (end_offset != bs->total_sectors * BDRV_SECTOR_SIZE) {
- end_offset = start_of_cluster(s, end_offset);
- }
+ assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+ end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
- nb_clusters = size_to_clusters(s, end_offset - offset);
+ nb_clusters = size_to_clusters(s, bytes);
s->cache_discards = true;
/* Each L2 table is handled by its own loop iteration */
while (nb_clusters > 0) {
- ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
- if (ret < 0) {
+ cleared = discard_single_l2(bs, offset, nb_clusters, type,
+ full_discard);
+ if (cleared < 0) {
+ ret = cleared;
goto fail;
}
- nb_clusters -= ret;
- offset += (ret * s->cluster_size);
+ nb_clusters -= cleared;
+ offset += (cleared * s->cluster_size);
}
ret = 0;
@@ -1561,6 +1602,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
int l2_index;
int ret;
int i;
+ bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);
ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
if (ret < 0) {
@@ -1573,12 +1615,22 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
for (i = 0; i < nb_clusters; i++) {
uint64_t old_offset;
+ QCow2ClusterType cluster_type;
old_offset = be64_to_cpu(l2_table[l2_index + i]);
- /* Update L2 entries */
+ /*
+ * Minimize L2 changes if the cluster already reads back as
+ * zeroes with correct allocation.
+ */
+ cluster_type = qcow2_get_cluster_type(old_offset);
+ if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN ||
+ (cluster_type == QCOW2_CLUSTER_ZERO_ALLOC && !unmap)) {
+ continue;
+ }
+
qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
- if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
+ if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) {
l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
} else {
@@ -1591,31 +1643,39 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset,
return nb_clusters;
}
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
- int flags)
+int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, int flags)
{
BDRVQcow2State *s = bs->opaque;
+ uint64_t end_offset = offset + bytes;
uint64_t nb_clusters;
+ int64_t cleared;
int ret;
+ /* Caller must pass aligned values, except at image end */
+ assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+ assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+ end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
+
/* The zero flag is only supported by version 3 and newer */
if (s->qcow_version < 3) {
return -ENOTSUP;
}
/* Each L2 table is handled by its own loop iteration */
- nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS);
+ nb_clusters = size_to_clusters(s, bytes);
s->cache_discards = true;
while (nb_clusters > 0) {
- ret = zero_single_l2(bs, offset, nb_clusters, flags);
- if (ret < 0) {
+ cleared = zero_single_l2(bs, offset, nb_clusters, flags);
+ if (cleared < 0) {
+ ret = cleared;
goto fail;
}
- nb_clusters -= ret;
- offset += (ret * s->cluster_size);
+ nb_clusters -= cleared;
+ offset += (cleared * s->cluster_size);
}
ret = 0;
@@ -1699,14 +1759,14 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
for (j = 0; j < s->l2_size; j++) {
uint64_t l2_entry = be64_to_cpu(l2_table[j]);
int64_t offset = l2_entry & L2E_OFFSET_MASK;
- int cluster_type = qcow2_get_cluster_type(l2_entry);
- bool preallocated = offset != 0;
+ QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
- if (cluster_type != QCOW2_CLUSTER_ZERO) {
+ if (cluster_type != QCOW2_CLUSTER_ZERO_PLAIN &&
+ cluster_type != QCOW2_CLUSTER_ZERO_ALLOC) {
continue;
}
- if (!preallocated) {
+ if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
if (!bs->backing) {
/* not backed; therefore we can simply deallocate the
* cluster */
@@ -1741,7 +1801,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
"%#" PRIx64 " unaligned (L2 offset: %#"
PRIx64 ", L2 index: %#x)", offset,
l2_offset, j);
- if (!preallocated) {
+ if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
qcow2_free_clusters(bs, offset, s->cluster_size,
QCOW2_DISCARD_ALWAYS);
}
@@ -1751,7 +1811,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
if (ret < 0) {
- if (!preallocated) {
+ if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
qcow2_free_clusters(bs, offset, s->cluster_size,
QCOW2_DISCARD_ALWAYS);
}
@@ -1760,7 +1820,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table,
ret = bdrv_pwrite_zeroes(bs->file, offset, s->cluster_size, 0);
if (ret < 0) {
- if (!preallocated) {
+ if (cluster_type == QCOW2_CLUSTER_ZERO_PLAIN) {
qcow2_free_clusters(bs, offset, s->cluster_size,
QCOW2_DISCARD_ALWAYS);
}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4efca7ebdb..7c06061aae 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1028,18 +1028,17 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
}
break;
case QCOW2_CLUSTER_NORMAL:
- case QCOW2_CLUSTER_ZERO:
- if (l2_entry & L2E_OFFSET_MASK) {
- if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
- qcow2_signal_corruption(bs, false, -1, -1,
- "Cannot free unaligned cluster %#llx",
- l2_entry & L2E_OFFSET_MASK);
- } else {
- qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
- nb_clusters << s->cluster_bits, type);
- }
+ case QCOW2_CLUSTER_ZERO_ALLOC:
+ if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
+ qcow2_signal_corruption(bs, false, -1, -1,
+ "Cannot free unaligned cluster %#llx",
+ l2_entry & L2E_OFFSET_MASK);
+ } else {
+ qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
+ nb_clusters << s->cluster_bits, type);
}
break;
+ case QCOW2_CLUSTER_ZERO_PLAIN:
case QCOW2_CLUSTER_UNALLOCATED:
break;
default:
@@ -1059,9 +1058,9 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
int64_t l1_table_offset, int l1_size, int addend)
{
BDRVQcow2State *s = bs->opaque;
- uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
+ uint64_t *l1_table, *l2_table, l2_offset, entry, l1_size2, refcount;
bool l1_allocated = false;
- int64_t old_offset, old_l2_offset;
+ int64_t old_entry, old_l2_offset;
int i, j, l1_modified = 0, nb_csectors;
int ret;
@@ -1089,15 +1088,16 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
goto fail;
}
- for(i = 0;i < l1_size; i++)
+ for (i = 0; i < l1_size; i++) {
be64_to_cpus(&l1_table[i]);
+ }
} else {
assert(l1_size == s->l1_size);
l1_table = s->l1_table;
l1_allocated = false;
}
- for(i = 0; i < l1_size; i++) {
+ for (i = 0; i < l1_size; i++) {
l2_offset = l1_table[i];
if (l2_offset) {
old_l2_offset = l2_offset;
@@ -1117,81 +1117,79 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
goto fail;
}
- for(j = 0; j < s->l2_size; j++) {
+ for (j = 0; j < s->l2_size; j++) {
uint64_t cluster_index;
-
- offset = be64_to_cpu(l2_table[j]);
- old_offset = offset;
- offset &= ~QCOW_OFLAG_COPIED;
-
- switch (qcow2_get_cluster_type(offset)) {
- case QCOW2_CLUSTER_COMPRESSED:
- nb_csectors = ((offset >> s->csize_shift) &
- s->csize_mask) + 1;
- if (addend != 0) {
- ret = update_refcount(bs,
- (offset & s->cluster_offset_mask) & ~511,
+ uint64_t offset;
+
+ entry = be64_to_cpu(l2_table[j]);
+ old_entry = entry;
+ entry &= ~QCOW_OFLAG_COPIED;
+ offset = entry & L2E_OFFSET_MASK;
+
+ switch (qcow2_get_cluster_type(entry)) {
+ case QCOW2_CLUSTER_COMPRESSED:
+ nb_csectors = ((entry >> s->csize_shift) &
+ s->csize_mask) + 1;
+ if (addend != 0) {
+ ret = update_refcount(bs,
+ (entry & s->cluster_offset_mask) & ~511,
nb_csectors * 512, abs(addend), addend < 0,
QCOW2_DISCARD_SNAPSHOT);
- if (ret < 0) {
- goto fail;
- }
- }
- /* compressed clusters are never modified */
- refcount = 2;
- break;
-
- case QCOW2_CLUSTER_NORMAL:
- case QCOW2_CLUSTER_ZERO:
- if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
- qcow2_signal_corruption(bs, true, -1, -1, "Data "
- "cluster offset %#llx "
- "unaligned (L2 offset: %#"
- PRIx64 ", L2 index: %#x)",
- offset & L2E_OFFSET_MASK,
- l2_offset, j);
- ret = -EIO;
+ if (ret < 0) {
goto fail;
}
+ }
+ /* compressed clusters are never modified */
+ refcount = 2;
+ break;
+
+ case QCOW2_CLUSTER_NORMAL:
+ case QCOW2_CLUSTER_ZERO_ALLOC:
+ if (offset_into_cluster(s, offset)) {
+ qcow2_signal_corruption(bs, true, -1, -1, "Cluster "
+ "allocation offset %#" PRIx64
+ " unaligned (L2 offset: %#"
+ PRIx64 ", L2 index: %#x)",
+ offset, l2_offset, j);
+ ret = -EIO;
+ goto fail;
+ }
- cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
- if (!cluster_index) {
- /* unallocated */
- refcount = 0;
- break;
- }
- if (addend != 0) {
- ret = qcow2_update_cluster_refcount(bs,
+ cluster_index = offset >> s->cluster_bits;
+ assert(cluster_index);
+ if (addend != 0) {
+ ret = qcow2_update_cluster_refcount(bs,
cluster_index, abs(addend), addend < 0,
QCOW2_DISCARD_SNAPSHOT);
- if (ret < 0) {
- goto fail;
- }
- }
-
- ret = qcow2_get_refcount(bs, cluster_index, &refcount);
if (ret < 0) {
goto fail;
}
- break;
+ }
- case QCOW2_CLUSTER_UNALLOCATED:
- refcount = 0;
- break;
+ ret = qcow2_get_refcount(bs, cluster_index, &refcount);
+ if (ret < 0) {
+ goto fail;
+ }
+ break;
+
+ case QCOW2_CLUSTER_ZERO_PLAIN:
+ case QCOW2_CLUSTER_UNALLOCATED:
+ refcount = 0;
+ break;
- default:
- abort();
+ default:
+ abort();
}
if (refcount == 1) {
- offset |= QCOW_OFLAG_COPIED;
+ entry |= QCOW_OFLAG_COPIED;
}
- if (offset != old_offset) {
+ if (entry != old_entry) {
if (addend > 0) {
qcow2_cache_set_dependency(bs, s->l2_table_cache,
s->refcount_block_cache);
}
- l2_table[j] = cpu_to_be64(offset);
+ l2_table[j] = cpu_to_be64(entry);
qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
l2_table);
}
@@ -1441,12 +1439,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
}
break;
- case QCOW2_CLUSTER_ZERO:
- if ((l2_entry & L2E_OFFSET_MASK) == 0) {
- break;
- }
- /* fall through */
-
+ case QCOW2_CLUSTER_ZERO_ALLOC:
case QCOW2_CLUSTER_NORMAL:
{
uint64_t offset = l2_entry & L2E_OFFSET_MASK;
@@ -1476,6 +1469,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
break;
}
+ case QCOW2_CLUSTER_ZERO_PLAIN:
case QCOW2_CLUSTER_UNALLOCATED:
break;
@@ -1638,10 +1632,10 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
for (j = 0; j < s->l2_size; j++) {
uint64_t l2_entry = be64_to_cpu(l2_table[j]);
uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
- int cluster_type = qcow2_get_cluster_type(l2_entry);
+ QCow2ClusterType cluster_type = qcow2_get_cluster_type(l2_entry);
- if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
- ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
+ if (cluster_type == QCOW2_CLUSTER_NORMAL ||
+ cluster_type == QCOW2_CLUSTER_ZERO_ALLOC) {
ret = qcow2_get_refcount(bs,
data_offset >> s->cluster_bits,
&refcount);
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 032424322a..44243e0e95 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -440,10 +440,9 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
/* The VM state isn't needed any more in the active L1 table; in fact, it
* hurts by causing expensive COW for the next snapshot. */
- qcow2_discard_clusters(bs, qcow2_vm_state_offset(s),
- align_offset(sn->vm_state_size, s->cluster_size)
- >> BDRV_SECTOR_BITS,
- QCOW2_DISCARD_NEVER, false);
+ qcow2_cluster_discard(bs, qcow2_vm_state_offset(s),
+ align_offset(sn->vm_state_size, s->cluster_size),
+ QCOW2_DISCARD_NEVER, false);
#ifdef DEBUG_ALLOC
{
diff --git a/block/qcow2.c b/block/qcow2.c
index 1c2697732b..a8d61f0981 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1385,7 +1385,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
*file = bs->file->bs;
status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
}
- if (ret == QCOW2_CLUSTER_ZERO) {
+ if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
status |= BDRV_BLOCK_ZERO;
} else if (ret != QCOW2_CLUSTER_UNALLOCATED) {
status |= BDRV_BLOCK_DATA;
@@ -1482,7 +1482,8 @@ static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
}
break;
- case QCOW2_CLUSTER_ZERO:
+ case QCOW2_CLUSTER_ZERO_PLAIN:
+ case QCOW2_CLUSTER_ZERO_ALLOC:
qemu_iovec_memset(&hd_qiov, 0, 0, cur_bytes);
break;
@@ -2139,7 +2140,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
* too, as long as the bulk is allocated here). Therefore, using
* floating point arithmetic is fine. */
int64_t meta_size = 0;
- uint64_t nreftablee, nrefblocke, nl1e, nl2e;
+ uint64_t nreftablee, nrefblocke, nl1e, nl2e, refblock_count;
int64_t aligned_total_size = align_offset(total_size, cluster_size);
int refblock_bits, refblock_size;
/* refcount entry size in bytes */
@@ -2182,11 +2183,12 @@ static int qcow2_create2(const char *filename, int64_t total_size,
nrefblocke = (aligned_total_size + meta_size + cluster_size)
/ (cluster_size - rces - rces * sizeof(uint64_t)
/ cluster_size);
- meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size;
+ refblock_count = DIV_ROUND_UP(nrefblocke, refblock_size);
+ meta_size += refblock_count * cluster_size;
/* total size of refcount tables */
- nreftablee = nrefblocke / refblock_size;
- nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t));
+ nreftablee = align_offset(refblock_count,
+ cluster_size / sizeof(uint64_t));
meta_size += nreftablee * sizeof(uint64_t);
qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
@@ -2449,6 +2451,10 @@ static bool is_zero_sectors(BlockDriverState *bs, int64_t start,
BlockDriverState *file;
int64_t res;
+ if (start + count > bs->total_sectors) {
+ count = bs->total_sectors - start;
+ }
+
if (!count) {
return true;
}
@@ -2467,6 +2473,9 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
uint32_t tail = (offset + count) % s->cluster_size;
trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
+ if (offset + count == bs->total_sectors * BDRV_SECTOR_SIZE) {
+ tail = 0;
+ }
if (head || tail) {
int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
@@ -2490,7 +2499,9 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
count = s->cluster_size;
nr = s->cluster_size;
ret = qcow2_get_cluster_offset(bs, offset, &nr, &off);
- if (ret != QCOW2_CLUSTER_UNALLOCATED && ret != QCOW2_CLUSTER_ZERO) {
+ if (ret != QCOW2_CLUSTER_UNALLOCATED &&
+ ret != QCOW2_CLUSTER_ZERO_PLAIN &&
+ ret != QCOW2_CLUSTER_ZERO_ALLOC) {
qemu_co_mutex_unlock(&s->lock);
return -ENOTSUP;
}
@@ -2501,7 +2512,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,
trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count);
/* Whatever is left can use real zero clusters */
- ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS, flags);
+ ret = qcow2_cluster_zeroize(bs, offset, count, flags);
qemu_co_mutex_unlock(&s->lock);
return ret;
@@ -2524,8 +2535,8 @@ static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
}
qemu_co_mutex_lock(&s->lock);
- ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
- QCOW2_DISCARD_REQUEST, false);
+ ret = qcow2_cluster_discard(bs, offset, count, QCOW2_DISCARD_REQUEST,
+ false);
qemu_co_mutex_unlock(&s->lock);
return ret;
}
@@ -2832,9 +2843,8 @@ fail:
static int qcow2_make_empty(BlockDriverState *bs)
{
BDRVQcow2State *s = bs->opaque;
- uint64_t start_sector;
- int sector_step = (QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size) /
- BDRV_SECTOR_SIZE);
+ uint64_t offset, end_offset;
+ int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
int l1_clusters, ret = 0;
l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t));
@@ -2851,18 +2861,15 @@ static int qcow2_make_empty(BlockDriverState *bs)
/* This fallback code simply discards every active cluster; this is slow,
* but works in all cases */
- for (start_sector = 0; start_sector < bs->total_sectors;
- start_sector += sector_step)
- {
+ end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
+ for (offset = 0; offset < end_offset; offset += step) {
/* As this function is generally used after committing an external
* snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
* default action for this kind of discard is to pass the discard,
* which will ideally result in an actually smaller image file, as
* is probably desired. */
- ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE,
- MIN(sector_step,
- bs->total_sectors - start_sector),
- QCOW2_DISCARD_SNAPSHOT, true);
+ ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
+ QCOW2_DISCARD_SNAPSHOT, true);
if (ret < 0) {
break;
}
diff --git a/block/qcow2.h b/block/qcow2.h
index f8aeb08794..1801dc30dc 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -322,6 +322,9 @@ typedef struct QCowL2Meta
/** Number of newly allocated clusters */
int nb_clusters;
+ /** Do not free the old clusters */
+ bool keep_old_clusters;
+
/**
* Requests that overlap with this allocation and wait to be restarted
* when the allocating request has completed.
@@ -346,12 +349,13 @@ typedef struct QCowL2Meta
QLIST_ENTRY(QCowL2Meta) next_in_flight;
} QCowL2Meta;
-enum {
+typedef enum QCow2ClusterType {
QCOW2_CLUSTER_UNALLOCATED,
+ QCOW2_CLUSTER_ZERO_PLAIN,
+ QCOW2_CLUSTER_ZERO_ALLOC,
QCOW2_CLUSTER_NORMAL,
QCOW2_CLUSTER_COMPRESSED,
- QCOW2_CLUSTER_ZERO
-};
+} QCow2ClusterType;
typedef enum QCow2MetadataOverlap {
QCOW2_OL_MAIN_HEADER_BITNR = 0,
@@ -440,12 +444,15 @@ static inline uint64_t qcow2_max_refcount_clusters(BDRVQcow2State *s)
return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits;
}
-static inline int qcow2_get_cluster_type(uint64_t l2_entry)
+static inline QCow2ClusterType qcow2_get_cluster_type(uint64_t l2_entry)
{
if (l2_entry & QCOW_OFLAG_COMPRESSED) {
return QCOW2_CLUSTER_COMPRESSED;
} else if (l2_entry & QCOW_OFLAG_ZERO) {
- return QCOW2_CLUSTER_ZERO;
+ if (l2_entry & L2E_OFFSET_MASK) {
+ return QCOW2_CLUSTER_ZERO_ALLOC;
+ }
+ return QCOW2_CLUSTER_ZERO_PLAIN;
} else if (!(l2_entry & L2E_OFFSET_MASK)) {
return QCOW2_CLUSTER_UNALLOCATED;
} else {
@@ -544,10 +551,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
int compressed_size);
int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
-int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
- int nb_sectors, enum qcow2_discard_type type, bool full_discard);
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
- int flags);
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, enum qcow2_discard_type type,
+ bool full_discard);
+int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, int flags);
int qcow2_expand_zero_clusters(BlockDriverState *bs,
BlockDriverAmendStatusCB *status_cb,