From 54fa16ee532705985e6c946da455856f18f63ee1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 2 Jul 2019 15:50:08 -0400 Subject: dm thin metadata: check if in fail_io mode when setting needs_check Check if in fail_io mode at start of dm_pool_metadata_set_needs_check(). Otherwise dm_pool_metadata_set_needs_check()'s superblock_lock() can crash in dm_bm_write_lock() while accessing the block manager object that was previously destroyed as part of a failed dm_pool_abort_metadata() that ultimately set fail_io to begin with. Also, update DMERR() message to more accurately describe superblock_lock() failure. Cc: stable@vger.kernel.org Reported-by: Zdenek Kabelac Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 7f0840601737..4c68a7b93d5e 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -2046,16 +2046,19 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) { - int r; + int r = -EINVAL; struct dm_block *sblock; struct thin_disk_superblock *disk_super; pmd_write_lock(pmd); + if (pmd->fail_io) + goto out; + pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; r = superblock_lock(pmd, &sblock); if (r) { - DMERR("couldn't read superblock"); + DMERR("couldn't lock superblock"); goto out; } -- cgit v1.2.3-55-g7522 From 5f1c56b34e81e2d63f345f7ad6c5c7971c3c314d Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 22 May 2019 13:29:44 +0200 Subject: dm integrity: always set version on superblock update The new integrity bitmap mode uses the dirty flag. The dirty flag should not be set in older superblock versions. The current code sets it unconditionally, even if the superblock was already formatted without bitmap in older system. Fix this by moving the version check to one common place and check version on every superblock write. Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 44e76cda087a..a2ab6a32b174 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -476,6 +476,9 @@ static int sync_rw_sb(struct dm_integrity_c *ic, int op, int op_flags) io_loc.sector = ic->start; io_loc.count = SB_SECTORS; + if (op == REQ_OP_WRITE) + sb_set_version(ic); + return dm_io(&io_req, 1, &io_loc, NULL); } @@ -2317,7 +2320,6 @@ static void recalc_write_super(struct dm_integrity_c *ic) if (dm_integrity_failed(ic)) return; - sb_set_version(ic); r = sync_rw_sb(ic, REQ_OP_WRITE, 0); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); -- cgit v1.2.3-55-g7522 From 9c81c99b242f1241e18573d1ce29f7479c168e38 Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Wed, 12 Jun 2019 14:14:45 +0800 Subject: dm crypt: use struct_size() when allocating encryption context Use struct_size() to avoid open-coded equivalent that is prone to a type mistake. Signed-off-by: Zhengyuan Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 1b16d34bb785..2587e94b0511 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2699,7 +2699,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL); + cc = kzalloc(struct_size(cc, key, key_size), GFP_KERNEL); if (!cc) { ti->error = "Cannot allocate encryption context"; return -ENOMEM; -- cgit v1.2.3-55-g7522 From d4e6e83651777224c0259e01dde5eb078f9e6b1d Mon Sep 17 00:00:00 2001 From: Zhengyuan Liu Date: Wed, 12 Jun 2019 14:14:46 +0800 Subject: dm log writes: use struct_size() to calculate size of pending_block Use struct_size() to avoid open-coded equivalent that is prone to a type mistake. Signed-off-by: Zhengyuan Liu Signed-off-by: Mike Snitzer --- drivers/md/dm-log-writes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index e549392e0ea5..0837b17e1798 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -699,7 +699,7 @@ static int log_writes_map(struct dm_target *ti, struct bio *bio) if (discard_bio) alloc_size = sizeof(struct pending_block); else - alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio); + alloc_size = struct_size(block, vecs, bio_segments(bio)); block = kzalloc(alloc_size, GFP_NOIO); if (!block) { -- cgit v1.2.3-55-g7522 From 7537dad791cdbaf856c05f9dd977556ccd830ef0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 18 Jun 2019 13:39:38 +0800 Subject: dm log writes: fix incorrect comment about the logged sequence example dm-log-writes records the sequence of completion, not submission, thus for the following sequence (W=write, C=complete): Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd The logged results in log device should be: c,a,b,flush,fua Fix the comment to give a better example. Signed-off-by: Qu Wenruo Signed-off-by: Mike Snitzer --- drivers/md/dm-log-writes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 0837b17e1798..99721c76225d 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -40,7 +40,7 @@ * * Would result in the log looking like this: * - * c,a,flush,fuad,b,, + * c,a,b,flush,fuad,, * * This is meant to help expose problems where file systems do not properly wait * on data being written before invoking a FLUSH. FUA bypasses cache so once it -- cgit v1.2.3-55-g7522 From d370ad23a5553f9128da24e029993f4091bc04d7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 20 Jun 2019 20:50:50 +0300 Subject: dm: update stale comment in end_clone_bio() Since commit a1ce35fa49852db60fc6e268 ("block: remove dead elevator code") blk_end_request() has been replaced with blk_mq_end_request(). So update comment to reference blk_mq_end_request() accordingly. Signed-off-by: Pavel Begunkov Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 5f7063f05ae0..c9e44ac1f9a6 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -115,7 +115,7 @@ static void end_clone_bio(struct bio *clone) /* * Update the original request. - * Do not use blk_end_request() here, because it may complete + * Do not use blk_mq_end_request() here, because it may complete * the original request before the clone, and break the ordering. */ if (is_last) -- cgit v1.2.3-55-g7522 From 131670c2625307551c23970ebcc0f8bc0fc9b4ef Mon Sep 17 00:00:00 2001 From: Fuqian Huang Date: Fri, 28 Jun 2019 10:47:34 +0800 Subject: dm integrity: use kzalloc() instead of kmalloc() + memset() Signed-off-by: Fuqian Huang Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index a2ab6a32b174..b1b0de402dfc 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3360,7 +3360,7 @@ static int create_journal(struct dm_integrity_c *ic, char **error) goto bad; } - crypt_iv = kmalloc(ivsize, GFP_KERNEL); + crypt_iv = kzalloc(ivsize, GFP_KERNEL); if (!crypt_iv) { *error = "Could not allocate iv"; r = -ENOMEM; @@ -3389,7 +3389,6 @@ static int create_journal(struct dm_integrity_c *ic, char **error) sg_set_buf(&sg[i], va, PAGE_SIZE); } sg_set_buf(&sg[i], &ic->commit_ids, sizeof ic->commit_ids); - memset(crypt_iv, 0x00, ivsize); skcipher_request_set_crypt(req, sg, sg, PAGE_SIZE * ic->journal_pages + sizeof ic->commit_ids, crypt_iv); -- cgit v1.2.3-55-g7522 From 4a52ffc7ca6f03005ce10c67412752dd068f79a3 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 9 Jul 2019 15:22:12 +0200 Subject: dm crypt: wipe private IV struct after key invalid flag is set If a private IV wipe function fails, the code does not set the key invalid flag. To fix this, move code to after the flag is set to prevent the device from resuming in an inconsistent state. Also, this allows using of a randomized key in private wipe function (to be used in a following commit). Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2587e94b0511..cd349e925649 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2158,6 +2158,14 @@ static int crypt_wipe_key(struct crypt_config *cc) clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); get_random_bytes(&cc->key, cc->key_size); + + /* Wipe IV private keys */ + if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { + r = cc->iv_gen_ops->wipe(cc); + if (r) + return r; + } + kzfree(cc->key_string); cc->key_string = NULL; r = crypt_setkey(cc); @@ -3050,14 +3058,8 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv, memset(cc->key, 0, cc->key_size * sizeof(u8)); return ret; } - if (argc == 2 && !strcasecmp(argv[1], "wipe")) { - if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) { - ret = cc->iv_gen_ops->wipe(cc); - if (ret) - return ret; - } + if (argc == 2 && !strcasecmp(argv[1], "wipe")) return crypt_wipe_key(cc); - } } error: -- cgit v1.2.3-55-g7522 From 6028a7a5a3d604855728fd8da8f4708892b13764 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 9 Jul 2019 15:22:13 +0200 Subject: dm crypt: remove obsolete comment about plumb IV The URL is no longer valid and the comment is obsolete anyway (the plumb IV was never used). Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index cd349e925649..e14730f720ce 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -290,9 +290,6 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) * is calculated from initial key, sector number and mixed using CRC32. * Note that this encryption scheme is vulnerable to watermarking attacks * and should be used for old compatible containers access only. - * - * plumb: unimplemented, see: - * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454 */ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, -- cgit v1.2.3-55-g7522 From b9411d73bd3eb5773d0ce02b8008ec21b7c447ce Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 9 Jul 2019 15:22:14 +0200 Subject: dm crypt: implement eboiv - encrypted byte-offset initialization vector This IV is used in some BitLocker devices with CBC encryption mode. IV is encrypted little-endian byte-offset (with the same key and cipher as the volume). Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e14730f720ce..d5216bcc4649 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -120,6 +120,10 @@ struct iv_tcw_private { u8 *whitening; }; +struct iv_eboiv_private { + struct crypto_cipher *tfm; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -159,6 +163,7 @@ struct crypt_config { struct iv_benbi_private benbi; struct iv_lmk_private lmk; struct iv_tcw_private tcw; + struct iv_eboiv_private eboiv; } iv_gen_private; u64 iv_offset; unsigned int iv_size; @@ -290,6 +295,10 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) * is calculated from initial key, sector number and mixed using CRC32. * Note that this encryption scheme is vulnerable to watermarking attacks * and should be used for old compatible containers access only. + * + * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode) + * The IV is encrypted little-endian byte-offset (with the same key + * and cipher as the volume). */ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, @@ -838,6 +847,67 @@ static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, return 0; } +static void crypt_iv_eboiv_dtr(struct crypt_config *cc) +{ + struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; + + crypto_free_cipher(eboiv->tfm); + eboiv->tfm = NULL; +} + +static int crypt_iv_eboiv_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; + struct crypto_cipher *tfm; + + tfm = crypto_alloc_cipher(cc->cipher, 0, 0); + if (IS_ERR(tfm)) { + ti->error = "Error allocating crypto tfm for EBOIV"; + return PTR_ERR(tfm); + } + + if (crypto_cipher_blocksize(tfm) != cc->iv_size) { + ti->error = "Block size of EBOIV cipher does " + "not match IV size of block cipher"; + crypto_free_cipher(tfm); + return -EINVAL; + } + + eboiv->tfm = tfm; + return 0; +} + +static int crypt_iv_eboiv_init(struct crypt_config *cc) +{ + struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; + int err; + + err = crypto_cipher_setkey(eboiv->tfm, cc->key, cc->key_size); + if (err) + return err; + + return 0; +} + +static int crypt_iv_eboiv_wipe(struct crypt_config *cc) +{ + /* Called after cc->key is set to random key in crypt_wipe() */ + return crypt_iv_eboiv_init(cc); +} + +static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + struct iv_eboiv_private *eboiv = &cc->iv_gen_private.eboiv; + + memset(iv, 0, cc->iv_size); + *(__le64 *)iv = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + crypto_cipher_encrypt_one(eboiv->tfm, iv, iv); + + return 0; +} + static const struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -890,6 +960,14 @@ static struct crypt_iv_operations crypt_iv_random_ops = { .generator = crypt_iv_random_gen }; +static struct crypt_iv_operations crypt_iv_eboiv_ops = { + .ctr = crypt_iv_eboiv_ctr, + .dtr = crypt_iv_eboiv_dtr, + .init = crypt_iv_eboiv_init, + .wipe = crypt_iv_eboiv_wipe, + .generator = crypt_iv_eboiv_gen +}; + /* * Integrity extensions */ @@ -2293,6 +2371,8 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) cc->iv_gen_ops = &crypt_iv_benbi_ops; else if (strcmp(ivmode, "null") == 0) cc->iv_gen_ops = &crypt_iv_null_ops; + else if (strcmp(ivmode, "eboiv") == 0) + cc->iv_gen_ops = &crypt_iv_eboiv_ops; else if (strcmp(ivmode, "lmk") == 0) { cc->iv_gen_ops = &crypt_iv_lmk_ops; /* @@ -3093,7 +3173,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 18, 1}, + .version = {1, 19, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit v1.2.3-55-g7522 From 2e6023850e177dbaca21498ada04c5a5ac93f812 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 19 Jun 2019 17:05:54 -0400 Subject: dm snapshot: add optional discard support features discard_zeroes_cow - a discard issued to the snapshot device that maps to entire chunks to will zero the corresponding exception(s) in the snapshot's exception store. discard_passdown_origin - a discard to the snapshot device is passed down to the snapshot-origin's underlying device. This doesn't cause copy-out to the snapshot exception store because the snapshot-origin target is bypassed. The discard_passdown_origin feature depends on the discard_zeroes_cow feature being enabled. When these 2 features are enabled they allow a temporarily read-only device that has completely exhausted its free space to recover space. To do so dm-snapshot provides temporary buffer to accommodate writes that the temporarily read-only device cannot handle yet. Once the upper layer frees space (e.g. fstrim to XFS) the discards issued to the dm-snapshot target will be issued to underlying read-only device whose free space was exhausted. In addition those discards will also cause zeroes to be written to the snapshot exception store if corresponding exceptions exist. If the underlying origin device provides deduplication for zero blocks then if/when the snapshot is merged backed to the origin those blocks will become unused. Once the origin has gained adequate space, merging the snapshot back to the thinly provisioned device will permit continued use of that device without the temporary space provided by the snapshot. Requested-by: John Dorminy Signed-off-by: Mike Snitzer --- Documentation/device-mapper/snapshot.txt | 16 +++ drivers/md/dm-snap.c | 186 +++++++++++++++++++++++++++---- 2 files changed, 181 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/Documentation/device-mapper/snapshot.txt b/Documentation/device-mapper/snapshot.txt index b8bbb516f989..1810833f6dc6 100644 --- a/Documentation/device-mapper/snapshot.txt +++ b/Documentation/device-mapper/snapshot.txt @@ -31,6 +31,7 @@ its visible content unchanged, at least until the fills up. *) snapshot + [<# feature args> []*] A snapshot of the block device is created. Changed chunks of sectors will be stored on the . Writes will @@ -53,8 +54,23 @@ When loading or unloading the snapshot target, the corresponding snapshot-origin or snapshot-merge target must be suspended. A failure to suspend the origin target could result in data corruption. +Optional features: + + discard_zeroes_cow - a discard issued to the snapshot device that + maps to entire chunks to will zero the corresponding exception(s) in + the snapshot's exception store. + + discard_passdown_origin - a discard to the snapshot device is passed + down to the snapshot-origin's underlying device. This doesn't cause + copy-out to the snapshot exception store because the snapshot-origin + target is bypassed. + + The discard_passdown_origin feature depends on the discard_zeroes_cow + feature being enabled. + * snapshot-merge + [<# feature args> []*] takes the same table arguments as the snapshot target except it only works with persistent snapshots. This target assumes the role of the diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3107f2b1988b..63916e1dc569 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1,6 +1,4 @@ /* - * dm-snapshot.c - * * Copyright (C) 2001-2002 Sistina Software (UK) Limited. * * This file is released under the GPL. @@ -134,7 +132,10 @@ struct dm_snapshot { * - I/O error while merging * => stop merging; set merge_failed; process I/O normally. */ - int merge_failed; + bool merge_failed:1; + + bool discard_zeroes_cow:1; + bool discard_passdown_origin:1; /* * Incoming bios that overlap with chunks being merged must wait @@ -1173,12 +1174,64 @@ static void stop_merge(struct dm_snapshot *s) clear_bit(SHUTDOWN_MERGE, &s->state_bits); } +static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s, + struct dm_target *ti) +{ + int r; + unsigned argc; + const char *arg_name; + + static const struct dm_arg _args[] = { + {0, 2, "Invalid number of feature arguments"}, + }; + + /* + * No feature arguments supplied. + */ + if (!as->argc) + return 0; + + r = dm_read_arg_group(_args, as, &argc, &ti->error); + if (r) + return -EINVAL; + + while (argc && !r) { + arg_name = dm_shift_arg(as); + argc--; + + if (!strcasecmp(arg_name, "discard_zeroes_cow")) + s->discard_zeroes_cow = true; + + else if (!strcasecmp(arg_name, "discard_passdown_origin")) + s->discard_passdown_origin = true; + + else { + ti->error = "Unrecognised feature requested"; + r = -EINVAL; + break; + } + } + + if (!s->discard_zeroes_cow && s->discard_passdown_origin) { + /* + * TODO: really these are disjoint.. but ti->num_discard_bios + * and dm_bio_get_target_bio_nr() require rigid constraints. + */ + ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow"; + r = -EINVAL; + } + + return r; +} + /* - * Construct a snapshot mapping: + * Construct a snapshot mapping: + * [<# feature args> []*] */ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct dm_snapshot *s; + struct dm_arg_set as; int i; int r = -EINVAL; char *origin_path, *cow_path; @@ -1186,8 +1239,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned args_used, num_flush_bios = 1; fmode_t origin_mode = FMODE_READ; - if (argc != 4) { - ti->error = "requires exactly 4 arguments"; + if (argc < 4) { + ti->error = "requires 4 or more arguments"; r = -EINVAL; goto bad; } @@ -1204,6 +1257,13 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } + as.argc = argc; + as.argv = argv; + dm_consume_args(&as, 4); + r = parse_snapshot_features(&as, s, ti); + if (r) + goto bad_features; + origin_path = argv[0]; argv++; argc--; @@ -1289,6 +1349,8 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->num_flush_bios = num_flush_bios; + if (s->discard_zeroes_cow) + ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1); ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); /* Add snapshot to the list of snapshots for this origin */ @@ -1336,29 +1398,22 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) bad_read_metadata: unregister_snapshot(s); - bad_load_and_register: mempool_exit(&s->pending_pool); - bad_pending_pool: dm_kcopyd_client_destroy(s->kcopyd_client); - bad_kcopyd: dm_exception_table_exit(&s->pending, pending_cache); dm_exception_table_exit(&s->complete, exception_cache); - bad_hash_tables: dm_exception_store_destroy(s->store); - bad_store: dm_put_device(ti, s->cow); - bad_cow: dm_put_device(ti, s->origin); - bad_origin: +bad_features: kfree(s); - bad: return r; } @@ -1806,6 +1861,37 @@ static void remap_exception(struct dm_snapshot *s, struct dm_exception *e, (bio->bi_iter.bi_sector & s->store->chunk_mask); } +static void zero_callback(int read_err, unsigned long write_err, void *context) +{ + struct bio *bio = context; + struct dm_snapshot *s = bio->bi_private; + + up(&s->cow_count); + bio->bi_status = write_err ? BLK_STS_IOERR : 0; + bio_endio(bio); +} + +static void zero_exception(struct dm_snapshot *s, struct dm_exception *e, + struct bio *bio, chunk_t chunk) +{ + struct dm_io_region dest; + + dest.bdev = s->cow->bdev; + dest.sector = bio->bi_iter.bi_sector; + dest.count = s->store->chunk_size; + + down(&s->cow_count); + WARN_ON_ONCE(bio->bi_private); + bio->bi_private = s; + dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio); +} + +static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio) +{ + return bio->bi_iter.bi_size == + (s->store->chunk_size << SECTOR_SHIFT); +} + static int snapshot_map(struct dm_target *ti, struct bio *bio) { struct dm_exception *e; @@ -1839,10 +1925,43 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) goto out_unlock; } + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) { + /* + * passdown discard to origin (without triggering + * snapshot exceptions via do_origin; doing so would + * defeat the goal of freeing space in origin that is + * implied by the "discard_passdown_origin" feature) + */ + bio_set_dev(bio, s->origin->bdev); + track_chunk(s, bio, chunk); + goto out_unlock; + } + /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */ + } + /* If the block is already remapped - use that, else remap it */ e = dm_lookup_exception(&s->complete, chunk); if (e) { remap_exception(s, e, bio, chunk); + if (unlikely(bio_op(bio) == REQ_OP_DISCARD) && + io_overlaps_chunk(s, bio)) { + dm_exception_table_unlock(&lock); + up_read(&s->lock); + zero_exception(s, e, bio, chunk); + r = DM_MAPIO_SUBMITTED; /* discard is not issued */ + goto out; + } + goto out_unlock; + } + + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { + /* + * If no exception exists, complete discard immediately + * otherwise it'll trigger copy-out. + */ + bio_endio(bio); + r = DM_MAPIO_SUBMITTED; goto out_unlock; } @@ -1890,9 +2009,7 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio) r = DM_MAPIO_SUBMITTED; - if (!pe->started && - bio->bi_iter.bi_size == - (s->store->chunk_size << SECTOR_SHIFT)) { + if (!pe->started && io_overlaps_chunk(s, bio)) { pe->started = 1; dm_exception_table_unlock(&lock); @@ -2138,6 +2255,7 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, { unsigned sz = 0; struct dm_snapshot *snap = ti->private; + unsigned num_features; switch (type) { case STATUSTYPE_INFO: @@ -2178,8 +2296,16 @@ static void snapshot_status(struct dm_target *ti, status_type_t type, * make sense. */ DMEMIT("%s %s", snap->origin->name, snap->cow->name); - snap->store->type->status(snap->store, type, result + sz, - maxlen - sz); + sz += snap->store->type->status(snap->store, type, result + sz, + maxlen - sz); + num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin; + if (num_features) { + DMEMIT(" %u", num_features); + if (snap->discard_zeroes_cow) + DMEMIT(" discard_zeroes_cow"); + if (snap->discard_passdown_origin) + DMEMIT(" discard_passdown_origin"); + } break; } } @@ -2198,6 +2324,22 @@ static int snapshot_iterate_devices(struct dm_target *ti, return r; } +static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct dm_snapshot *snap = ti->private; + + if (snap->discard_zeroes_cow) { + struct dm_snapshot *snap_src = NULL, *snap_dest = NULL; + + (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL); + if (snap_src && snap_dest) + snap = snap_src; + + /* All discards are split on chunk_size boundary */ + limits->discard_granularity = snap->store->chunk_size; + limits->max_discard_sectors = snap->store->chunk_size; + } +} /*----------------------------------------------------------------- * Origin methods @@ -2522,7 +2664,7 @@ static struct target_type origin_target = { static struct target_type snapshot_target = { .name = "snapshot", - .version = {1, 15, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -2532,11 +2674,12 @@ static struct target_type snapshot_target = { .resume = snapshot_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, + .io_hints = snapshot_io_hints, }; static struct target_type merge_target = { .name = dm_snapshot_merge_target_name, - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = snapshot_ctr, .dtr = snapshot_dtr, @@ -2547,6 +2690,7 @@ static struct target_type merge_target = { .resume = snapshot_merge_resume, .status = snapshot_status, .iterate_devices = snapshot_iterate_devices, + .io_hints = snapshot_io_hints, }; static int __init dm_snapshot_init(void) -- cgit v1.2.3-55-g7522 From bd293d071ffe65e645b4d8104f9d8fe15ea13862 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Tue, 9 Jul 2019 17:17:19 -0700 Subject: dm bufio: fix deadlock with loop device When thin-volume is built on loop device, if available memory is low, the following deadlock can be triggered: One process P1 allocates memory with GFP_FS flag, direct alloc fails, memory reclaim invokes memory shrinker in dm_bufio, dm_bufio_shrink_scan() runs, mutex dm_bufio_client->lock is acquired, then P1 waits for dm_buffer IO to complete in __try_evict_buffer(). But this IO may never complete if issued to an underlying loop device that forwards it using direct-IO, which allocates memory using GFP_KERNEL (see: do_blockdev_direct_IO()). If allocation fails, memory reclaim will invoke memory shrinker in dm_bufio, dm_bufio_shrink_scan() will be invoked, and since the mutex is already held by P1 the loop thread will hang, and IO will never complete. Resulting in ABBA deadlock. Cc: stable@vger.kernel.org Signed-off-by: Junxiao Bi Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2a48ea3f1b30..b6b5acc92ca2 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1599,9 +1599,7 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed; c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_FS) - dm_bufio_lock(c); - else if (!dm_bufio_trylock(c)) + if (!dm_bufio_trylock(c)) return SHRINK_STOP; freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); -- cgit v1.2.3-55-g7522