From 751f188dd5ab95b3f2b5f2f467c38aae5a2877eb Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 20 Jul 2012 14:25:03 +0100 Subject: dm raid1: fix crash with mirror recovery and discard This patch fixes a crash when a discard request is sent during mirror recovery. Firstly, some background. Generally, the following sequence happens during mirror synchronization: - function do_recovery is called - do_recovery calls dm_rh_recovery_prepare - dm_rh_recovery_prepare uses a semaphore to limit the number simultaneously recovered regions (by default the semaphore value is 1, so only one region at a time is recovered) - dm_rh_recovery_prepare calls __rh_recovery_prepare, __rh_recovery_prepare asks the log driver for the next region to recover. Then, it sets the region state to DM_RH_RECOVERING. If there are no pending I/Os on this region, the region is added to quiesced_regions list. If there are pending I/Os, the region is not added to any list. It is added to the quiesced_regions list later (by dm_rh_dec function) when all I/Os finish. - when the region is on quiesced_regions list, there are no I/Os in flight on this region. The region is popped from the list in dm_rh_recovery_start function. Then, a kcopyd job is started in the recover function. - when the kcopyd job finishes, recovery_complete is called. It calls dm_rh_recovery_end. dm_rh_recovery_end adds the region to recovered_regions or failed_recovered_regions list (depending on whether the copy operation was successful or not). The above mechanism assumes that if the region is in DM_RH_RECOVERING state, no new I/Os are started on this region. When I/O is started, dm_rh_inc_pending is called, which increases reg->pending count. When I/O is finished, dm_rh_dec is called. It decreases reg->pending count. If the count is zero and the region was in DM_RH_RECOVERING state, dm_rh_dec adds it to the quiesced_regions list. Consequently, if we call dm_rh_inc_pending/dm_rh_dec while the region is in DM_RH_RECOVERING state, it could be added to quiesced_regions list multiple times or it could be added to this list when kcopyd is copying data (it is assumed that the region is not on any list while kcopyd does its jobs). This results in memory corruption and crash. There already exist bypasses for REQ_FLUSH requests: REQ_FLUSH requests do not belong to any region, so they are always added to the sync list in do_writes. dm_rh_inc_pending does not increase count for REQ_FLUSH requests. In mirror_end_io, dm_rh_dec is never called for REQ_FLUSH requests. These bypasses avoid the crash possibility described above. These bypasses were improperly implemented for REQ_DISCARD when the mirror target gained discard support in commit 5fc2ffeabb9ee0fc0e71ff16b49f34f0ed3d05b4 (dm raid1: support discard). In do_writes, REQ_DISCARD requests is always added to the sync queue and immediately dispatched (even if the region is in DM_RH_RECOVERING). However, dm_rh_inc and dm_rh_dec is called for REQ_DISCARD resusts. So it violates the rule that no I/Os are started on DM_RH_RECOVERING regions, and causes the list corruption described above. This patch changes it so that REQ_DISCARD requests follow the same path as REQ_FLUSH. This avoids the crash. Reference: https://bugzilla.redhat.com/837607 Signed-off-by: Mikulas Patocka Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-region-hash.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index d039de8322f0..ea16984c0f08 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1214,7 +1214,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, * We need to dec pending if this was a write. */ if (rw == WRITE) { - if (!(bio->bi_rw & REQ_FLUSH)) + if (!(bio->bi_rw & (REQ_FLUSH | REQ_DISCARD))) dm_rh_dec(ms->rh, map_context->ll); return error; } diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c index 7771ed212182..69732e03eb34 100644 --- a/drivers/md/dm-region-hash.c +++ b/drivers/md/dm-region-hash.c @@ -404,6 +404,9 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio) return; } + if (bio->bi_rw & REQ_DISCARD) + return; + /* We must inform the log that the sync count has changed. */ log->type->set_region_sync(log, region, 0); @@ -524,7 +527,7 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) struct bio *bio; for (bio = bios->head; bio; bio = bio->bi_next) { - if (bio->bi_rw & REQ_FLUSH) + if (bio->bi_rw & (REQ_FLUSH | REQ_DISCARD)) continue; rh_inc(rh, dm_rh_bio_to_region(rh, bio)); } -- cgit v1.2.3-55-g7522 From 650d2a06b4fe1cc1d218c20e256650f68bf0ca31 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 20 Jul 2012 14:25:05 +0100 Subject: dm thin: do not send discards to shared blocks When process_discard receives a partial discard that doesn't cover a full block, it sends this discard down to that block. Unfortunately, the block can be shared and the discard would corrupt the other snapshots sharing this block. This patch detects block sharing and ends the discard with success when sending it to the shared block. The above change means that if the device supports discard it can't be guaranteed that a discard request zeroes data. Therefore, we set ti->discard_zeroes_data_unsupported. Thin target discard support with this bug arrived in commit 104655fd4dcebd50068ef30253a001da72e3a081 (dm thin: support discards). Signed-off-by: Mikulas Patocka Cc: stable@kernel.org Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index ce59824fb414..68694da0d21d 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1245,7 +1245,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio) cell_release_singleton(cell, bio); cell_release_singleton(cell2, bio); - remap_and_issue(tc, bio, lookup_result.block); + if ((!lookup_result.shared) && pool->pf.discard_passdown) + remap_and_issue(tc, bio, lookup_result.block); + else + bio_endio(bio, 0); } break; @@ -2628,6 +2631,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (tc->pool->pf.discard_enabled) { ti->discards_supported = 1; ti->num_discard_requests = 1; + ti->discard_zeroes_data_unsupported = 1; } dm_put(pool_md); -- cgit v1.2.3-55-g7522 From 7c8d3a42fe1c58a7e8fd3f6a013e7d7b474ff931 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 20 Jul 2012 14:25:07 +0100 Subject: dm raid1: set discard_zeroes_data_unsupported We can't guarantee that REQ_DISCARD on dm-mirror zeroes the data even if the underlying disks support zero on discard. So this patch sets ti->discard_zeroes_data_unsupported. For example, if the mirror is in the process of resynchronizing, it may happen that kcopyd reads a piece of data, then discard is sent on the same area and then kcopyd writes the piece of data to another leg. Consequently, the data is not zeroed. The flag was made available by commit 983c7db347db8ce2d8453fd1d89b7a4bb6920d56 (dm crypt: always disable discard_zeroes_data). Signed-off-by: Mikulas Patocka Cc: stable@kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid1.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index ea16984c0f08..b58b7a33914a 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1084,6 +1084,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->split_io = dm_rh_get_region_size(ms->rh); ti->num_flush_requests = 1; ti->num_discard_requests = 1; + ti->discard_zeroes_data_unsupported = 1; ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); -- cgit v1.2.3-55-g7522 From 7768ed33ccdc02801c4483fc5682dc66ace14aea Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Fri, 27 Jul 2012 15:07:57 +0100 Subject: dm thin: reduce endio_hook pool size Reduce the slab size used for the dm_thin_endio_hook mempool. Allocation has been seen to fail on machines with smaller amounts of memory due to fragmentation. lvm: page allocation failure. order:5, mode:0xd0 device-mapper: table: 253:38: thin-pool: Error creating pool's endio_hook mempool Cc: stable@vger.kernel.org Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 68694da0d21d..18f87b0def12 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -19,7 +19,7 @@ /* * Tunable constants */ -#define ENDIO_HOOK_POOL_SIZE 10240 +#define ENDIO_HOOK_POOL_SIZE 1024 #define DEFERRED_SET_SIZE 64 #define MAPPING_POOL_SIZE 1024 #define PRISON_CELLS 1024 -- cgit v1.2.3-55-g7522 From 17b7d63f7ed10376e762fdfadbc65da6687d569a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 27 Jul 2012 15:07:57 +0100 Subject: dm thin: clean up compiler warning Clean up "warning: dubious: !x & y". Also make it clear that __snapshotted_since() returns a bool and that dm_thin_lookup_result's 'shared' member is a flag. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin-metadata.c | 2 +- drivers/md/dm-thin-metadata.h | 2 +- drivers/md/dm-thin.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 3e2907f0bc46..c858931d2dcb 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1262,7 +1262,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) return td->id; } -static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) +static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) { return td->snapshotted_time > time; } diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index b88918ccdaf6..7b47c0a9a8e3 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -119,7 +119,7 @@ dm_thin_id dm_thin_dev_id(struct dm_thin_device *td); struct dm_thin_lookup_result { dm_block_t block; - int shared; + unsigned shared:1; }; /* diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 18f87b0def12..0bb9e646e215 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1218,7 +1218,7 @@ static void process_discard(struct thin_c *tc, struct bio *bio) */ m = get_next_mapping(pool); m->tc = tc; - m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; + m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown; m->virt_block = block; m->data_block = lookup_result.block; m->cell = cell; -- cgit v1.2.3-55-g7522 From 3caf6d73d4dc163b2d135e0b52b052a2b63e5216 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 27 Jul 2012 15:07:58 +0100 Subject: dm persistent data: remove debug space map checker Remove debug space map checker from dm persistent data. The space map checker is a wrapper for other space maps that double checks the reference counts are correct. It holds all these reference counts in memory rather than on disk, so uses a lot of memory and is thus restricted to small pools. As yet, this checker hasn't found any issues, but has caused a few of its own due to people turning it on by default with larger pools. Removing. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/Kconfig | 9 - drivers/md/persistent-data/Makefile | 1 - drivers/md/persistent-data/dm-space-map-checker.c | 446 --------------------- drivers/md/persistent-data/dm-space-map-checker.h | 26 -- drivers/md/persistent-data/dm-space-map-disk.c | 34 +- .../md/persistent-data/dm-transaction-manager.c | 29 +- 6 files changed, 11 insertions(+), 534 deletions(-) delete mode 100644 drivers/md/persistent-data/dm-space-map-checker.c delete mode 100644 drivers/md/persistent-data/dm-space-map-checker.h (limited to 'drivers/md') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 10f122a3a856..1eee45b69b71 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -260,15 +260,6 @@ config DM_DEBUG_BLOCK_STACK_TRACING If unsure, say N. -config DM_DEBUG_SPACE_MAPS - boolean "Extra validation for thin provisioning space maps" - depends on DM_THIN_PROVISIONING - ---help--- - Enable this for messages that may help debug problems with the - space maps used by thin provisioning. - - If unsure, say N. - config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM diff --git a/drivers/md/persistent-data/Makefile b/drivers/md/persistent-data/Makefile index cfa95f662230..d8e7cb767c1e 100644 --- a/drivers/md/persistent-data/Makefile +++ b/drivers/md/persistent-data/Makefile @@ -1,7 +1,6 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o dm-persistent-data-objs := \ dm-block-manager.o \ - dm-space-map-checker.o \ dm-space-map-common.o \ dm-space-map-disk.o \ dm-space-map-metadata.o \ diff --git a/drivers/md/persistent-data/dm-space-map-checker.c b/drivers/md/persistent-data/dm-space-map-checker.c deleted file mode 100644 index fc90c11620ad..000000000000 --- a/drivers/md/persistent-data/dm-space-map-checker.c +++ /dev/null @@ -1,446 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#include "dm-space-map-checker.h" - -#include -#include -#include - -#ifdef CONFIG_DM_DEBUG_SPACE_MAPS - -#define DM_MSG_PREFIX "space map checker" - -/*----------------------------------------------------------------*/ - -struct count_array { - dm_block_t nr; - dm_block_t nr_free; - - uint32_t *counts; -}; - -static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count) -{ - if (b >= ca->nr) - return -EINVAL; - - *count = ca->counts[b]; - return 0; -} - -static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r) -{ - if (b >= ca->nr) - return -EINVAL; - - *r = ca->counts[b] > 1; - return 0; -} - -static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count) -{ - uint32_t old_count; - - if (b >= ca->nr) - return -EINVAL; - - old_count = ca->counts[b]; - - if (!count && old_count) - ca->nr_free++; - - else if (count && !old_count) - ca->nr_free--; - - ca->counts[b] = count; - return 0; -} - -static int ca_inc_block(struct count_array *ca, dm_block_t b) -{ - if (b >= ca->nr) - return -EINVAL; - - ca_set_count(ca, b, ca->counts[b] + 1); - return 0; -} - -static int ca_dec_block(struct count_array *ca, dm_block_t b) -{ - if (b >= ca->nr) - return -EINVAL; - - BUG_ON(ca->counts[b] == 0); - ca_set_count(ca, b, ca->counts[b] - 1); - return 0; -} - -static int ca_create(struct count_array *ca, struct dm_space_map *sm) -{ - int r; - dm_block_t nr_blocks; - - r = dm_sm_get_nr_blocks(sm, &nr_blocks); - if (r) - return r; - - ca->nr = nr_blocks; - ca->nr_free = nr_blocks; - - if (!nr_blocks) - ca->counts = NULL; - else { - ca->counts = vzalloc(sizeof(*ca->counts) * nr_blocks); - if (!ca->counts) - return -ENOMEM; - } - - return 0; -} - -static void ca_destroy(struct count_array *ca) -{ - vfree(ca->counts); -} - -static int ca_load(struct count_array *ca, struct dm_space_map *sm) -{ - int r; - uint32_t count; - dm_block_t nr_blocks, i; - - r = dm_sm_get_nr_blocks(sm, &nr_blocks); - if (r) - return r; - - BUG_ON(ca->nr != nr_blocks); - - DMWARN("Loading debug space map from disk. This may take some time"); - for (i = 0; i < nr_blocks; i++) { - r = dm_sm_get_count(sm, i, &count); - if (r) { - DMERR("load failed"); - return r; - } - - ca_set_count(ca, i, count); - } - DMWARN("Load complete"); - - return 0; -} - -static int ca_extend(struct count_array *ca, dm_block_t extra_blocks) -{ - dm_block_t nr_blocks = ca->nr + extra_blocks; - uint32_t *counts = vzalloc(sizeof(*counts) * nr_blocks); - if (!counts) - return -ENOMEM; - - if (ca->counts) { - memcpy(counts, ca->counts, sizeof(*counts) * ca->nr); - ca_destroy(ca); - } - ca->nr = nr_blocks; - ca->nr_free += extra_blocks; - ca->counts = counts; - return 0; -} - -static int ca_commit(struct count_array *old, struct count_array *new) -{ - if (old->nr != new->nr) { - BUG_ON(old->nr > new->nr); - ca_extend(old, new->nr - old->nr); - } - - BUG_ON(old->nr != new->nr); - old->nr_free = new->nr_free; - memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr); - return 0; -} - -/*----------------------------------------------------------------*/ - -struct sm_checker { - struct dm_space_map sm; - - struct count_array old_counts; - struct count_array counts; - - struct dm_space_map *real_sm; -}; - -static void sm_checker_destroy(struct dm_space_map *sm) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - - dm_sm_destroy(smc->real_sm); - ca_destroy(&smc->old_counts); - ca_destroy(&smc->counts); - kfree(smc); -} - -static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_get_nr_blocks(smc->real_sm, count); - if (!r) - BUG_ON(smc->old_counts.nr != *count); - return r; -} - -static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_get_nr_free(smc->real_sm, count); - if (!r) { - /* - * Slow, but we know it's correct. - */ - dm_block_t b, n = 0; - for (b = 0; b < smc->old_counts.nr; b++) - if (smc->old_counts.counts[b] == 0 && - smc->counts.counts[b] == 0) - n++; - - if (n != *count) - DMERR("free block counts differ, checker %u, sm-disk:%u", - (unsigned) n, (unsigned) *count); - } - return r; -} - -static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_new_block(smc->real_sm, b); - - if (!r) { - BUG_ON(*b >= smc->old_counts.nr); - BUG_ON(smc->old_counts.counts[*b] != 0); - BUG_ON(*b >= smc->counts.nr); - BUG_ON(smc->counts.counts[*b] != 0); - ca_set_count(&smc->counts, *b, 1); - } - - return r; -} - -static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_inc_block(smc->real_sm, b); - int r2 = ca_inc_block(&smc->counts, b); - BUG_ON(r != r2); - return r; -} - -static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_dec_block(smc->real_sm, b); - int r2 = ca_dec_block(&smc->counts, b); - BUG_ON(r != r2); - return r; -} - -static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - uint32_t result2 = 0; - int r = dm_sm_get_count(smc->real_sm, b, result); - int r2 = ca_get_count(&smc->counts, b, &result2); - - BUG_ON(r != r2); - if (!r) - BUG_ON(*result != result2); - return r; -} - -static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int result2 = 0; - int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result); - int r2 = ca_count_more_than_one(&smc->counts, b, &result2); - - BUG_ON(r != r2); - if (!r) - BUG_ON(!(*result) && result2); - return r; -} - -static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - uint32_t old_rc; - int r = dm_sm_set_count(smc->real_sm, b, count); - int r2; - - BUG_ON(b >= smc->counts.nr); - old_rc = smc->counts.counts[b]; - r2 = ca_set_count(&smc->counts, b, count); - BUG_ON(r != r2); - - return r; -} - -static int sm_checker_commit(struct dm_space_map *sm) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r; - - r = dm_sm_commit(smc->real_sm); - if (r) - return r; - - r = ca_commit(&smc->old_counts, &smc->counts); - if (r) - return r; - - return 0; -} - -static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - int r = dm_sm_extend(smc->real_sm, extra_blocks); - if (r) - return r; - - return ca_extend(&smc->counts, extra_blocks); -} - -static int sm_checker_root_size(struct dm_space_map *sm, size_t *result) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - return dm_sm_root_size(smc->real_sm, result); -} - -static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len) -{ - struct sm_checker *smc = container_of(sm, struct sm_checker, sm); - return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len); -} - -/*----------------------------------------------------------------*/ - -static struct dm_space_map ops_ = { - .destroy = sm_checker_destroy, - .get_nr_blocks = sm_checker_get_nr_blocks, - .get_nr_free = sm_checker_get_nr_free, - .inc_block = sm_checker_inc_block, - .dec_block = sm_checker_dec_block, - .new_block = sm_checker_new_block, - .get_count = sm_checker_get_count, - .count_is_more_than_one = sm_checker_count_more_than_one, - .set_count = sm_checker_set_count, - .commit = sm_checker_commit, - .extend = sm_checker_extend, - .root_size = sm_checker_root_size, - .copy_root = sm_checker_copy_root -}; - -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) -{ - int r; - struct sm_checker *smc; - - if (IS_ERR_OR_NULL(sm)) - return ERR_PTR(-EINVAL); - - smc = kmalloc(sizeof(*smc), GFP_KERNEL); - if (!smc) - return ERR_PTR(-ENOMEM); - - memcpy(&smc->sm, &ops_, sizeof(smc->sm)); - r = ca_create(&smc->old_counts, sm); - if (r) { - kfree(smc); - return ERR_PTR(r); - } - - r = ca_create(&smc->counts, sm); - if (r) { - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - smc->real_sm = sm; - - r = ca_load(&smc->counts, sm); - if (r) { - ca_destroy(&smc->counts); - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - r = ca_commit(&smc->old_counts, &smc->counts); - if (r) { - ca_destroy(&smc->counts); - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - return &smc->sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create); - -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) -{ - int r; - struct sm_checker *smc; - - if (IS_ERR_OR_NULL(sm)) - return ERR_PTR(-EINVAL); - - smc = kmalloc(sizeof(*smc), GFP_KERNEL); - if (!smc) - return ERR_PTR(-ENOMEM); - - memcpy(&smc->sm, &ops_, sizeof(smc->sm)); - r = ca_create(&smc->old_counts, sm); - if (r) { - kfree(smc); - return ERR_PTR(r); - } - - r = ca_create(&smc->counts, sm); - if (r) { - ca_destroy(&smc->old_counts); - kfree(smc); - return ERR_PTR(r); - } - - smc->real_sm = sm; - return &smc->sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); - -/*----------------------------------------------------------------*/ - -#else - -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm) -{ - return sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create); - -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm) -{ - return sm; -} -EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/drivers/md/persistent-data/dm-space-map-checker.h b/drivers/md/persistent-data/dm-space-map-checker.h deleted file mode 100644 index 444dccf6688c..000000000000 --- a/drivers/md/persistent-data/dm-space-map-checker.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (C) 2011 Red Hat, Inc. - * - * This file is released under the GPL. - */ - -#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H -#define SNAPSHOTS_SPACE_MAP_CHECKER_H - -#include "dm-space-map.h" - -/*----------------------------------------------------------------*/ - -/* - * This space map wraps a real on-disk space map, and verifies all of its - * operations. It uses a lot of memory, so only use if you have a specific - * problem that you're debugging. - * - * Ownership of @sm passes. - */ -struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm); -struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm); - -/*----------------------------------------------------------------*/ - -#endif diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 3d0ed5332883..f6d29e614ab7 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -4,7 +4,6 @@ * This file is released under the GPL. */ -#include "dm-space-map-checker.h" #include "dm-space-map-common.h" #include "dm-space-map-disk.h" #include "dm-space-map.h" @@ -252,9 +251,8 @@ static struct dm_space_map ops = { .copy_root = sm_disk_copy_root }; -static struct dm_space_map *dm_sm_disk_create_real( - struct dm_transaction_manager *tm, - dm_block_t nr_blocks) +struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, + dm_block_t nr_blocks) { int r; struct sm_disk *smd; @@ -285,27 +283,10 @@ bad: kfree(smd); return ERR_PTR(r); } - -struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm, - dm_block_t nr_blocks) -{ - struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks); - struct dm_space_map *smc; - - if (IS_ERR_OR_NULL(sm)) - return sm; - - smc = dm_sm_checker_create_fresh(sm); - if (IS_ERR(smc)) - dm_sm_destroy(sm); - - return smc; -} EXPORT_SYMBOL_GPL(dm_sm_disk_create); -static struct dm_space_map *dm_sm_disk_open_real( - struct dm_transaction_manager *tm, - void *root_le, size_t len) +struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, + void *root_le, size_t len) { int r; struct sm_disk *smd; @@ -332,13 +313,6 @@ bad: kfree(smd); return ERR_PTR(r); } - -struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm, - void *root_le, size_t len) -{ - return dm_sm_checker_create( - dm_sm_disk_open_real(tm, root_le, len)); -} EXPORT_SYMBOL_GPL(dm_sm_disk_open); /*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index e5604b32d91f..86c3705052a4 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -5,7 +5,6 @@ */ #include "dm-transaction-manager.h" #include "dm-space-map.h" -#include "dm-space-map-checker.h" #include "dm-space-map-disk.h" #include "dm-space-map-metadata.h" #include "dm-persistent-data-internal.h" @@ -319,15 +318,14 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, int create) { int r; - struct dm_space_map *inner; - inner = dm_sm_metadata_init(); - if (IS_ERR(inner)) - return PTR_ERR(inner); + *sm = dm_sm_metadata_init(); + if (IS_ERR(*sm)) + return PTR_ERR(*sm); - *tm = dm_tm_create(bm, inner); + *tm = dm_tm_create(bm, *sm); if (IS_ERR(*tm)) { - dm_sm_destroy(inner); + dm_sm_destroy(*sm); return PTR_ERR(*tm); } @@ -339,19 +337,13 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, goto bad1; } - r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm), + r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm), sb_location); if (r) { DMERR("couldn't create metadata space map"); goto bad2; } - *sm = dm_sm_checker_create(inner); - if (IS_ERR(*sm)) { - r = PTR_ERR(*sm); - goto bad2; - } - } else { r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location, sb_validator, sblock); @@ -360,19 +352,13 @@ static int dm_tm_create_internal(struct dm_block_manager *bm, goto bad1; } - r = dm_sm_metadata_open(inner, *tm, + r = dm_sm_metadata_open(*sm, *tm, dm_block_data(*sblock) + root_offset, root_max_len); if (r) { DMERR("couldn't open metadata space map"); goto bad2; } - - *sm = dm_sm_checker_create(inner); - if (IS_ERR(*sm)) { - r = PTR_ERR(*sm); - goto bad2; - } } return 0; @@ -381,7 +367,6 @@ bad2: dm_tm_unlock(*tm, *sblock); bad1: dm_tm_destroy(*tm); - dm_sm_destroy(inner); return r; } -- cgit v1.2.3-55-g7522 From d973ac196b7668c198f3c1338d8b07c13a3e7713 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 27 Jul 2012 15:07:58 +0100 Subject: dm thin metadata: remove pointless label from __commit_transaction Remove the pointless label 'out' from __commit_transaction in dm-thin-metadata.c Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin-metadata.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index c858931d2dcb..acd89ec825d5 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -597,31 +597,31 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) r = __write_changed_details(pmd); if (r < 0) - goto out; + return r; if (!pmd->need_commit) - goto out; + return r; r = dm_sm_commit(pmd->data_sm); if (r < 0) - goto out; + return r; r = dm_tm_pre_commit(pmd->tm); if (r < 0) - goto out; + return r; r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); if (r < 0) - goto out; + return r; r = dm_sm_root_size(pmd->data_sm, &data_len); if (r < 0) - goto out; + return r; r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, &sb_validator, &sblock); if (r) - goto out; + return r; disk_super = dm_block_data(sblock); disk_super->time = cpu_to_le32(pmd->time); @@ -644,7 +644,6 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) if (!r) pmd->need_commit = 0; -out: return r; out_locked: -- cgit v1.2.3-55-g7522 From 8c971178a788c70e8d6db5c3a813de1a1f54e5b7 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 27 Jul 2012 15:07:58 +0100 Subject: dm thin metadata: introduce THIN_MAX_CONCURRENT_LOCKS Introduce THIN_MAX_CONCURRENT_LOCKS into dm-thin-metadata to give a name to an otherwise "magic" number. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin-metadata.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index acd89ec825d5..31f9827dfb56 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -80,6 +80,12 @@ #define THIN_METADATA_CACHE_SIZE 64 #define SECTOR_TO_BLOCK_SHIFT 3 +/* + * 3 for btree insert + + * 2 for btree lookup used within space map + */ +#define THIN_MAX_CONCURRENT_LOCKS 5 + /* This should be plenty */ #define SPACE_MAP_ROOT_SIZE 128 @@ -668,13 +674,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, return ERR_PTR(-ENOMEM); } - /* - * Max hex locks: - * 3 for btree insert + - * 2 for btree lookup used within space map - */ bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, - THIN_METADATA_CACHE_SIZE, 5); + THIN_METADATA_CACHE_SIZE, + THIN_MAX_CONCURRENT_LOCKS); if (!bm) { DMERR("could not create block manager"); kfree(pmd); -- cgit v1.2.3-55-g7522 From 70c48611024791ccf83aca6195b58a5db9325485 Mon Sep 17 00:00:00 2001 From: Alasdair G Kergon Date: Fri, 27 Jul 2012 15:07:59 +0100 Subject: dm snapshot: remove redundant assignment in merge fn Remove redundant bvm->bi_sector self-assignment in dm snapshot's origin_merge(). Signed-off-by: Alasdair G Kergon --- drivers/md/dm-snap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 6f758870fc19..a228c20e40b3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2176,7 +2176,6 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, return max_size; bvm->bi_bdev = dev->bdev; - bvm->bi_sector = bvm->bi_sector; return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } -- cgit v1.2.3-55-g7522 From 1a66a08ae82b16eb40705ad112ff28873981af92 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Fri, 27 Jul 2012 15:07:59 +0100 Subject: dm: replace simple_strtoul Replace obsolete simple_strtoul() with kstrtou8/kstrtouint. Signed-off-by: majianpeng Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 5 +---- drivers/md/dm-exception-store.c | 13 ++++--------- drivers/md/dm-stripe.c | 7 ++----- 3 files changed, 7 insertions(+), 18 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3f06df59fd82..e2b32401ecc7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1241,7 +1241,6 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) static int crypt_decode_key(u8 *key, char *hex, unsigned int size) { char buffer[3]; - char *endp; unsigned int i; buffer[2] = '\0'; @@ -1250,9 +1249,7 @@ static int crypt_decode_key(u8 *key, char *hex, unsigned int size) buffer[0] = *hex++; buffer[1] = *hex++; - key[i] = (u8)simple_strtoul(buffer, &endp, 16); - - if (endp != &buffer[2]) + if (kstrtou8(buffer, 16, &key[i])) return -EINVAL; } diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index aa70f7d43a1a..ebaa4f803eec 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -142,24 +142,19 @@ EXPORT_SYMBOL(dm_exception_store_type_unregister); static int set_chunk_size(struct dm_exception_store *store, const char *chunk_size_arg, char **error) { - unsigned long chunk_size_ulong; - char *value; + unsigned chunk_size; - chunk_size_ulong = simple_strtoul(chunk_size_arg, &value, 10); - if (*chunk_size_arg == '\0' || *value != '\0' || - chunk_size_ulong > UINT_MAX) { + if (kstrtouint(chunk_size_arg, 10, &chunk_size)) { *error = "Invalid chunk size"; return -EINVAL; } - if (!chunk_size_ulong) { + if (!chunk_size) { store->chunk_size = store->chunk_mask = store->chunk_shift = 0; return 0; } - return dm_exception_store_set_chunk_size(store, - (unsigned) chunk_size_ulong, - error); + return dm_exception_store_set_chunk_size(store, chunk_size, error); } int dm_exception_store_set_chunk_size(struct dm_exception_store *store, diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 35c94ff24ad5..183db5d3e48e 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -99,7 +99,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) sector_t width; uint32_t stripes; uint32_t chunk_size; - char *end; int r; unsigned int i; @@ -108,14 +107,12 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - stripes = simple_strtoul(argv[0], &end, 10); - if (!stripes || *end) { + if (kstrtouint(argv[0], 10, &stripes) || !stripes) { ti->error = "Invalid stripe count"; return -EINVAL; } - chunk_size = simple_strtoul(argv[1], &end, 10); - if (*end) { + if (kstrtouint(argv[1], 10, &chunk_size)) { ti->error = "Invalid chunk_size"; return -EINVAL; } -- cgit v1.2.3-55-g7522 From f09996c993e256fce4b920588959866998d51250 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 27 Jul 2012 15:07:59 +0100 Subject: dm thin: provide specific errors for two table load failure cases Provide specific error message strings for two pool_ctr() failure cases that currently give just "Unknown error". Reference: test_two_pools_pointing_to_the_same_metadata_fails and test_different_pool_cant_replace_pool in thinp-test-suite. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 0bb9e646e215..e89f8e7d8a33 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1828,15 +1828,19 @@ static struct pool *__pool_find(struct mapped_device *pool_md, struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); if (pool) { - if (pool->pool_md != pool_md) + if (pool->pool_md != pool_md) { + *error = "metadata device already in use by a pool"; return ERR_PTR(-EBUSY); + } __pool_inc(pool); } else { pool = __pool_table_lookup(pool_md); if (pool) { - if (pool->md_dev != metadata_dev) + if (pool->md_dev != metadata_dev) { + *error = "different pool cannot replace a pool"; return ERR_PTR(-EINVAL); + } __pool_inc(pool); } else { -- cgit v1.2.3-55-g7522 From f14fa693c93078444b5e95d7cad78ead0383ad50 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:00 +0100 Subject: dm stripe: fix size test dm-stripe is supposed to ensure that all the space allocated to the stripes is fully used and that all stripes are the same size. This patch fixes the test. It checks that device length is divisible by the chunk size and checks that the resulting quotient is divisible by the number of stripes (which is equivalent to testing if device length is divisible by chunk_size * stripes). Previously, the code only tested that the number of sectors in the target was divisible by each of the chunk size and the number of stripes separately, which could leave entire stripes unused. (A setup that genuinely needs some stripes to be shorter than others can be created by concatenating striped targets.) Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/striped.txt | 3 +-- drivers/md/dm-stripe.c | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/Documentation/device-mapper/striped.txt b/Documentation/device-mapper/striped.txt index f34d3236b9da..953fe1d68068 100644 --- a/Documentation/device-mapper/striped.txt +++ b/Documentation/device-mapper/striped.txt @@ -16,8 +16,7 @@ Parameters: [ ]+ : Starting sector within the device. One or more underlying devices can be specified. The striped device size must -be a multiple of the chunk size and a multiple of the number of underlying -devices. +be a multiple of the chunk size multiplied by the number of underlying devices. Example scripts diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 183db5d3e48e..1f6316ed2b31 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -132,7 +132,6 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - width = ti->len; if (sector_div(width, stripes)) { ti->error = "Target length not divisible by " "number of stripes"; -- cgit v1.2.3-55-g7522 From 1df05483d758ea43abc375869fbe06be506ba827 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:00 +0100 Subject: dm stripe: remove stripes_mask The structure stripe_c contains a stripes_mask field. This field is useless because it can be trivially calculated by subtracting one from stripes. It is used only at one place. This patch removes it. The patch also changes ffs(stripes) - 1 to __ffs(stripes). Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 1f6316ed2b31..6931bd18b615 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -26,7 +26,6 @@ struct stripe { struct stripe_c { uint32_t stripes; int stripes_shift; - sector_t stripes_mask; /* The size of this target / num. stripes */ sector_t stripe_width; @@ -163,10 +162,8 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (stripes & (stripes - 1)) sc->stripes_shift = -1; - else { - sc->stripes_shift = ffs(stripes) - 1; - sc->stripes_mask = ((sector_t) stripes) - 1; - } + else + sc->stripes_shift = __ffs(stripes); ti->split_io = chunk_size; ti->num_flush_requests = stripes; @@ -218,7 +215,7 @@ static void stripe_map_sector(struct stripe_c *sc, sector_t sector, if (sc->stripes_shift < 0) *stripe = sector_div(chunk, sc->stripes); else { - *stripe = chunk & sc->stripes_mask; + *stripe = chunk & (sc->stripes - 1); chunk >>= sc->stripes_shift; } -- cgit v1.2.3-55-g7522 From 542f90381422676544382d4071ba44a2de90a0c1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 27 Jul 2012 15:08:00 +0100 Subject: dm: support non power of two target max_io_len Remove the restriction that limits a target's specified maximum incoming I/O size to be a power of 2. Rename this setting from 'split_io' to the less-ambiguous 'max_io_len'. Change it from sector_t to uint32_t, which is plenty big enough, and introduce a wrapper function dm_set_target_max_io_len() to set it. Use sector_div() to process it now that it is not necessarily a power of 2. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid.c | 11 +++++------ drivers/md/dm-raid1.c | 6 +++++- drivers/md/dm-snap.c | 27 +++++++++++++++------------ drivers/md/dm-stripe.c | 5 ++++- drivers/md/dm-thin.c | 5 ++++- drivers/md/dm.c | 35 +++++++++++++++++++++++++++-------- include/linux/device-mapper.h | 9 +++++++-- include/linux/dm-ioctl.h | 4 ++-- 8 files changed, 69 insertions(+), 33 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 017c34d78d61..858a8b70811c 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -353,6 +353,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, { unsigned i, rebuild_cnt = 0; unsigned long value, region_size = 0; + sector_t max_io_len; char *key; /* @@ -522,14 +523,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; if (rs->md.chunk_sectors) - rs->ti->split_io = rs->md.chunk_sectors; + max_io_len = rs->md.chunk_sectors; else - rs->ti->split_io = region_size; + max_io_len = region_size; - if (rs->md.chunk_sectors) - rs->ti->split_io = rs->md.chunk_sectors; - else - rs->ti->split_io = region_size; + if (dm_set_target_max_io_len(rs->ti, max_io_len)) + return -EINVAL; /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index b58b7a33914a..819ccba65912 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1081,7 +1081,11 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ti->private = ms; - ti->split_io = dm_rh_get_region_size(ms->rh); + + r = dm_set_target_max_io_len(ti, dm_rh_get_region_size(ms->rh)); + if (r) + goto err_free_context; + ti->num_flush_requests = 1; ti->num_discard_requests = 1; ti->discard_zeroes_data_unsupported = 1; diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index a228c20e40b3..6c0f3e33923a 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -691,7 +691,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) * Return a minimum chunk size of all snapshots that have the specified origin. * Return zero if the origin has no snapshots. */ -static sector_t __minimum_chunk_size(struct origin *o) +static uint32_t __minimum_chunk_size(struct origin *o) { struct dm_snapshot *snap; unsigned chunk_size = 0; @@ -701,7 +701,7 @@ static sector_t __minimum_chunk_size(struct origin *o) chunk_size = min_not_zero(chunk_size, snap->store->chunk_size); - return chunk_size; + return (uint32_t) chunk_size; } /* @@ -1172,7 +1172,10 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Chunk size not set"; goto bad_read_metadata; } - ti->split_io = s->store->chunk_size; + + r = dm_set_target_max_io_len(ti, s->store->chunk_size); + if (r) + goto bad_read_metadata; return 0; @@ -1239,7 +1242,7 @@ static void __handover_exceptions(struct dm_snapshot *snap_src, snap_dest->store->snap = snap_dest; snap_src->store->snap = snap_src; - snap_dest->ti->split_io = snap_dest->store->chunk_size; + snap_dest->ti->max_io_len = snap_dest->store->chunk_size; snap_dest->valid = snap_src->valid; /* @@ -1817,9 +1820,9 @@ static void snapshot_resume(struct dm_target *ti) up_write(&s->lock); } -static sector_t get_origin_minimum_chunksize(struct block_device *bdev) +static uint32_t get_origin_minimum_chunksize(struct block_device *bdev) { - sector_t min_chunksize; + uint32_t min_chunksize; down_read(&_origins_lock); min_chunksize = __minimum_chunk_size(__lookup_origin(bdev)); @@ -1838,9 +1841,9 @@ static void snapshot_merge_resume(struct dm_target *ti) snapshot_resume(ti); /* - * snapshot-merge acts as an origin, so set ti->split_io + * snapshot-merge acts as an origin, so set ti->max_io_len */ - ti->split_io = get_origin_minimum_chunksize(s->origin->bdev); + ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev); start_merge(s); } @@ -2073,12 +2076,12 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, struct origin *o; /* - * The origin's __minimum_chunk_size() got stored in split_io + * The origin's __minimum_chunk_size() got stored in max_io_len * by snapshot_merge_resume(). */ down_read(&_origins_lock); o = __lookup_origin(merging_snap->origin->bdev); - for (n = 0; n < size; n += merging_snap->ti->split_io) + for (n = 0; n < size; n += merging_snap->ti->max_io_len) if (__origin_write(&o->snapshots, sector + n, NULL) == DM_MAPIO_SUBMITTED) must_wait = 1; @@ -2138,14 +2141,14 @@ static int origin_map(struct dm_target *ti, struct bio *bio, } /* - * Set the target "split_io" field to the minimum of all the snapshots' + * Set the target "max_io_len" field to the minimum of all the snapshots' * chunk sizes. */ static void origin_resume(struct dm_target *ti) { struct dm_dev *dev = ti->private; - ti->split_io = get_origin_minimum_chunksize(dev->bdev); + ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); } static int origin_status(struct dm_target *ti, status_type_t type, char *result, diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 6931bd18b615..992c9d4c3bd9 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -165,7 +165,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) else sc->stripes_shift = __ffs(stripes); - ti->split_io = chunk_size; + r = dm_set_target_max_io_len(ti, chunk_size); + if (r) + return r; + ti->num_flush_requests = stripes; ti->num_discard_requests = stripes; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e89f8e7d8a33..350bcf40485e 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2628,7 +2628,10 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad_thin_open; } - ti->split_io = tc->pool->sectors_per_block; + r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); + if (r) + goto bad_thin_open; + ti->num_flush_requests = 1; /* In case the pool supports discards, pass them on. */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e24143cc2040..415c2803c0c9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -968,22 +968,41 @@ static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti static sector_t max_io_len(sector_t sector, struct dm_target *ti) { sector_t len = max_io_len_target_boundary(sector, ti); + sector_t offset, max_len; /* - * Does the target need to split even further ? + * Does the target need to split even further? */ - if (ti->split_io) { - sector_t boundary; - sector_t offset = dm_target_offset(ti, sector); - boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) - - offset; - if (len > boundary) - len = boundary; + if (ti->max_io_len) { + offset = dm_target_offset(ti, sector); + if (unlikely(ti->max_io_len & (ti->max_io_len - 1))) + max_len = sector_div(offset, ti->max_io_len); + else + max_len = offset & (ti->max_io_len - 1); + max_len = ti->max_io_len - max_len; + + if (len > max_len) + len = max_len; } return len; } +int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) +{ + if (len > UINT_MAX) { + DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)", + (unsigned long long)len, UINT_MAX); + ti->error = "Maximum size of target IO is too large"; + return -EINVAL; + } + + ti->max_io_len = (uint32_t) len; + + return 0; +} +EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); + static void __map_bio(struct dm_target *ti, struct bio *clone, struct dm_target_io *tio) { diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index d70cbb2ada25..b19c1e189a68 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -186,8 +186,8 @@ struct dm_target { sector_t begin; sector_t len; - /* Always a power of 2 */ - sector_t split_io; + /* If non-zero, maximum size of I/O submitted to a target. */ + uint32_t max_io_len; /* * A number of zero-length barrier requests that will be submitted @@ -357,6 +357,11 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback */ int dm_table_complete(struct dm_table *t); +/* + * Target may require that it is never sent I/O larger than len. + */ +int __must_check dm_set_target_max_io_len(struct dm_target *ti, sector_t len); + /* * Table reference counting. */ diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h index 75fd5573516e..3ece4eee84cb 100644 --- a/include/linux/dm-ioctl.h +++ b/include/linux/dm-ioctl.h @@ -268,8 +268,8 @@ enum { #define DM_VERSION_MAJOR 4 #define DM_VERSION_MINOR 22 -#define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2011-10-19)" +#define DM_VERSION_PATCHLEVEL 1 +#define DM_VERSION_EXTRA "-ioctl (2012-06-01)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3-55-g7522 From eb850de608cc22e0199b3797cd5c0076bae6cda0 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 27 Jul 2012 15:08:01 +0100 Subject: dm stripe: support for non power of 2 chunksize Support non-power-of-2 chunk sizes with dm striping for proper alignment of stripe IO on storage that has non-power-of-2 optimal IO sizes (e.g. RAID6 10+2). Signed-off-by: Mike Snitzer Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- Documentation/device-mapper/striped.txt | 4 +-- drivers/md/dm-stripe.c | 44 ++++++++++++++------------------- 2 files changed, 21 insertions(+), 27 deletions(-) (limited to 'drivers/md') diff --git a/Documentation/device-mapper/striped.txt b/Documentation/device-mapper/striped.txt index 953fe1d68068..45f3b91ea4c3 100644 --- a/Documentation/device-mapper/striped.txt +++ b/Documentation/device-mapper/striped.txt @@ -9,8 +9,8 @@ devices in parallel. Parameters: [ ]+ : Number of underlying devices. - : Size of each chunk of data. Must be a power-of-2 and at - least as large as the system's PAGE_SIZE. + : Size of each chunk of data. Must be at least as + large as the system's PAGE_SIZE. : Full pathname to the underlying block-device, or a "major:minor" device-number. : Starting sector within the device. diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 992c9d4c3bd9..9690daa11c83 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -30,9 +30,7 @@ struct stripe_c { /* The size of this target / num. stripes */ sector_t stripe_width; - /* stripe chunk size */ - uint32_t chunk_shift; - sector_t chunk_mask; + uint32_t chunk_size; /* Needed for handling events */ struct dm_target *ti; @@ -90,7 +88,7 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, /* * Construct a striped mapping. - * [ ]+ + * [ ]+ */ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) { @@ -111,21 +109,14 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - if (kstrtouint(argv[1], 10, &chunk_size)) { - ti->error = "Invalid chunk_size"; - return -EINVAL; - } - - /* - * chunk_size is a power of two - */ - if (!is_power_of_2(chunk_size) || + if (kstrtouint(argv[1], 10, &chunk_size) || (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { - ti->error = "Invalid chunk size"; + ti->error = "Invalid chunk_size"; return -EINVAL; } - if (ti->len & (chunk_size - 1)) { + width = ti->len; + if (sector_div(width, chunk_size)) { ti->error = "Target length not divisible by " "chunk size"; return -EINVAL; @@ -172,8 +163,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_requests = stripes; ti->num_discard_requests = stripes; - sc->chunk_shift = ffs(chunk_size) - 1; - sc->chunk_mask = ((sector_t) chunk_size) - 1; + sc->chunk_size = chunk_size; /* * Get the stripe destinations. @@ -212,8 +202,8 @@ static void stripe_dtr(struct dm_target *ti) static void stripe_map_sector(struct stripe_c *sc, sector_t sector, uint32_t *stripe, sector_t *result) { - sector_t offset = dm_target_offset(sc->ti, sector); - sector_t chunk = offset >> sc->chunk_shift; + sector_t chunk = dm_target_offset(sc->ti, sector); + sector_t chunk_offset = sector_div(chunk, sc->chunk_size); if (sc->stripes_shift < 0) *stripe = sector_div(chunk, sc->stripes); @@ -222,7 +212,7 @@ static void stripe_map_sector(struct stripe_c *sc, sector_t sector, chunk >>= sc->stripes_shift; } - *result = (chunk << sc->chunk_shift) | (offset & sc->chunk_mask); + *result = (chunk * sc->chunk_size) + chunk_offset; } static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, @@ -233,9 +223,13 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, stripe_map_sector(sc, sector, &stripe, result); if (stripe == target_stripe) return; - *result &= ~sc->chunk_mask; /* round down */ + + /* round down */ + sector = *result; + *result -= sector_div(sector, sc->chunk_size); + if (target_stripe < stripe) - *result += sc->chunk_mask + 1; /* next chunk */ + *result += sc->chunk_size; /* next chunk */ } static int stripe_map_discard(struct stripe_c *sc, struct bio *bio, @@ -320,7 +314,7 @@ static int stripe_status(struct dm_target *ti, case STATUSTYPE_TABLE: DMEMIT("%d %llu", sc->stripes, - (unsigned long long)sc->chunk_mask + 1); + (unsigned long long)sc->chunk_size); for (i = 0; i < sc->stripes; i++) DMEMIT(" %s %llu", sc->stripe[i].dev->name, (unsigned long long)sc->stripe[i].physical_start); @@ -387,7 +381,7 @@ static void stripe_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct stripe_c *sc = ti->private; - unsigned chunk_size = (sc->chunk_mask + 1) << 9; + unsigned chunk_size = sc->chunk_size << SECTOR_SHIFT; blk_limits_io_min(limits, chunk_size); blk_limits_io_opt(limits, chunk_size * sc->stripes); @@ -415,7 +409,7 @@ static int stripe_merge(struct dm_target *ti, struct bvec_merge_data *bvm, static struct target_type stripe_target = { .name = "striped", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, -- cgit v1.2.3-55-g7522 From 8f069b41bce79b0c4b6076acd0f3d15df0a232ed Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:01 +0100 Subject: dm stripe: remove minimum stripe size There is no technical limitation in device mapper that would prevent the dm-stripe target from using a stripe size smaller than page size. This patch removes the limit and makes stripe volumes portable across architectures with different page size. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 9690daa11c83..6a20e9ea0bfb 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -109,8 +109,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - if (kstrtouint(argv[1], 10, &chunk_size) || - (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) { + if (kstrtouint(argv[1], 10, &chunk_size) || !chunk_size) { ti->error = "Invalid chunk_size"; return -EINVAL; } -- cgit v1.2.3-55-g7522 From 33d07c0dfab902a7c5420587984497dc05ab5c9c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:02 +0100 Subject: dm stripe: optimize chunk_size calculations dm-stripe is usually used with a chunk size that is a power of two. Use faster shifts and bit masks in such cases. stripe_width is already optimized in a similar way. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 6a20e9ea0bfb..9e8f4cc63d6c 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -31,6 +31,7 @@ struct stripe_c { sector_t stripe_width; uint32_t chunk_size; + int chunk_size_shift; /* Needed for handling events */ struct dm_target *ti; @@ -163,6 +164,10 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_discard_requests = stripes; sc->chunk_size = chunk_size; + if (chunk_size & (chunk_size - 1)) + sc->chunk_size_shift = -1; + else + sc->chunk_size_shift = __ffs(chunk_size); /* * Get the stripe destinations. @@ -202,7 +207,14 @@ static void stripe_map_sector(struct stripe_c *sc, sector_t sector, uint32_t *stripe, sector_t *result) { sector_t chunk = dm_target_offset(sc->ti, sector); - sector_t chunk_offset = sector_div(chunk, sc->chunk_size); + sector_t chunk_offset; + + if (sc->chunk_size_shift < 0) + chunk_offset = sector_div(chunk, sc->chunk_size); + else { + chunk_offset = chunk & (sc->chunk_size - 1); + chunk >>= sc->chunk_size_shift; + } if (sc->stripes_shift < 0) *stripe = sector_div(chunk, sc->stripes); @@ -211,7 +223,12 @@ static void stripe_map_sector(struct stripe_c *sc, sector_t sector, chunk >>= sc->stripes_shift; } - *result = (chunk * sc->chunk_size) + chunk_offset; + if (sc->chunk_size_shift < 0) + chunk *= sc->chunk_size; + else + chunk <<= sc->chunk_size_shift; + + *result = chunk + chunk_offset; } static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, @@ -225,7 +242,10 @@ static void stripe_map_range_sector(struct stripe_c *sc, sector_t sector, /* round down */ sector = *result; - *result -= sector_div(sector, sc->chunk_size); + if (sc->chunk_size_shift < 0) + *result -= sector_div(sector, sc->chunk_size); + else + *result = sector & ~(sector_t)(sc->chunk_size - 1); if (target_stripe < stripe) *result += sc->chunk_size; /* next chunk */ -- cgit v1.2.3-55-g7522 From 55f2b8bdb0c7387eb2dc645b9ecbe5d0faa6b54e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 27 Jul 2012 15:08:02 +0100 Subject: dm thin: support for non power of 2 pool blocksize Non power of 2 blocksize support is needed to properly align thinp IO on storage that has non power of 2 optimal IO sizes (e.g. RAID6 10+2). Use sector_div to support non power of 2 blocksize for the pool's data device. This provides comparable performance to the power of 2 math that was performed until now (as tested on modern x86_64 hardware). The kernel currently assumes that limits->discard_granularity is a power of two so the thin target only enables discard support if the block size is a power of two. Eliminate pool structure's 'block_shift', 'offset_mask' and remaining 4 byte holes. Signed-off-by: Mike Snitzer Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 59 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 22 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 350bcf40485e..f21d318d98f0 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -510,10 +510,8 @@ struct pool { struct block_device *md_dev; struct dm_pool_metadata *pmd; - uint32_t sectors_per_block; - unsigned block_shift; - dm_block_t offset_mask; dm_block_t low_water_blocks; + uint32_t sectors_per_block; struct pool_features pf; unsigned low_water_triggered:1; /* A dm event has been sent */ @@ -526,8 +524,8 @@ struct pool { struct work_struct worker; struct delayed_work waker; - unsigned ref_count; unsigned long last_commit_jiffies; + unsigned ref_count; spinlock_t lock; struct bio_list deferred_bios; @@ -679,16 +677,21 @@ static void requeue_io(struct thin_c *tc) static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) { - return bio->bi_sector >> tc->pool->block_shift; + sector_t block_nr = bio->bi_sector; + + (void) sector_div(block_nr, tc->pool->sectors_per_block); + + return block_nr; } static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) { struct pool *pool = tc->pool; + sector_t bi_sector = bio->bi_sector; bio->bi_bdev = tc->pool_dev->bdev; - bio->bi_sector = (block << pool->block_shift) + - (bio->bi_sector & pool->offset_mask); + bio->bi_sector = (block * pool->sectors_per_block) + + sector_div(bi_sector, pool->sectors_per_block); } static void remap_to_origin(struct thin_c *tc, struct bio *bio) @@ -933,9 +936,10 @@ static void process_prepared(struct pool *pool, struct list_head *head, */ static int io_overlaps_block(struct pool *pool, struct bio *bio) { - return !(bio->bi_sector & pool->offset_mask) && - (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); + sector_t bi_sector = bio->bi_sector; + return !sector_div(bi_sector, pool->sectors_per_block) && + (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); } static int io_overwrites_block(struct pool *pool, struct bio *bio) @@ -1239,8 +1243,8 @@ static void process_discard(struct thin_c *tc, struct bio *bio) * part of the discard that is in a subsequent * block. */ - sector_t offset = bio->bi_sector - (block << pool->block_shift); - unsigned remaining = (pool->sectors_per_block - offset) << 9; + sector_t offset = bio->bi_sector - (block * pool->sectors_per_block); + unsigned remaining = (pool->sectors_per_block - offset) << SECTOR_SHIFT; bio->bi_size = min(bio->bi_size, remaining); cell_release_singleton(cell, bio); @@ -1722,8 +1726,6 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->pmd = pmd; pool->sectors_per_block = block_size; - pool->block_shift = ffs(block_size) - 1; - pool->offset_mask = block_size - 1; pool->low_water_blocks = 0; pool_features_init(&pool->pf); pool->prison = prison_create(PRISON_CELLS); @@ -1971,7 +1973,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) if (kstrtoul(argv[2], 10, &block_size) || !block_size || block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS || block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS || - !is_power_of_2(block_size)) { + block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) { ti->error = "Invalid block size"; r = -EINVAL; goto out; @@ -2018,6 +2020,15 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out_flags_changed; } + /* + * The block layer requires discard_granularity to be a power of 2. + */ + if (pf.discard_enabled && !is_power_of_2(block_size)) { + ti->error = "Discard support must be disabled when the block size is not a power of 2"; + r = -EINVAL; + goto out_flags_changed; + } + pt->pool = pool; pt->ti = ti; pt->metadata_dev = metadata_dev; @@ -2097,7 +2108,8 @@ static int pool_preresume(struct dm_target *ti) int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - dm_block_t data_size, sb_data_size; + sector_t data_size = ti->len; + dm_block_t sb_data_size; /* * Take control of the pool object. @@ -2106,7 +2118,8 @@ static int pool_preresume(struct dm_target *ti) if (r) return r; - data_size = ti->len >> pool->block_shift; + (void) sector_div(data_size, pool->sectors_per_block); + r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); if (r) { DMERR("failed to retrieve data device size"); @@ -2115,7 +2128,7 @@ static int pool_preresume(struct dm_target *ti) if (data_size < sb_data_size) { DMERR("pool target too small, is %llu blocks (expected %llu)", - data_size, sb_data_size); + (unsigned long long)data_size, sb_data_size); return -EINVAL; } else if (data_size > sb_data_size) { @@ -2764,19 +2777,21 @@ static int thin_status(struct dm_target *ti, status_type_t type, static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { - dm_block_t blocks; + sector_t blocks; struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; /* * We can't call dm_pool_get_data_dev_size() since that blocks. So * we follow a more convoluted path through to the pool's target. */ - if (!tc->pool->ti) + if (!pool->ti) return 0; /* nothing is bound */ - blocks = tc->pool->ti->len >> tc->pool->block_shift; + blocks = pool->ti->len; + (void) sector_div(blocks, pool->sectors_per_block); if (blocks) - return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data); + return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); return 0; } @@ -2793,7 +2808,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 1, 0}, + .version = {1, 2, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, -- cgit v1.2.3-55-g7522 From 7acf0277cea0f2da89ffffcc9892bea23f618e63 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:03 +0100 Subject: dm: introduce split_discard_requests This patch introduces a new variable split_discard_requests. It can be set by targets so that discard requests are split on max_io_len boundaries. When split_discard_requests is not set, discard requests are only split on boundaries between targets, as was the case before this patch. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm.c | 5 ++++- include/linux/device-mapper.h | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 415c2803c0c9..4e09b6ff5b49 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1215,7 +1215,10 @@ static int __clone_and_map_discard(struct clone_info *ci) if (!ti->num_discard_requests) return -EOPNOTSUPP; - len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + if (!ti->split_discard_requests) + len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + else + len = min(ci->sector_count, max_io_len(ci->sector, ti)); __issue_target_requests(ci, ti, ti->num_discard_requests, len); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index b19c1e189a68..8bdbbfce759a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -217,6 +217,12 @@ struct dm_target { */ unsigned discards_supported:1; + /* + * Set if the target required discard request to be split + * on max_io_len boundary. + */ + unsigned split_discard_requests:1; + /* * Set if this target does not return zeroes on discarded blocks. */ -- cgit v1.2.3-55-g7522 From 4929630901100fdbfa19186ecf5ea2706f57719b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:03 +0100 Subject: dm thin: split discards on block boundary This patch sets the variable "ti->split_discard_requests" for the dm thin target so that device mapper core splits discard requests on a block boundary. Consequently, a discard request that spans multiple blocks is never sent to dm-thin. The patch also removes some code in process_discard that deals with discards that span multiple blocks. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index f21d318d98f0..828649256902 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1238,15 +1238,10 @@ static void process_discard(struct thin_c *tc, struct bio *bio) } } else { /* - * This path is hit if people are ignoring - * limits->discard_granularity. It ignores any - * part of the discard that is in a subsequent - * block. + * The DM core makes sure that the discard doesn't span + * a block boundary. So we submit the discard of a + * partial block appropriately. */ - sector_t offset = bio->bi_sector - (block * pool->sectors_per_block); - unsigned remaining = (pool->sectors_per_block - offset) << SECTOR_SHIFT; - bio->bi_size = min(bio->bi_size, remaining); - cell_release_singleton(cell, bio); cell_release_singleton(cell2, bio); if ((!lookup_result.shared) && pool->pf.discard_passdown) @@ -2509,7 +2504,8 @@ static void set_discard_limits(struct pool *pool, struct queue_limits *limits) /* * This is just a hint, and not enforced. We have to cope with - * bios that overlap 2 blocks. + * bios that cover a block partially. A discard that spans a block + * boundary is not sent to this target. */ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; limits->discard_zeroes_data = pool->pf.zero_new_blocks; @@ -2652,6 +2648,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->discards_supported = 1; ti->num_discard_requests = 1; ti->discard_zeroes_data_unsupported = 1; + /* Discard requests must be split on a block boundary */ + ti->split_discard_requests = 1; } dm_put(pool_md); -- cgit v1.2.3-55-g7522 From f9a8e0cd261fc05820539b0ea613686a32795406 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 27 Jul 2012 15:08:03 +0100 Subject: dm thin: optimize power of two block size dm-thin will be most likely used with a block size that is a power of two. So it should be optimized for this case. This patch changes division and modulo operations to shifts and bit masks if block size is a power of two. A test that bi_sector is divisible by a block size is removed from io_overlaps_block. Device mapper never sends bios that span a block boundary. Consequently, if we tested that bi_size is equivalent to block size, bi_sector must already be on a block boundary. Signed-off-by: Mikulas Patocka Signed-off-by: Alasdair G Kergon --- drivers/md/dm-thin.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 828649256902..93e3e542cff9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -512,6 +512,7 @@ struct pool { dm_block_t low_water_blocks; uint32_t sectors_per_block; + int sectors_per_block_shift; struct pool_features pf; unsigned low_water_triggered:1; /* A dm event has been sent */ @@ -679,7 +680,10 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) { sector_t block_nr = bio->bi_sector; - (void) sector_div(block_nr, tc->pool->sectors_per_block); + if (tc->pool->sectors_per_block_shift < 0) + (void) sector_div(block_nr, tc->pool->sectors_per_block); + else + block_nr >>= tc->pool->sectors_per_block_shift; return block_nr; } @@ -690,8 +694,12 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) sector_t bi_sector = bio->bi_sector; bio->bi_bdev = tc->pool_dev->bdev; - bio->bi_sector = (block * pool->sectors_per_block) + - sector_div(bi_sector, pool->sectors_per_block); + if (tc->pool->sectors_per_block_shift < 0) + bio->bi_sector = (block * pool->sectors_per_block) + + sector_div(bi_sector, pool->sectors_per_block); + else + bio->bi_sector = (block << pool->sectors_per_block_shift) | + (bi_sector & (pool->sectors_per_block - 1)); } static void remap_to_origin(struct thin_c *tc, struct bio *bio) @@ -936,10 +944,7 @@ static void process_prepared(struct pool *pool, struct list_head *head, */ static int io_overlaps_block(struct pool *pool, struct bio *bio) { - sector_t bi_sector = bio->bi_sector; - - return !sector_div(bi_sector, pool->sectors_per_block) && - (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); + return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT); } static int io_overwrites_block(struct pool *pool, struct bio *bio) @@ -1721,6 +1726,10 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->pmd = pmd; pool->sectors_per_block = block_size; + if (block_size & (block_size - 1)) + pool->sectors_per_block_shift = -1; + else + pool->sectors_per_block_shift = __ffs(block_size); pool->low_water_blocks = 0; pool_features_init(&pool->pf); pool->prison = prison_create(PRISON_CELLS); -- cgit v1.2.3-55-g7522 From a58a935d5a1b2ad267017a68c3a1bca26226cc76 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 27 Jul 2012 15:08:04 +0100 Subject: dm mpath: add retain_attached_hw_handler feature A SCSI device handler might get attached to a device during the initial device scan. We do not necessarily want to override this when loading a multipath table, so this patch adds a new multipath feature argument "retain_attached_hw_handler". During SCSI device scan all loaded SCSI device handlers will be consulted for a match (via scsi_dh's provided .match). If a match is found that device handler will be attached. We need a way to have userspace multipathd's provided 'hw_handler' not override the already attached hardware handler. When specifying the new feature 'retain_attached_hw_handler' multipath will use the currently attached hardware handler instead of trying to attach the one specified during table load. If no hardware handler is attached the specified hardware handler will still be used. Leverages scsi_dh_attach's ability to increment the scsi_dh's reference count if the same scsi_dh name is provided when attaching - currently attached scsi_dh name is determined with scsi_dh_attached_handler_name. Depends upon commit 7e8a74b177f17d100916b6ad415450f7c9508691 ("[SCSI] scsi_dh: add scsi_dh_attached_handler_name"). Signed-off-by: Mike Snitzer Tested-by: Babu Moger Reviewed-by: Chandra Seetharaman Acked-by: Hannes Reinecke Signed-off-by: Alasdair G Kergon --- drivers/md/dm-mpath.c | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 638dae048b4f..8a3b2d53f81b 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -85,6 +85,7 @@ struct multipath { unsigned queue_io:1; /* Must we queue all I/O? */ unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ + unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -568,6 +569,8 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps int r; struct pgpath *p; struct multipath *m = ti->private; + struct request_queue *q = NULL; + const char *attached_handler_name; /* we need at least a path arg */ if (as->argc < 1) { @@ -586,13 +589,37 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps goto bad; } - if (m->hw_handler_name) { - struct request_queue *q = bdev_get_queue(p->path.dev->bdev); + if (m->retain_attached_hw_handler || m->hw_handler_name) + q = bdev_get_queue(p->path.dev->bdev); + + if (m->retain_attached_hw_handler) { + attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL); + if (attached_handler_name) { + /* + * Reset hw_handler_name to match the attached handler + * and clear any hw_handler_params associated with the + * ignored handler. + * + * NB. This modifies the table line to show the actual + * handler instead of the original table passed in. + */ + kfree(m->hw_handler_name); + m->hw_handler_name = attached_handler_name; + + kfree(m->hw_handler_params); + m->hw_handler_params = NULL; + } + } + if (m->hw_handler_name) { + /* + * Increments scsi_dh reference, even when using an + * already-attached handler. + */ r = scsi_dh_attach(q, m->hw_handler_name); if (r == -EBUSY) { /* - * Already attached to different hw_handler, + * Already attached to different hw_handler: * try to reattach with correct one. */ scsi_dh_detach(q); @@ -760,7 +787,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) const char *arg_name; static struct dm_arg _args[] = { - {0, 5, "invalid number of feature args"}, + {0, 6, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; @@ -781,6 +808,11 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) continue; } + if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { + m->retain_attached_hw_handler = 1; + continue; + } + if (!strcasecmp(arg_name, "pg_init_retries") && (argc >= 1)) { r = dm_read_arg(_args + 1, as, &m->pg_init_retries, &ti->error); @@ -1364,13 +1396,16 @@ static int multipath_status(struct dm_target *ti, status_type_t type, else { DMEMIT("%u ", m->queue_if_no_path + (m->pg_init_retries > 0) * 2 + - (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2); + (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) * 2 + + m->retain_attached_hw_handler); if (m->queue_if_no_path) DMEMIT("queue_if_no_path "); if (m->pg_init_retries) DMEMIT("pg_init_retries %u ", m->pg_init_retries); if (m->pg_init_delay_msecs != DM_PG_INIT_DELAY_DEFAULT) DMEMIT("pg_init_delay_msecs %u ", m->pg_init_delay_msecs); + if (m->retain_attached_hw_handler) + DMEMIT("retain_attached_hw_handler "); } if (!m->hw_handler_name || type == STATUSTYPE_INFO) @@ -1656,7 +1691,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, -- cgit v1.2.3-55-g7522 From f999e8fe70bd0b8faa27ccdac14b5942999c6e78 Mon Sep 17 00:00:00 2001 From: Jonathan E Brassow Date: Fri, 27 Jul 2012 15:08:04 +0100 Subject: dm raid: restructure parse_raid_params In preparation for RAID10 addition to dm-raid, we change an 'if' conditional to a 'switch' conditional to make it easier to see what is being checked for each RAID type. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 858a8b70811c..1717ed33dd7f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -430,13 +430,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv, if (!strcasecmp(key, "rebuild")) { rebuild_cnt++; - if (((rs->raid_type->level != 1) && - (rebuild_cnt > rs->raid_type->parity_devs)) || - ((rs->raid_type->level == 1) && - (rebuild_cnt > (rs->md.raid_disks - 1)))) { - rs->ti->error = "Too many rebuild devices specified for given RAID type"; + + switch (rs->raid_type->level) { + case 1: + if (rebuild_cnt >= rs->md.raid_disks) { + rs->ti->error = "Too many rebuild devices specified"; + return -EINVAL; + } + break; + case 4: + case 5: + case 6: + if (rebuild_cnt > rs->raid_type->parity_devs) { + rs->ti->error = "Too many rebuild devices specified for given RAID type"; + return -EINVAL; + } + break; + default: + DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); + rs->ti->error = "Rebuild not supported for this RAID type"; return -EINVAL; } + if (value > rs->md.raid_disks) { rs->ti->error = "Invalid rebuild index given"; return -EINVAL; -- cgit v1.2.3-55-g7522 From c039c332f23e794deb6d6f37b9f07ff3b27fb2cf Mon Sep 17 00:00:00 2001 From: Jonathan E Brassow Date: Fri, 27 Jul 2012 15:08:04 +0100 Subject: dm raid: move sectors_per_dev calculation In preparation for RAID10 inclusion in dm-raid, we move the sectors_per_dev calculation later in the device creation process. This is because we won't know up-front how many stripes vs how many mirrors there are which will change the calculation. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-raid.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1717ed33dd7f..f4275a8e860c 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -101,20 +101,12 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra { unsigned i; struct raid_set *rs; - sector_t sectors_per_dev; if (raid_devs <= raid_type->parity_devs) { ti->error = "Insufficient number of devices"; return ERR_PTR(-EINVAL); } - sectors_per_dev = ti->len; - if ((raid_type->level > 1) && - sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { - ti->error = "Target length not divisible by number of data devices"; - return ERR_PTR(-EINVAL); - } - rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); if (!rs) { ti->error = "Cannot allocate raid context"; @@ -128,7 +120,6 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra rs->md.raid_disks = raid_devs; rs->md.level = raid_type->level; rs->md.new_level = rs->md.level; - rs->md.dev_sectors = sectors_per_dev; rs->md.layout = raid_type->algorithm; rs->md.new_layout = rs->md.layout; rs->md.delta_disks = 0; @@ -143,6 +134,7 @@ static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *ra * rs->md.external * rs->md.chunk_sectors * rs->md.new_chunk_sectors + * rs->md.dev_sectors */ return rs; @@ -353,6 +345,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, { unsigned i, rebuild_cnt = 0; unsigned long value, region_size = 0; + sector_t sectors_per_dev = rs->ti->len; sector_t max_io_len; char *key; @@ -545,6 +538,13 @@ static int parse_raid_params(struct raid_set *rs, char **argv, if (dm_set_target_max_io_len(rs->ti, max_io_len)) return -EINVAL; + if ((rs->raid_type->level > 1) && + sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { + rs->ti->error = "Target length not divisible by number of data devices"; + return -EINVAL; + } + rs->md.dev_sectors = sectors_per_dev; + /* Assume there are no metadata devices until the drives are parsed */ rs->md.persistent = 0; rs->md.external = 1; -- cgit v1.2.3-55-g7522