diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/btree.c | 3 | ||||
-rw-r--r-- | drivers/md/bcache/extents.c | 13 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/stats.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 30 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 81 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.h | 23 | ||||
-rw-r--r-- | drivers/md/bcache/util.c | 6 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.h | 3 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 30 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 65 | ||||
-rw-r--r-- | drivers/md/dm.c | 41 | ||||
-rw-r--r-- | drivers/md/md-linear.c | 3 | ||||
-rw-r--r-- | drivers/md/raid1.c | 37 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 33 | ||||
-rw-r--r-- | drivers/md/raid5.c | 8 |
20 files changed, 279 insertions, 129 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 23cb1dc7296b..64def336f053 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -432,8 +432,9 @@ static void do_btree_node_write(struct btree *b) int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, b->bio, j) + bio_for_each_segment_all(bv, b->bio, j, iter_all) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 956004366699..886710043025 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -538,6 +538,7 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); unsigned int i, stale; + char buf[80]; if (!KEY_PTRS(k) || bch_extent_invalid(bk, k)) @@ -547,19 +548,19 @@ static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) if (!ptr_available(b->c, k, i)) return true; - if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) - return false; - for (i = 0; i < KEY_PTRS(k); i++) { stale = ptr_stale(b->c, k, i); + if (stale && KEY_DIRTY(k)) { + bch_extent_to_text(buf, sizeof(buf), k); + pr_info("stale dirty pointer, stale %u, key: %s", + stale, buf); + } + btree_bug_on(stale > BUCKET_GC_GEN_MAX, b, "key too stale: %i, need_gc %u", stale, b->c->need_gc); - btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), - b, "stale dirty pointer"); - if (stale) return true; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 15070412a32e..f101bfe8657a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -392,10 +392,11 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) /* * Flag for bypass if the IO is for read-ahead or background, - * unless the read-ahead request is for metadata (eg, for gfs2). + * unless the read-ahead request is for metadata + * (eg, for gfs2 or xfs). */ if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && - !(bio->bi_opf & REQ_PRIO)) + !(bio->bi_opf & (REQ_META|REQ_PRIO))) goto skip; if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || @@ -877,7 +878,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, } if (!(bio->bi_opf & REQ_RAHEAD) && - !(bio->bi_opf & REQ_PRIO) && + !(bio->bi_opf & (REQ_META|REQ_PRIO)) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, get_capacity(bio->bi_disk) - bio_end_sector(bio)); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 894410f3f829..ba1c93791d8d 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -111,7 +111,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) { memset(&acc->total.cache_hits, 0, - sizeof(unsigned long) * 7); + sizeof(struct cache_stats)); } void bch_cache_accounting_destroy(struct cache_accounting *acc) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4dee119c3664..a697a3a923cd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1615,21 +1615,21 @@ static void conditional_stop_bcache_device(struct cache_set *c, */ pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.", d->disk->disk_name); - /* - * There might be a small time gap that cache set is - * released but bcache device is not. Inside this time - * gap, regular I/O requests will directly go into - * backing device as no cache set attached to. This - * behavior may also introduce potential inconsistence - * data in writeback mode while cache is dirty. - * Therefore before calling bcache_device_stop() due - * to a broken cache device, dc->io_disable should be - * explicitly set to true. - */ - dc->io_disable = true; - /* make others know io_disable is true earlier */ - smp_mb(); - bcache_device_stop(d); + /* + * There might be a small time gap that cache set is + * released but bcache device is not. Inside this time + * gap, regular I/O requests will directly go into + * backing device as no cache set attached to. This + * behavior may also introduce potential inconsistence + * data in writeback mode while cache is dirty. + * Therefore before calling bcache_device_stop() due + * to a broken cache device, dc->io_disable should be + * explicitly set to true. + */ + dc->io_disable = true; + /* make others know io_disable is true earlier */ + smp_mb(); + bcache_device_stop(d); } else { /* * dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 557a8a3270a1..17bae9c14ca0 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -67,6 +67,8 @@ read_attribute(written); read_attribute(btree_written); read_attribute(metadata_written); read_attribute(active_journal_entries); +read_attribute(backing_dev_name); +read_attribute(backing_dev_uuid); sysfs_time_stats_attribute(btree_gc, sec, ms); sysfs_time_stats_attribute(btree_split, sec, us); @@ -243,6 +245,19 @@ SHOW(__bch_cached_dev) return strlen(buf); } + if (attr == &sysfs_backing_dev_name) { + snprintf(buf, BDEVNAME_SIZE + 1, "%s", dc->backing_dev_name); + strcat(buf, "\n"); + return strlen(buf); + } + + if (attr == &sysfs_backing_dev_uuid) { + /* convert binary uuid into 36-byte string plus '\0' */ + snprintf(buf, 36+1, "%pU", dc->sb.uuid); + strcat(buf, "\n"); + return strlen(buf); + } + #undef var return 0; } @@ -262,10 +277,10 @@ STORE(__cached_dev) sysfs_strtoul(data_csum, dc->disk.data_csum); d_strtoul(verify); - d_strtoul(bypass_torture_test); - d_strtoul(writeback_metadata); - d_strtoul(writeback_running); - d_strtoul(writeback_delay); + sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test); + sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata); + sysfs_strtoul_bool(writeback_running, dc->writeback_running); + sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX); sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, bch_cutoff_writeback); @@ -287,9 +302,15 @@ STORE(__cached_dev) sysfs_strtoul_clamp(writeback_rate_update_seconds, dc->writeback_rate_update_seconds, 1, WRITEBACK_RATE_UPDATE_SECS_MAX); - d_strtoul(writeback_rate_i_term_inverse); - d_strtoul_nonzero(writeback_rate_p_term_inverse); - d_strtoul_nonzero(writeback_rate_minimum); + sysfs_strtoul_clamp(writeback_rate_i_term_inverse, + dc->writeback_rate_i_term_inverse, + 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_p_term_inverse, + dc->writeback_rate_p_term_inverse, + 1, UINT_MAX); + sysfs_strtoul_clamp(writeback_rate_minimum, + dc->writeback_rate_minimum, + 1, UINT_MAX); sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); @@ -299,7 +320,9 @@ STORE(__cached_dev) dc->io_disable = v ? 1 : 0; } - d_strtoi_h(sequential_cutoff); + sysfs_strtoul_clamp(sequential_cutoff, + dc->sequential_cutoff, + 0, UINT_MAX); d_strtoi_h(readahead); if (attr == &sysfs_clear_stats) @@ -452,6 +475,8 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_verify, &sysfs_bypass_torture_test, #endif + &sysfs_backing_dev_name, + &sysfs_backing_dev_uuid, NULL }; KTYPE(bch_cached_dev); @@ -761,10 +786,12 @@ STORE(__bch_cache_set) c->shrink.scan_objects(&c->shrink, &sc); } - sysfs_strtoul(congested_read_threshold_us, - c->congested_read_threshold_us); - sysfs_strtoul(congested_write_threshold_us, - c->congested_write_threshold_us); + sysfs_strtoul_clamp(congested_read_threshold_us, + c->congested_read_threshold_us, + 0, UINT_MAX); + sysfs_strtoul_clamp(congested_write_threshold_us, + c->congested_write_threshold_us, + 0, UINT_MAX); if (attr == &sysfs_errors) { v = __sysfs_match_string(error_actions, -1, buf); @@ -774,12 +801,20 @@ STORE(__bch_cache_set) c->on_error = v; } - if (attr == &sysfs_io_error_limit) - c->error_limit = strtoul_or_return(buf); + sysfs_strtoul_clamp(io_error_limit, c->error_limit, 0, UINT_MAX); /* See count_io_errors() for why 88 */ - if (attr == &sysfs_io_error_halflife) - c->error_decay = strtoul_or_return(buf) / 88; + if (attr == &sysfs_io_error_halflife) { + unsigned long v = 0; + ssize_t ret; + + ret = strtoul_safe_clamp(buf, v, 0, UINT_MAX); + if (!ret) { + c->error_decay = v / 88; + return size; + } + return ret; + } if (attr == &sysfs_io_disable) { v = strtoul_or_return(buf); @@ -794,13 +829,15 @@ STORE(__bch_cache_set) } } - sysfs_strtoul(journal_delay_ms, c->journal_delay_ms); - sysfs_strtoul(verify, c->verify); - sysfs_strtoul(key_merging_disabled, c->key_merging_disabled); + sysfs_strtoul_clamp(journal_delay_ms, + c->journal_delay_ms, + 0, USHRT_MAX); + sysfs_strtoul_bool(verify, c->verify); + sysfs_strtoul_bool(key_merging_disabled, c->key_merging_disabled); sysfs_strtoul(expensive_debug_checks, c->expensive_debug_checks); - sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite); - sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled); - sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled); + sysfs_strtoul_bool(gc_always_rewrite, c->gc_always_rewrite); + sysfs_strtoul_bool(btree_shrinker_disabled, c->shrinker_disabled); + sysfs_strtoul_bool(copy_gc_enabled, c->copy_gc_enabled); /* * write gc_after_writeback here may overwrite an already set * BCH_DO_AUTO_GC, it doesn't matter because this flag will be diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index 3fe82425859c..215df32f567b 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -79,11 +79,28 @@ do { \ return strtoul_safe(buf, var) ?: (ssize_t) size; \ } while (0) +#define sysfs_strtoul_bool(file, var) \ +do { \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = strtoul_or_return(buf); \ + \ + var = v ? 1 : 0; \ + return size; \ + } \ +} while (0) + #define sysfs_strtoul_clamp(file, var, min, max) \ do { \ - if (attr == &sysfs_ ## file) \ - return strtoul_safe_clamp(buf, var, min, max) \ - ?: (ssize_t) size; \ + if (attr == &sysfs_ ## file) { \ + unsigned long v = 0; \ + ssize_t ret; \ + ret = strtoul_safe_clamp(buf, v, min, max); \ + if (!ret) { \ + var = v; \ + return size; \ + } \ + return ret; \ + } \ } while (0) #define strtoul_or_return(cp) \ diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 20eddeac1531..62fb917f7a4f 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) int i; struct bio_vec *bv; - bio_for_each_segment_all(bv, bio, i) { + /* + * This is called on freshly new bio, so it is safe to access the + * bvec table directly. + */ + for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) { bv->bv_page = alloc_page(gfp_mask); if (!bv->bv_page) { while (--bv >= bio->bi_io_vec) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 6a743d3bb338..4e4c6810dc3c 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -71,6 +71,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, in_use > bch_cutoff_writeback_sync) return false; + if (bio_op(bio) == REQ_OP_DISCARD) + return false; + if (dc->partial_stripes_expensive && bcache_dev_stripe_dirty(dc, bio->bi_iter.bi_sector, bio_sectors(bio))) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0ff22159a0ca..dd6565798778 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -932,7 +932,7 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio) if (IS_ERR(bip)) return PTR_ERR(bip); - tag_len = io->cc->on_disk_tag_size * bio_sectors(bio); + tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); bip->bip_iter.bi_size = tag_len; bip->bip_iter.bi_sector = io->cc->start + io->sector; @@ -1447,8 +1447,9 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone) { unsigned int i; struct bio_vec *bv; + struct bvec_iter_all iter_all; - bio_for_each_segment_all(bv, clone, i) { + bio_for_each_segment_all(bv, clone, i, iter_all) { BUG_ON(!bv->bv_page); mempool_free(bv->bv_page, &cc->page_pool); } @@ -2414,9 +2415,21 @@ static int crypt_ctr_cipher_new(struct dm_target *ti, char *cipher_in, char *key * capi:cipher_api_spec-iv:ivopts */ tmp = &cipher_in[strlen("capi:")]; - cipher_api = strsep(&tmp, "-"); - *ivmode = strsep(&tmp, ":"); - *ivopts = tmp; + + /* Separate IV options if present, it can contain another '-' in hash name */ + *ivopts = strrchr(tmp, ':'); + if (*ivopts) { + **ivopts = '\0'; + (*ivopts)++; + } + /* Parse IV mode */ + *ivmode = strrchr(tmp, '-'); + if (*ivmode) { + **ivmode = '\0'; + (*ivmode)++; + } + /* The rest is crypto API spec */ + cipher_api = tmp; if (*ivmode && !strcmp(*ivmode, "lmk")) cc->tfms_count = 64; @@ -2486,11 +2499,8 @@ static int crypt_ctr_cipher_old(struct dm_target *ti, char *cipher_in, char *key goto bad_mem; chainmode = strsep(&tmp, "-"); - *ivopts = strsep(&tmp, "-"); - *ivmode = strsep(&*ivopts, ":"); - - if (tmp) - DMWARN("Ignoring unexpected additional cipher options"); + *ivmode = strsep(&tmp, ":"); + *ivopts = tmp; /* * For compatibility with the original dm-crypt mapping format, if diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 4eb5f8c56535..b2c03c079a8d 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -131,7 +131,7 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) static void rq_completed(struct mapped_device *md) { /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) + if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); /* @@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 4b1be754cc41..ba9481f1bf3c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } -static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) -{ - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); -} - static bool dm_table_all_devices_attribute(struct dm_table *t, iterate_devices_callout_fn func) { @@ -1902,11 +1894,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_write_zeroes(t)) q->limits.max_write_zeroes_sectors = 0; - if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) - blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q); - else - blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); - dm_table_verify_integrity(t); /* diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 20b0776e39ef..ed3caceaed07 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1678,7 +1678,7 @@ int dm_thin_remove_range(struct dm_thin_device *td, return r; } -int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) +int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) { int r; uint32_t ref_count; @@ -1686,7 +1686,7 @@ int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *resu down_read(&pmd->root_lock); r = dm_sm_get_count(pmd->data_sm, b, &ref_count); if (!r) - *result = (ref_count != 0); + *result = (ref_count > 1); up_read(&pmd->root_lock); return r; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 35e954ea20a9..f6be0d733c20 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -195,7 +195,7 @@ int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result); -int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); +int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result); int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e); diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index dadd9696340c..e83b63608262 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -257,6 +257,7 @@ struct pool { spinlock_t lock; struct bio_list deferred_flush_bios; + struct bio_list deferred_flush_completions; struct list_head prepared_mappings; struct list_head prepared_discards; struct list_head prepared_discards_pt2; @@ -956,6 +957,39 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) mempool_free(m, &m->tc->pool->mapping_pool); } +static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) +{ + struct pool *pool = tc->pool; + unsigned long flags; + + /* + * If the bio has the REQ_FUA flag set we must commit the metadata + * before signaling its completion. + */ + if (!bio_triggers_commit(tc, bio)) { + bio_endio(bio); + return; + } + + /* + * Complete bio with an error if earlier I/O caused changes to the + * metadata that can't be committed, e.g, due to I/O errors on the + * metadata device. + */ + if (dm_thin_aborted_changes(tc->td)) { + bio_io_error(bio); + return; + } + + /* + * Batch together any bios that trigger commits and then issue a + * single commit for them in process_deferred_bios(). + */ + spin_lock_irqsave(&pool->lock, flags); + bio_list_add(&pool->deferred_flush_completions, bio); + spin_unlock_irqrestore(&pool->lock, flags); +} + static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; @@ -988,7 +1022,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ if (bio) { inc_remap_and_issue_cell(tc, m->cell, m->data_block); - bio_endio(bio); + complete_overwrite_bio(tc, bio); } else { inc_all_io_entry(tc->pool, m->cell->holder); remap_and_issue(tc, m->cell->holder, m->data_block); @@ -1048,7 +1082,7 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m * passdown we have to check that these blocks are now unused. */ int r = 0; - bool used = true; + bool shared = true; struct thin_c *tc = m->tc; struct pool *pool = tc->pool; dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; @@ -1058,11 +1092,11 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m while (b != end) { /* find start of unmapped run */ for (; b < end; b++) { - r = dm_pool_block_is_used(pool->pmd, b, &used); + r = dm_pool_block_is_shared(pool->pmd, b, &shared); if (r) goto out; - if (!used) + if (!shared) break; } @@ -1071,11 +1105,11 @@ static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m /* find end of run */ for (e = b + 1; e != end; e++) { - r = dm_pool_block_is_used(pool->pmd, e, &used); + r = dm_pool_block_is_shared(pool->pmd, e, &shared); if (r) goto out; - if (used) + if (shared) break; } @@ -2317,7 +2351,7 @@ static void process_deferred_bios(struct pool *pool) { unsigned long flags; struct bio *bio; - struct bio_list bios; + struct bio_list bios, bio_completions; struct thin_c *tc; tc = get_first_thin(pool); @@ -2328,26 +2362,36 @@ static void process_deferred_bios(struct pool *pool) } /* - * If there are any deferred flush bios, we must commit - * the metadata before issuing them. + * If there are any deferred flush bios, we must commit the metadata + * before issuing them or signaling their completion. */ bio_list_init(&bios); + bio_list_init(&bio_completions); + spin_lock_irqsave(&pool->lock, flags); bio_list_merge(&bios, &pool->deferred_flush_bios); bio_list_init(&pool->deferred_flush_bios); + + bio_list_merge(&bio_completions, &pool->deferred_flush_completions); + bio_list_init(&pool->deferred_flush_completions); spin_unlock_irqrestore(&pool->lock, flags); - if (bio_list_empty(&bios) && + if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) return; if (commit(pool)) { + bio_list_merge(&bios, &bio_completions); + while ((bio = bio_list_pop(&bios))) bio_io_error(bio); return; } pool->last_commit_jiffies = jiffies; + while ((bio = bio_list_pop(&bio_completions))) + bio_endio(bio); + while ((bio = bio_list_pop(&bios))) generic_make_request(bio); } @@ -2954,6 +2998,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout); spin_lock_init(&pool->lock); bio_list_init(&pool->deferred_flush_bios); + bio_list_init(&pool->deferred_flush_completions); INIT_LIST_HEAD(&pool->prepared_mappings); INIT_LIST_HEAD(&pool->prepared_discards); INIT_LIST_HEAD(&pool->prepared_discards_pt2); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d67c95ef8d7e..515e6af9bed2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -699,7 +699,7 @@ static void end_io_acct(struct dm_io *io) true, duration, &io->stats_aux); /* nudge anyone waiting on suspend queue */ - if (unlikely(waitqueue_active(&md->wait))) + if (unlikely(wq_has_sleeper(&md->wait))) wake_up(&md->wait); } @@ -1320,7 +1320,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, __bio_clone_fast(clone, bio); - if (unlikely(bio_integrity(bio) != NULL)) { + if (bio_integrity(bio)) { int r; if (unlikely(!dm_target_has_integrity(tio->ti->type) && @@ -1339,7 +1339,7 @@ static int clone_bio(struct dm_target_io *tio, struct bio *bio, bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); - if (unlikely(bio_integrity(bio) != NULL)) + if (bio_integrity(bio)) bio_integrity_trim(clone); return 0; @@ -1588,6 +1588,9 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md, ci->sector = bio->bi_iter.bi_sector; } +#define __dm_part_stat_sub(part, field, subnd) \ + (part_stat_get(part, field) -= (subnd)) + /* * Entry point to split a bio into clones and submit them to the targets. */ @@ -1642,7 +1645,21 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, GFP_NOIO, &md->queue->bio_split); ci.io->orig_bio = b; + + /* + * Adjust IO stats for each split, otherwise upon queue + * reentry there will be redundant IO accounting. + * NOTE: this is a stop-gap fix, a proper fix involves + * significant refactoring of DM core's bio splitting + * (by eliminating DM's splitting and just using bio_split) + */ + part_stat_lock(); + __dm_part_stat_sub(&dm_disk(md)->part0, + sectors[op_stat_group(bio_op(bio))], ci.sector_count); + part_stat_unlock(); + bio_chain(b, bio); + trace_block_split(md->queue, b, bio->bi_iter.bi_sector); ret = generic_make_request(bio); break; } @@ -1713,6 +1730,15 @@ out: return ret; } +static blk_qc_t dm_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) +{ + if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED) + return __process_bio(md, map, bio); + else + return __split_and_process_bio(md, map, bio); +} + static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) { struct mapped_device *md = q->queuedata; @@ -1733,10 +1759,7 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) return ret; } - if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED) - ret = __process_bio(md, map, bio); - else - ret = __split_and_process_bio(md, map, bio); + ret = dm_process_bio(md, map, bio); dm_put_live_table(md, srcu_idx); return ret; @@ -2415,9 +2438,9 @@ static void dm_wq_work(struct work_struct *work) break; if (dm_request_based(md)) - generic_make_request(c); + (void) generic_make_request(c); else - __split_and_process_bio(md, map, c); + (void) dm_process_bio(md, map, c); } dm_put_live_table(md, srcu_idx); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index d45c697c0ebe..5998d78aa189 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -96,8 +96,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) int i, cnt; bool discard_supported = false; - conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), - GFP_KERNEL); + conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); if (!conf) return NULL; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1d54109071cc..fdf451aac369 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1603,11 +1603,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev) return; } set_bit(Blocked, &rdev->flags); - if (test_and_clear_bit(In_sync, &rdev->flags)) { + if (test_and_clear_bit(In_sync, &rdev->flags)) mddev->degraded++; - set_bit(Faulty, &rdev->flags); - } else - set_bit(Faulty, &rdev->flags); + set_bit(Faulty, &rdev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); /* * if recovery is running, make sure it aborts. @@ -1863,6 +1861,20 @@ static void end_sync_read(struct bio *bio) reschedule_retry(r1_bio); } +static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) +{ + sector_t sync_blocks = 0; + sector_t s = r1_bio->sector; + long sectors_to_go = r1_bio->sectors; + + /* make sure these bits don't get cleared. */ + do { + md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); + s += sync_blocks; + sectors_to_go -= sync_blocks; + } while (sectors_to_go > 0); +} + static void end_sync_write(struct bio *bio) { int uptodate = !bio->bi_status; @@ -1874,15 +1886,7 @@ static void end_sync_write(struct bio *bio) struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { - sector_t sync_blocks = 0; - sector_t s = r1_bio->sector; - long sectors_to_go = r1_bio->sectors; - /* make sure these bits doesn't get cleared. */ - do { - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); - s += sync_blocks; - sectors_to_go -= sync_blocks; - } while (sectors_to_go > 0); + abort_sync_write(mddev, r1_bio); set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -2114,13 +2118,14 @@ static void process_checks(struct r1bio *r1_bio) struct page **spages = get_resync_pages(sbio)->pages; struct bio_vec *bi; int page_len[RESYNC_PAGES] = { 0 }; + struct bvec_iter_all iter_all; if (sbio->bi_end_io != end_sync_read) continue; /* Now we can 'fixup' the error value */ sbio->bi_status = 0; - bio_for_each_segment_all(bi, sbio, j) + bio_for_each_segment_all(bi, sbio, j, iter_all) page_len[j] = bi->bv_len; if (!status) { @@ -2172,8 +2177,10 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) (i == r1_bio->read_disk || !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) continue; - if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) { + abort_sync_write(mddev, r1_bio); continue; + } bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index ec3a5ef7fee0..cbbe6b6535be 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1935,12 +1935,14 @@ out: } static struct stripe_head * -r5c_recovery_alloc_stripe(struct r5conf *conf, - sector_t stripe_sect) +r5c_recovery_alloc_stripe( + struct r5conf *conf, + sector_t stripe_sect, + int noblock) { struct stripe_head *sh; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0); + sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); if (!sh) return NULL; /* no more stripe available */ @@ -2150,7 +2152,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, stripe_sect); if (!sh) { - sh = r5c_recovery_alloc_stripe(conf, stripe_sect); + sh = r5c_recovery_alloc_stripe(conf, stripe_sect, 1); /* * cannot get stripe from raid5_get_active_stripe * try replay some stripes @@ -2159,20 +2161,29 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, r5c_recovery_replay_stripes( cached_stripe_list, ctx); sh = r5c_recovery_alloc_stripe( - conf, stripe_sect); + conf, stripe_sect, 1); } if (!sh) { + int new_size = conf->min_nr_stripes * 2; pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n", mdname(mddev), - conf->min_nr_stripes * 2); - raid5_set_cache_size(mddev, - conf->min_nr_stripes * 2); - sh = r5c_recovery_alloc_stripe(conf, - stripe_sect); + new_size); + ret = raid5_set_cache_size(mddev, new_size); + if (conf->min_nr_stripes <= new_size / 2) { + pr_err("md/raid:%s: Cannot increase cache size, ret=%d, new_size=%d, min_nr_stripes=%d, max_nr_stripes=%d\n", + mdname(mddev), + ret, + new_size, + conf->min_nr_stripes, + conf->max_nr_stripes); + return -ENOMEM; + } + sh = r5c_recovery_alloc_stripe( + conf, stripe_sect, 0); } if (!sh) { pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n", - mdname(mddev)); + mdname(mddev)); return -ENOMEM; } list_add_tail(&sh->lru, cached_stripe_list); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4990f0319f6c..cecea901ab8c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6369,6 +6369,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page) int raid5_set_cache_size(struct mddev *mddev, int size) { + int result = 0; struct r5conf *conf = mddev->private; if (size <= 16 || size > 32768) @@ -6385,11 +6386,14 @@ raid5_set_cache_size(struct mddev *mddev, int size) mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) - if (!grow_one_stripe(conf, GFP_KERNEL)) + if (!grow_one_stripe(conf, GFP_KERNEL)) { + conf->min_nr_stripes = conf->max_nr_stripes; + result = -ENOMEM; break; + } mutex_unlock(&conf->cache_size_mutex); - return 0; + return result; } EXPORT_SYMBOL(raid5_set_cache_size); |