summaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5-cache.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5-cache.c')
-rw-r--r--drivers/md/raid5-cache.c365
1 files changed, 296 insertions, 69 deletions
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index d7bfb6fc8aef..3f307be01b10 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -20,6 +20,7 @@
#include <linux/crc32c.h>
#include <linux/random.h>
#include <linux/kthread.h>
+#include <linux/types.h>
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
@@ -162,9 +163,62 @@ struct r5l_log {
/* to submit async io_units, to fulfill ordering of flush */
struct work_struct deferred_io_work;
+ /* to disable write back during in degraded mode */
+ struct work_struct disable_writeback_work;
+
+ /* to for chunk_aligned_read in writeback mode, details below */
+ spinlock_t tree_lock;
+ struct radix_tree_root big_stripe_tree;
};
/*
+ * Enable chunk_aligned_read() with write back cache.
+ *
+ * Each chunk may contain more than one stripe (for example, a 256kB
+ * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For
+ * chunk_aligned_read, these stripes are grouped into one "big_stripe".
+ * For each big_stripe, we count how many stripes of this big_stripe
+ * are in the write back cache. These data are tracked in a radix tree
+ * (big_stripe_tree). We use radix_tree item pointer as the counter.
+ * r5c_tree_index() is used to calculate keys for the radix tree.
+ *
+ * chunk_aligned_read() calls r5c_big_stripe_cached() to look up
+ * big_stripe of each chunk in the tree. If this big_stripe is in the
+ * tree, chunk_aligned_read() aborts. This look up is protected by
+ * rcu_read_lock().
+ *
+ * It is necessary to remember whether a stripe is counted in
+ * big_stripe_tree. Instead of adding new flag, we reuses existing flags:
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these
+ * two flags are set, the stripe is counted in big_stripe_tree. This
+ * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to
+ * r5c_try_caching_write(); and moving clear_bit of
+ * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to
+ * r5c_finish_stripe_write_out().
+ */
+
+/*
+ * radix tree requests lowest 2 bits of data pointer to be 2b'00.
+ * So it is necessary to left shift the counter by 2 bits before using it
+ * as data pointer of the tree.
+ */
+#define R5C_RADIX_COUNT_SHIFT 2
+
+/*
+ * calculate key for big_stripe_tree
+ *
+ * sect: align_bi->bi_iter.bi_sector or sh->sector
+ */
+static inline sector_t r5c_tree_index(struct r5conf *conf,
+ sector_t sect)
+{
+ sector_t offset;
+
+ offset = sector_div(sect, conf->chunk_sectors);
+ return sect;
+}
+
+/*
* an IO range starts from a meta data block and end at the next meta data
* block. The io unit's the meta data block tracks data/parity followed it. io
* unit is written to log disk with normal write, as we always flush log disk
@@ -335,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
/*
* Total log space (in sectors) needed to flush all data in cache
*
- * Currently, writing-out phase automatically includes all pending writes
- * to the same sector. So the reclaim of each stripe takes up to
- * (conf->raid_disks + 1) pages of log space.
+ * To avoid deadlock due to log space, it is necessary to reserve log
+ * space to flush critical stripes (stripes that occupying log space near
+ * last_checkpoint). This function helps check how much log space is
+ * required to flush all cached stripes.
*
- * To totally avoid deadlock due to log space, the code reserves
- * (conf->raid_disks + 1) pages for each stripe in cache, which is not
- * necessary in most cases.
+ * To reduce log space requirements, two mechanisms are used to give cache
+ * flush higher priorities:
+ * 1. In handle_stripe_dirtying() and schedule_reconstruction(),
+ * stripes ALREADY in journal can be flushed w/o pending writes;
+ * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
+ * can be delayed (r5l_add_no_space_stripe).
*
- * To improve this, we will need writing-out phase to be able to NOT include
- * pending writes, which will reduce the requirement to
- * (conf->max_degraded + 1) pages per stripe in cache.
+ * In cache flush, the stripe goes through 1 and then 2. For a stripe that
+ * already passed 1, flushing it requires at most (conf->max_degraded + 1)
+ * pages of journal space. For stripes that has not passed 1, flushing it
+ * requires (conf->raid_disks + 1) pages of journal space. There are at
+ * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
+ * required to flush all cached stripes (in pages) is:
+ *
+ * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
+ * (group_cnt + 1) * (raid_disks + 1)
+ * or
+ * (stripe_in_journal_count) * (max_degraded + 1) +
+ * (group_cnt + 1) * (raid_disks - max_degraded)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
@@ -354,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
if (!r5c_is_writeback(log))
return 0;
- return BLOCK_SECTORS * (conf->raid_disks + 1) *
- atomic_read(&log->stripe_in_journal_count);
+ return BLOCK_SECTORS *
+ ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
+ (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
}
/*
@@ -410,16 +478,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes);
-
- if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
- BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
- atomic_dec(&conf->r5c_cached_partial_stripes);
- }
-
- if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
- BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
- atomic_dec(&conf->r5c_cached_full_stripes);
- }
}
static void r5c_handle_data_cached(struct stripe_head *sh)
@@ -611,6 +669,21 @@ static void r5l_submit_io_async(struct work_struct *work)
r5l_do_submit_io(log, io);
}
+static void r5c_disable_writeback_async(struct work_struct *work)
+{
+ struct r5l_log *log = container_of(work, struct r5l_log,
+ disable_writeback_work);
+ struct mddev *mddev = log->rdev->mddev;
+
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ return;
+ pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
+ mdname(mddev));
+ mddev_suspend(mddev);
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+ mddev_resume(mddev);
+}
+
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
@@ -1254,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
atomic_inc(&conf->active_stripes);
r5c_make_stripe_write_out(sh);
+ if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+ atomic_inc(&conf->r5c_flushing_partial_stripes);
+ else
+ atomic_inc(&conf->r5c_flushing_full_stripes);
raid5_release_stripe(sh);
}
@@ -1296,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf)
unsigned long flags;
int total_cached;
int stripes_to_flush;
+ int flushing_partial, flushing_full;
if (!r5c_is_writeback(log))
return;
+ flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes);
+ flushing_full = atomic_read(&conf->r5c_flushing_full_stripes);
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
- atomic_read(&conf->r5c_cached_full_stripes);
+ atomic_read(&conf->r5c_cached_full_stripes) -
+ flushing_full - flushing_partial;
if (total_cached > conf->min_nr_stripes * 3 / 4 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
@@ -1311,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf)
*/
stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP;
else if (total_cached > conf->min_nr_stripes * 1 / 2 ||
- atomic_read(&conf->r5c_cached_full_stripes) >
+ atomic_read(&conf->r5c_cached_full_stripes) - flushing_full >
R5C_FULL_STRIPE_FLUSH_BATCH)
/*
* if stripe cache pressure moderate, or if there is many full
@@ -1345,9 +1426,9 @@ static void r5c_do_reclaim(struct r5conf *conf)
!test_bit(STRIPE_HANDLE, &sh->state) &&
atomic_read(&sh->count) == 0) {
r5c_flush_stripe(conf, sh);
+ if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
+ break;
}
- if (count++ >= R5C_RECLAIM_STRIPE_GROUP)
- break;
}
spin_unlock(&conf->device_lock);
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -1393,8 +1474,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
next_checkpoint = r5c_calculate_new_cp(conf);
spin_unlock_irq(&log->io_list_lock);
- BUG_ON(reclaimable < 0);
-
if (reclaimable == 0 || !write_super)
return;
@@ -1682,8 +1761,7 @@ out:
static struct stripe_head *
r5c_recovery_alloc_stripe(struct r5conf *conf,
- sector_t stripe_sect,
- sector_t log_start)
+ sector_t stripe_sect)
{
struct stripe_head *sh;
@@ -1692,7 +1770,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
return NULL; /* no more stripe available */
r5l_recovery_reset_stripe(sh);
- sh->log_start = log_start;
return sh;
}
@@ -1862,7 +1939,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
stripe_sect);
if (!sh) {
- sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
+ sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
/*
* cannot get stripe from raid5_get_active_stripe
* try replay some stripes
@@ -1871,7 +1948,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
r5c_recovery_replay_stripes(
cached_stripe_list, ctx);
sh = r5c_recovery_alloc_stripe(
- conf, stripe_sect, ctx->pos);
+ conf, stripe_sect);
}
if (!sh) {
pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
@@ -1879,8 +1956,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
conf->min_nr_stripes * 2);
raid5_set_cache_size(mddev,
conf->min_nr_stripes * 2);
- sh = r5c_recovery_alloc_stripe(
- conf, stripe_sect, ctx->pos);
+ sh = r5c_recovery_alloc_stripe(conf,
+ stripe_sect);
}
if (!sh) {
pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
@@ -1894,7 +1971,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
r5l_recovery_replay_one_stripe(conf, sh, ctx);
- sh->log_start = ctx->pos;
list_move_tail(&sh->lru, cached_stripe_list);
}
r5l_recovery_load_data(log, sh, ctx, payload,
@@ -1933,8 +2009,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
set_bit(R5_UPTODATE, &dev->flags);
}
}
- list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
- atomic_inc(&log->stripe_in_journal_count);
}
/*
@@ -2067,9 +2141,10 @@ static int
r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
- struct stripe_head *sh, *next;
+ struct stripe_head *sh;
struct mddev *mddev = log->rdev->mddev;
struct page *page;
+ sector_t next_checkpoint = MaxSector;
page = alloc_page(GFP_KERNEL);
if (!page) {
@@ -2078,7 +2153,9 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
return -ENOMEM;
}
- list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+ WARN_ON(list_empty(&ctx->cached_list));
+
+ list_for_each_entry(sh, &ctx->cached_list, lru) {
struct r5l_meta_block *mb;
int i;
int offset;
@@ -2123,14 +2200,42 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
REQ_OP_WRITE, REQ_FUA, false);
sh->log_start = ctx->pos;
+ list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
+ atomic_inc(&log->stripe_in_journal_count);
ctx->pos = write_pos;
ctx->seq += 1;
+ next_checkpoint = sh->log_start;
+ }
+ log->next_checkpoint = next_checkpoint;
+ __free_page(page);
+ return 0;
+}
+
+static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ struct stripe_head *sh, *next;
+
+ if (ctx->data_only_stripes == 0)
+ return;
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
+
+ list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+ r5c_make_stripe_write_out(sh);
+ set_bit(STRIPE_HANDLE, &sh->state);
list_del_init(&sh->lru);
raid5_release_stripe(sh);
}
- __free_page(page);
- return 0;
+
+ md_wakeup_thread(conf->mddev->thread);
+ /* reuse conf->wait_for_quiescent in recovery */
+ wait_event(conf->wait_for_quiescent,
+ atomic_read(&conf->active_stripes) == 0);
+
+ log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
}
static int r5l_recovery_log(struct r5l_log *log)
@@ -2139,7 +2244,6 @@ static int r5l_recovery_log(struct r5l_log *log)
struct r5l_recovery_ctx ctx;
int ret;
sector_t pos;
- struct stripe_head *sh;
ctx.pos = log->last_checkpoint;
ctx.seq = log->last_cp_seq;
@@ -2160,35 +2264,31 @@ static int r5l_recovery_log(struct r5l_log *log)
pos = ctx.pos;
ctx.seq += 10000;
- if (ctx.data_only_stripes == 0) {
- log->next_checkpoint = ctx.pos;
- r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
- ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
- } else {
- sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
- log->next_checkpoint = sh->log_start;
- }
if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
pr_debug("md/raid:%s: starting from clean shutdown\n",
mdname(mddev));
- else {
- pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
+ else
+ pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
mdname(mddev), ctx.data_only_stripes,
ctx.data_parity_stripes);
- if (ctx.data_only_stripes > 0)
- if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
- pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
- mdname(mddev));
- return -EIO;
- }
+ if (ctx.data_only_stripes == 0) {
+ log->next_checkpoint = ctx.pos;
+ r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+ ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+ } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+ pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+ mdname(mddev));
+ return -EIO;
}
log->log_start = ctx.pos;
log->seq = ctx.seq;
log->last_checkpoint = pos;
r5l_write_super(log, pos);
+
+ r5c_recovery_flush_data_only_stripes(log, &ctx);
return 0;
}
@@ -2250,6 +2350,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
val > R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
+ if (raid5_calc_degraded(conf) > 0 &&
+ val == R5C_JOURNAL_MODE_WRITE_BACK)
+ return -EINVAL;
+
mddev_suspend(mddev);
conf->log->r5c_journal_mode = val;
mddev_resume(mddev);
@@ -2280,6 +2384,10 @@ int r5c_try_caching_write(struct r5conf *conf,
int i;
struct r5dev *dev;
int to_cache = 0;
+ void **pslot;
+ sector_t tree_index;
+ int ret;
+ uintptr_t refcount;
BUG_ON(!r5c_is_writeback(log));
@@ -2304,6 +2412,16 @@ int r5c_try_caching_write(struct r5conf *conf,
set_bit(STRIPE_R5C_CACHING, &sh->state);
}
+ /*
+ * When run in degraded mode, array is set to write-through mode.
+ * This check helps drain pending write safely in the transition to
+ * write-through mode.
+ */
+ if (s->failed) {
+ r5c_make_stripe_write_out(sh);
+ return -EAGAIN;
+ }
+
for (i = disks; i--; ) {
dev = &sh->dev[i];
/* if non-overwrite, use writing-out phase */
@@ -2314,6 +2432,44 @@ int r5c_try_caching_write(struct r5conf *conf,
}
}
+ /* if the stripe is not counted in big_stripe_tree, add it now */
+ if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+ !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+ tree_index = r5c_tree_index(conf, sh->sector);
+ spin_lock(&log->tree_lock);
+ pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+ tree_index);
+ if (pslot) {
+ refcount = (uintptr_t)radix_tree_deref_slot_protected(
+ pslot, &log->tree_lock) >>
+ R5C_RADIX_COUNT_SHIFT;
+ radix_tree_replace_slot(
+ &log->big_stripe_tree, pslot,
+ (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT));
+ } else {
+ /*
+ * this radix_tree_insert can fail safely, so no
+ * need to call radix_tree_preload()
+ */
+ ret = radix_tree_insert(
+ &log->big_stripe_tree, tree_index,
+ (void *)(1 << R5C_RADIX_COUNT_SHIFT));
+ if (ret) {
+ spin_unlock(&log->tree_lock);
+ r5c_make_stripe_write_out(sh);
+ return -EAGAIN;
+ }
+ }
+ spin_unlock(&log->tree_lock);
+
+ /*
+ * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is
+ * counted in the radix tree
+ */
+ set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
+ atomic_inc(&conf->r5c_cached_partial_stripes);
+ }
+
for (i = disks; i--; ) {
dev = &sh->dev[i];
if (dev->towrite) {
@@ -2354,6 +2510,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
struct page *p = sh->dev[i].orig_page;
sh->dev[i].orig_page = sh->dev[i].page;
+ clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
if (!using_disk_info_extra_page)
put_page(p);
}
@@ -2386,17 +2544,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
+ struct r5l_log *log = conf->log;
int i;
int do_wakeup = 0;
+ sector_t tree_index;
+ void **pslot;
+ uintptr_t refcount;
- if (!conf->log ||
- !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
+ if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags))
return;
WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags);
- if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
+ if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
for (i = sh->disks; i--; ) {
@@ -2418,15 +2579,45 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (do_wakeup)
wake_up(&conf->wait_for_overlap);
- if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
- return;
-
- spin_lock_irq(&conf->log->stripe_in_journal_lock);
+ spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
- spin_unlock_irq(&conf->log->stripe_in_journal_lock);
+ spin_unlock_irq(&log->stripe_in_journal_lock);
sh->log_start = MaxSector;
- atomic_dec(&conf->log->stripe_in_journal_count);
- r5c_update_log_state(conf->log);
+
+ atomic_dec(&log->stripe_in_journal_count);
+ r5c_update_log_state(log);
+
+ /* stop counting this stripe in big_stripe_tree */
+ if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
+ test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+ tree_index = r5c_tree_index(conf, sh->sector);
+ spin_lock(&log->tree_lock);
+ pslot = radix_tree_lookup_slot(&log->big_stripe_tree,
+ tree_index);
+ BUG_ON(pslot == NULL);
+ refcount = (uintptr_t)radix_tree_deref_slot_protected(
+ pslot, &log->tree_lock) >>
+ R5C_RADIX_COUNT_SHIFT;
+ if (refcount == 1)
+ radix_tree_delete(&log->big_stripe_tree, tree_index);
+ else
+ radix_tree_replace_slot(
+ &log->big_stripe_tree, pslot,
+ (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT));
+ spin_unlock(&log->tree_lock);
+ }
+
+ if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
+ BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+ atomic_dec(&conf->r5c_flushing_partial_stripes);
+ atomic_dec(&conf->r5c_cached_partial_stripes);
+ }
+
+ if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
+ BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
+ atomic_dec(&conf->r5c_flushing_full_stripes);
+ atomic_dec(&conf->r5c_cached_full_stripes);
+ }
}
int
@@ -2486,6 +2677,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
return 0;
}
+/* check whether this big stripe is in write back cache. */
+bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
+{
+ struct r5l_log *log = conf->log;
+ sector_t tree_index;
+ void *slot;
+
+ if (!log)
+ return false;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ tree_index = r5c_tree_index(conf, sect);
+ slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
+ return slot != NULL;
+}
+
static int r5l_load_log(struct r5l_log *log)
{
struct md_rdev *rdev = log->rdev;
@@ -2561,6 +2768,19 @@ ioerr:
return ret;
}
+void r5c_update_on_rdev_error(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+ struct r5l_log *log = conf->log;
+
+ if (!log)
+ return;
+
+ if (raid5_calc_degraded(conf) > 0 &&
+ conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ schedule_work(&log->disable_writeback_work);
+}
+
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -2619,6 +2839,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log->meta_pool)
goto out_mempool;
+ spin_lock_init(&log->tree_lock);
+ INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN);
+
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
if (!log->reclaim_thread)
@@ -2633,20 +2856,23 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->no_space_stripes_lock);
INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+ INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
INIT_LIST_HEAD(&log->stripe_in_journal_list);
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
+ rcu_assign_pointer(conf->log, log);
+
if (r5l_load_log(log))
goto error;
- rcu_assign_pointer(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
error:
+ rcu_assign_pointer(conf->log, NULL);
md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
mempool_destroy(log->meta_pool);
@@ -2663,6 +2889,7 @@ io_kc:
void r5l_exit_log(struct r5l_log *log)
{
+ flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
bioset_free(log->bs);