diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 621 |
1 files changed, 354 insertions, 267 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index cb20d0b0555a..4e957f3140a8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -36,6 +36,7 @@ #include <linux/blkdev.h> #include <linux/sysctl.h> #include <linux/seq_file.h> +#include <linux/mutex.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ #include <linux/poll.h> #include <linux/ctype.h> @@ -56,7 +57,6 @@ #define DEBUG 0 #define dprintk(x...) ((void)(DEBUG && printk(x))) - #ifndef MODULE static void autostart_arrays(int part); #endif @@ -67,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock); static void md_print_devices(void); static DECLARE_WAIT_QUEUE_HEAD(resync_wait); +static struct workqueue_struct *md_wq; +static struct workqueue_struct *md_misc_wq; #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } @@ -147,6 +149,72 @@ static const struct block_device_operations md_fops; static int start_readonly; +/* bio_clone_mddev + * like bio_clone, but with a local bio set + */ + +static void mddev_bio_destructor(struct bio *bio) +{ + mddev_t *mddev, **mddevp; + + mddevp = (void*)bio; + mddev = mddevp[-1]; + + bio_free(bio, mddev->bio_set); +} + +struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, + mddev_t *mddev) +{ + struct bio *b; + mddev_t **mddevp; + + if (!mddev || !mddev->bio_set) + return bio_alloc(gfp_mask, nr_iovecs); + + b = bio_alloc_bioset(gfp_mask, nr_iovecs, + mddev->bio_set); + if (!b) + return NULL; + mddevp = (void*)b; + mddevp[-1] = mddev; + b->bi_destructor = mddev_bio_destructor; + return b; +} +EXPORT_SYMBOL_GPL(bio_alloc_mddev); + +struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, + mddev_t *mddev) +{ + struct bio *b; + mddev_t **mddevp; + + if (!mddev || !mddev->bio_set) + return bio_clone(bio, gfp_mask); + + b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, + mddev->bio_set); + if (!b) + return NULL; + mddevp = (void*)b; + mddevp[-1] = mddev; + b->bi_destructor = mddev_bio_destructor; + __bio_clone(b, bio); + if (bio_integrity(bio)) { + int ret; + + ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set); + + if (ret < 0) { + bio_put(b); + return NULL; + } + } + + return b; +} +EXPORT_SYMBOL_GPL(bio_clone_mddev); + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -225,12 +293,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) return 0; } rcu_read_lock(); - if (mddev->suspended || mddev->barrier) { + if (mddev->suspended) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended && !mddev->barrier) + if (!mddev->suspended) break; rcu_read_unlock(); schedule(); @@ -261,7 +329,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio) * Once ->stop is called and completes, the module will be completely * unused. */ -static void mddev_suspend(mddev_t *mddev) +void mddev_suspend(mddev_t *mddev) { BUG_ON(mddev->suspended); mddev->suspended = 1; @@ -269,50 +337,41 @@ static void mddev_suspend(mddev_t *mddev) wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); } +EXPORT_SYMBOL_GPL(mddev_suspend); -static void mddev_resume(mddev_t *mddev) +void mddev_resume(mddev_t *mddev) { mddev->suspended = 0; wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); } +EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(mddev_t *mddev, int bits) { - if (mddev->barrier) - return 1; return mddev->suspended; } EXPORT_SYMBOL(mddev_congested); /* - * Generic barrier handling for md + * Generic flush handling for md */ -#define POST_REQUEST_BARRIER ((void*)1) - -static void md_end_barrier(struct bio *bio, int err) +static void md_end_flush(struct bio *bio, int err) { mdk_rdev_t *rdev = bio->bi_private; mddev_t *mddev = rdev->mddev; - if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) - set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { - if (mddev->barrier == POST_REQUEST_BARRIER) { - /* This was a post-request barrier */ - mddev->barrier = NULL; - wake_up(&mddev->sb_wait); - } else - /* The pre-request barrier has finished */ - schedule_work(&mddev->barrier_work); + /* The pre-request flush has finished */ + queue_work(md_wq, &mddev->flush_work); } bio_put(bio); } -static void submit_barriers(mddev_t *mddev) +static void submit_flushes(mddev_t *mddev) { mdk_rdev_t *rdev; @@ -328,61 +387,102 @@ static void submit_barriers(mddev_t *mddev) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); - bi = bio_alloc(GFP_KERNEL, 0); - bi->bi_end_io = md_end_barrier; + bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev); + bi->bi_end_io = md_end_flush; bi->bi_private = rdev; bi->bi_bdev = rdev->bdev; atomic_inc(&mddev->flush_pending); - submit_bio(WRITE_BARRIER, bi); + submit_bio(WRITE_FLUSH, bi); rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); } -static void md_submit_barrier(struct work_struct *ws) +static void md_submit_flush_data(struct work_struct *ws) { - mddev_t *mddev = container_of(ws, mddev_t, barrier_work); - struct bio *bio = mddev->barrier; + mddev_t *mddev = container_of(ws, mddev_t, flush_work); + struct bio *bio = mddev->flush_bio; atomic_set(&mddev->flush_pending, 1); - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - bio_endio(bio, -EOPNOTSUPP); - else if (bio->bi_size == 0) + if (bio->bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); else { - bio->bi_rw &= ~(1<<BIO_RW_BARRIER); + bio->bi_rw &= ~REQ_FLUSH; if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); - mddev->barrier = POST_REQUEST_BARRIER; - submit_barriers(mddev); } if (atomic_dec_and_test(&mddev->flush_pending)) { - mddev->barrier = NULL; + mddev->flush_bio = NULL; wake_up(&mddev->sb_wait); } } -void md_barrier_request(mddev_t *mddev, struct bio *bio) +void md_flush_request(mddev_t *mddev, struct bio *bio) { spin_lock_irq(&mddev->write_lock); wait_event_lock_irq(mddev->sb_wait, - !mddev->barrier, + !mddev->flush_bio, mddev->write_lock, /*nothing*/); - mddev->barrier = bio; + mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); atomic_set(&mddev->flush_pending, 1); - INIT_WORK(&mddev->barrier_work, md_submit_barrier); + INIT_WORK(&mddev->flush_work, md_submit_flush_data); - submit_barriers(mddev); + submit_flushes(mddev); if (atomic_dec_and_test(&mddev->flush_pending)) - schedule_work(&mddev->barrier_work); + queue_work(md_wq, &mddev->flush_work); } -EXPORT_SYMBOL(md_barrier_request); +EXPORT_SYMBOL(md_flush_request); + +/* Support for plugging. + * This mirrors the plugging support in request_queue, but does not + * require having a whole queue + */ +static void plugger_work(struct work_struct *work) +{ + struct plug_handle *plug = + container_of(work, struct plug_handle, unplug_work); + plug->unplug_fn(plug); +} +static void plugger_timeout(unsigned long data) +{ + struct plug_handle *plug = (void *)data; + kblockd_schedule_work(NULL, &plug->unplug_work); +} +void plugger_init(struct plug_handle *plug, + void (*unplug_fn)(struct plug_handle *)) +{ + plug->unplug_flag = 0; + plug->unplug_fn = unplug_fn; + init_timer(&plug->unplug_timer); + plug->unplug_timer.function = plugger_timeout; + plug->unplug_timer.data = (unsigned long)plug; + INIT_WORK(&plug->unplug_work, plugger_work); +} +EXPORT_SYMBOL_GPL(plugger_init); + +void plugger_set_plug(struct plug_handle *plug) +{ + if (!test_and_set_bit(PLUGGED_FLAG, &plug->unplug_flag)) + mod_timer(&plug->unplug_timer, jiffies + msecs_to_jiffies(3)+1); +} +EXPORT_SYMBOL_GPL(plugger_set_plug); + +int plugger_remove_plug(struct plug_handle *plug) +{ + if (test_and_clear_bit(PLUGGED_FLAG, &plug->unplug_flag)) { + del_timer(&plug->unplug_timer); + return 1; + } else + return 0; +} +EXPORT_SYMBOL_GPL(plugger_remove_plug); + static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -394,6 +494,8 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(mddev_t *mddev) { + struct bio_set *bs = NULL; + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && @@ -401,22 +503,25 @@ static void mddev_put(mddev_t *mddev) /* Array is not configured at all, and not held active, * so destroy it */ list_del(&mddev->all_mddevs); + bs = mddev->bio_set; + mddev->bio_set = NULL; if (mddev->gendisk) { - /* we did a probe so need to clean up. - * Call schedule_work inside the spinlock - * so that flush_scheduled_work() after - * mddev_find will succeed in waiting for the - * work to be done. + /* We did a probe so need to clean up. Call + * queue_work inside the spinlock so that + * flush_workqueue() after mddev_find will + * succeed in waiting for the work to be done. */ INIT_WORK(&mddev->del_work, mddev_delayed_delete); - schedule_work(&mddev->del_work); + queue_work(md_misc_wq, &mddev->del_work); } else kfree(mddev); } spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); } -static void mddev_init(mddev_t *mddev) +void mddev_init(mddev_t *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); @@ -436,6 +541,7 @@ static void mddev_init(mddev_t *mddev) mddev->resync_max = MaxSector; mddev->level = LEVEL_NONE; } +EXPORT_SYMBOL_GPL(mddev_init); static mddev_t * mddev_find(dev_t unit) { @@ -532,25 +638,31 @@ static void mddev_unlock(mddev_t * mddev) * an access to the files will try to take reconfig_mutex * while holding the file unremovable, which leads to * a deadlock. - * So hold open_mutex instead - we are allowed to take - * it while holding reconfig_mutex, and md_run can - * use it to wait for the remove to complete. + * So hold set sysfs_active while the remove in happeing, + * and anything else which might set ->to_remove or my + * otherwise change the sysfs namespace will fail with + * -EBUSY if sysfs_active is still set. + * We set sysfs_active under reconfig_mutex and elsewhere + * test it under the same mutex to ensure its correct value + * is seen. */ struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; - mutex_lock(&mddev->open_mutex); + mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); - if (to_remove != &md_redundancy_group) - sysfs_remove_group(&mddev->kobj, to_remove); - if (mddev->pers == NULL || - mddev->pers->sync_request == NULL) { - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - if (mddev->sysfs_action) - sysfs_put(mddev->sysfs_action); - mddev->sysfs_action = NULL; + if (mddev->kobj.sd) { + if (to_remove != &md_redundancy_group) + sysfs_remove_group(&mddev->kobj, to_remove); + if (mddev->pers == NULL || + mddev->pers->sync_request == NULL) { + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + if (mddev->sysfs_action) + sysfs_put(mddev->sysfs_action); + mddev->sysfs_action = NULL; + } } - mutex_unlock(&mddev->open_mutex); + mddev->sysfs_active = 0; } else mutex_unlock(&mddev->reconfig_mutex); @@ -641,31 +753,6 @@ static void super_written(struct bio *bio, int error) bio_put(bio); } -static void super_written_barrier(struct bio *bio, int error) -{ - struct bio *bio2 = bio->bi_private; - mdk_rdev_t *rdev = bio2->bi_private; - mddev_t *mddev = rdev->mddev; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && - error == -EOPNOTSUPP) { - unsigned long flags; - /* barriers don't appear to be supported :-( */ - set_bit(BarriersNotsupp, &rdev->flags); - mddev->barriers_work = 0; - spin_lock_irqsave(&mddev->write_lock, flags); - bio2->bi_next = mddev->biolist; - mddev->biolist = bio2; - spin_unlock_irqrestore(&mddev->write_lock, flags); - wake_up(&mddev->sb_wait); - bio_put(bio); - } else { - bio_put(bio2); - bio->bi_private = rdev; - super_written(bio, error); - } -} - void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page) { @@ -674,51 +761,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error - * - * As we might need to resubmit the request if BIO_RW_BARRIER - * causes ENOTSUPP, we allocate a spare bio... */ - struct bio *bio = bio_alloc(GFP_NOIO, 1); - int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_rw = rw; atomic_inc(&mddev->pending_writes); - if (!test_bit(BarriersNotsupp, &rdev->flags)) { - struct bio *rbio; - rw |= (1<<BIO_RW_BARRIER); - rbio = bio_clone(bio, GFP_NOIO); - rbio->bi_private = bio; - rbio->bi_end_io = super_written_barrier; - submit_bio(rw, rbio); - } else - submit_bio(rw, bio); + submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA, + bio); } void md_super_wait(mddev_t *mddev) { - /* wait for all superblock writes that were scheduled to complete. - * if any had to be retried (due to BARRIER problems), retry them - */ + /* wait for all superblock writes that were scheduled to complete */ DEFINE_WAIT(wq); for(;;) { prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); if (atomic_read(&mddev->pending_writes)==0) break; - while (mddev->biolist) { - struct bio *bio; - spin_lock_irq(&mddev->write_lock); - bio = mddev->biolist; - mddev->biolist = bio->bi_next ; - bio->bi_next = NULL; - spin_unlock_irq(&mddev->write_lock); - submit_bio(bio->bi_rw, bio); - } schedule(); } finish_wait(&mddev->sb_wait, &wq); @@ -729,16 +793,16 @@ static void bi_complete(struct bio *bio, int error) complete((struct completion*)bio->bi_private); } -int sync_page_io(struct block_device *bdev, sector_t sector, int size, - struct page *page, int rw) +int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size, + struct page *page, int rw) { - struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); struct completion event; int ret; - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; - bio->bi_bdev = bdev; + bio->bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); init_completion(&event); @@ -764,7 +828,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size) return 0; - if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) + if (!sync_page_io(rdev, rdev->sb_start, size, rdev->sb_page, READ)) goto fail; rdev->sb_loaded = 1; return 0; @@ -1015,7 +1079,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 0; @@ -1430,7 +1493,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) clear_bit(Faulty, &rdev->flags); clear_bit(In_sync, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); - clear_bit(BarriersNotsupp, &rdev->flags); if (mddev->raid_disks == 0) { mddev->major_version = 1; @@ -1588,7 +1650,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; - } + } else + max_dev = le32_to_cpu(sb->max_dev); + for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1811,11 +1875,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { - kobject_del(&rdev->kobj); - goto fail; - } - rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, NULL, "state"); + if (sysfs_create_link(&rdev->kobj, ko, "block")) + /* failure here is OK */; + rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); list_add_rcu(&rdev->same_set, &mddev->disks); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); @@ -1859,7 +1921,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) synchronize_rcu(); INIT_WORK(&rdev->del_work, md_delayed_delete); kobject_get(&rdev->kobj); - schedule_work(&rdev->del_work); + queue_work(md_misc_wq, &rdev->del_work); } /* @@ -2083,16 +2145,6 @@ static void sync_sbs(mddev_t * mddev, int nospares) * with the rest of the array) */ mdk_rdev_t *rdev; - - /* First make sure individual recovery_offsets are correct */ - list_for_each_entry(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(In_sync, &rdev->flags) && - mddev->curr_resync_completed > rdev->recovery_offset) - rdev->recovery_offset = mddev->curr_resync_completed; - - } list_for_each_entry(rdev, &mddev->disks, same_set) { if (rdev->sb_events == mddev->events || (nospares && @@ -2114,13 +2166,29 @@ static void md_update_sb(mddev_t * mddev, int force_change) int sync_req; int nospares = 0; - mddev->utime = get_seconds(); - if (mddev->external) - return; repeat: + /* First make sure individual recovery_offsets are correct */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(In_sync, &rdev->flags) && + mddev->curr_resync_completed > rdev->recovery_offset) + rdev->recovery_offset = mddev->curr_resync_completed; + + } + if (!mddev->persistent) { + clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_DEVS, &mddev->flags); + if (!mddev->external) + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + wake_up(&mddev->sb_wait); + return; + } + spin_lock_irq(&mddev->write_lock); - set_bit(MD_CHANGE_PENDING, &mddev->flags); + mddev->utime = get_seconds(); + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) force_change = 1; if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) @@ -2168,19 +2236,6 @@ repeat: MD_BUG(); mddev->events --; } - - /* - * do not write anything to disk if using - * nonpersistent superblocks - */ - if (!mddev->persistent) { - if (!mddev->external) - clear_bit(MD_CHANGE_PENDING, &mddev->flags); - - spin_unlock_irq(&mddev->write_lock); - wake_up(&mddev->sb_wait); - return; - } sync_sbs(mddev, nospares); spin_unlock_irq(&mddev->write_lock); @@ -2334,8 +2389,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) set_bit(In_sync, &rdev->flags); err = 0; } - if (!err && rdev->sysfs_state) - sysfs_notify_dirent(rdev->sysfs_state); + if (!err) + sysfs_notify_dirent_safe(rdev->sysfs_state); return err ? err : len; } static struct rdev_sysfs_entry rdev_state = @@ -2430,14 +2485,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) rdev->raid_disk = -1; return err; } else - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(rdev->mddev)); - + /* failure here is OK */; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks) @@ -2447,7 +2498,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) clear_bit(Faulty, &rdev->flags); clear_bit(WriteMostly, &rdev->flags); set_bit(In_sync, &rdev->flags); - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } return len; } @@ -2695,6 +2746,24 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; +void md_rdev_init(mdk_rdev_t *rdev) +{ + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; + rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; + atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); + atomic_set(&rdev->corrected_errors, 0); + + INIT_LIST_HEAD(&rdev->same_set); + init_waitqueue_head(&rdev->blocked_wait); +} +EXPORT_SYMBOL_GPL(md_rdev_init); /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -2718,6 +2787,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } + md_rdev_init(rdev); if ((err = alloc_disk_sb(rdev))) goto abort_free; @@ -2727,18 +2797,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - rdev->desc_nr = -1; - rdev->saved_raid_disk = -1; - rdev->raid_disk = -1; - rdev->flags = 0; - rdev->data_offset = 0; - rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; - atomic_set(&rdev->nr_pending, 0); - atomic_set(&rdev->read_errors, 0); - atomic_set(&rdev->corrected_errors, 0); - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING @@ -2767,9 +2825,6 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi } } - INIT_LIST_HEAD(&rdev->same_set); - init_waitqueue_head(&rdev->blocked_wait); - return rdev; abort_free: @@ -2960,7 +3015,9 @@ level_store(mddev_t *mddev, const char *buf, size_t len) * - new personality will access other array. */ - if (mddev->sync_thread || mddev->reshape_position != MaxSector) + if (mddev->sync_thread || + mddev->reshape_position != MaxSector || + mddev->sysfs_active) return -EBUSY; if (!mddev->pers->quiesce) { @@ -3324,7 +3381,7 @@ array_state_show(mddev_t *mddev, char *page) case 0: if (mddev->in_sync) st = clean; - else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) st = write_pending; else if (mddev->safemode) st = active_idle; @@ -3405,9 +3462,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) mddev->in_sync = 1; if (mddev->safemode == 1) mddev->safemode = 0; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, - &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } err = 0; } else @@ -3419,8 +3474,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) case active: if (mddev->pers) { restart_array(mddev); - if (mddev->external) - clear_bit(MD_CHANGE_CLEAN, &mddev->flags); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; } else { @@ -3437,7 +3491,7 @@ array_state_store(mddev_t *mddev, const char *buf, size_t len) if (err) return err; else { - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } } @@ -3735,7 +3789,7 @@ action_store(mddev_t *mddev, const char *page, size_t len) } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); return len; } @@ -4211,10 +4265,10 @@ static int md_alloc(dev_t dev, char *name) shift = partitioned ? MdpMinorShift : 0; unit = MINOR(mddev->unit) >> shift; - /* wait for any previous instance if this device - * to be completed removed (mddev_delayed_delete). + /* wait for any previous instance of this device to be + * completely removed (mddev_delayed_delete). */ - flush_scheduled_work(); + flush_workqueue(md_misc_wq); mutex_lock(&disks_mutex); error = -EEXIST; @@ -4281,13 +4335,14 @@ static int md_alloc(dev_t dev, char *name) disk->disk_name); error = 0; } - if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); abort: mutex_unlock(&disks_mutex); - if (!error) { + if (!error && mddev->kobj.sd) { kobject_uevent(&mddev->kobj, KOBJ_ADD); - mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, NULL, "array_state"); + mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); } mddev_put(mddev); return error; @@ -4325,14 +4380,14 @@ static void md_safemode_timeout(unsigned long data) if (!atomic_read(&mddev->writes_pending)) { mddev->safemode = 1; if (mddev->external) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } md_wakeup_thread(mddev->thread); } static int start_dirty_degraded; -static int md_run(mddev_t *mddev) +int md_run(mddev_t *mddev) { int err; mdk_rdev_t *rdev; @@ -4344,13 +4399,9 @@ static int md_run(mddev_t *mddev) if (mddev->pers) return -EBUSY; - - /* These two calls synchronise us with the - * sysfs_remove_group calls in mddev_unlock, - * so they must have completed. - */ - mutex_lock(&mddev->open_mutex); - mutex_unlock(&mddev->open_mutex); + /* Cannot run until previous stop completes properly */ + if (mddev->sysfs_active) + return -EBUSY; /* * Analyze all RAID superblock(s) @@ -4397,9 +4448,12 @@ static int md_run(mddev_t *mddev) return -EINVAL; } } - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); } + if (mddev->bio_set == NULL) + mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); + spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); if (!pers || !try_module_get(pers->owner)) { @@ -4462,7 +4516,6 @@ static int md_run(mddev_t *mddev) /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; - mddev->barriers_work = 1; mddev->ok_start_degraded = start_dirty_degraded; if (start_readonly && mddev->ro == 0) @@ -4496,11 +4549,12 @@ static int md_run(mddev_t *mddev) return err; } if (mddev->pers->sync_request) { - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + if (mddev->kobj.sd && + sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); + mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; @@ -4518,8 +4572,7 @@ static int md_run(mddev_t *mddev) char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk("md: cannot register %s for %s\n", - nm, mdname(mddev)); + /* failure here is OK */; } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -4531,12 +4584,12 @@ static int md_run(mddev_t *mddev) md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ md_new_event(mddev); - sysfs_notify_dirent(mddev->sysfs_state); - if (mddev->sysfs_action) - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; } +EXPORT_SYMBOL_GPL(md_run); static int do_md_run(mddev_t *mddev) { @@ -4545,7 +4598,11 @@ static int do_md_run(mddev_t *mddev) err = md_run(mddev); if (err) goto out; - + err = bitmap_load(mddev); + if (err) { + bitmap_destroy(mddev); + goto out; + } set_capacity(mddev->gendisk, mddev->array_sectors); revalidate_disk(mddev->gendisk); kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); @@ -4573,7 +4630,7 @@ static int restart_array(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; } @@ -4637,16 +4694,16 @@ static void md_clean(mddev_t *mddev) mddev->recovery = 0; mddev->in_sync = 0; mddev->degraded = 0; - mddev->barriers_work = 0; mddev->safemode = 0; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; + mddev->plug = NULL; } -static void md_stop_writes(mddev_t *mddev) +void md_stop_writes(mddev_t *mddev) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4666,11 +4723,10 @@ static void md_stop_writes(mddev_t *mddev) md_update_sb(mddev, 1); } } +EXPORT_SYMBOL_GPL(md_stop_writes); -static void md_stop(mddev_t *mddev) +void md_stop(mddev_t *mddev) { - md_stop_writes(mddev); - mddev->pers->stop(mddev); if (mddev->pers->sync_request && mddev->to_remove == NULL) mddev->to_remove = &md_redundancy_group; @@ -4678,6 +4734,7 @@ static void md_stop(mddev_t *mddev) mddev->pers = NULL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } +EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(mddev_t *mddev, int is_open) { @@ -4697,7 +4754,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open) mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); err = 0; } out: @@ -4711,26 +4768,29 @@ out: */ static int do_md_stop(mddev_t * mddev, int mode, int is_open) { - int err = 0; struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; mutex_lock(&mddev->open_mutex); - if (atomic_read(&mddev->openers) > is_open) { + if (atomic_read(&mddev->openers) > is_open || + mddev->sysfs_active) { printk("md: %s still in use.\n",mdname(mddev)); - err = -EBUSY; - } else if (mddev->pers) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + if (mddev->pers) { if (mddev->ro) set_disk_ro(disk, 0); + md_stop_writes(mddev); md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; /* tell userspace to handle 'inactive' */ - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); list_for_each_entry(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0) { @@ -4740,21 +4800,17 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) } set_capacity(disk, 0); + mutex_unlock(&mddev->open_mutex); revalidate_disk(disk); if (mddev->ro) mddev->ro = 0; - - err = 0; - } - mutex_unlock(&mddev->open_mutex); - if (err) - return err; + } else + mutex_unlock(&mddev->open_mutex); /* * Free resources if final stop */ if (mode == 0) { - printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); @@ -4771,13 +4827,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; - } - err = 0; blk_integrity_unregister(disk); md_new_event(mddev); - sysfs_notify_dirent(mddev->sysfs_state); - return err; + sysfs_notify_dirent_safe(mddev->sysfs_state); + return 0; } #ifndef MODULE @@ -5138,7 +5192,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (err) export_rdev(rdev); else - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); md_update_sb(mddev, 1); if (mddev->degraded) @@ -5331,8 +5385,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) err = 0; if (mddev->pers) { mddev->pers->quiesce(mddev, 1); - if (fd >= 0) + if (fd >= 0) { err = bitmap_create(mddev); + if (!err) + err = bitmap_load(mddev); + } if (fd < 0 || err) { bitmap_destroy(mddev); fd = -1; /* make sure to put the file */ @@ -5581,6 +5638,8 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); + if (!rv) + rv = bitmap_load(mddev); if (rv) bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); @@ -5813,7 +5872,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { if (mddev->ro == 2) { mddev->ro = 0; - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); } else { @@ -5908,7 +5967,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) */ mddev_put(mddev); /* Wait until bdev->bd_disk is definitely gone */ - flush_scheduled_work(); + flush_workqueue(md_misc_wq); /* Then retry the open from the top */ return -ERESTARTSYS; } @@ -6059,10 +6118,12 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) mddev->pers->error_handler(mddev,rdev); if (mddev->degraded) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); + if (mddev->event_work.func) + queue_work(md_misc_wq, &mddev->event_work); md_new_event_inintr(mddev); } @@ -6514,15 +6575,15 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); md_wakeup_thread(mddev->thread); did_change = 1; } spin_unlock_irq(&mddev->write_lock); } if (did_change) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && !test_bit(MD_CHANGE_PENDING, &mddev->flags)); } @@ -6558,22 +6619,31 @@ int md_allow_write(mddev_t *mddev) if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; spin_unlock_irq(&mddev->write_lock); md_update_sb(mddev, 0); - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock_irq(&mddev->write_lock); - if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) + if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; else return 0; } EXPORT_SYMBOL_GPL(md_allow_write); +void md_unplug(mddev_t *mddev) +{ + if (mddev->queue) + blk_unplug(mddev->queue); + if (mddev->plug) + mddev->plug->unplug_fn(mddev->plug); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) void md_do_sync(mddev_t *mddev) @@ -6752,7 +6822,7 @@ void md_do_sync(mddev_t *mddev) >= mddev->resync_max - mddev->curr_resync_completed )) { /* time to update curr_resync_completed */ - blk_unplug(mddev->queue); + md_unplug(mddev); wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = @@ -6829,7 +6899,7 @@ void md_do_sync(mddev_t *mddev) * about not overloading the IO subsystem. (things like an * e2fsck being done on the RAID array should execute fast) */ - blk_unplug(mddev->queue); + md_unplug(mddev); cond_resched(); currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 @@ -6848,7 +6918,7 @@ void md_do_sync(mddev_t *mddev) * this also signals 'finished resyncing' to md_stop */ out: - blk_unplug(mddev->queue); + md_unplug(mddev); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); @@ -6950,10 +7020,7 @@ static int remove_and_add_spares(mddev_t *mddev) sprintf(nm, "rd%d", rdev->raid_disk); if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) - printk(KERN_WARNING - "md: cannot register " - "%s for %s\n", - nm, mdname(mddev)); + /* failure here is OK */; spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); @@ -7009,7 +7076,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return; if ( ! ( - (mddev->flags && !mddev->external) || + (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || @@ -7039,14 +7106,13 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; did_change = 1; - if (mddev->persistent) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); + set_bit(MD_CHANGE_CLEAN, &mddev->flags); } if (mddev->safemode == 1) mddev->safemode = 0; spin_unlock_irq(&mddev->write_lock); if (did_change) - sysfs_notify_dirent(mddev->sysfs_state); + sysfs_notify_dirent_safe(mddev->sysfs_state); } if (mddev->flags) @@ -7085,7 +7151,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); goto unlock; } @@ -7147,7 +7213,7 @@ void md_check_recovery(mddev_t *mddev) mddev->recovery = 0; } else md_wakeup_thread(mddev->sync_thread); - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(mddev); } unlock: @@ -7156,7 +7222,7 @@ void md_check_recovery(mddev_t *mddev) if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) if (mddev->sysfs_action) - sysfs_notify_dirent(mddev->sysfs_action); + sysfs_notify_dirent_safe(mddev->sysfs_action); } mddev_unlock(mddev); } @@ -7164,7 +7230,7 @@ void md_check_recovery(mddev_t *mddev) void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - sysfs_notify_dirent(rdev->sysfs_state); + sysfs_notify_dirent_safe(rdev->sysfs_state); wait_event_timeout(rdev->blocked_wait, !test_bit(Blocked, &rdev->flags), msecs_to_jiffies(5000)); @@ -7217,12 +7283,23 @@ static void md_geninit(void) static int __init md_init(void) { - if (register_blkdev(MD_MAJOR, "md")) - return -1; - if ((mdp_major=register_blkdev(0, "mdp"))<=0) { - unregister_blkdev(MD_MAJOR, "md"); - return -1; - } + int ret = -ENOMEM; + + md_wq = alloc_workqueue("md", WQ_RESCUER, 0); + if (!md_wq) + goto err_wq; + + md_misc_wq = alloc_workqueue("md_misc", 0, 0); + if (!md_misc_wq) + goto err_misc_wq; + + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) + goto err_md; + + if ((ret = register_blkdev(0, "mdp")) < 0) + goto err_mdp; + mdp_major = ret; + blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, md_probe, NULL, NULL); blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, @@ -7233,8 +7310,16 @@ static int __init md_init(void) md_geninit(); return 0; -} +err_mdp: + unregister_blkdev(MD_MAJOR, "md"); +err_md: + destroy_workqueue(md_misc_wq); +err_misc_wq: + destroy_workqueue(md_wq); +err_wq: + return ret; +} #ifndef MODULE @@ -7321,6 +7406,8 @@ static __exit void md_exit(void) export_array(mddev); mddev->hold_active = 0; } + destroy_workqueue(md_misc_wq); + destroy_workqueue(md_wq); } subsys_initcall(md_init); |