diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-raid.c | 4 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 93 | ||||
-rw-r--r-- | drivers/md/md-bitmap.h | 8 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 17 | ||||
-rw-r--r-- | drivers/md/md-multipath.c | 4 | ||||
-rw-r--r-- | drivers/md/md.c | 226 | ||||
-rw-r--r-- | drivers/md/md.h | 37 | ||||
-rw-r--r-- | drivers/md/raid1-10.c | 63 | ||||
-rw-r--r-- | drivers/md/raid1.c | 36 | ||||
-rw-r--r-- | drivers/md/raid1.h | 2 | ||||
-rw-r--r-- | drivers/md/raid10.c | 179 | ||||
-rw-r--r-- | drivers/md/raid10.h | 2 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 22 | ||||
-rw-r--r-- | drivers/md/raid5.c | 68 | ||||
-rw-r--r-- | drivers/md/raid5.h | 2 |
15 files changed, 483 insertions, 280 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c8821fcb8299..8846bf510a35 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, * canceling read-auto mode */ mddev->ro = 0; - if (!mddev->suspended && mddev->sync_thread) + if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!mddev->suspended && mddev->thread) + if (!mddev->suspended) md_wakeup_thread(mddev->thread); return 0; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bc8d7565171d..1ff712889a3b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -54,14 +54,7 @@ __acquires(bitmap->lock) { unsigned char *mappage; - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - + WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -1023,7 +1016,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) return set; } - /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -1033,8 +1025,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!bitmap || !bitmap->storage.filemap || - test_bit(BITMAP_STALE, &bitmap->flags)) + if (!md_bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be @@ -1063,6 +1054,35 @@ void md_bitmap_unplug(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_unplug); +struct bitmap_unplug_work { + struct work_struct work; + struct bitmap *bitmap; + struct completion *done; +}; + +static void md_bitmap_unplug_fn(struct work_struct *work) +{ + struct bitmap_unplug_work *unplug_work = + container_of(work, struct bitmap_unplug_work, work); + + md_bitmap_unplug(unplug_work->bitmap); + complete(unplug_work->done); +} + +void md_bitmap_unplug_async(struct bitmap *bitmap) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct bitmap_unplug_work unplug_work; + + INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn); + unplug_work.bitmap = bitmap; + unplug_work.done = &done; + + queue_work(md_bitmap_wq, &unplug_work.work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(md_bitmap_unplug_async); + static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the @@ -1241,11 +1261,28 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); +static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, + bool force) +{ + struct md_thread *thread; + + rcu_read_lock(); + thread = rcu_dereference(mddev->thread); + + if (!thread) + goto out; + + if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) + thread->timeout = timeout; + +out: + rcu_read_unlock(); +} + /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ - void md_bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; @@ -1269,7 +1306,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); goto done; } bitmap->allclean = 1; @@ -1366,8 +1403,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1387,6 +1423,14 @@ __acquires(bitmap->lock) sector_t csize; int err; + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || @@ -1820,8 +1864,7 @@ void md_bitmap_destroy(struct mddev *mddev) mddev->bitmap = NULL; /* disconnect from the md device */ spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); md_bitmap_free(bitmap); } @@ -1964,7 +2007,7 @@ int md_bitmap_load(struct mddev *mddev) /* Kick recovery in case any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); md_bitmap_update_sb(bitmap); @@ -2469,17 +2512,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) timeout = MAX_SCHEDULE_TIMEOUT-1; if (timeout < 1) timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { - mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } - } + mddev_set_timeout(mddev, timeout, false); + md_wakeup_thread(mddev->thread); + return len; } diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index cfd7395de8fd..8a3788c9bfef 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -264,6 +264,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t new_lo, sector_t new_hi); void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_unplug_async(struct bitmap *bitmap); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, @@ -273,6 +274,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); + +static inline bool md_bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap && bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + #endif #endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10e0c5381d01..3d9fd74233df 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -75,14 +75,14 @@ struct md_cluster_info { sector_t suspend_hi; int suspend_from; /* the slot which broadcast suspend_lo/hi */ - struct md_thread *recovery_thread; + struct md_thread __rcu *recovery_thread; unsigned long recovery_map; /* communication loc resources */ struct dlm_lock_resource *ack_lockres; struct dlm_lock_resource *message_lockres; struct dlm_lock_resource *token_lockres; struct dlm_lock_resource *no_new_dev_lockres; - struct md_thread *recv_thread; + struct md_thread __rcu *recv_thread; struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; @@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot) set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { - cinfo->recovery_thread = md_register_thread(recover_bitmaps, - mddev, "recover"); + rcu_assign_pointer(cinfo->recovery_thread, + md_register_thread(recover_bitmaps, mddev, "recover")); if (!cinfo->recovery_thread) { pr_warn("md-cluster: Could not create recovery thread\n"); return; @@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { int got_lock = 0; + struct md_thread *thread; struct md_cluster_info *cinfo = mddev->cluster_info; mddev->good_device_nr = le32_to_cpu(msg->raid_slot); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - wait_event(mddev->thread->wqueue, + + /* daemaon thread must exist */ + thread = rcu_dereference_protected(mddev->thread, true); + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); md_reload_sb(mddev, mddev->good_device_nr); @@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes) } /* Initiate the communication resources */ ret = -ENOMEM; - cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + rcu_assign_pointer(cinfo->recv_thread, + md_register_thread(recv_daemon, mddev, "cluster_recv")); if (!cinfo->recv_thread) { pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); goto err; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 66edf5e72bd6..92c45be203d7 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev) if (ret) goto out_free_conf; - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); + rcu_assign_pointer(mddev->thread, + md_register_thread(multipathd, mddev, "multipath")); if (!mddev->thread) goto out_free_conf; diff --git a/drivers/md/md.c b/drivers/md/md.c index ca0de7ddd943..cf3733c90c47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -70,11 +70,7 @@ #include "md-bitmap.h" #include "md-cluster.h" -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ +/* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -87,23 +83,13 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; -static struct workqueue_struct *md_rdev_misc_wq; +struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); - -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -360,10 +346,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -static bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -457,13 +439,19 @@ static void md_submit_bio(struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); - lockdep_assert_held(&mddev->reconfig_mutex); + struct md_thread *thread = rcu_dereference_protected(mddev->thread, + lockdep_is_held(&mddev->reconfig_mutex)); + + WARN_ON_ONCE(thread && current == thread->tsk); if (mddev->suspended++) return; wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); percpu_ref_kill(&mddev->active_io); + + if (mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); @@ -655,9 +643,11 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); @@ -759,6 +749,24 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; +static void md_free_rdev(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct md_rdev *tmp; + + mutex_lock(&mddev->delete_mutex); + if (list_empty(&mddev->deleting)) + goto out; + + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } +out: + mutex_unlock(&mddev->delete_mutex); +} + void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { @@ -800,13 +808,10 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); + md_free_rdev(mddev); + md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); - spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -2440,13 +2445,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void rdev_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ @@ -2467,6 +2465,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) static void md_kick_rdev_from_array(struct md_rdev *rdev) { + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); @@ -2480,15 +2480,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ + synchronize_rcu(); - INIT_WORK(&rdev->del_work, rdev_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev, rdev->mddev); + + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + mutex_lock(&mddev->delete_mutex); + list_add(&rdev->same_set, &mddev->deleting); + mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) @@ -3556,6 +3558,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3563,6 +3566,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == state_store && cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) @@ -3571,6 +3578,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = entry->store(rdev, page, length); mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -3796,8 +3807,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) @@ -3809,7 +3821,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) return -EINVAL; } - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; @@ -4479,6 +4491,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; + if (n > INT_MAX) + return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } @@ -4493,20 +4507,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4534,7 +4534,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -4806,11 +4805,21 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; err = mddev_lock(mddev); if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; - else { + } else if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); + } else { + /* + * If reshape is still in progress, and + * md_check_recovery() can continue to reshape, + * don't restart reshape because data can be + * corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } @@ -5594,7 +5603,6 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); - flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); @@ -6271,10 +6279,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; @@ -6335,10 +6345,12 @@ static int do_md_stop(struct mddev *mddev, int mode, } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); wait_event(resync_wait, (mddev->sync_thread == NULL && @@ -6735,7 +6747,6 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (info->state & (1<<MD_DISK_SYNC) && info->raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - set_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; @@ -7557,9 +7568,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, } - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); - if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, @@ -7888,13 +7896,29 @@ static int md_thread(void *arg) return 0; } -void md_wakeup_thread(struct md_thread *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) + wake_up_process(t->tsk); + rcu_read_unlock(); +} + +void md_wakeup_thread(struct md_thread __rcu *thread) +{ + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) { + pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); + set_bit(THREAD_WAKEUP, &t->flags); + wake_up(&t->wqueue); } + rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); @@ -7924,22 +7948,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread **threadp) +void md_unregister_thread(struct md_thread __rcu **threadp) { - struct md_thread *thread; + struct md_thread *thread = rcu_dereference_protected(*threadp, true); - /* - * Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - thread = *threadp; - if (!thread) { - spin_unlock(&pers_lock); + if (!thread) return; - } - *threadp = NULL; - spin_unlock(&pers_lock); + + rcu_assign_pointer(*threadp, NULL); + synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -9102,6 +9119,7 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); + wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } @@ -9204,9 +9222,8 @@ static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "resync")); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); @@ -9621,9 +9638,10 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_rdev_misc_wq) - goto err_rdev_misc_wq; + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + goto err_bitmap_wq; ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) @@ -9643,8 +9661,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_rdev_misc_wq); -err_rdev_misc_wq: + destroy_workqueue(md_bitmap_wq); +err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9940,8 +9958,8 @@ static __exit void md_exit(void) } spin_unlock(&all_mddevs_lock); - destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); + destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index fd8f260ed5f8..bfd2306bc750 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -122,8 +122,6 @@ struct md_rdev { struct serial_in_rdev *serial; /* used for raid1 io serialization */ - struct work_struct del_work; /* used for delayed sysfs removal */ - struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ @@ -367,8 +365,8 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - struct md_thread *thread; /* management thread */ - struct md_thread *sync_thread; /* doing resync or reconstruct */ + struct md_thread __rcu *thread; /* management thread */ + struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* 'last_sync_action' is initialized to "none". It is set when a * sync operation (i.e "data-check", "requested-resync", "resync", @@ -531,6 +529,14 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ + /* + * Temporarily store rdev that will be finally removed when + * reconfig_mutex is unlocked. + */ + struct list_head deleting; + /* Protect the deleting list */ + struct mutex delete_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; @@ -555,6 +561,23 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -614,6 +637,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -734,8 +758,8 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread **threadp); -extern void md_wakeup_thread(struct md_thread *thread); +extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); @@ -828,6 +852,7 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; +extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index cd349e69ed77..169ebe296f2d 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -21,6 +21,7 @@ #define IO_MADE_GOOD ((struct bio *)2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) +#define MAX_PLUG_BIO 32 /* for managing resync I/O pages */ struct resync_pages { @@ -31,6 +32,7 @@ struct resync_pages { struct raid1_plug_cb { struct blk_plug_cb cb; struct bio_list pending; + unsigned int count; }; static void rbio_pool_free(void *rbio, void *data) @@ -110,3 +112,64 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } + + +static inline void raid1_submit_write(struct bio *bio) +{ + struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; + + bio->bi_next = NULL; + bio_set_dev(bio, rdev->bdev); + if (test_bit(Faulty, &rdev->flags)) + bio_io_error(bio); + else if (unlikely(bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev))) + /* Just ignore it */ + bio_endio(bio); + else + submit_bio_noacct(bio); +} + +static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, + blk_plug_cb_fn unplug, int copies) +{ + struct raid1_plug_cb *plug = NULL; + struct blk_plug_cb *cb; + + /* + * If bitmap is not enabled, it's safe to submit the io directly, and + * this can get optimal performance. + */ + if (!md_bitmap_enabled(mddev->bitmap)) { + raid1_submit_write(bio); + return true; + } + + cb = blk_check_plugged(unplug, mddev, sizeof(*plug)); + if (!cb) + return false; + + plug = container_of(cb, struct raid1_plug_cb, cb); + bio_list_add(&plug->pending, bio); + if (++plug->count / MAX_PLUG_BIO >= copies) { + list_del(&cb->list); + cb->callback(cb, false); + } + + + return true; +} + +/* + * current->bio_list will be set under submit_bio() context, in this case bitmap + * io will be added to the list and wait for current io submission to finish, + * while current io submission must wait for bitmap io to be done. In order to + * avoid such deadlock, submit bitmap io asynchronously. + */ +static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +{ + if (current->bio_list) + md_bitmap_unplug_async(bitmap); + else + md_bitmap_unplug(bitmap); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3570da63969b..dd25832eb045 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -794,22 +794,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } @@ -1178,7 +1169,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r1conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); @@ -1346,8 +1337,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; bool write_behind = false; @@ -1576,15 +1565,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; - - cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -3087,7 +3068,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, "raid1"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid1d, mddev, "raid1")); if (!conf->thread) goto abort; @@ -3180,8 +3162,8 @@ static int raid1_run(struct mddev *mddev) /* * Ok, everything is just fine now */ - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ebb6788820e7..468f189da7a0 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -130,7 +130,7 @@ struct r1conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* Keep track of cluster resync window to send to other * nodes. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 381c21f7fb06..d0de8c9fb3cf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -779,8 +779,16 @@ static struct md_rdev *read_balance(struct r10conf *conf, disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + r10_bio->devs[slot].addr + sectors > + rdev->recovery_offset) { + /* + * Read replacement first to prevent reading both rdev + * and replacement as NULL during replacement replace + * rdev. + */ + smp_mb(); rdev = rcu_dereference(conf->mirrors[disk].rdev); + } if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; @@ -902,25 +910,15 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - /* flush any pending bitmap writes to disk - * before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; + cond_resched(); } blk_finish_plug(&plug); } else @@ -982,6 +980,7 @@ static void lower_barrier(struct r10conf *conf) static bool stop_waiting_barrier(struct r10conf *conf) { struct bio_list *bio_list = current->bio_list; + struct md_thread *thread; /* barrier is dropped */ if (!conf->barrier) @@ -997,12 +996,14 @@ static bool stop_waiting_barrier(struct r10conf *conf) (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) return true; + /* daemon thread must exist while handling io */ + thread = rcu_dereference_protected(conf->mddev->thread, true); /* * move on if io is issued from raid10d(), nr_pending is not released * from original io(see handle_read_error()). All raise barrier is * blocked until this io is done. */ - if (conf->mddev->thread->tsk == current) { + if (thread->tsk == current) { WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); return true; } @@ -1113,7 +1114,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); @@ -1125,23 +1126,15 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - md_bitmap_unplug(mddev->bitmap); + raid1_prepare_flush_writes(mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; + cond_resched(); } kfree(plug); } @@ -1282,8 +1275,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; unsigned long flags; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; struct r10conf *conf = mddev->private; struct md_rdev *rdev; int devnum = r10_bio->devs[n_copy].devnum; @@ -1323,14 +1314,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, atomic_inc(&r10_bio->remaining); - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); @@ -1479,9 +1463,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[d].replacement); + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(conf->mirrors[d].replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == rrdev) rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) @@ -2148,9 +2138,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; int err = -EEXIST; - int mirror; + int mirror, repl_slot = -1; int first = 0; int last = conf->geo.raid_disks - 1; + struct raid10_info *p; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -2173,23 +2164,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) else mirror = first; for ( ; mirror <= last ; mirror++) { - struct raid10_info *p = &conf->mirrors[mirror]; + p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { - if (!test_bit(WantReplacement, &p->rdev->flags) || - p->replacement != NULL) - continue; - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); - break; + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL && repl_slot < 0) + repl_slot = mirror; + continue; } if (mddev->gendisk) @@ -2206,6 +2188,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } + if (err && repl_slot >= 0) { + p = &conf->mirrors[repl_slot]; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + } + print_conf(conf); return err; } @@ -3303,6 +3298,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; + int error_disk = -1; /* * Allow skipping a full rebuild for incremental assembly @@ -3386,8 +3382,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return reshape_request(mddev, sector_nr, skipped); if (chunks_skipped >= conf->geo.raid_disks) { - /* if there has been nothing to do on any drive, - * then there is nothing to do at all.. + pr_err("md/raid10:%s: %s fails\n", mdname(mddev), + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); + if (error_disk >= 0 && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* + * recovery fails, set mirrors.recovery_disabled, + * device shouldn't be added to there. + */ + conf->mirrors[error_disk].recovery_disabled = + mddev->recovery_disabled; + return 0; + } + /* + * if there has been nothing to do on any drive, + * then there is nothing to do at all. */ *skipped = 1; return (max_sector - sector_nr) + sectors_skipped; @@ -3437,8 +3446,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; - int need_recover = 0; - int need_replace = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3446,15 +3453,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mrdev = rcu_dereference(mirror->rdev); mreplace = rcu_dereference(mirror->replacement); - if (mrdev != NULL && - !test_bit(Faulty, &mrdev->flags) && - !test_bit(In_sync, &mrdev->flags)) - need_recover = 1; - if (mreplace != NULL && - !test_bit(Faulty, &mreplace->flags)) - need_replace = 1; + if (mrdev && (test_bit(Faulty, &mrdev->flags) || + test_bit(In_sync, &mrdev->flags))) + mrdev = NULL; + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; - if (!need_recover && !need_replace) { + if (!mrdev && !mreplace) { rcu_read_unlock(); continue; } @@ -3470,8 +3475,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - if (mreplace && test_bit(Faulty, &mreplace->flags)) - mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3490,7 +3493,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - atomic_inc(&mrdev->nr_pending); + if (mrdev) + atomic_inc(&mrdev->nr_pending); if (mreplace) atomic_inc(&mreplace->nr_pending); rcu_read_unlock(); @@ -3577,7 +3581,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - if (need_recover) { + if (mrdev) { bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; @@ -3594,11 +3598,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - /* Note: if need_replace, then bio + /* Note: if replace is not NULL, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. */ - if (!need_replace) + if (!mreplace) break; bio->bi_next = biolist; biolist = bio; @@ -3622,7 +3626,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (!test_bit(In_sync, + if (mrdev && !test_bit(In_sync, &mrdev->flags) && !rdev_set_badblocks( mrdev, @@ -3643,17 +3647,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mdname(mddev)); mirror->recovery_disabled = mddev->recovery_disabled; + } else { + error_disk = i; } put_buf(r10_bio); if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - rdev_dec_pending(mrdev, mddev); + if (mrdev) + rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); break; } - rdev_dec_pending(mrdev, mddev); + if (mrdev) + rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { @@ -4107,7 +4115,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) atomic_set(&conf->nr_pending, 0); err = -ENOMEM; - conf->thread = md_register_thread(raid10d, mddev, "raid10"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid10d, mddev, "raid10")); if (!conf->thread) goto out; @@ -4152,8 +4161,8 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); if (mddev_is_clustered(conf->mddev)) { int fc, fo; @@ -4296,8 +4305,8 @@ static int raid10_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto out_free_conf; } @@ -4698,8 +4707,8 @@ out: set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { ret = -EAGAIN; goto abort; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 8c072ce0bc54..63e48b11b552 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -100,7 +100,7 @@ struct r10conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* * Keep track of cluster resync window to send to other nodes. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 852b265c5db4..47ba7d9e81e1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -120,7 +120,7 @@ struct r5l_log { struct bio_set bs; mempool_t meta_pool; - struct md_thread *reclaim_thread; + struct md_thread __rcu *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be * reclaimed. if it's 0, reclaim spaces * used by io_units which are in @@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) void r5l_quiesce(struct r5l_log *log, int quiesce) { - struct mddev *mddev; + struct mddev *mddev = log->rdev->mddev; + struct md_thread *thread = rcu_dereference_protected( + log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex)); if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ - mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); - kthread_park(log->reclaim_thread->tsk); + kthread_park(thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); } else - kthread_unpark(log->reclaim_thread->tsk); + kthread_unpark(thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct r5l_log *log; + struct md_thread *thread; int ret; pr_debug("md/raid:%s: using device %pg as journal\n", @@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->tree_lock); INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); - log->reclaim_thread = md_register_thread(r5l_reclaim_thread, - log->rdev->mddev, "reclaim"); - if (!log->reclaim_thread) + thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, + "reclaim"); + if (!thread) goto reclaim_thread; - log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + + thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + rcu_assign_pointer(log->reclaim_thread, thread); init_waitqueue_head(&log->iounit_wait); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4739ed891e75..f8bc74e16811 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5966,6 +5966,19 @@ out: return ret; } +static bool reshape_inprogress(struct mddev *mddev) +{ + return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery); +} + +static bool reshape_disabled(struct mddev *mddev) +{ + return is_md_suspended(mddev) || !md_is_rdwr(mddev); +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5997,7 +6010,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6076,6 +6090,15 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && + reshape_disabled(mddev)) { + bi->bi_status = BLK_STS_IOERR; + ret = STRIPE_FAIL; + pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", + mdname(mddev)); + } + return ret; } @@ -7708,7 +7731,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) } sprintf(pers_name, "raid%d", mddev->new_level); - conf->thread = md_register_thread(raid5d, mddev, pers_name); + rcu_assign_pointer(conf->thread, + md_register_thread(raid5d, mddev, pers_name)); if (!conf->thread) { pr_warn("md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); @@ -7931,8 +7955,8 @@ static int raid5_run(struct mddev *mddev) } conf->min_offset_diff = min_offset_diff; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; @@ -8029,8 +8053,8 @@ static int raid5_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto abort; } @@ -8377,6 +8401,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) p = conf->disks + disk; tmp = rdev_mdlock_deref(mddev, p->rdev); if (test_bit(WantReplacement, &tmp->flags) && + mddev->reshape_position == MaxSector && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); @@ -8500,6 +8525,7 @@ static int raid5_start_reshape(struct mddev *mddev) struct r5conf *conf = mddev->private; struct md_rdev *rdev; int spares = 0; + int i; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -8511,6 +8537,13 @@ static int raid5_start_reshape(struct mddev *mddev) if (has_failed(conf)) return -EINVAL; + /* raid5 can't handle concurrent reshape and recovery */ + if (mddev->recovery_cp < MaxSector) + return -EBUSY; + for (i = 0; i < conf->raid_disks; i++) + if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) + return -EBUSY; + rdev_for_each(rdev, mddev) { if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -8607,8 +8640,8 @@ static int raid5_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); @@ -9043,6 +9076,22 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || + percpu_ref_is_zero(&mddev->active_io)); + if (percpu_ref_is_zero(&mddev->active_io)) + return; + + /* + * Reshape is not in progress, and array is suspended, io that is + * waiting for reshpape can never be done. + */ + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -9063,6 +9112,7 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9087,6 +9137,7 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9112,6 +9163,7 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e873938a6125..f19707189a7b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -679,7 +679,7 @@ struct r5conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; |