diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-31 15:31:27 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-31 15:31:27 -0700 |
commit | 0f9a75179da33cc03594b882ed823cc5f4356d9a (patch) | |
tree | 7cb38f21b862a0d7e314100bf8935f707c60b0bd /drivers | |
parent | 6d541d6672eeaf526d67b67b5407f48fe0522c6d (diff) | |
parent | 0a751df4566c86e5a24f2a03290dad3d0f215692 (diff) | |
download | linux-0f9a75179da33cc03594b882ed823cc5f4356d9a.tar.gz linux-0f9a75179da33cc03594b882ed823cc5f4356d9a.tar.bz2 linux-0f9a75179da33cc03594b882ed823cc5f4356d9a.zip |
Merge tag 'block-6.10-20240530' of git://git.kernel.dk/linux
Pull block fixes from Jens Axboe:
- NVMe fixes via Keith:
- Removing unused fields (Kanchan)
- Large folio offsets support (Kundan)
- Multipath NUMA node initialiazation fix (Nilay)
- Multipath IO stats accounting fixes (Keith)
- Circular lockdep fix (Keith)
- Target race condition fix (Sagi)
- Target memory leak fix (Sagi)
- bcache fixes
- null_blk fixes (Damien)
- Fix regression in io.max due to throttle low removal (Waiman)
- DM limit table fixes (Christoph)
- SCSI and block limit fixes (Christoph)
- zone fixes (Damien)
- Misc fixes (Christoph, Hannes, hexue)
* tag 'block-6.10-20240530' of git://git.kernel.dk/linux: (25 commits)
blk-throttle: Fix incorrect display of io.max
block: Fix zone write plugging handling of devices with a runt zone
block: Fix validation of zoned device with a runt zone
null_blk: Do not allow runt zone with zone capacity smaller then zone size
nvmet: fix a possible leak when destroy a ctrl during qp establishment
nvme: use srcu for iterating namespace list
bcache: code cleanup in __bch_bucket_alloc_set()
bcache: call force_wake_up_gc() if necessary in check_should_bypass()
bcache: allow allocator to invalidate bucket in gc
block: check for max_hw_sectors underflow
block: stack max_user_sectors
sd: also set max_user_sectors when setting max_sectors
null_blk: Print correct max open zones limit in null_init_zoned_dev()
block: delete redundant function declaration
null_blk: Fix return value of nullb_device_power_store()
dm: make dm_set_zones_restrictions work on the queue limits
dm: remove dm_check_zoned
dm: move setting zoned_enabled to dm_table_set_restrictions
block: remove blk_queue_max_integrity_segments
nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/block/null_blk/main.c | 1 | ||||
-rw-r--r-- | drivers/block/null_blk/zoned.c | 13 | ||||
-rw-r--r-- | drivers/md/bcache/alloc.c | 21 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 1 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 7 | ||||
-rw-r--r-- | drivers/md/bcache/request.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-zone.c | 72 | ||||
-rw-r--r-- | drivers/md/dm.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 116 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 15 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 26 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 7 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 3 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 8 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 9 | ||||
-rw-r--r-- | drivers/scsi/sd.c | 4 |
17 files changed, 203 insertions, 134 deletions
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eb023d267369..631dca2e4e84 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -494,6 +494,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; + ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 5b5a63adacc1..f118d304f310 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -74,6 +74,17 @@ int null_init_zoned_dev(struct nullb_device *dev, return -EINVAL; } + /* + * If a smaller zone capacity was requested, do not allow a smaller last + * zone at the same time as such zone configuration does not correspond + * to any real zoned device. + */ + if (dev->zone_capacity != dev->zone_size && + dev->size & (dev->zone_size - 1)) { + pr_err("A smaller last zone is not allowed with zone capacity smaller than zone size.\n"); + return -EINVAL; + } + zone_capacity_sects = mb_to_sects(dev->zone_capacity); dev_capacity_sects = mb_to_sects(dev->size); dev->zone_size_sects = mb_to_sects(dev->zone_size); @@ -108,7 +119,7 @@ int null_init_zoned_dev(struct nullb_device *dev, if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) { dev->zone_max_open = dev->zone_max_active; pr_info("changed the maximum number of open zones to %u\n", - dev->nr_zones); + dev->zone_max_open); } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_open = 0; pr_info("zone_max_open limit disabled, limit >= zone count\n"); diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index ce13c272c387..48ce750bf70a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -129,12 +129,9 @@ static inline bool can_inc_bucket_gen(struct bucket *b) bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b) { - BUG_ON(!ca->set->gc_mark_valid); - - return (!GC_MARK(b) || - GC_MARK(b) == GC_MARK_RECLAIMABLE) && - !atomic_read(&b->pin) && - can_inc_bucket_gen(b); + return (ca->set->gc_mark_valid || b->reclaimable_in_gc) && + ((!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) && + !atomic_read(&b->pin) && can_inc_bucket_gen(b)); } void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -148,6 +145,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) bch_inc_gen(ca, b); b->prio = INITIAL_PRIO; atomic_inc(&b->pin); + b->reclaimable_in_gc = 0; } static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) @@ -352,8 +350,7 @@ static int bch_allocator_thread(void *arg) */ retry_invalidate: - allocator_wait(ca, ca->set->gc_mark_valid && - !ca->invalidate_needs_gc); + allocator_wait(ca, !ca->invalidate_needs_gc); invalidate_buckets(ca); /* @@ -501,8 +498,8 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, ca = c->cache; b = bch_bucket_alloc(ca, reserve, wait); - if (b == -1) - goto err; + if (b < 0) + return -1; k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, bucket_to_sector(c, b), @@ -511,10 +508,6 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, SET_KEY_PTRS(k, 1); return 0; -err: - bch_bucket_free(c, k); - bkey_put(c, k); - return -1; } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 4e6afa89921f..1d33e40d26ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -200,6 +200,7 @@ struct bucket { uint8_t gen; uint8_t last_gc; /* Most out of date gen in the btree */ uint16_t gc_mark; /* Bitfield used by GC. See below for field */ + uint16_t reclaimable_in_gc:1; }; /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index d011a7154d33..4e6ccf2c8a0b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,18 +1741,20 @@ static void btree_gc_start(struct cache_set *c) mutex_lock(&c->bucket_lock); - c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; ca = c->cache; for_each_bucket(b, ca) { b->last_gc = b->gen; + if (bch_can_invalidate_bucket(ca, b)) + b->reclaimable_in_gc = 1; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } + c->gc_mark_valid = 0; mutex_unlock(&c->bucket_lock); } @@ -1809,6 +1811,9 @@ static void bch_btree_gc_finish(struct cache_set *c) for_each_bucket(b, ca) { c->need_gc = max(c->need_gc, bucket_gc_gen(b)); + if (b->reclaimable_in_gc) + b->reclaimable_in_gc = 0; + if (atomic_read(&b->pin)) continue; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 83d112bd2b1c..af345dc6fde1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -369,10 +369,24 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) struct io *i; if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || - c->gc_stats.in_use > CUTOFF_CACHE_ADD || (bio_op(bio) == REQ_OP_DISCARD)) goto skip; + if (c->gc_stats.in_use > CUTOFF_CACHE_ADD) { + /* + * If cached buckets are all clean now, 'true' will be + * returned and all requests will bypass the cache device. + * Then c->sectors_to_gc has no chance to be negative, and + * gc thread won't wake up and caching won't work forever. + * Here call force_wake_up_gc() to avoid such aftermath. + */ + if (BDEV_STATE(&dc->sb) == BDEV_STATE_CLEAN && + c->gc_mark_valid) + force_wake_up_gc(c); + + goto skip; + } + if (mode == CACHE_MODE_NONE || (mode == CACHE_MODE_WRITEAROUND && op_is_write(bio_op(bio)))) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index cc66a27c363a..b2d5246cff21 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1981,10 +1981,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (!dm_table_supports_secure_erase(t)) limits->max_secure_erase_sectors = 0; - r = queue_limits_set(q, limits); - if (r) - return r; - if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_FUA))) @@ -2036,15 +2032,16 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * For a zoned target, setup the zones related queue attributes * and resources necessary for zone append emulation if necessary. */ - if (blk_queue_is_zoned(q)) { - r = dm_set_zones_restrictions(t, q); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && limits->zoned) { + r = dm_set_zones_restrictions(t, q, limits); if (r) return r; - if (blk_queue_is_zoned(q) && - !static_key_enabled(&zoned_enabled.key)) - static_branch_enable(&zoned_enabled); } + r = queue_limits_set(q, limits); + if (r) + return r; + dm_update_crypto_profile(q, t); /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 8e6bcb0d786a..5d66d916730e 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -160,37 +160,6 @@ static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) -{ - struct gendisk *disk = md->disk; - unsigned int nr_conv_zones = 0; - int ret; - - /* Count conventional zones */ - md->zone_revalidate_map = t; - ret = dm_blk_report_zones(disk, 0, UINT_MAX, - dm_check_zoned_cb, &nr_conv_zones); - md->zone_revalidate_map = NULL; - if (ret < 0) { - DMERR("Check zoned failed %d", ret); - return ret; - } - - /* - * If we only have conventional zones, expose the mapped device as - * a regular device. - */ - if (nr_conv_zones >= ret) { - disk->queue->limits.max_open_zones = 0; - disk->queue->limits.max_active_zones = 0; - disk->queue->limits.zoned = false; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - disk->nr_zones = 0; - } - - return 0; -} - /* * Revalidate the zones of a mapped device to initialize resource necessary * for zone append emulation. Note that we cannot simply use the block layer @@ -251,9 +220,12 @@ static bool dm_table_supports_zone_append(struct dm_table *t) return true; } -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim) { struct mapped_device *md = t->md; + struct gendisk *disk = md->disk; + unsigned int nr_conv_zones = 0; int ret; /* @@ -265,21 +237,37 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); } else { set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - blk_queue_max_zone_append_sectors(q, 0); + lim->max_zone_append_sectors = 0; } if (!get_capacity(md->disk)) return 0; /* - * Check that the mapped device will indeed be zoned, that is, that it - * has sequential write required zones. + * Count conventional zones to check that the mapped device will indeed + * have sequential write required zones. */ - ret = dm_check_zoned(md, t); - if (ret) + md->zone_revalidate_map = t; + ret = dm_blk_report_zones(disk, 0, UINT_MAX, + dm_check_zoned_cb, &nr_conv_zones); + md->zone_revalidate_map = NULL; + if (ret < 0) { + DMERR("Check zoned failed %d", ret); return ret; - if (!blk_queue_is_zoned(q)) + } + + /* + * If we only have conventional zones, expose the mapped device as + * a regular device. + */ + if (nr_conv_zones >= ret) { + lim->max_open_zones = 0; + lim->max_active_zones = 0; + lim->zoned = false; + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + disk->nr_zones = 0; return 0; + } if (!md->disk->nr_zones) { DMINFO("%s using %s zone append", @@ -287,7 +275,13 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) queue_emulates_zone_append(q) ? "emulated" : "native"); } - return dm_revalidate_zones(md, t); + ret = dm_revalidate_zones(md, t); + if (ret < 0) + return ret; + + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); + return 0; } /* diff --git a/drivers/md/dm.h b/drivers/md/dm.h index e0c57f19839b..53ef8207fe2c 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -101,7 +101,8 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * Zoned targets related functions. */ -int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); +int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, + struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 954f850f113a..f5d150c62955 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -414,7 +414,15 @@ static inline void nvme_end_req_zoned(struct request *req) } } -static inline void nvme_end_req(struct request *req) +static inline void __nvme_end_req(struct request *req) +{ + nvme_end_req_zoned(req); + nvme_trace_bio_complete(req); + if (req->cmd_flags & REQ_NVME_MPATH) + nvme_mpath_end_request(req); +} + +void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); @@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req) else nvme_log_error(req); } - nvme_end_req_zoned(req); - nvme_trace_bio_complete(req); - if (req->cmd_flags & REQ_NVME_MPATH) - nvme_mpath_end_request(req); + __nvme_end_req(req); blk_mq_end_request(req, status); } @@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req) { trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); - nvme_end_req_zoned(req); + __nvme_end_req(req); } EXPORT_SYMBOL_GPL(nvme_complete_batch_req); @@ -673,7 +678,7 @@ static void nvme_free_ns(struct kref *kref) kfree(ns); } -static inline bool nvme_get_ns(struct nvme_ns *ns) +bool nvme_get_ns(struct nvme_ns *ns) { return kref_get_unless_zero(&ns->kref); } @@ -3679,9 +3684,10 @@ out_unlock: struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns, *ret = NULL; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (ns->head->ns_id == nsid) { if (!nvme_get_ns(ns)) continue; @@ -3691,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->head->ns_id > nsid) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU); @@ -3705,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns) list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) { if (tmp->head->ns_id < ns->head->ns_id) { - list_add(&ns->list, &tmp->list); + list_add_rcu(&ns->list, &tmp->list); return; } } @@ -3771,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (nvme_update_ns_info(ns, info)) goto out_unlink_ns; - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); /* * Ensure that no namespaces are added to the ctrl list after the queues * are frozen, thereby avoiding a deadlock between scan and reset. */ if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) { - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); goto out_unlink_ns; } nvme_ns_add_to_ctrl_list(ns); - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); nvme_get_ctrl(ctrl); if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups)) @@ -3804,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_cleanup_ns_from_list: nvme_put_ctrl(ctrl); - down_write(&ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); @@ -3856,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns) nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); - down_write(&ns->ctrl->namespaces_rwsem); - list_del_init(&ns->list); - up_write(&ns->ctrl->namespaces_rwsem); + mutex_lock(&ns->ctrl->namespaces_lock); + list_del_rcu(&ns->list); + mutex_unlock(&ns->ctrl->namespaces_lock); + synchronize_srcu(&ns->ctrl->srcu); if (last_path) nvme_mpath_shutdown_disk(ns->head); @@ -3948,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, struct nvme_ns *ns, *next; LIST_HEAD(rm_list); - down_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { if (ns->head->ns_id > nsid) - list_move_tail(&ns->list, &rm_list); + list_splice_init_rcu(&ns->list, &rm_list, + synchronize_rcu); } - up_write(&ctrl->namespaces_rwsem); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &rm_list, list) nvme_ns_remove(ns); - } static int nvme_scan_ns_list(struct nvme_ctrl *ctrl) @@ -4127,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) /* this is a no-op when called from the controller reset handler */ nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO); - down_write(&ctrl->namespaces_rwsem); - list_splice_init(&ctrl->namespaces, &ns_list); - up_write(&ctrl->namespaces_rwsem); + mutex_lock(&ctrl->namespaces_lock); + list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu); + mutex_unlock(&ctrl->namespaces_lock); + synchronize_srcu(&ctrl->srcu); list_for_each_entry_safe(ns, next, &ns_list, list) nvme_ns_remove(ns); @@ -4577,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev) key_put(ctrl->tls_key); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); + cleanup_srcu_struct(&ctrl->srcu); nvme_auth_stop(ctrl); nvme_auth_free(ctrl); __free_page(ctrl->discard_page); @@ -4609,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->passthru_err_log_enabled = false; clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags); spin_lock_init(&ctrl->lock); + mutex_init(&ctrl->namespaces_lock); + + ret = init_srcu_struct(&ctrl->srcu); + if (ret) + return ret; + mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); xa_init(&ctrl->cels); - init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; @@ -4692,6 +4709,7 @@ out_release_instance: out: if (ctrl->discard_page) __free_page(ctrl->discard_page); + cleanup_srcu_struct(&ctrl->srcu); return ret; } EXPORT_SYMBOL_GPL(nvme_init_ctrl); @@ -4700,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl); void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mark_disk_dead(ns->disk); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead); void nvme_unfreeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_unfreeze_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); clear_bit(NVME_CTRL_FROZEN, &ctrl->flags); } EXPORT_SYMBOL_GPL(nvme_unfreeze); @@ -4723,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout); if (timeout <= 0) break; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return timeout; } EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); @@ -4738,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); void nvme_wait_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_mq_freeze_queue_wait(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_wait_freeze); void nvme_start_freeze(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; set_bit(NVME_CTRL_FROZEN, &ctrl->flags); - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_freeze_queue_start(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_start_freeze); @@ -4797,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue); void nvme_sync_io_queues(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } EXPORT_SYMBOL_GPL(nvme_sync_io_queues); diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 499a8bb7cac7..9d9d2a127c4e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, bool open_for_write) { struct nvme_ns *ns; - int ret; + int ret, srcu_idx; - down_read(&ctrl->namespaces_rwsem); + srcu_idx = srcu_read_lock(&ctrl->srcu); if (list_empty(&ctrl->namespaces)) { ret = -ENOTTY; goto out_unlock; } - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list); if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { dev_warn(ctrl->device, "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); @@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, dev_warn(ctrl->device, "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); + if (!nvme_get_ns(ns)) { + ret = -ENXIO; + goto out_unlock; + } + srcu_read_unlock(&ctrl->srcu, srcu_idx); ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); nvme_put_ns(ns); return ret; out_unlock: - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return ret; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d16e976ae1a4..d8b6b4648eaf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req) blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); + nvme_req(req)->status = 0; + nvme_end_req(req); kblockd_schedule_work(&ns->head->requeue_work); } @@ -150,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq) void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { if (!ns->head->disk) continue; kblockd_schedule_work(&ns->head->requeue_work); if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE) disk_uevent(ns->head->disk, KOBJ_CHANGE); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } static const char *nvme_ana_state_names[] = { @@ -193,13 +195,14 @@ out: void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { struct nvme_ns *ns; + int srcu_idx; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { nvme_mpath_clear_current_path(ns); kblockd_schedule_work(&ns->head->requeue_work); } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); } void nvme_mpath_revalidate_paths(struct nvme_ns *ns) @@ -595,7 +598,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) int node, srcu_idx; srcu_idx = srcu_read_lock(&head->srcu); - for_each_node(node) + for_each_online_node(node) __nvme_find_path(head, node); srcu_read_unlock(&head->srcu, srcu_idx); } @@ -680,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0; unsigned *nr_change_groups = data; struct nvme_ns *ns; + int srcu_idx; dev_dbg(ctrl->device, "ANA group %d: %s.\n", le32_to_cpu(desc->grpid), @@ -691,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, if (!nr_nsids) return 0; - down_read(&ctrl->namespaces_rwsem); - list_for_each_entry(ns, &ctrl->namespaces, list) { + srcu_idx = srcu_read_lock(&ctrl->srcu); + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { unsigned nsid; again: nsid = le32_to_cpu(desc->nsids[n]); @@ -705,7 +709,7 @@ again: if (ns->head->ns_id > nsid) goto again; } - up_read(&ctrl->namespaces_rwsem); + srcu_read_unlock(&ctrl->srcu, srcu_idx); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cacc56f4bbf4..f3a41133ac3f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -282,7 +282,8 @@ struct nvme_ctrl { struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; - struct rw_semaphore namespaces_rwsem; + struct mutex namespaces_lock; + struct srcu_struct srcu; struct device ctrl_device; struct device *device; /* char device */ #ifdef CONFIG_NVME_HWMON @@ -471,8 +472,6 @@ struct nvme_ns_head { u8 pi_type; u8 pi_offset; u8 guard_type; - u16 sgs; - u32 sws; #ifdef CONFIG_BLK_DEV_ZONED u64 zsze; #endif @@ -767,6 +766,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl) } } +void nvme_end_req(struct request *req); void nvme_complete_rq(struct request *req); void nvme_complete_batch_req(struct request *req); @@ -1161,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects, struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); +bool nvme_get_ns(struct nvme_ns *ns); void nvme_put_ns(struct nvme_ns *ns); static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 710043086dff..102a9fb0c65f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) + if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 7c43a0ad6877..bd87dfd173a4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -676,10 +676,18 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item, if (kstrtobool(page, &enable)) return -EINVAL; + /* + * take a global nvmet_config_sem because the disable routine has a + * window where it releases the subsys-lock, giving a chance to + * a parallel enable to concurrently execute causing the disable to + * have a misaccounting of the ns percpu_ref. + */ + down_write(&nvmet_config_sem); if (enable) ret = nvmet_ns_enable(ns); else nvmet_ns_disable(ns); + up_write(&nvmet_config_sem); return ret ? ret : count; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 2fde22323622..06f0c587f343 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -818,6 +818,15 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) percpu_ref_exit(&sq->ref); nvmet_auth_sq_free(sq); + /* + * we must reference the ctrl again after waiting for inflight IO + * to complete. Because admin connect may have sneaked in after we + * store sq->ctrl locally, but before we killed the percpu_ref. the + * admin connect allocates and assigns sq->ctrl, which now needs a + * final ref put, as this ctrl is going away. + */ + ctrl = sq->ctrl; + if (ctrl) { /* * The teardown flow may take some time, and the host may not diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 332eb9dac22d..f6c822c9cbd2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3700,8 +3700,10 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->first_scan || q->limits.max_sectors > q->limits.max_dev_sectors || - q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors > q->limits.max_hw_sectors) { q->limits.max_sectors = rw_max; + q->limits.max_user_sectors = rw_max; + } sdkp->first_scan = 0; |