diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 14:39:37 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 14:39:37 -0700 |
commit | fc0586062816559defb14c947319ef8c4c326fb3 (patch) | |
tree | 5ca73bd1fc9de596a11e6d3549fd8fbf6f87dafc /drivers/nvme | |
parent | 6c0029211382011af508273c4fc98a732f841d95 (diff) | |
parent | 8324fbae75ce65fc2eb960a8434799dca48248ac (diff) | |
download | linux-fc0586062816559defb14c947319ef8c4c326fb3.tar.gz linux-fc0586062816559defb14c947319ef8c4c326fb3.tar.bz2 linux-fc0586062816559defb14c947319ef8c4c326fb3.zip |
Merge tag 'for-5.13/drivers-2021-04-27' of git://git.kernel.dk/linux-block
Pull block driver updates from Jens Axboe:
- MD changes via Song:
- raid5 POWER fix
- raid1 failure fix
- UAF fix for md cluster
- mddev_find_or_alloc() clean up
- Fix NULL pointer deref with external bitmap
- Performance improvement for raid10 discard requests
- Fix missing information of /proc/mdstat
- rsxx const qualifier removal (Arnd)
- Expose allocated brd pages (Calvin)
- rnbd via Gioh Kim:
- Change maintainer
- Change domain address of maintainers' email
- Add polling IO mode and document update
- Fix memory leak and some bug detected by static code analysis
tools
- Code refactoring
- Series of floppy cleanups/fixes (Denis)
- s390 dasd fixes (Julian)
- kerneldoc fixes (Lee)
- null_blk double free (Lv)
- null_blk virtual boundary addition (Max)
- Remove xsysace driver (Michal)
- umem driver removal (Davidlohr)
- ataflop fixes (Dan)
- Revalidate disk removal (Christoph)
- Bounce buffer cleanups (Christoph)
- Mark lightnvm as deprecated (Christoph)
- mtip32xx init cleanups (Shixin)
- Various fixes (Tian, Gustavo, Coly, Yang, Zhang, Zhiqiang)
* tag 'for-5.13/drivers-2021-04-27' of git://git.kernel.dk/linux-block: (143 commits)
async_xor: increase src_offs when dropping destination page
drivers/block/null_blk/main: Fix a double free in null_init.
md/raid1: properly indicate failure when ending a failed write request
md-cluster: fix use-after-free issue when removing rdev
nvme: introduce generic per-namespace chardev
nvme: cleanup nvme_configure_apst
nvme: do not try to reconfigure APST when the controller is not live
nvme: add 'kato' sysfs attribute
nvme: sanitize KATO setting
nvmet: avoid queuing keep-alive timer if it is disabled
brd: expose number of allocated pages in debugfs
ataflop: fix off by one in ataflop_probe()
ataflop: potential out of bounds in do_format()
drbd: Fix fall-through warnings for Clang
block/rnbd: Use strscpy instead of strlcpy
block/rnbd-clt-sysfs: Remove copy buffer overlap in rnbd_clt_get_path_name
block/rnbd-clt: Remove max_segment_size
block/rnbd-clt: Generate kobject_uevent when the rnbd device state changes
block/rnbd-srv: Remove unused arguments of rnbd_srv_rdma_ev
Documentation/ABI/rnbd-clt: Add description for nr_poll_queues
...
Diffstat (limited to 'drivers/nvme')
-rw-r--r-- | drivers/nvme/host/Makefile | 2 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 1076 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 4 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 14 | ||||
-rw-r--r-- | drivers/nvme/host/ioctl.c | 481 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 10 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 114 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 64 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 30 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 7 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 16 | ||||
-rw-r--r-- | drivers/nvme/host/zns.c | 4 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 14 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 33 | ||||
-rw-r--r-- | drivers/nvme/target/discovery.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/fabrics-cmd.c | 17 | ||||
-rw-r--r-- | drivers/nvme/target/fc.c | 78 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 6 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 8 | ||||
-rw-r--r-- | drivers/nvme/target/tcp.c | 79 |
21 files changed, 1250 insertions, 819 deletions
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index d7f6a87687b8..cbc509784b2e 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o -nvme-core-y := core.o +nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5eaaa51a5e30..b6f7815fa239 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,6 +89,10 @@ static dev_t nvme_ctrl_base_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; +static DEFINE_IDA(nvme_ns_chr_minor_ida); +static dev_t nvme_ns_chr_devt; +static struct class *nvme_ns_chr_class; + static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -112,7 +116,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) set_capacity_and_notify(ns->disk, 0); } -static void nvme_queue_scan(struct nvme_ctrl *ctrl) +void nvme_queue_scan(struct nvme_ctrl *ctrl) { /* * Only new queue scan work when admin and IO queues are both alive @@ -179,7 +183,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -549,7 +553,12 @@ static void nvme_free_ns_head(struct kref *ref) kfree(head); } -static void nvme_put_ns_head(struct nvme_ns_head *head) +bool nvme_tryget_ns_head(struct nvme_ns_head *head) +{ + return kref_get_unless_zero(&head->ref); +} + +void nvme_put_ns_head(struct nvme_ns_head *head) { kref_put(&head->ref, nvme_free_ns_head); } @@ -575,11 +584,12 @@ EXPORT_SYMBOL_NS_GPL(nvme_put_ns, NVME_TARGET_PASSTHRU); static inline void nvme_clear_nvme_request(struct request *req) { - if (!(req->rq_flags & RQF_DONTPREP)) { - nvme_req(req)->retries = 0; - nvme_req(req)->flags = 0; - req->rq_flags |= RQF_DONTPREP; - } + struct nvme_command *cmd = nvme_req(req)->cmd; + + memset(cmd, 0, sizeof(*cmd)); + nvme_req(req)->retries = 0; + nvme_req(req)->flags = 0; + req->rq_flags |= RQF_DONTPREP; } static inline unsigned int nvme_req_op(struct nvme_command *cmd) @@ -595,9 +605,12 @@ static inline void nvme_init_request(struct request *req, else /* no queuedata implies admin queue */ req->timeout = NVME_ADMIN_TIMEOUT; + /* passthru commands should let the driver set the SGL flags */ + cmd->common.flags &= ~NVME_CMD_SGL_ALL; + req->cmd_flags |= REQ_FAILFAST_DRIVER; nvme_clear_nvme_request(req); - nvme_req(req)->cmd = cmd; + memcpy(nvme_req(req)->cmd, cmd, sizeof(*cmd)); } struct request *nvme_alloc_request(struct request_queue *q, @@ -726,14 +739,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl, req->q->write_hints[streamid] += blk_rq_bytes(req) >> 9; } -static void nvme_setup_passthrough(struct request *req, - struct nvme_command *cmd) -{ - memcpy(cmd, nvme_req(req)->cmd, sizeof(*cmd)); - /* passthru commands should let the driver set the SGL flags */ - cmd->common.flags &= ~NVME_CMD_SGL_ALL; -} - static inline void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { @@ -888,18 +893,18 @@ void nvme_cleanup_cmd(struct request *req) } EXPORT_SYMBOL_GPL(nvme_cleanup_cmd); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd) +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req) { + struct nvme_command *cmd = nvme_req(req)->cmd; blk_status_t ret = BLK_STS_OK; - nvme_clear_nvme_request(req); + if (!(req->rq_flags & RQF_DONTPREP)) + nvme_clear_nvme_request(req); - memset(cmd, 0, sizeof(*cmd)); switch (req_op(req)) { case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: - nvme_setup_passthrough(req, cmd); + /* these are setup prior to execution in nvme_init_request() */ break; case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); @@ -1020,40 +1025,6 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); -static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, - unsigned len, u32 seed, bool write) -{ - struct bio_integrity_payload *bip; - int ret = -ENOMEM; - void *buf; - - buf = kmalloc(len, GFP_KERNEL); - if (!buf) - goto out; - - ret = -EFAULT; - if (write && copy_from_user(buf, ubuf, len)) - goto out_free_meta; - - bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); - if (IS_ERR(bip)) { - ret = PTR_ERR(bip); - goto out_free_meta; - } - - bip->bip_iter.bi_size = len; - bip->bip_iter.bi_sector = seed; - ret = bio_integrity_add_page(bio, virt_to_page(buf), len, - offset_in_page(buf)); - if (ret == len) - return buf; - ret = -ENOMEM; -out_free_meta: - kfree(buf); -out: - return ERR_PTR(ret); -} - static u32 nvme_known_admin_effects(u8 opcode) { switch (opcode) { @@ -1076,9 +1047,9 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) if (ns->head->effects) effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) - dev_warn(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", - opcode, effects); + dev_warn_once(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); return 0; } @@ -1120,7 +1091,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) - nvme_init_identify(ctrl); + nvme_init_ctrl_finish(ctrl); if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); @@ -1137,68 +1108,20 @@ void nvme_execute_passthru_rq(struct request *rq) effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); blk_execute_rq(disk, rq, 0); - nvme_passthru_end(ctrl, effects); + if (effects) /* nothing to be done for zero cmd effects */ + nvme_passthru_end(ctrl, effects); } EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); -static int nvme_submit_user_cmd(struct request_queue *q, - struct nvme_command *cmd, void __user *ubuffer, - unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) +/* + * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: + * + * The host should send Keep Alive commands at half of the Keep Alive Timeout + * accounting for transport roundtrip times [..]. + */ +static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) { - bool write = nvme_is_write(cmd); - struct nvme_ns *ns = q->queuedata; - struct block_device *bdev = ns ? ns->disk->part0 : NULL; - struct request *req; - struct bio *bio = NULL; - void *meta = NULL; - int ret; - - req = nvme_alloc_request(q, cmd, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - if (timeout) - req->timeout = timeout; - nvme_req(req)->flags |= NVME_REQ_USERCMD; - - if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - GFP_KERNEL); - if (ret) - goto out; - bio = req->bio; - if (bdev) - bio_set_dev(bio, bdev); - if (bdev && meta_buffer && meta_len) { - meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, - meta_seed, write); - if (IS_ERR(meta)) { - ret = PTR_ERR(meta); - goto out_unmap; - } - req->cmd_flags |= REQ_INTEGRITY; - } - } - - nvme_execute_passthru_rq(req); - if (nvme_req(req)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - else - ret = nvme_req(req)->status; - if (result) - *result = le64_to_cpu(nvme_req(req)->result.u64); - if (meta && !ret && !write) { - if (copy_to_user(meta_buffer, meta, meta_len)) - ret = -EFAULT; - } - kfree(meta); - out_unmap: - if (bio) - blk_rq_unmap_user(bio); - out: - blk_mq_free_request(req); - return ret; + queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ / 2); } static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) @@ -1223,7 +1146,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) startka = true; spin_unlock_irqrestore(&ctrl->lock, flags); if (startka) - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } static void nvme_keep_alive_work(struct work_struct *work) @@ -1237,7 +1160,7 @@ static void nvme_keep_alive_work(struct work_struct *work) dev_dbg(ctrl->device, "reschedule traffic based keep-alive timer\n"); ctrl->comp_seen = false; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); return; } @@ -1260,7 +1183,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) if (unlikely(ctrl->kato == 0)) return; - queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvme_queue_keep_alive_work(ctrl); } void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) @@ -1536,170 +1459,6 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) } /* - * Convert integer values from ioctl structures to user pointers, silently - * ignoring the upper bits in the compat case to match behaviour of 32-bit - * kernels. - */ -static void __user *nvme_to_user_ptr(uintptr_t ptrval) -{ - if (in_compat_syscall()) - ptrval = (compat_uptr_t)ptrval; - return (void __user *)ptrval; -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - if (io.flags) - return -EINVAL; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - - if ((io.control & NVME_RW_PRINFO_PRACT) && - ns->ms == sizeof(struct t10_pi_tuple)) { - /* - * Protection information is stripped/inserted by the - * controller. - */ - if (nvme_to_user_ptr(io.metadata)) - return -EINVAL; - meta_len = 0; - metadata = NULL; - } else { - meta_len = (io.nblocks + 1) * ns->ms; - metadata = nvme_to_user_ptr(io.metadata); - } - - if (ns->features & NVME_NS_EXT_LBAS) { - length += meta_len; - meta_len = 0; - } else if (meta_len) { - if ((io.metadata & 3) || !io.metadata) - return -EINVAL; - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->head->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - - return nvme_submit_user_cmd(ns->queue, &c, - nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); -} - -static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - u64 result; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); - - if (status >= 0) { - if (put_user(result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) -{ - struct nvme_passthru_cmd64 cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - if (cmd.flags) - return -EINVAL; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10 = cpu_to_le32(cmd.cdw10); - c.common.cdw11 = cpu_to_le32(cmd.cdw11); - c.common.cdw12 = cpu_to_le32(cmd.cdw12); - c.common.cdw13 = cpu_to_le32(cmd.cdw13); - c.common.cdw14 = cpu_to_le32(cmd.cdw14); - c.common.cdw15 = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, - nvme_to_user_ptr(cmd.addr), cmd.data_len, - nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); - - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -/* * Issue ioctl requests on the first available path. Note that unlike normal * block layer requests we will not retry failed request on another controller. */ @@ -1729,136 +1488,12 @@ void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) srcu_read_unlock(&head->srcu, idx); } -static bool is_ctrl_ioctl(unsigned int cmd) -{ - if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) - return true; - if (is_sed_ioctl(cmd)) - return true; - return false; -} - -static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, - struct nvme_ns_head *head, - int srcu_idx) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - int ret; - - nvme_get_ctrl(ns->ctrl); - nvme_put_ns_from_disk(head, srcu_idx); - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - ret = nvme_user_cmd(ctrl, NULL, argp); - break; - case NVME_IOCTL_ADMIN64_CMD: - ret = nvme_user_cmd64(ctrl, NULL, argp); - break; - default: - ret = sed_ioctl(ctrl->opal_dev, cmd, argp); - break; - } - nvme_put_ctrl(ctrl); - return ret; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - struct nvme_ns_head *head = NULL; - void __user *argp = (void __user *)arg; - struct nvme_ns *ns; - int srcu_idx, ret; - - ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); - if (unlikely(!ns)) - return -EWOULDBLOCK; - - /* - * Handle ioctls that apply to the controller instead of the namespace - * seperately and drop the ns SRCU reference early. This avoids a - * deadlock when deleting namespaces using the passthrough interface. - */ - if (is_ctrl_ioctl(cmd)) - return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx); - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - ret = ns->head->ns_id; - break; - case NVME_IOCTL_IO_CMD: - ret = nvme_user_cmd(ns->ctrl, ns, argp); - break; - case NVME_IOCTL_SUBMIT_IO: - ret = nvme_submit_io(ns, argp); - break; - case NVME_IOCTL_IO64_CMD: - ret = nvme_user_cmd64(ns->ctrl, ns, argp); - break; - default: - if (ns->ndev) - ret = nvme_nvm_ioctl(ns, cmd, arg); - else - ret = -ENOTTY; - } - - nvme_put_ns_from_disk(head, srcu_idx); - return ret; -} - -#ifdef CONFIG_COMPAT -struct nvme_user_io32 { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -} __attribute__((__packed__)); - -#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) - -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +static int nvme_ns_open(struct nvme_ns *ns) { - /* - * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO - * between 32 bit programs and 64 bit kernel. - * The cause is that the results of sizeof(struct nvme_user_io), - * which is used to define NVME_IOCTL_SUBMIT_IO, - * are not same between 32 bit compiler and 64 bit compiler. - * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling - * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. - * Other IOCTL numbers are same between 32 bit and 64 bit. - * So there is nothing to do regarding to other IOCTL numbers. - */ - if (cmd == NVME_IOCTL_SUBMIT_IO32) - return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif /* CONFIG_COMPAT */ - -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - -#ifdef CONFIG_NVME_MULTIPATH /* should never be called due to GENHD_FL_HIDDEN */ - if (WARN_ON_ONCE(ns->head->disk)) + if (WARN_ON_ONCE(nvme_ns_head_multipath(ns->head))) goto fail; -#endif if (!kref_get_unless_zero(&ns->kref)) goto fail; if (!try_module_get(ns->ctrl->ops->module)) @@ -1872,15 +1507,24 @@ fail: return -ENXIO; } -static void nvme_release(struct gendisk *disk, fmode_t mode) +static void nvme_ns_release(struct nvme_ns *ns) { - struct nvme_ns *ns = disk->private_data; module_put(ns->ctrl->ops->module); nvme_put_ns(ns); } -static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_ns_open(bdev->bd_disk->private_data); +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_ns_release(disk->private_data); +} + +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) { /* some standard values */ geo->heads = 1 << 6; @@ -1929,7 +1573,7 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); - if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) { + if (ctrl->max_discard_sectors == 0) { blk_queue_flag_clear(QUEUE_FLAG_DISCARD, queue); return; } @@ -1947,27 +1591,13 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) if (blk_queue_flag_test_and_set(QUEUE_FLAG_DISCARD, queue)) return; - blk_queue_max_discard_sectors(queue, UINT_MAX); - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); + blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); + blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -/* - * Even though NVMe spec explicitly states that MDTS is not applicable to the - * write-zeroes, we are cautious and limit the size to the controllers - * max_hw_sectors value, which is based on the MDTS field and possibly other - * limiting factors. - */ -static void nvme_config_write_zeroes(struct request_queue *q, - struct nvme_ctrl *ctrl) -{ - if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && - !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) - blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors); -} - static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) { return !uuid_is_null(&ids->uuid) || @@ -2137,7 +1767,8 @@ static void nvme_update_disk_info(struct gendisk *disk, set_capacity_and_notify(disk, capacity); nvme_config_discard(disk, ns); - nvme_config_write_zeroes(disk->queue, ns->ctrl); + blk_queue_max_write_zeroes_sectors(disk->queue, + ns->ctrl->max_zeroes_sectors); set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) || test_bit(NVME_NS_FORCE_RO, &ns->flags)); @@ -2206,11 +1837,10 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (blk_queue_is_zoned(ns->queue)) { ret = nvme_revalidate_zones(ns); if (ret && !nvme_first_scan(ns->disk)) - return ret; + goto out; } -#ifdef CONFIG_NVME_MULTIPATH - if (ns->head->disk) { + if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); blk_stack_limits(&ns->head->disk->queue->limits, @@ -2218,11 +1848,19 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) blk_queue_update_readahead(ns->head->disk->queue); blk_mq_unfreeze_queue(ns->head->disk->queue); } -#endif return 0; out_unfreeze: blk_mq_unfreeze_queue(ns->disk->queue); +out: + /* + * If probing fails due an unsupported feature, hide the block device, + * but still allow other access. + */ + if (ret == -ENODEV) { + ns->disk->flags |= GENHD_FL_HIDDEN; + ret = 0; + } return ret; } @@ -2303,22 +1941,25 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, enum pr_type type, bool abort) { u32 cdw10 = nvme_pr_type(type) << 8 | (abort ? 2 : 1); + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); } static int nvme_pr_clear(struct block_device *bdev, u64 key) { u32 cdw10 = 1 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); } static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); } -static const struct pr_ops nvme_pr_ops = { +const struct pr_ops nvme_pr_ops = { .pr_register = nvme_pr_register, .pr_reserve = nvme_pr_reserve, .pr_release = nvme_pr_release, @@ -2351,7 +1992,6 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -2360,31 +2000,25 @@ static const struct block_device_operations nvme_bdev_ops = { }; #ifdef CONFIG_NVME_MULTIPATH -static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys) { - struct nvme_ns_head *head = bdev->bd_disk->private_data; - - if (!kref_get_unless_zero(&head->ref)) - return -ENXIO; - return 0; -} + struct nvme_ctrl *ctrl; + int ret; -static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) -{ - nvme_put_ns_head(disk->private_data); + ret = mutex_lock_killable(&nvme_subsystems_lock); + if (ret) + return ERR_PTR(ret); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->state == NVME_CTRL_LIVE) + goto found; + } + mutex_unlock(&nvme_subsystems_lock); + return ERR_PTR(-EWOULDBLOCK); +found: + nvme_get_ctrl(ctrl); + mutex_unlock(&nvme_subsystems_lock); + return ctrl; } - -const struct block_device_operations nvme_ns_head_ops = { - .owner = THIS_MODULE, - .submit_bio = nvme_ns_head_submit_bio, - .open = nvme_ns_head_open, - .release = nvme_ns_head_release, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .getgeo = nvme_getgeo, - .report_zones = nvme_report_zones, - .pr_ops = &nvme_pr_ops, -}; #endif /* CONFIG_NVME_MULTIPATH */ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) @@ -2541,28 +2175,28 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl) return ret; } +/* + * APST (Autonomous Power State Transition) lets us program a table of power + * state transitions that the controller will perform automatically. + * We configure it with a simple heuristic: we are willing to spend at most 2% + * of the time transitioning between power states. Therefore, when running in + * any given state, we will enter the next lower-power non-operational state + * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit + * latency is under the requested maximum latency. + * + * We will not autonomously enter any non-operational state for which the total + * latency exceeds ps_max_latency_us. + * + * Users can set ps_max_latency_us to zero to turn off APST. + */ static int nvme_configure_apst(struct nvme_ctrl *ctrl) { - /* - * APST (Autonomous Power State Transition) lets us program a - * table of power state transitions that the controller will - * perform automatically. We configure it with a simple - * heuristic: we are willing to spend at most 2% of the time - * transitioning between power states. Therefore, when running - * in any given state, we will enter the next lower-power - * non-operational state after waiting 50 * (enlat + exlat) - * microseconds, as long as that state's exit latency is under - * the requested maximum latency. - * - * We will not autonomously enter any non-operational state for - * which the total latency exceeds ps_max_latency_us. Users - * can set ps_max_latency_us to zero to turn off APST. - */ - - unsigned apste; struct nvme_feat_auto_pst *table; + unsigned apste = 0; u64 max_lat_us = 0; + __le64 target = 0; int max_ps = -1; + int state; int ret; /* @@ -2583,83 +2217,72 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl) if (!ctrl->apst_enabled || ctrl->ps_max_latency_us == 0) { /* Turn off APST. */ - apste = 0; dev_dbg(ctrl->device, "APST disabled\n"); - } else { - __le64 target = cpu_to_le64(0); - int state; - - /* - * Walk through all states from lowest- to highest-power. - * According to the spec, lower-numbered states use more - * power. NPSS, despite the name, is the index of the - * lowest-power state, not the number of states. - */ - for (state = (int)ctrl->npss; state >= 0; state--) { - u64 total_latency_us, exit_latency_us, transition_ms; - - if (target) - table->entries[state] = target; - - /* - * Don't allow transitions to the deepest state - * if it's quirked off. - */ - if (state == ctrl->npss && - (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) - continue; - - /* - * Is this state a useful non-operational state for - * higher-power states to autonomously transition to? - */ - if (!(ctrl->psd[state].flags & - NVME_PS_FLAGS_NON_OP_STATE)) - continue; - - exit_latency_us = - (u64)le32_to_cpu(ctrl->psd[state].exit_lat); - if (exit_latency_us > ctrl->ps_max_latency_us) - continue; + goto done; + } - total_latency_us = - exit_latency_us + - le32_to_cpu(ctrl->psd[state].entry_lat); + /* + * Walk through all states from lowest- to highest-power. + * According to the spec, lower-numbered states use more power. NPSS, + * despite the name, is the index of the lowest-power state, not the + * number of states. + */ + for (state = (int)ctrl->npss; state >= 0; state--) { + u64 total_latency_us, exit_latency_us, transition_ms; - /* - * This state is good. Use it as the APST idle - * target for higher power states. - */ - transition_ms = total_latency_us + 19; - do_div(transition_ms, 20); - if (transition_ms > (1 << 24) - 1) - transition_ms = (1 << 24) - 1; + if (target) + table->entries[state] = target; - target = cpu_to_le64((state << 3) | - (transition_ms << 8)); + /* + * Don't allow transitions to the deepest state if it's quirked + * off. + */ + if (state == ctrl->npss && + (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) + continue; - if (max_ps == -1) - max_ps = state; + /* + * Is this state a useful non-operational state for higher-power + * states to autonomously transition to? + */ + if (!(ctrl->psd[state].flags & NVME_PS_FLAGS_NON_OP_STATE)) + continue; - if (total_latency_us > max_lat_us) - max_lat_us = total_latency_us; - } + exit_latency_us = (u64)le32_to_cpu(ctrl->psd[state].exit_lat); + if (exit_latency_us > ctrl->ps_max_latency_us) + continue; - apste = 1; + total_latency_us = exit_latency_us + + le32_to_cpu(ctrl->psd[state].entry_lat); - if (max_ps == -1) { - dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); - } else { - dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", - max_ps, max_lat_us, (int)sizeof(*table), table); - } + /* + * This state is good. Use it as the APST idle target for + * higher power states. + */ + transition_ms = total_latency_us + 19; + do_div(transition_ms, 20); + if (transition_ms > (1 << 24) - 1) + transition_ms = (1 << 24) - 1; + + target = cpu_to_le64((state << 3) | (transition_ms << 8)); + if (max_ps == -1) + max_ps = state; + if (total_latency_us > max_lat_us) + max_lat_us = total_latency_us; } + if (max_ps == -1) + dev_dbg(ctrl->device, "APST enabled but no non-operational states are available\n"); + else + dev_dbg(ctrl->device, "APST enabled: max PS = %d, max round-trip latency = %lluus, table = %*phN\n", + max_ps, max_lat_us, (int)sizeof(*table), table); + apste = 1; + +done: ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste, table, sizeof(*table), NULL); if (ret) dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret); - kfree(table); return ret; } @@ -2681,7 +2304,8 @@ static void nvme_set_latency_tolerance(struct device *dev, s32 val) if (ctrl->ps_max_latency_us != latency) { ctrl->ps_max_latency_us = latency; - nvme_configure_apst(ctrl); + if (ctrl->state == NVME_CTRL_LIVE) + nvme_configure_apst(ctrl); } } @@ -2854,8 +2478,8 @@ static ssize_t subsys_##field##_show(struct device *dev, \ { \ struct nvme_subsystem *subsys = \ container_of(dev, struct nvme_subsystem, dev); \ - return sprintf(buf, "%.*s\n", \ - (int)sizeof(subsys->field), subsys->field); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ } \ static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); @@ -3038,28 +2662,74 @@ out: return 0; } -/* - * Initialize the cached copies of the Identify data and various controller - * register in our nvme_ctrl structure. This should be called as soon as - * the admin queue is fully up and running. - */ -int nvme_init_identify(struct nvme_ctrl *ctrl) +static inline u32 nvme_mps_to_sectors(struct nvme_ctrl *ctrl, u32 units) { - struct nvme_id_ctrl *id; - int ret, page_shift; - u32 max_hw_sectors; - bool prev_apst_enabled; + u32 page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12, val; - ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); - if (ret) { - dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); - return ret; + if (check_shl_overflow(1U, units + page_shift - 9, &val)) + return UINT_MAX; + return val; +} + +static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) +{ + struct nvme_command c = { }; + struct nvme_id_ctrl_nvm *id; + int ret; + + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + ctrl->max_discard_sectors = UINT_MAX; + ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; + } else { + ctrl->max_discard_sectors = 0; + ctrl->max_discard_segments = 0; } - page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + /* + * Even though NVMe spec explicitly states that MDTS is not applicable + * to the write-zeroes, we are cautious and limit the size to the + * controllers max_hw_sectors value, which is based on the MDTS field + * and possibly other limiting factors. + */ + if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) && + !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) + ctrl->max_zeroes_sectors = ctrl->max_hw_sectors; + else + ctrl->max_zeroes_sectors = 0; + + if (nvme_ctrl_limited_cns(ctrl)) + return 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + return 0; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = NVME_ID_CNS_CS_CTRL; + c.identify.csi = NVME_CSI_NVM; + + ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + if (ret) + goto free_data; + + if (id->dmrl) + ctrl->max_discard_segments = id->dmrl; + if (id->dmrsl) + ctrl->max_discard_sectors = le32_to_cpu(id->dmrsl); + if (id->wzsl) + ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); + +free_data: + kfree(id); + return ret; +} + +static int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u32 max_hw_sectors; + bool prev_apst_enabled; + int ret; ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -3077,7 +2747,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->cntlid = le16_to_cpu(id->cntlid); if (!ctrl->identified) { - int i; + unsigned int i; ret = nvme_init_subsystem(ctrl, id); if (ret) @@ -3116,7 +2786,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; if (id->mdts) - max_hw_sectors = 1 << (id->mdts + page_shift - 9); + max_hw_sectors = nvme_mps_to_sectors(ctrl, id->mdts); else max_hw_sectors = UINT_MAX; ctrl->max_hw_sectors = @@ -3190,20 +2860,51 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } ret = nvme_mpath_init(ctrl, id); - kfree(id); - if (ret < 0) - return ret; + goto out_free; if (ctrl->apst_enabled && !prev_apst_enabled) dev_pm_qos_expose_latency_tolerance(ctrl->device); else if (!ctrl->apst_enabled && prev_apst_enabled) dev_pm_qos_hide_latency_tolerance(ctrl->device); +out_free: + kfree(id); + return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl) +{ + int ret; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); + return ret; + } + + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + + if (ctrl->vs >= NVME_VS(1, 1, 0)) + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); + + ret = nvme_init_identify(ctrl); + if (ret) + return ret; + + ret = nvme_init_non_mdts_limits(ctrl); + if (ret < 0) + return ret; + ret = nvme_configure_apst(ctrl); if (ret < 0) return ret; - + ret = nvme_configure_timestamp(ctrl); if (ret < 0) return ret; @@ -3225,12 +2926,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->identified = true; return 0; - -out_free: - kfree(id); - return ret; } -EXPORT_SYMBOL_GPL(nvme_init_identify); +EXPORT_SYMBOL_GPL(nvme_init_ctrl_finish); static int nvme_dev_open(struct inode *inode, struct file *file) { @@ -3264,65 +2961,6 @@ static int nvme_dev_release(struct inode *inode, struct file *file) return 0; } -static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) -{ - struct nvme_ns *ns; - int ret; - - down_read(&ctrl->namespaces_rwsem); - if (list_empty(&ctrl->namespaces)) { - ret = -ENOTTY; - goto out_unlock; - } - - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); - if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { - dev_warn(ctrl->device, - "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); - ret = -EINVAL; - goto out_unlock; - } - - dev_warn(ctrl->device, - "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); - kref_get(&ns->kref); - up_read(&ctrl->namespaces_rwsem); - - ret = nvme_user_cmd(ctrl, ns, argp); - nvme_put_ns(ns); - return ret; - -out_unlock: - up_read(&ctrl->namespaces_rwsem); - return ret; -} - -static long nvme_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ctrl *ctrl = file->private_data; - void __user *argp = (void __user *)arg; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp); - case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); - case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp); - case NVME_IOCTL_RESET: - dev_warn(ctrl->device, "resetting controller\n"); - return nvme_reset_ctrl_sync(ctrl); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_reset_subsystem(ctrl); - case NVME_IOCTL_RESCAN: - nvme_queue_scan(ctrl); - return 0; - default: - return -ENOTTY; - } -} - static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, @@ -3376,13 +3014,13 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, int model_len = sizeof(subsys->model); if (!uuid_is_null(&ids->uuid)) - return sprintf(buf, "uuid.%pU\n", &ids->uuid); + return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return sprintf(buf, "eui.%16phN\n", ids->nguid); + return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return sprintf(buf, "eui.%8phN\n", ids->eui64); + return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || subsys->serial[serial_len - 1] == '\0')) @@ -3391,7 +3029,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, subsys->model[model_len - 1] == '\0')) model_len--; - return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, head->ns_id); } @@ -3400,7 +3038,7 @@ static DEVICE_ATTR_RO(wwid); static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); + return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); } static DEVICE_ATTR_RO(nguid); @@ -3415,23 +3053,23 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, if (uuid_is_null(&ids->uuid)) { printk_ratelimited(KERN_WARNING "No UUID available providing old NGUID\n"); - return sprintf(buf, "%pU\n", ids->nguid); + return sysfs_emit(buf, "%pU\n", ids->nguid); } - return sprintf(buf, "%pU\n", &ids->uuid); + return sysfs_emit(buf, "%pU\n", &ids->uuid); } static DEVICE_ATTR_RO(uuid); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); + return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); } static DEVICE_ATTR_RO(eui); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); } static DEVICE_ATTR_RO(nsid); @@ -3496,7 +3134,7 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%.*s\n", \ + return sysfs_emit(buf, "%.*s\n", \ (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3510,7 +3148,7 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%d\n", ctrl->field); \ + return sysfs_emit(buf, "%d\n", ctrl->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); @@ -3518,6 +3156,7 @@ nvme_show_int_function(cntlid); nvme_show_int_function(numa_node); nvme_show_int_function(queue_count); nvme_show_int_function(sqsize); +nvme_show_int_function(kato); static ssize_t nvme_sysfs_delete(struct device *dev, struct device_attribute *attr, const char *buf, @@ -3558,9 +3197,9 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && state_name[ctrl->state]) - return sprintf(buf, "%s\n", state_name[ctrl->state]); + return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); - return sprintf(buf, "unknown state\n"); + return sysfs_emit(buf, "unknown state\n"); } static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); @@ -3612,9 +3251,9 @@ static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, struct nvmf_ctrl_options *opts = ctrl->opts; if (ctrl->opts->max_reconnects == -1) - return sprintf(buf, "off\n"); - return sprintf(buf, "%d\n", - opts->max_reconnects * opts->reconnect_delay); + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); } static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, @@ -3628,7 +3267,7 @@ static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, if (err) return -EINVAL; - else if (ctrl_loss_tmo < 0) + if (ctrl_loss_tmo < 0) opts->max_reconnects = -1; else opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, @@ -3644,8 +3283,8 @@ static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); if (ctrl->opts->reconnect_delay == -1) - return sprintf(buf, "off\n"); - return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay); + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); } static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, @@ -3665,6 +3304,36 @@ static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); +static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->fast_io_fail_tmo == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); +} + +static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int fast_io_fail_tmo, err; + + err = kstrtoint(buf, 10, &fast_io_fail_tmo); + if (err) + return -EINVAL; + + if (fast_io_fail_tmo < 0) + opts->fast_io_fail_tmo = -1; + else + opts->fast_io_fail_tmo = fast_io_fail_tmo; + return count; +} +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3684,6 +3353,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_hostid.attr, &dev_attr_ctrl_loss_tmo.attr, &dev_attr_reconnect_delay.attr, + &dev_attr_fast_io_fail_tmo.attr, + &dev_attr_kato.attr, NULL }; @@ -3705,6 +3376,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) return 0; + if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) + return 0; return a->mode; } @@ -3727,7 +3400,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) + if (h->ns_id == nsid && nvme_tryget_ns_head(h)) return h; } @@ -3750,6 +3423,66 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, return 0; } +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) +{ + cdev_device_del(cdev, cdev_device); + ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(cdev_device->devt)); +} + +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner) +{ + int minor, ret; + + minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + if (minor < 0) + return minor; + cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); + cdev_device->class = nvme_ns_chr_class; + device_initialize(cdev_device); + cdev_init(cdev, fops); + cdev->owner = owner; + ret = cdev_device_add(cdev, cdev_device); + if (ret) + ida_simple_remove(&nvme_ns_chr_minor_ida, minor); + return ret; +} + +static int nvme_ns_chr_open(struct inode *inode, struct file *file) +{ + return nvme_ns_open(container_of(inode->i_cdev, struct nvme_ns, cdev)); +} + +static int nvme_ns_chr_release(struct inode *inode, struct file *file) +{ + nvme_ns_release(container_of(inode->i_cdev, struct nvme_ns, cdev)); + return 0; +} + +static const struct file_operations nvme_ns_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_chr_open, + .release = nvme_ns_chr_release, + .unlocked_ioctl = nvme_ns_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_cdev(struct nvme_ns *ns) +{ + int ret; + + ns->cdev_device.parent = ns->ctrl->device; + ret = dev_set_name(&ns->cdev_device, "ng%dn%d", + ns->ctrl->instance, ns->head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&ns->cdev, &ns->cdev_device, &nvme_ns_chr_fops, + ns->ctrl->ops->module); + if (ret) + kfree_const(ns->cdev_device.kobj.name); + return ret; +} + static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -3890,8 +3623,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns *ns; struct gendisk *disk; struct nvme_id_ns *id; - char disk_name[DISK_NAME_LEN]; - int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT; + int node = ctrl->numa_node; if (nvme_identify_ns(ctrl, nsid, ids, &id)) return; @@ -3917,7 +3649,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (nvme_init_ns_head(ns, nsid, ids, id->nmic & NVME_NS_NMIC_SHARED)) goto out_free_queue; - nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); if (!disk) @@ -3926,15 +3657,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = flags; - memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); + disk->flags = GENHD_FL_EXT_DEVT; + /* + * Without the multipath code enabled, multiple controller per + * subsystems are visible as devices and thus we cannot use the + * subsystem instance. + */ + if (!nvme_mpath_set_disk_name(ns, disk->disk_name, &disk->flags)) + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, + ns->head->instance); ns->disk = disk; if (nvme_update_ns_info(ns, id)) goto out_put_disk; if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { - if (nvme_nvm_register(ns, disk_name, node)) { + if (nvme_nvm_register(ns, disk->disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); goto out_put_disk; } @@ -3947,6 +3685,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, nvme_get_ctrl(ctrl); device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); + if (!nvme_ns_head_multipath(ns->head)) + nvme_add_ns_cdev(ns); nvme_mpath_add_disk(ns, id); nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); @@ -3991,6 +3731,8 @@ static void nvme_ns_remove(struct nvme_ns *ns) synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ if (ns->disk->flags & GENHD_FL_UP) { + if (!nvme_ns_head_multipath(ns->head)) + nvme_cdev_del(&ns->cdev, &ns->cdev_device); del_gendisk(ns->disk); blk_cleanup_queue(ns->queue); if (blk_get_integrity(ns->disk)) @@ -4735,6 +4477,7 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_nvm) != NVME_IDENTIFY_DATA_SIZE); BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64); @@ -4780,8 +4523,24 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_subsys_class); goto destroy_class; } + + result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, + "nvme-generic"); + if (result < 0) + goto destroy_subsys_class; + + nvme_ns_chr_class = class_create(THIS_MODULE, "nvme-generic"); + if (IS_ERR(nvme_ns_chr_class)) { + result = PTR_ERR(nvme_ns_chr_class); + goto unregister_generic_ns; + } + return 0; +unregister_generic_ns: + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); +destroy_subsys_class: + class_destroy(nvme_subsys_class); destroy_class: class_destroy(nvme_class); unregister_chrdev: @@ -4798,12 +4557,15 @@ out: static void __exit nvme_core_exit(void) { + class_destroy(nvme_ns_chr_class); class_destroy(nvme_subsys_class); class_destroy(nvme_class); + unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_ns_chr_minor_ida); ida_destroy(&nvme_instance_ida); } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 604ab0e5a2ad..13c2747e3d00 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -379,10 +379,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) /* * Set keep-alive timeout in seconds granularity (ms * 1000) - * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->kato ? - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; + cmd.connect.kato = cpu_to_le32(ctrl->kato * 1000); if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6ffa8de2a0d7..9b9b7be0f412 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1708,7 +1708,7 @@ restart: * * If this routine returns error, the LLDD should abort the exchange. * - * @remoteport: pointer to the (registered) remote port that the LS + * @portptr: pointer to the (registered) remote port that the LS * was received from. The remoteport is associated with * a specific localport. * @lsrsp: pointer to a nvmefc_ls_rsp response structure to be @@ -2128,6 +2128,7 @@ nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, op->op.fcp_req.first_sgl = op->sgl; op->op.fcp_req.private = &op->priv[0]; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &op->op.cmd_iu.sqe; return res; } @@ -2759,8 +2760,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl = queue->ctrl; struct request *rq = bd->rq; struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; - struct nvme_command *sqe = &cmdiu->sqe; enum nvmefc_fcp_datadir io_dir; bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); u32 data_len; @@ -2770,7 +2769,7 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx, !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); - ret = nvme_setup_cmd(ns, rq, sqe); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; @@ -3086,7 +3085,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvme_init_identify(&ctrl->ctrl); + ret = nvme_init_ctrl_finish(&ctrl->ctrl); if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) goto out_disconnect_admin_queue; @@ -3100,6 +3099,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) } /* FC-NVME supports normal SGL Data Block Descriptors */ + if (!(ctrl->ctrl.sgls & ((1 << 0) | (1 << 1)))) { + dev_err(ctrl->ctrl.device, + "Mandatory sgls are not supported!\n"); + goto out_disconnect_admin_queue; + } if (opts->queue_size > ctrl->ctrl.maxcmd) { /* warn if maxcmd is lower than queue_size */ diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c new file mode 100644 index 000000000000..502f8e4a2a1f --- /dev/null +++ b/drivers/nvme/host/ioctl.c @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2011-2014, Intel Corporation. + * Copyright (c) 2017-2021 Christoph Hellwig. + */ +#include <linux/ptrace.h> /* for force_successful_syscall_return */ +#include <linux/nvme_ioctl.h> +#include "nvme.h" + +/* + * Convert integer values from ioctl structures to user pointers, silently + * ignoring the upper bits in the compat case to match behaviour of 32-bit + * kernels. + */ +static void __user *nvme_to_user_ptr(uintptr_t ptrval) +{ + if (in_compat_syscall()) + ptrval = (compat_uptr_t)ptrval; + return (void __user *)ptrval; +} + +static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf, + unsigned len, u32 seed, bool write) +{ + struct bio_integrity_payload *bip; + int ret = -ENOMEM; + void *buf; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + goto out; + + ret = -EFAULT; + if (write && copy_from_user(buf, ubuf, len)) + goto out_free_meta; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = len; + bip->bip_iter.bi_sector = seed; + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret == len) + return buf; + ret = -ENOMEM; +out_free_meta: + kfree(buf); +out: + return ERR_PTR(ret); +} + +static int nvme_submit_user_cmd(struct request_queue *q, + struct nvme_command *cmd, void __user *ubuffer, + unsigned bufflen, void __user *meta_buffer, unsigned meta_len, + u32 meta_seed, u64 *result, unsigned timeout) +{ + bool write = nvme_is_write(cmd); + struct nvme_ns *ns = q->queuedata; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + if (timeout) + req->timeout = timeout; + nvme_req(req)->flags |= NVME_REQ_USERCMD; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + if (bdev) + bio_set_dev(bio, bdev); + if (bdev && meta_buffer && meta_len) { + meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, + meta_seed, write); + if (IS_ERR(meta)) { + ret = PTR_ERR(meta); + goto out_unmap; + } + req->cmd_flags |= REQ_INTEGRITY; + } + } + + nvme_execute_passthru_rq(req); + if (nvme_req(req)->flags & NVME_REQ_CANCELLED) + ret = -EINTR; + else + ret = nvme_req(req)->status; + if (result) + *result = le64_to_cpu(nvme_req(req)->result.u64); + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + kfree(meta); + out_unmap: + if (bio) + blk_rq_unmap_user(bio); + out: + blk_mq_free_request(req); + return ret; +} + + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + if (io.flags) + return -EINVAL; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + + if ((io.control & NVME_RW_PRINFO_PRACT) && + ns->ms == sizeof(struct t10_pi_tuple)) { + /* + * Protection information is stripped/inserted by the + * controller. + */ + if (nvme_to_user_ptr(io.metadata)) + return -EINVAL; + meta_len = 0; + metadata = NULL; + } else { + meta_len = (io.nblocks + 1) * ns->ms; + metadata = nvme_to_user_ptr(io.metadata); + } + + if (ns->features & NVME_NS_EXT_LBAS) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->head->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return nvme_submit_user_cmd(ns->queue, &c, + nvme_to_user_ptr(io.addr), length, + metadata, meta_len, lower_32_bits(io.slba), NULL, 0); +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + u64 result; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &result, timeout); + + if (status >= 0) { + if (put_user(result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd64 __user *ucmd) +{ + struct nvme_passthru_cmd64 cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + if (cmd.flags) + return -EINVAL; + if (ns && cmd.nsid != ns->head->ns_id) { + dev_err(ctrl->device, + "%s: nsid (%u) in cmd does not match nsid (%u) of namespace\n", + current->comm, cmd.nsid, ns->head->ns_id); + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10 = cpu_to_le32(cmd.cdw10); + c.common.cdw11 = cpu_to_le32(cmd.cdw11); + c.common.cdw12 = cpu_to_le32(cmd.cdw12); + c.common.cdw13 = cpu_to_le32(cmd.cdw13); + c.common.cdw14 = cpu_to_le32(cmd.cdw14); + c.common.cdw15 = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + nvme_to_user_ptr(cmd.addr), cmd.data_len, + nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, + 0, &cmd.result, timeout); + + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static bool is_ctrl_ioctl(unsigned int cmd) +{ + if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD) + return true; + if (is_sed_ioctl(cmd)) + return true; + return false; +} + +static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + default: + return sed_ioctl(ctrl->opal_dev, cmd, argp); + } +} + +#ifdef COMPAT_FOR_U64_ALIGNMENT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) +#endif /* COMPAT_FOR_U64_ALIGNMENT */ + +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, + void __user *argp) +{ + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->head->ns_id; + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, argp); + /* + * struct nvme_user_io can have different padding on some 32-bit ABIs. + * Just accept the compat version as all fields that are used are the + * same size and at the same offset. + */ +#ifdef COMPAT_FOR_U64_ALIGNMENT + case NVME_IOCTL_SUBMIT_IO32: +#endif + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, argp); + case NVME_IOCTL_IO64_CMD: + return nvme_user_cmd64(ns->ctrl, ns, argp); + default: + if (!ns->ndev) + return -ENOTTY; + return nvme_nvm_ioctl(ns, cmd, argp); + } +} + +static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg) +{ + if (is_ctrl_ioctl(cmd)) + return nvme_ctrl_ioctl(ns->ctrl, cmd, arg); + return nvme_ns_ioctl(ns, cmd, arg); +} + +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = + container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + + return __nvme_ioctl(ns, cmd, (void __user *)arg); +} + +#ifdef CONFIG_NVME_MULTIPATH +static int nvme_ns_head_ctrl_ioctl(struct nvme_ns_head *head, + unsigned int cmd, void __user *argp) +{ + struct nvme_ctrl *ctrl = nvme_find_get_live_ctrl(head->subsys); + int ret; + + if (IS_ERR(ctrl)) + return PTR_ERR(ctrl); + ret = nvme_ctrl_ioctl(ctrl, cmd, argp); + nvme_put_ctrl(ctrl); + return ret; +} + +static int nvme_ns_head_ns_ioctl(struct nvme_ns_head *head, + unsigned int cmd, void __user *argp) +{ + int srcu_idx = srcu_read_lock(&head->srcu); + struct nvme_ns *ns = nvme_find_path(head); + int ret = -EWOULDBLOCK; + + if (ns) + ret = nvme_ns_ioctl(ns, cmd, argp); + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + void __user *argp = (void __user *)arg; + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(head, cmd, argp); + return nvme_ns_head_ns_ioctl(head, cmd, argp); +} + +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct cdev *cdev = file_inode(file)->i_cdev; + struct nvme_ns_head *head = + container_of(cdev, struct nvme_ns_head, cdev); + void __user *argp = (void __user *)arg; + + if (is_ctrl_ioctl(cmd)) + return nvme_ns_head_ctrl_ioctl(head, cmd, argp); + return nvme_ns_head_ns_ioctl(head, cmd, argp); +} +#endif /* CONFIG_NVME_MULTIPATH */ + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + down_read(&ctrl->namespaces_rwsem); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->device, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->device, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + up_read(&ctrl->namespaces_rwsem); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + up_read(&ctrl->namespaces_rwsem); + return ret; +} + +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_ADMIN64_CMD: + return nvme_user_cmd64(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->device, "resetting controller\n"); + return nvme_reset_ctrl_sync(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: + nvme_queue_scan(ctrl); + return 0; + default: + return -ENOTTY; + } +} diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index f6ca2fbb711e..e9d9ad47f70f 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -930,15 +930,15 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, return ret; } -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp) { switch (cmd) { case NVME_NVM_IOCTL_ADMIN_VIO: - return nvme_nvm_user_vcmd(ns, 1, (void __user *)arg); + return nvme_nvm_user_vcmd(ns, 1, argp); case NVME_NVM_IOCTL_IO_VIO: - return nvme_nvm_user_vcmd(ns, 0, (void __user *)arg); + return nvme_nvm_user_vcmd(ns, 0, argp); case NVME_NVM_IOCTL_SUBMIT_VIO: - return nvme_nvm_submit_vio(ns, (void __user *)arg); + return nvme_nvm_submit_vio(ns, argp); default: return -ENOTTY; } @@ -1240,7 +1240,7 @@ static struct attribute *nvm_dev_attrs[] = { static umode_t nvm_dev_attrs_visible(struct kobject *kobj, struct attribute *attr, int index) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gendisk *disk = dev_to_disk(dev); struct nvme_ns *ns = disk->private_data; struct nvm_dev *ndev = ns->ndev; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a1d476e1ac02..0d0de3433f37 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -50,19 +50,19 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) * and those that have a single controller and use the controller node * directly. */ -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) -{ - if (!multipath) { - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); - } else if (ns->head->disk) { - sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, - ctrl->instance, ns->head->instance); - *flags = GENHD_FL_HIDDEN; - } else { - sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, - ns->head->instance); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags) +{ + if (!multipath) + return false; + if (!ns->head->disk) { + sprintf(disk_name, "nvme%dn%d", ns->ctrl->subsys->instance, + ns->head->instance); + return true; } + sprintf(disk_name, "nvme%dc%dn%d", ns->ctrl->subsys->instance, + ns->ctrl->instance, ns->head->instance); + *flags = GENHD_FL_HIDDEN; + return true; } void nvme_failover_req(struct request *req) @@ -294,7 +294,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) return false; } -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) +static blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); @@ -334,6 +334,71 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) return ret; } +static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +{ + if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) + return -ENXIO; + return 0; +} + +static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns_head(disk->private_data); +} + +const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, + .submit_bio = nvme_ns_head_submit_bio, + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ns_head_ioctl, + .getgeo = nvme_getgeo, + .report_zones = nvme_report_zones, + .pr_ops = &nvme_pr_ops, +}; + +static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev) +{ + return container_of(cdev, struct nvme_ns_head, cdev); +} + +static int nvme_ns_head_chr_open(struct inode *inode, struct file *file) +{ + if (!nvme_tryget_ns_head(cdev_to_ns_head(inode->i_cdev))) + return -ENXIO; + return 0; +} + +static int nvme_ns_head_chr_release(struct inode *inode, struct file *file) +{ + nvme_put_ns_head(cdev_to_ns_head(inode->i_cdev)); + return 0; +} + +static const struct file_operations nvme_ns_head_chr_fops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_chr_open, + .release = nvme_ns_head_chr_release, + .unlocked_ioctl = nvme_ns_head_chr_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) +{ + int ret; + + head->cdev_device.parent = &head->subsys->dev; + ret = dev_set_name(&head->cdev_device, "ng%dn%d", + head->subsys->instance, head->instance); + if (ret) + return ret; + ret = nvme_cdev_add(&head->cdev, &head->cdev_device, + &nvme_ns_head_chr_fops, THIS_MODULE); + if (ret) + kfree_const(head->cdev_device.kobj.name); + return ret; +} + static void nvme_requeue_work(struct work_struct *work) { struct nvme_ns_head *head = @@ -412,9 +477,11 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) if (!head->disk) return; - if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) + if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { device_add_disk(&head->subsys->dev, head->disk, nvme_ns_id_attr_groups); + nvme_add_ns_head_cdev(head); + } mutex_lock(&head->lock); if (nvme_path_is_optimized(ns)) { @@ -602,8 +669,8 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev, struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - return sprintf(buf, "%s\n", - nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); + return sysfs_emit(buf, "%s\n", + nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]); } static ssize_t nvme_subsys_iopolicy_store(struct device *dev, @@ -628,7 +695,7 @@ SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR, static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); + return sysfs_emit(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid); } DEVICE_ATTR_RO(ana_grpid); @@ -637,7 +704,7 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr, { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); + return sysfs_emit(buf, "%s\n", nvme_ana_state_names[ns->ana_state]); } DEVICE_ATTR_RO(ana_state); @@ -668,9 +735,13 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) if (desc.state) { /* found the group desc: update */ nvme_update_ns_ana_state(&desc, ns); + } else { + /* group desc not found: trigger a re-read */ + set_bit(NVME_NS_ANA_PENDING, &ns->flags); + queue_work(nvme_wq, &ns->ctrl->ana_work); } } else { - ns->ana_state = NVME_ANA_OPTIMIZED; + ns->ana_state = NVME_ANA_OPTIMIZED; nvme_mpath_set_live(ns); } @@ -687,8 +758,10 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - if (head->disk->flags & GENHD_FL_UP) + if (head->disk->flags & GENHD_FL_UP) { + nvme_cdev_del(&head->cdev, &head->cdev_device); del_gendisk(head->disk); + } blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); @@ -758,4 +831,3 @@ void nvme_mpath_uninit(struct nvme_ctrl *ctrl) kfree(ctrl->ana_log_buf); ctrl->ana_log_buf = NULL; } - diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 07b34175c6ce..773dde5b231d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -27,7 +27,6 @@ extern unsigned int admin_timeout; #define NVME_ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 -#define NVME_KATO_GRACE 10 #ifdef CONFIG_ARCH_NO_SG_CHAIN #define NVME_INLINE_SG_CNT 0 @@ -276,6 +275,9 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; + u32 max_discard_sectors; + u32 max_discard_segments; + u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED u32 max_zone_append; #endif @@ -410,8 +412,12 @@ struct nvme_ns_head { bool shared; int instance; struct nvme_effects_log *effects; -#ifdef CONFIG_NVME_MULTIPATH + + struct cdev cdev; + struct device cdev_device; + struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH struct bio_list requeue_list; spinlock_t requeue_lock; struct work_struct requeue_work; @@ -422,6 +428,11 @@ struct nvme_ns_head { #endif }; +static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head) +{ + return IS_ENABLED(CONFIG_NVME_MULTIPATH) && head->disk; +} + enum nvme_ns_features { NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */ NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */ @@ -457,6 +468,9 @@ struct nvme_ns { #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 + struct cdev cdev; + struct device cdev_device; + struct nvme_fault_inject fault_inject; }; @@ -599,7 +613,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl); -int nvme_init_identify(struct nvme_ctrl *ctrl); +int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl); @@ -623,8 +637,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl); struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, blk_mq_req_flags_t flags); void nvme_cleanup_cmd(struct request *req); -blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, - struct nvme_command *cmd); +blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, @@ -640,16 +653,34 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid, int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); - +void nvme_queue_scan(struct nvme_ctrl *ctrl); int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, struct nvme_ns_head **head, int *srcu_idx); void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx); +bool nvme_tryget_ns_head(struct nvme_ns_head *head); +void nvme_put_ns_head(struct nvme_ns_head *head); +struct nvme_ctrl *nvme_find_get_live_ctrl(struct nvme_subsystem *subsys); +int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, + const struct file_operations *fops, struct module *owner); +void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); +int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg); +int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo); extern const struct attribute_group *nvme_ns_id_attr_groups[]; +extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH @@ -661,8 +692,7 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) void nvme_mpath_unfreeze(struct nvme_subsystem *subsys); void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); -void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags); +bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, int *flags); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); @@ -674,7 +704,6 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); -blk_qc_t nvme_ns_head_submit_bio(struct bio *bio); static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { @@ -701,16 +730,11 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) { return false; } -/* - * Without the multipath code enabled, multiple controller per subsystems are - * visible as devices and thus we cannot use the subsystem instance. - */ -static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, - struct nvme_ctrl *ctrl, int *flags) +static inline bool nvme_mpath_set_disk_name(struct nvme_ns *ns, char *disk_name, + int *flags) { - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); + return false; } - static inline void nvme_failover_req(struct request *req) { } @@ -745,7 +769,7 @@ static inline void nvme_trace_bio_complete(struct request *req) static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { - if (ctrl->subsys->cmic & (1 << 3)) + if (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) dev_warn(ctrl->device, "Please enable CONFIG_NVME_MULTIPATH for full support of multi-port devices.\n"); return 0; @@ -798,7 +822,7 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); extern const struct attribute_group nvme_nvm_attr_group; -int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg); +int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp); #else static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node) @@ -808,7 +832,7 @@ static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, - unsigned long arg) + void __user *argp) { return -ENOTTY; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7249ae74f71f..09d4c5f99fc3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -224,6 +224,7 @@ struct nvme_queue { */ struct nvme_iod { struct nvme_request req; + struct nvme_command cmd; struct nvme_queue *nvmeq; bool use_sgl; int aborted; @@ -429,6 +430,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, iod->nvmeq = nvmeq; nvme_req(req)->ctrl = &dev->ctrl; + nvme_req(req)->cmd = &iod->cmd; return 0; } @@ -852,7 +854,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); - if (iod->nvmeq->qid && + if (iod->nvmeq->qid && sgl_threshold && dev->ctrl.sgls & ((1 << 0) | (1 << 1))) return nvme_setup_sgl_simple(dev, req, &cmnd->rw, &bv); @@ -917,7 +919,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_command cmnd; + struct nvme_command *cmnd = &iod->cmd; blk_status_t ret; iod->aborted = 0; @@ -931,24 +933,24 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags))) return BLK_STS_IOERR; - ret = nvme_setup_cmd(ns, req, &cmnd); + ret = nvme_setup_cmd(ns, req); if (ret) return ret; if (blk_rq_nr_phys_segments(req)) { - ret = nvme_map_data(dev, req, &cmnd); + ret = nvme_map_data(dev, req, cmnd); if (ret) goto out_free_cmd; } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, &cmnd); + ret = nvme_map_metadata(dev, req, cmnd); if (ret) goto out_unmap_data; } blk_mq_start_request(req); - nvme_submit_cmd(nvmeq, &cmnd, bd->last); + nvme_submit_cmd(nvmeq, cmnd, bd->last); return BLK_STS_OK; out_unmap_data: nvme_unmap_data(dev, req); @@ -1060,18 +1062,10 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; - irqreturn_t ret = IRQ_NONE; - /* - * The rmb/wmb pair ensures we see all updates from a previous run of - * the irq handler, even if that was on another CPU. - */ - rmb(); if (nvme_process_cq(nvmeq)) - ret = IRQ_HANDLED; - wmb(); - - return ret; + return IRQ_HANDLED; + return IRQ_NONE; } static irqreturn_t nvme_irq_check(int irq, void *data) @@ -2178,7 +2172,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - + clear_bit(NVMEQ_ENABLED, &adminq->flags); if (dev->cmb_use_sqes) { @@ -2653,7 +2647,7 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_integrity_segments = 1; - result = nvme_init_identify(&dev->ctrl); + result = nvme_init_ctrl_finish(&dev->ctrl); if (result) goto out; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index be905d4fdb47..660c774fa9e1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -314,6 +314,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, NVME_RDMA_DATA_SGL_SIZE; req->queue = queue; + nvme_req(rq)->cmd = req->sqe.data; return 0; } @@ -920,7 +921,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - error = nvme_init_identify(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) goto out_quiesce_queue; @@ -2041,7 +2042,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_qe *sqe = &req->sqe; - struct nvme_command *c = sqe->data; + struct nvme_command *c = nvme_req(rq)->cmd; struct ib_device *dev; bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); blk_status_t ret; @@ -2064,7 +2065,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); - ret = nvme_setup_cmd(ns, rq, c); + ret = nvme_setup_cmd(ns, rq); if (ret) goto unmap_qe; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a0f00cb8f9f3..75435cdb156c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -417,6 +417,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, { struct nvme_tcp_ctrl *ctrl = set->driver_data; struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue); @@ -427,8 +428,10 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, if (!req->pdu) return -ENOMEM; + pdu = req->pdu; req->queue = queue; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &pdu->cmd; return 0; } @@ -874,7 +877,7 @@ static void nvme_tcp_state_change(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; @@ -895,7 +898,7 @@ static void nvme_tcp_state_change(struct sock *sk) queue->state_change(sk); done: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) @@ -1885,7 +1888,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) blk_mq_unquiesce_queue(ctrl->admin_q); - error = nvme_init_identify(ctrl); + error = nvme_init_ctrl_finish(ctrl); if (error) goto out_quiesce_queue; @@ -1973,6 +1976,11 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) goto destroy_admin; } + if (!(ctrl->sgls & ((1 << 0) | (1 << 1)))) { + dev_err(ctrl->device, "Mandatory sgls are not supported!\n"); + goto destroy_admin; + } + if (opts->queue_size > ctrl->sqsize + 1) dev_warn(ctrl->device, "queue_size %zu > ctrl sqsize %u, clamping down\n", @@ -2269,7 +2277,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; - ret = nvme_setup_cmd(ns, rq, &pdu->cmd); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index bc2f344f0ae0..475dd45c3db4 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -96,7 +96,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) dev_warn(ns->ctrl->device, "zone operations:%x not supported for namespace:%u\n", le16_to_cpu(id->zoc), ns->head->ns_id); - status = -EINVAL; + status = -ENODEV; goto free_data; } @@ -105,7 +105,7 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) dev_warn(ns->ctrl->device, "invalid zone size:%llu for namespace:%u\n", ns->zsze, ns->head->ns_id); - status = -EINVAL; + status = -ENODEV; goto free_data; } diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index fe6b8aa90b53..d2a26ff3f7b3 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -513,7 +513,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) default: id->nuse = id->nsze; break; - } + } if (req->ns->bdev) nvmet_bdev_set_limits(req->ns->bdev, id); @@ -919,15 +919,21 @@ void nvmet_execute_async_event(struct nvmet_req *req) void nvmet_execute_keep_alive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; if (!nvmet_check_transfer_len(req, 0)) return; + if (!ctrl->kato) { + status = NVME_SC_KA_TIMEOUT_INVALID; + goto out; + } + pr_debug("ctrl %d update keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); - nvmet_req_complete(req, 0); +out: + nvmet_req_complete(req, status); } u16 nvmet_parse_admin_cmd(struct nvmet_req *req) @@ -940,7 +946,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvmet_req_subsys(req)->type == NVME_NQN_DISC) return nvmet_parse_discovery_cmd(req); - ret = nvmet_check_ctrl_status(req, cmd); + ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e5dbd1923b7b..65a0cf99f557 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1149,6 +1149,12 @@ static ssize_t nvmet_subsys_attr_model_store_locked(struct nvmet_subsys *subsys, if (!len) return -EINVAL; + if (len > NVMET_MN_MAX_SIZE) { + pr_err("Model number size can not exceed %d Bytes\n", + NVMET_MN_MAX_SIZE); + return -EINVAL; + } + for (pos = 0; pos < len; pos++) { if (!nvmet_is_ascii(page[pos])) return -EINVAL; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index a027433b8be8..25cc2ee8de3f 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -864,10 +864,9 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { - struct nvme_command *cmd = req->cmd; u16 ret; - ret = nvmet_check_ctrl_status(req, cmd); + ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) return ret; @@ -1190,19 +1189,19 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) ctrl->cap |= NVMET_QUEUE_SIZE - 1; } -u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, - struct nvmet_req *req, struct nvmet_ctrl **ret) +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req) { + struct nvmet_ctrl *ctrl = NULL; struct nvmet_subsys *subsys; - struct nvmet_ctrl *ctrl; - u16 status = 0; subsys = nvmet_find_get_subsys(req->port, subsysnqn); if (!subsys) { pr_warn("connect request for invalid subsystem %s!\n", subsysnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); - return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + goto out; } mutex_lock(&subsys->lock); @@ -1215,33 +1214,34 @@ u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, if (!kref_get_unless_zero(&ctrl->ref)) continue; - *ret = ctrl; - goto out; + /* ctrl found */ + goto found; } } + ctrl = NULL; /* ctrl not found */ pr_warn("could not find controller %d for subsys %s / host %s\n", cntlid, subsysnqn, hostnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; -out: +found: mutex_unlock(&subsys->lock); nvmet_subsys_put(subsys); - return status; +out: + return ctrl; } -u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd) +u16 nvmet_check_ctrl_status(struct nvmet_req *req) { if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { pr_err("got cmd %d while CC.EN == 0 on qid = %d\n", - cmd->common.opcode, req->sq->qid); + req->cmd->common.opcode, req->sq->qid); return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { pr_err("got cmd %d while CSTS.RDY == 0 on qid = %d\n", - cmd->common.opcode, req->sq->qid); + req->cmd->common.opcode, req->sq->qid); return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } return 0; @@ -1322,10 +1322,10 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, pr_warn("connect request for invalid subsystem %s!\n", subsysnqn); req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); + req->error_loc = offsetof(struct nvme_common_command, dptr); goto out; } - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; down_read(&nvmet_config_sem); if (!nvmet_host_allowed(subsys, hostnqn)) { pr_info("connect by host %s for subsystem %s not allowed\n", @@ -1333,6 +1333,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); status = NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR; + req->error_loc = offsetof(struct nvme_common_command, dptr); goto out_put_subsystem; } up_read(&nvmet_config_sem); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 682854e0e079..4845d12e374a 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -178,12 +178,14 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) if (req->cmd->get_log_page.lid != NVME_LOG_DISC) { req->error_loc = offsetof(struct nvme_get_log_page_command, lid); - status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out; } /* Spec requires dword aligned offsets */ if (offset & 0x3) { + req->error_loc = + offsetof(struct nvme_get_log_page_command, lpo); status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out; } @@ -250,7 +252,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) if (req->cmd->identify.cns != NVME_ID_CNS_CTRL) { req->error_loc = offsetof(struct nvme_identify, cns); - status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; goto out; } diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 42bd12b8bf00..1420a8e3e0b1 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -190,12 +190,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, le32_to_cpu(c->kato), &ctrl); - if (status) { - if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR)) - req->error_loc = - offsetof(struct nvme_common_command, opcode); + if (status) goto out; - } ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; @@ -222,7 +218,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) { struct nvmf_connect_command *c = &req->cmd->connect; struct nvmf_connect_data *d; - struct nvmet_ctrl *ctrl = NULL; + struct nvmet_ctrl *ctrl; u16 qid = le16_to_cpu(c->qid); u16 status = 0; @@ -249,11 +245,12 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) goto out; } - status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, - le16_to_cpu(d->cntlid), - req, &ctrl); - if (status) + ctrl = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), req); + if (!ctrl) { + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; goto out; + } if (unlikely(qid > ctrl->subsys->max_qid)) { pr_warn("invalid queue id (%d)\n", qid); diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index d375745fc4ed..19e113240fff 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1021,60 +1021,75 @@ nvmet_fc_free_hostport(struct nvmet_fc_hostport *hostport) } static struct nvmet_fc_hostport * +nvmet_fc_match_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) +{ + struct nvmet_fc_hostport *host; + + lockdep_assert_held(&tgtport->lock); + + list_for_each_entry(host, &tgtport->host_list, host_list) { + if (host->hosthandle == hosthandle && !host->invalid) { + if (nvmet_fc_hostport_get(host)) + return (host); + } + } + + return NULL; +} + +static struct nvmet_fc_hostport * nvmet_fc_alloc_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) { - struct nvmet_fc_hostport *newhost, *host, *match = NULL; + struct nvmet_fc_hostport *newhost, *match = NULL; unsigned long flags; /* if LLDD not implemented, leave as NULL */ if (!hosthandle) return NULL; - /* take reference for what will be the newly allocated hostport */ + /* + * take reference for what will be the newly allocated hostport if + * we end up using a new allocation + */ if (!nvmet_fc_tgtport_get(tgtport)) return ERR_PTR(-EINVAL); + spin_lock_irqsave(&tgtport->lock, flags); + match = nvmet_fc_match_hostport(tgtport, hosthandle); + spin_unlock_irqrestore(&tgtport->lock, flags); + + if (match) { + /* no new allocation - release reference */ + nvmet_fc_tgtport_put(tgtport); + return match; + } + newhost = kzalloc(sizeof(*newhost), GFP_KERNEL); if (!newhost) { - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(host, &tgtport->host_list, host_list) { - if (host->hosthandle == hosthandle && !host->invalid) { - if (nvmet_fc_hostport_get(host)) { - match = host; - break; - } - } - } - spin_unlock_irqrestore(&tgtport->lock, flags); - /* no allocation - release reference */ + /* no new allocation - release reference */ nvmet_fc_tgtport_put(tgtport); - return (match) ? match : ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); } - newhost->tgtport = tgtport; - newhost->hosthandle = hosthandle; - INIT_LIST_HEAD(&newhost->host_list); - kref_init(&newhost->ref); - spin_lock_irqsave(&tgtport->lock, flags); - list_for_each_entry(host, &tgtport->host_list, host_list) { - if (host->hosthandle == hosthandle && !host->invalid) { - if (nvmet_fc_hostport_get(host)) { - match = host; - break; - } - } - } + match = nvmet_fc_match_hostport(tgtport, hosthandle); if (match) { + /* new allocation not needed */ kfree(newhost); - newhost = NULL; - /* releasing allocation - release reference */ + newhost = match; + /* no new allocation - release reference */ nvmet_fc_tgtport_put(tgtport); - } else + } else { + newhost->tgtport = tgtport; + newhost->hosthandle = hosthandle; + INIT_LIST_HEAD(&newhost->host_list); + kref_init(&newhost->ref); + list_add_tail(&newhost->host_list, &tgtport->host_list); + } spin_unlock_irqrestore(&tgtport->lock, flags); - return (match) ? match : newhost; + return newhost; } static void @@ -1996,6 +2011,7 @@ nvmet_fc_handle_ls_rqst_work(struct work_struct *work) * * @target_port: pointer to the (registered) target port the LS was * received on. + * @hosthandle: pointer to the host specific data, gets stored in iod. * @lsrsp: pointer to a lsrsp structure to be used to reference * the exchange corresponding to the LS. * @lsreqbuf: pointer to the buffer containing the LS Request diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3e189e753bcf..6665da3b634f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -141,7 +141,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) return nvmf_fail_nonready_command(&queue->ctrl->ctrl, req); - ret = nvme_setup_cmd(ns, req, &iod->cmd); + ret = nvme_setup_cmd(ns, req); if (ret) return ret; @@ -205,8 +205,10 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set, unsigned int numa_node) { struct nvme_loop_ctrl *ctrl = set->driver_data; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); nvme_req(req)->ctrl = &ctrl->ctrl; + nvme_req(req)->cmd = &iod->cmd; return nvme_loop_init_iod(ctrl, blk_mq_rq_to_pdu(req), (set == &ctrl->tag_set) ? hctx_idx + 1 : 0); } @@ -396,7 +398,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - error = nvme_init_identify(&ctrl->ctrl); + error = nvme_init_ctrl_finish(&ctrl->ctrl); if (error) goto out_cleanup_queue; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4b84edb49f22..5566ed403576 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -27,6 +27,7 @@ #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) #define NVMET_DEFAULT_CTRL_MODEL "Linux" +#define NVMET_MN_MAX_SIZE 40 /* * Supported optional AENs: @@ -428,10 +429,11 @@ void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); -u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, - struct nvmet_req *req, struct nvmet_ctrl **ret); +struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, + const char *hostnqn, u16 cntlid, + struct nvmet_req *req); void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); -u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd); +u16 nvmet_check_ctrl_status(struct nvmet_req *req); struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, enum nvme_subsys_type type); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index d658c6e8263a..f9f34f6caf5e 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -29,6 +29,16 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); +/* Define a time period (in usecs) that io_work() shall sample an activated + * queue before determining it to be idle. This optional module behavior + * can enable NIC solutions that support socket optimized packet processing + * using advanced interrupt moderation techniques. + */ +static int idle_poll_period_usecs; +module_param(idle_poll_period_usecs, int, 0644); +MODULE_PARM_DESC(idle_poll_period_usecs, + "nvmet tcp io_work poll till idle time period in usecs"); + #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 @@ -119,6 +129,8 @@ struct nvmet_tcp_queue { struct ahash_request *snd_hash; struct ahash_request *rcv_hash; + unsigned long poll_end; + spinlock_t state_lock; enum nvmet_tcp_queue_state state; @@ -525,11 +537,36 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req) struct nvmet_tcp_cmd *cmd = container_of(req, struct nvmet_tcp_cmd, req); struct nvmet_tcp_queue *queue = cmd->queue; + struct nvme_sgl_desc *sgl; + u32 len; + + if (unlikely(cmd == queue->cmd)) { + sgl = &cmd->req.cmd->common.dptr.sgl; + len = le32_to_cpu(sgl->length); + + /* + * Wait for inline data before processing the response. + * Avoid using helpers, this might happen before + * nvmet_req_init is completed. + */ + if (queue->rcv_state == NVMET_TCP_RECV_PDU && + len && len < cmd->req.port->inline_data_size && + nvme_is_write(cmd->req.cmd)) + return; + } llist_add(&cmd->lentry, &queue->resp_list); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); } +static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd) +{ + if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED)) + nvmet_tcp_queue_response(&cmd->req); + else + cmd->req.execute(&cmd->req); +} + static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd) { u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); @@ -961,7 +998,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) le32_to_cpu(req->cmd->common.dptr.sgl.length)); nvmet_tcp_handle_req_failure(queue, queue->cmd, req); - return -EAGAIN; + return 0; } ret = nvmet_tcp_map_data(queue->cmd); @@ -1104,10 +1141,8 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) return 0; } - if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && - cmd->rbytes_done == cmd->req.transfer_len) { - cmd->req.execute(&cmd->req); - } + if (cmd->rbytes_done == cmd->req.transfer_len) + nvmet_tcp_execute_request(cmd); nvmet_prepare_receive_pdu(queue); return 0; @@ -1144,9 +1179,9 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue) goto out; } - if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) && - cmd->rbytes_done == cmd->req.transfer_len) - cmd->req.execute(&cmd->req); + if (cmd->rbytes_done == cmd->req.transfer_len) + nvmet_tcp_execute_request(cmd); + ret = 0; out: nvmet_prepare_receive_pdu(queue); @@ -1216,6 +1251,23 @@ static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue) spin_unlock(&queue->state_lock); } +static inline void nvmet_tcp_arm_queue_deadline(struct nvmet_tcp_queue *queue) +{ + queue->poll_end = jiffies + usecs_to_jiffies(idle_poll_period_usecs); +} + +static bool nvmet_tcp_check_queue_deadline(struct nvmet_tcp_queue *queue, + int ops) +{ + if (!idle_poll_period_usecs) + return false; + + if (ops) + nvmet_tcp_arm_queue_deadline(queue); + + return !time_after(jiffies, queue->poll_end); +} + static void nvmet_tcp_io_work(struct work_struct *w) { struct nvmet_tcp_queue *queue = @@ -1241,9 +1293,10 @@ static void nvmet_tcp_io_work(struct work_struct *w) } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET); /* - * We exahusted our budget, requeue our selves + * Requeue the worker if idle deadline period is in progress or any + * ops activity was recorded during the do-while loop above. */ - if (pending) + if (nvmet_tcp_check_queue_deadline(queue, ops) || pending) queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } @@ -1434,7 +1487,7 @@ static void nvmet_tcp_state_change(struct sock *sk) { struct nvmet_tcp_queue *queue; - write_lock_bh(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; @@ -1452,7 +1505,7 @@ static void nvmet_tcp_state_change(struct sock *sk) queue->idx, sk->sk_state); } done: - write_unlock_bh(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) @@ -1501,6 +1554,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) sock->sk->sk_state_change = nvmet_tcp_state_change; queue->write_space = sock->sk->sk_write_space; sock->sk->sk_write_space = nvmet_tcp_write_space; + if (idle_poll_period_usecs) + nvmet_tcp_arm_queue_deadline(queue); queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); } write_unlock_bh(&sock->sk->sk_callback_lock); |