summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c1178
1 files changed, 796 insertions, 382 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index dfebbf3a272a..d9529275a030 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -95,7 +95,7 @@
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
/* only define max */
-#define IORING_MAX_FIXED_FILES (1U << 15)
+#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -117,6 +117,8 @@
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
IO_REQ_CLEAN_FLAGS)
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
struct io_uring {
@@ -170,7 +172,7 @@ struct io_rings {
* The application needs a full memory barrier before checking
* for IORING_SQ_NEED_WAKEUP after updating the sq tail.
*/
- u32 sq_flags;
+ atomic_t sq_flags;
/*
* Runtime CQ flags
*
@@ -258,6 +260,8 @@ struct io_rsrc_put {
struct io_file_table {
struct io_fixed_file *files;
+ unsigned long *bitmap;
+ unsigned int alloc_hint;
};
struct io_rsrc_node {
@@ -282,10 +286,26 @@ struct io_rsrc_data {
bool quiesce;
};
+#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf))
struct io_buffer_list {
- struct list_head list;
- struct list_head buf_list;
+ /*
+ * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
+ * then these are classic provided buffers and ->buf_list is used.
+ */
+ union {
+ struct list_head buf_list;
+ struct {
+ struct page **buf_pages;
+ struct io_uring_buf_ring *buf_ring;
+ };
+ };
__u16 bgid;
+
+ /* below is for ring provided buffers */
+ __u16 buf_nr_pages;
+ __u16 nr_entries;
+ __u32 head;
+ __u32 mask;
};
struct io_buffer {
@@ -358,7 +378,7 @@ struct io_ev_fd {
struct rcu_head rcu;
};
-#define IO_BUFFERS_HASH_BITS 5
+#define BGID_ARRAY 64
struct io_ring_ctx {
/* const or read-mostly hot data */
@@ -367,6 +387,7 @@ struct io_ring_ctx {
struct io_rings *rings;
unsigned int flags;
+ enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
@@ -411,11 +432,14 @@ struct io_ring_ctx {
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
+
+ struct io_buffer_list *io_bl;
+ struct xarray io_bl_xa;
+ struct list_head io_buffers_cache;
+
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct list_head *io_buffers;
- struct list_head io_buffers_cache;
struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
@@ -627,7 +651,7 @@ struct io_rw {
struct kiocb kiocb;
u64 addr;
u32 len;
- u32 flags;
+ rwf_t flags;
};
struct io_connect {
@@ -644,9 +668,9 @@ struct io_sr_msg {
void __user *buf;
};
int msg_flags;
- int bgid;
size_t len;
size_t done_io;
+ unsigned int flags;
};
struct io_open {
@@ -814,6 +838,7 @@ enum {
REQ_F_NEED_CLEANUP_BIT,
REQ_F_POLLED_BIT,
REQ_F_BUFFER_SELECTED_BIT,
+ REQ_F_BUFFER_RING_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_CREDS_BIT,
@@ -824,6 +849,7 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
+ REQ_F_APOLL_MULTISHOT_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -864,6 +890,8 @@ enum {
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
+ /* buffer selected from ring, needs commit */
+ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
/* completion is deferred through io_comp_state */
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
@@ -888,6 +916,8 @@ enum {
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
+ /* fast poll multishot mode */
+ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
};
struct async_poll {
@@ -967,6 +997,11 @@ struct io_kiocb {
u8 opcode;
/* polled IO has completed */
u8 iopoll_completed;
+ /*
+ * Can be either a fixed buffer index, or used with provided buffers.
+ * For the latter, before issue it points to the buffer group ID,
+ * and after selection it points to the buffer ID itself.
+ */
u16 buf_index;
unsigned int flags;
@@ -976,14 +1011,26 @@ struct io_kiocb {
struct task_struct *task;
struct io_rsrc_node *rsrc_node;
- /* store used ubuf, so we can prevent reloading */
- struct io_mapped_ubuf *imu;
+
+ union {
+ /* store used ubuf, so we can prevent reloading */
+ struct io_mapped_ubuf *imu;
+
+ /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+ struct io_buffer *kbuf;
+
+ /*
+ * stores buffer ID for ring provided buffers, valid IFF
+ * REQ_F_BUFFER_RING is set.
+ */
+ struct io_buffer_list *buf_list;
+ };
union {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
/* cache ->apoll->events */
- int apoll_events;
+ __poll_t apoll_events;
};
atomic_t refs;
atomic_t poll_refs;
@@ -994,8 +1041,6 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
- struct io_buffer *kbuf;
/* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
struct io_kiocb *link;
/* custom credentials, valid IFF REQ_F_CREDS is set */
@@ -1046,12 +1091,20 @@ struct io_op_def {
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
+ /* supports ioprio */
+ unsigned ioprio : 1;
+ /* supports iopoll */
+ unsigned iopoll : 1;
/* size of async data needed, if any */
unsigned short async_size;
};
static const struct io_op_def io_op_defs[] = {
- [IORING_OP_NOP] = {},
+ [IORING_OP_NOP] = {
+ .audit_skip = 1,
+ .iopoll = 1,
+ .buffer_select = 1,
+ },
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
@@ -1060,6 +1113,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITEV] = {
@@ -1070,6 +1125,8 @@ static const struct io_op_def io_op_defs[] = {
.needs_async_setup = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FSYNC] = {
@@ -1082,6 +1139,8 @@ static const struct io_op_def io_op_defs[] = {
.pollin = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE_FIXED] = {
@@ -1091,6 +1150,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_POLL_ADD] = {
@@ -1133,6 +1194,7 @@ static const struct io_op_def io_op_defs[] = {
.unbound_nonreg_file = 1,
.pollin = 1,
.poll_exclusive = 1,
+ .ioprio = 1, /* used for flags */
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
@@ -1155,6 +1217,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_CLOSE] = {},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
@@ -1166,6 +1229,8 @@ static const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_WRITE] = {
@@ -1175,6 +1240,8 @@ static const struct io_op_def io_op_defs[] = {
.pollout = 1,
.plug = 1,
.audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
.async_size = sizeof(struct io_async_rw),
},
[IORING_OP_FADVISE] = {
@@ -1209,9 +1276,11 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
+ .iopoll = 1,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -1229,6 +1298,7 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_LINKAT] = {},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
+ .iopoll = 1,
},
[IORING_OP_FSETXATTR] = {
.needs_file = 1
@@ -1263,7 +1333,7 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
static void io_clean_op(struct io_kiocb *req);
static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
-static inline struct file *io_file_get_normal(struct io_kiocb *req, int fd);
+static struct file *io_file_get_normal(struct io_kiocb *req, int fd);
static void io_drop_inflight_file(struct io_kiocb *req);
static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags);
static void io_queue_sqe(struct io_kiocb *req);
@@ -1557,21 +1627,23 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
- struct io_buffer *kbuf = req->kbuf;
- unsigned int cflags;
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list)
+ req->buf_list->head++;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ } else {
+ list_add(&req->kbuf->list, list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ }
- cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
- req->flags &= ~REQ_F_BUFFER_SELECTED;
- list_add(&kbuf->list, list);
- req->kbuf = NULL;
- return cflags;
+ return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
lockdep_assert_held(&req->ctx->completion_lock);
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
@@ -1581,7 +1653,7 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
{
unsigned int cflags;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
/*
@@ -1596,7 +1668,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
- if (issue_flags & IO_URING_F_UNLOCKED) {
+ if (req->flags & REQ_F_BUFFER_RING) {
+ /* no buffers to recycle for this case */
+ cflags = __io_put_kbuf(req, NULL);
+ } else if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
@@ -1614,15 +1689,10 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
- struct list_head *hash_list;
- struct io_buffer_list *bl;
-
- hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- list_for_each_entry(bl, hash_list, list)
- if (bl->bgid == bgid || bgid == -1U)
- return bl;
+ if (ctx->io_bl && bgid < BGID_ARRAY)
+ return &ctx->io_bl[bgid];
- return NULL;
+ return xa_load(&ctx->io_bl_xa, bgid);
}
static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
@@ -1631,11 +1701,23 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
struct io_buffer_list *bl;
struct io_buffer *buf;
- if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return;
/* don't recycle if we already did IO to this buffer */
if (req->flags & REQ_F_PARTIAL_IO)
return;
+ /*
+ * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear
+ * the flag and hence ensure that bl->head doesn't get incremented.
+ * If the tail has already been incremented, hang on to it.
+ */
+ if (req->flags & REQ_F_BUFFER_RING) {
+ if (req->buf_list) {
+ req->buf_index = req->buf_list->bgid;
+ req->flags &= ~REQ_F_BUFFER_RING;
+ }
+ return;
+ }
io_ring_submit_lock(ctx, issue_flags);
@@ -1643,7 +1725,7 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->kbuf = NULL;
+ req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
}
@@ -1728,12 +1810,14 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int i, hash_bits;
+ int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return NULL;
+ xa_init(&ctx->io_bl_xa);
+
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
@@ -1755,13 +1839,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
- ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
- sizeof(struct list_head), GFP_KERNEL);
- if (!ctx->io_buffers)
- goto err;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
- INIT_LIST_HEAD(&ctx->io_buffers[i]);
-
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1797,7 +1874,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
return NULL;
}
@@ -2161,8 +2239,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
all_flushed = list_empty(&ctx->cq_overflow_list);
if (all_flushed) {
clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW);
+ atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
io_commit_cqring(ctx);
@@ -2256,8 +2333,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
}
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW);
+ atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
ocqe->cqe.user_data = user_data;
@@ -2388,6 +2464,8 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
+ if (res < 0)
+ req_set_fail(req);
__io_req_complete(req, 0, res, 0);
}
@@ -2638,6 +2716,8 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
{
if (!ctx)
return;
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (*locked) {
io_submit_flush_completions(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -2754,8 +2834,8 @@ static void tctx_task_work(struct callback_head *cb)
static void io_req_task_work_add(struct io_kiocb *req, bool priority)
{
struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
struct io_uring_task *tctx = tsk->io_uring;
- enum task_work_notify_mode notify;
struct io_wq_work_node *node;
unsigned long flags;
bool running;
@@ -2778,18 +2858,11 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
if (running)
return;
- /*
- * SQPOLL kernel thread doesn't need notification, just a wakeup. For
- * all other cases, use TWA_SIGNAL unconditionally to ensure we're
- * processing task_work. There's no reliable way to tell if TWA_RESUME
- * will do the job.
- */
- notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL;
- if (likely(!task_work_add(tsk, &tctx->task_work, notify))) {
- if (notify == TWA_NONE)
- wake_up_process(tsk);
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+
+ if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
return;
- }
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
@@ -3412,6 +3485,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->rw.addr = READ_ONCE(sqe->addr);
req->rw.len = READ_ONCE(sqe->len);
req->rw.flags = READ_ONCE(sqe->rw_flags);
+ /* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
return 0;
}
@@ -3560,56 +3634,96 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter,
return __io_import_fixed(req, rw, iter, imu);
}
-static void io_buffer_add_list(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned int bgid)
+static int io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
{
- struct list_head *list;
-
- list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
- INIT_LIST_HEAD(&bl->buf_list);
bl->bgid = bgid;
- list_add(&bl->list, list);
+ if (bgid < BGID_ARRAY)
+ return 0;
+
+ return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
-static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
- int bgid, unsigned int issue_flags)
+static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl)
{
- struct io_buffer *kbuf = req->kbuf;
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
+ if (!list_empty(&bl->buf_list)) {
+ struct io_buffer *kbuf;
- if (req->flags & REQ_F_BUFFER_SELECTED)
- return kbuf;
-
- io_ring_submit_lock(req->ctx, issue_flags);
-
- bl = io_buffer_get_list(ctx, bgid);
- if (bl && !list_empty(&bl->buf_list)) {
kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
+ req->buf_index = kbuf->bid;
+ return u64_to_user_ptr(kbuf->addr);
+ }
+ return NULL;
+}
+
+static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
+ struct io_buffer_list *bl,
+ unsigned int issue_flags)
+{
+ struct io_uring_buf_ring *br = bl->buf_ring;
+ struct io_uring_buf *buf;
+ __u32 head = bl->head;
+
+ if (unlikely(smp_load_acquire(&br->tail) == head)) {
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return NULL;
+ }
+
+ head &= bl->mask;
+ if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
+ buf = &br->bufs[head];
} else {
- kbuf = ERR_PTR(-ENOBUFS);
+ int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
+ int index = head / IO_BUFFER_LIST_BUF_PER_PAGE - 1;
+ buf = page_address(bl->buf_pages[index]);
+ buf += off;
}
+ if (*len > buf->len)
+ *len = buf->len;
+ req->flags |= REQ_F_BUFFER_RING;
+ req->buf_list = bl;
+ req->buf_index = buf->bid;
- io_ring_submit_unlock(req->ctx, issue_flags);
- return kbuf;
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ /*
+ * If we came in unlocked, we have no choice but to consume the
+ * buffer here. This does mean it'll be pinned until the IO
+ * completes. But coming in unlocked means we're in io-wq
+ * context, hence there should be no further retry. For the
+ * locked case, the caller must ensure to call the commit when
+ * the transfer completes (or if we get -EAGAIN and must poll
+ * or retry).
+ */
+ req->buf_list = NULL;
+ bl->head++;
+ }
+ return u64_to_user_ptr(buf->addr);
}
-static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+static void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
+ unsigned int issue_flags)
{
- struct io_buffer *kbuf;
- u16 bgid;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ void __user *ret = NULL;
- bgid = req->buf_index;
- kbuf = io_buffer_select(req, len, bgid, issue_flags);
- if (IS_ERR(kbuf))
- return kbuf;
- return u64_to_user_ptr(kbuf->addr);
+ io_ring_submit_lock(req->ctx, issue_flags);
+
+ bl = io_buffer_get_list(ctx, req->buf_index);
+ if (likely(bl)) {
+ if (bl->buf_nr_pages)
+ ret = io_ring_buffer_select(req, len, bl, issue_flags);
+ else
+ ret = io_provided_buffer_select(req, len, bl);
+ }
+ io_ring_submit_unlock(req->ctx, issue_flags);
+ return ret;
}
#ifdef CONFIG_COMPAT
@@ -3619,7 +3733,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
struct compat_iovec __user *uiov;
compat_ssize_t clen;
void __user *buf;
- ssize_t len;
+ size_t len;
uiov = u64_to_user_ptr(req->rw.addr);
if (!access_ok(uiov, sizeof(*uiov)))
@@ -3630,11 +3744,12 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov,
return -EINVAL;
len = clen;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = (compat_size_t) len;
+ req->rw.len = iov[0].iov_len = (compat_size_t) len;
return 0;
}
#endif
@@ -3652,22 +3767,21 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
len = iov[0].iov_len;
if (len < 0)
return -EINVAL;
- buf = io_rw_buffer_select(req, &len, issue_flags);
- if (IS_ERR(buf))
- return PTR_ERR(buf);
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ req->rw.addr = (unsigned long) buf;
iov[0].iov_base = buf;
- iov[0].iov_len = len;
+ req->rw.len = iov[0].iov_len = len;
return 0;
}
static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
unsigned int issue_flags)
{
- if (req->flags & REQ_F_BUFFER_SELECTED) {
- struct io_buffer *kbuf = req->kbuf;
-
- iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- iov[0].iov_len = kbuf->len;
+ if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
+ iov[0].iov_base = u64_to_user_ptr(req->rw.addr);
+ iov[0].iov_len = req->rw.len;
return 0;
}
if (req->rw.len != 1)
@@ -3681,6 +3795,13 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov,
return __io_iov_buffer_select(req, iov, issue_flags);
}
+static inline bool io_do_buffer_select(struct io_kiocb *req)
+{
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return false;
+ return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING));
+}
+
static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
struct io_rw_state *s,
unsigned int issue_flags)
@@ -3699,18 +3820,15 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req,
return NULL;
}
- /* buffer index only valid with fixed read/write, or buffer select */
- if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)))
- return ERR_PTR(-EINVAL);
-
buf = u64_to_user_ptr(req->rw.addr);
sqe_len = req->rw.len;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
- if (req->flags & REQ_F_BUFFER_SELECT) {
- buf = io_rw_buffer_select(req, &sqe_len, issue_flags);
- if (IS_ERR(buf))
- return ERR_CAST(buf);
+ if (io_do_buffer_select(req)) {
+ buf = io_buffer_select(req, &sqe_len, issue_flags);
+ if (!buf)
+ return ERR_PTR(-ENOBUFS);
+ req->rw.addr = (unsigned long) buf;
req->rw.len = sqe_len;
}
@@ -4012,6 +4130,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll)
return -EOPNOTSUPP;
+ kiocb->private = NULL;
kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
@@ -4276,9 +4395,7 @@ static int io_renameat_prep(struct io_kiocb *req,
struct io_rename *ren = &req->rename;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4315,8 +4432,6 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
ren->newpath, ren->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4337,9 +4452,6 @@ static void io_xattr_finish(struct io_kiocb *req, int ret)
req->flags &= ~REQ_F_NEED_CLEANUP;
__io_xattr_finish(req);
- if (ret < 0)
- req_set_fail(req);
-
io_req_complete(req, ret);
}
@@ -4578,10 +4690,7 @@ static int io_unlinkat_prep(struct io_kiocb *req,
struct io_unlink *un = &req->unlink;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4615,8 +4724,6 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_unlinkat(un->dfd, un->filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4627,10 +4734,7 @@ static int io_mkdirat_prep(struct io_kiocb *req,
struct io_mkdir *mkd = &req->mkdir;
const char __user *fname;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4658,8 +4762,6 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4670,10 +4772,7 @@ static int io_symlinkat_prep(struct io_kiocb *req,
struct io_symlink *sl = &req->symlink;
const char __user *oldpath, *newpath;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4707,8 +4806,6 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4719,9 +4816,7 @@ static int io_linkat_prep(struct io_kiocb *req,
struct io_hardlink *lnk = &req->hardlink;
const char __user *oldf, *newf;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -4758,8 +4853,6 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
lnk->newpath, lnk->flags);
req->flags &= ~REQ_F_NEED_CLEANUP;
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -4768,9 +4861,7 @@ static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags ||
+ if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
@@ -4795,8 +4886,6 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
ret = __sys_shutdown_sock(sock, req->shutdown.how);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -4810,9 +4899,6 @@ static int __io_splice_prep(struct io_kiocb *req,
struct io_splice *sp = &req->splice;
unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
-
sp->len = READ_ONCE(sqe->len);
sp->flags = READ_ONCE(sqe->splice_flags);
if (unlikely(sp->flags & ~valid_flags))
@@ -4857,7 +4943,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret != sp->len)
req_set_fail(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, 0, ret, 0);
return 0;
}
@@ -4902,7 +4988,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
done:
if (ret != sp->len)
req_set_fail(req);
- io_req_complete(req, ret);
+ __io_req_complete(req, 0, ret, 0);
return 0;
}
@@ -4911,20 +4997,25 @@ done:
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_ring_ctx *ctx = req->ctx;
+ void __user *buf;
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ size_t len = 1;
+
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ }
- __io_req_complete(req, issue_flags, 0, 0);
+ __io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags));
return 0;
}
static int io_msg_ring_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
- sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in ||
+ sqe->buf_index || sqe->personality))
return -EINVAL;
req->msg.user_data = READ_ONCE(sqe->off);
@@ -4960,17 +5051,15 @@ done:
if (ret < 0)
req_set_fail(req);
__io_req_complete(req, issue_flags, ret, 0);
+ /* put file to avoid an attempt to IOPOLL the req */
+ io_put_file(req->file);
+ req->file = NULL;
return 0;
}
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.flags = READ_ONCE(sqe->fsync_flags);
@@ -4994,8 +5083,6 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fsync_range(req->file, req->sync.off,
end > 0 ? end : LLONG_MAX,
req->sync.flags & IORING_FSYNC_DATASYNC);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -5003,10 +5090,7 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -5024,9 +5108,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
- if (ret < 0)
- req_set_fail(req);
- else
+ if (ret >= 0)
fsnotify_modify(req->file);
io_req_complete(req, ret);
return 0;
@@ -5037,9 +5119,7 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->ioprio || sqe->buf_index))
+ if (unlikely(sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@@ -5094,6 +5174,61 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_openat_prep(req, sqe);
}
+static int io_file_bitmap_get(struct io_ring_ctx *ctx)
+{
+ struct io_file_table *table = &ctx->file_table;
+ unsigned long nr = ctx->nr_user_files;
+ int ret;
+
+ if (table->alloc_hint >= nr)
+ table->alloc_hint = 0;
+
+ do {
+ ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint);
+ if (ret != nr) {
+ table->alloc_hint = ret + 1;
+ return ret;
+ }
+ if (!table->alloc_hint)
+ break;
+
+ nr = table->alloc_hint;
+ table->alloc_hint = 0;
+ } while (1);
+
+ return -ENFILE;
+}
+
+static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
+ struct file *file, unsigned int file_slot)
+{
+ bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ if (alloc_slot) {
+ io_ring_submit_lock(ctx, issue_flags);
+ ret = io_file_bitmap_get(ctx);
+ if (unlikely(ret < 0)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+ }
+
+ file_slot = ret;
+ } else {
+ file_slot--;
+ }
+
+ ret = io_install_fixed_file(req, file, issue_flags, file_slot);
+ if (alloc_slot) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ if (!ret)
+ return file_slot;
+ }
+
+ return ret;
+}
+
static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
{
struct open_flags op;
@@ -5149,8 +5284,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
fd_install(ret, file);
else
- ret = io_install_fixed_file(req, file, issue_flags,
- req->open.file_slot - 1);
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ req->open.file_slot);
err:
putname(req->open.filename);
req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -5171,7 +5306,7 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
+ if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off ||
sqe->splice_fd_in)
return -EINVAL;
@@ -5194,6 +5329,20 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
+ if (bl->buf_nr_pages) {
+ int j;
+
+ i = bl->buf_ring->tail - bl->head;
+ for (j = 0; j < bl->buf_nr_pages; j++)
+ unpin_user_page(bl->buf_pages[j]);
+ kvfree(bl->buf_pages);
+ bl->buf_pages = NULL;
+ bl->buf_nr_pages = 0;
+ /* make sure it's seen as empty */
+ INIT_LIST_HEAD(&bl->buf_list);
+ return i;
+ }
+
/* the head kbuf is the list itself */
while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
@@ -5220,8 +5369,12 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
ret = -ENOENT;
bl = io_buffer_get_list(ctx, p->bgid);
- if (bl)
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ if (bl) {
+ ret = -EINVAL;
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (!bl->buf_nr_pages)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
+ }
if (ret < 0)
req_set_fail(req);
@@ -5238,7 +5391,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
struct io_provide_buf *p = &req->pbuf;
u64 tmp;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
@@ -5335,6 +5488,23 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
+static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
+{
+ int i;
+
+ ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list),
+ GFP_KERNEL);
+ if (!ctx->io_bl)
+ return -ENOMEM;
+
+ for (i = 0; i < BGID_ARRAY; i++) {
+ INIT_LIST_HEAD(&ctx->io_bl[i].buf_list);
+ ctx->io_bl[i].bgid = i;
+ }
+
+ return 0;
+}
+
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
@@ -5344,14 +5514,30 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
io_ring_submit_lock(ctx, issue_flags);
+ if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) {
+ ret = io_init_bl_list(ctx);
+ if (ret)
+ goto err;
+ }
+
bl = io_buffer_get_list(ctx, p->bgid);
if (unlikely(!bl)) {
- bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl) {
ret = -ENOMEM;
goto err;
}
- io_buffer_add_list(ctx, bl, p->bgid);
+ INIT_LIST_HEAD(&bl->buf_list);
+ ret = io_buffer_add_list(ctx, bl, p->bgid);
+ if (ret) {
+ kfree(bl);
+ goto err;
+ }
+ }
+ /* can't add buffers via this command for a mapped buffer ring */
+ if (bl->buf_nr_pages) {
+ ret = -EINVAL;
+ goto err;
}
ret = io_add_buffers(ctx, p, bl);
@@ -5368,9 +5554,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -5414,9 +5598,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
@@ -5438,8 +5620,6 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
#else
@@ -5449,9 +5629,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in)
- return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
@@ -5487,9 +5665,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *path;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -5525,19 +5701,13 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
ctx->buffer);
-
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
- sqe->rw_flags || sqe->buf_index)
+ if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (req->flags & REQ_F_FIXED_FILE)
return -EBADF;
@@ -5569,7 +5739,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags)
spin_unlock(&files->file_lock);
goto err;
}
- file = fdt->fd[close->fd];
+ file = rcu_dereference_protected(fdt->fd[close->fd],
+ lockdep_is_held(&files->file_lock));
if (!file || file->f_op == &io_uring_fops) {
spin_unlock(&files->file_lock);
file = NULL;
@@ -5603,12 +5774,7 @@ err:
static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- struct io_ring_ctx *ctx = req->ctx;
-
- if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index ||
- sqe->splice_fd_in))
+ if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
@@ -5627,8 +5793,6 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
- if (ret < 0)
- req_set_fail(req);
io_req_complete(req, ret);
return 0;
}
@@ -5686,11 +5850,16 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(sqe->file_index))
+ return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
+ sr->flags = READ_ONCE(sqe->addr2);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -5725,7 +5894,11 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- flags = req->sr_msg.msg_flags;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg);
+
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5767,6 +5940,10 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
int min_ret = 0;
int ret;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -5780,7 +5957,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_controllen = 0;
msg.msg_namelen = 0;
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (issue_flags & IO_URING_F_NONBLOCK)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -5897,14 +6074,6 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
return __io_recvmsg_copy_hdr(req, iomsg);
}
-static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
- unsigned int issue_flags)
-{
- struct io_sr_msg *sr = &req->sr_msg;
-
- return io_buffer_select(req, &sr->len, sr->bgid, issue_flags);
-}
-
static int io_recvmsg_prep_async(struct io_kiocb *req)
{
int ret;
@@ -5919,12 +6088,16 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_sr_msg *sr = &req->sr_msg;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(sqe->file_index))
+ return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
- sr->bgid = READ_ONCE(sqe->buf_group);
+ sr->flags = READ_ONCE(sqe->addr2);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
@@ -5942,7 +6115,6 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
struct io_async_msghdr iomsg, *kmsg;
struct io_sr_msg *sr = &req->sr_msg;
struct socket *sock;
- struct io_buffer *kbuf;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@@ -5960,24 +6132,29 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
kmsg = &iomsg;
}
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
- kmsg->fast_iov[0].iov_len = req->sr_msg.len;
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov,
- 1, req->sr_msg.len);
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg);
+
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &sr->len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ kmsg->fast_iov[0].iov_base = buf;
+ kmsg->fast_iov[0].iov_len = sr->len;
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+ sr->len);
}
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
- ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
- kmsg->uaddr, flags);
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock)
return io_setup_async_msg(req, kmsg);
@@ -6007,28 +6184,32 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
{
- struct io_buffer *kbuf;
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
- void __user *buf = sr->buf;
struct socket *sock;
struct iovec iov;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
- if (req->flags & REQ_F_BUFFER_SELECT) {
- kbuf = io_recv_buffer_select(req, issue_flags);
- if (IS_ERR(kbuf))
- return PTR_ERR(kbuf);
- buf = u64_to_user_ptr(kbuf->addr);
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &sr->len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ sr->buf = buf;
}
- ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+ ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
goto out_free;
@@ -6039,7 +6220,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
msg.msg_iocb = NULL;
msg.msg_flags = 0;
- flags = req->sr_msg.msg_flags;
+ flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
@@ -6075,29 +6256,39 @@ out_free:
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_accept *accept = &req->accept;
+ unsigned flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index)
+ if (sqe->len || sqe->buf_index)
return -EINVAL;
accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
accept->flags = READ_ONCE(sqe->accept_flags);
accept->nofile = rlimit(RLIMIT_NOFILE);
+ flags = READ_ONCE(sqe->ioprio);
+ if (flags & ~IORING_ACCEPT_MULTISHOT)
+ return -EINVAL;
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
- return -EINVAL;
+ if (accept->file_slot) {
+ if (accept->flags & SOCK_CLOEXEC)
+ return -EINVAL;
+ if (flags & IORING_ACCEPT_MULTISHOT &&
+ accept->file_slot != IORING_FILE_INDEX_ALLOC)
+ return -EINVAL;
+ }
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+ if (flags & IORING_ACCEPT_MULTISHOT)
+ req->flags |= REQ_F_APOLL_MULTISHOT;
return 0;
}
static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
+ struct io_ring_ctx *ctx = req->ctx;
struct io_accept *accept = &req->accept;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -6105,6 +6296,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file;
int ret, fd;
+retry:
if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile);
if (unlikely(fd < 0))
@@ -6116,8 +6308,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
if (!fixed)
put_unused_fd(fd);
ret = PTR_ERR(file);
- if (ret == -EAGAIN && force_nonblock)
- return -EAGAIN;
+ if (ret == -EAGAIN && force_nonblock) {
+ /*
+ * if it's multishot and polled, we don't need to
+ * return EAGAIN to arm the poll infra since it
+ * has already been done
+ */
+ if ((req->flags & IO_APOLL_MULTI_POLLED) ==
+ IO_APOLL_MULTI_POLLED)
+ ret = 0;
+ return ret;
+ }
if (ret == -ERESTARTSYS)
ret = -EINTR;
req_set_fail(req);
@@ -6125,11 +6326,30 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags)
fd_install(fd, file);
ret = fd;
} else {
- ret = io_install_fixed_file(req, file, issue_flags,
- accept->file_slot - 1);
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ accept->file_slot);
}
- __io_req_complete(req, issue_flags, ret, 0);
- return 0;
+
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
+ }
+ if (ret >= 0) {
+ bool filled;
+
+ spin_lock(&ctx->completion_lock);
+ filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret,
+ IORING_CQE_F_MORE);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ if (filled) {
+ io_cqring_ev_posted(ctx);
+ goto retry;
+ }
+ ret = -ECANCELED;
+ }
+
+ return ret;
}
static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -6200,10 +6420,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_connect *conn = &req->connect;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags ||
- sqe->splice_fd_in)
+ if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -6386,6 +6603,7 @@ static void io_poll_remove_entries(struct io_kiocb *req)
rcu_read_unlock();
}
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags);
/*
* All poll tw should go through this. Checks for poll events, manages
* references, does rewait, etc.
@@ -6394,10 +6612,10 @@ static void io_poll_remove_entries(struct io_kiocb *req)
* either spurious wakeup or multishot CQE is served. 0 when it's done with
* the request, then the mask is stored in req->cqe.res.
*/
-static int io_poll_check_events(struct io_kiocb *req, bool locked)
+static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
- int v;
+ int v, ret;
/* req->task == current here, checking PF_EXITING is safe */
if (unlikely(req->task->flags & PF_EXITING))
@@ -6421,23 +6639,37 @@ static int io_poll_check_events(struct io_kiocb *req, bool locked)
req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events;
}
- /* multishot, just fill an CQE and proceed */
- if (req->cqe.res && !(req->apoll_events & EPOLLONESHOT)) {
- __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events);
+ if ((unlikely(!req->cqe.res)))
+ continue;
+ if (req->apoll_events & EPOLLONESHOT)
+ return 0;
+
+ /* multishot, just fill a CQE and proceed */
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ __poll_t mask = mangle_poll(req->cqe.res &
+ req->apoll_events);
bool filled;
spin_lock(&ctx->completion_lock);
- filled = io_fill_cqe_aux(ctx, req->cqe.user_data, mask,
- IORING_CQE_F_MORE);
+ filled = io_fill_cqe_aux(ctx, req->cqe.user_data,
+ mask, IORING_CQE_F_MORE);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
- if (unlikely(!filled))
- return -ECANCELED;
- io_cqring_ev_posted(ctx);
- } else if (req->cqe.res) {
- return 0;
+ if (filled) {
+ io_cqring_ev_posted(ctx);
+ continue;
+ }
+ return -ECANCELED;
}
+ io_tw_lock(req->ctx, locked);
+ if (unlikely(req->task->flags & PF_EXITING))
+ return -EFAULT;
+ ret = io_issue_sqe(req,
+ IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ if (ret)
+ return ret;
+
/*
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
@@ -6452,7 +6684,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req, *locked);
+ ret = io_poll_check_events(req, locked);
if (ret > 0)
return;
@@ -6477,7 +6709,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
struct io_ring_ctx *ctx = req->ctx;
int ret;
- ret = io_poll_check_events(req, *locked);
+ ret = io_poll_check_events(req, locked);
if (ret > 0)
return;
@@ -6492,7 +6724,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
+static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
{
req->cqe.res = mask;
/*
@@ -6511,7 +6743,8 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
io_req_task_work_add(req, false);
}
-static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
+static inline void io_poll_execute(struct io_kiocb *req, int res,
+ __poll_t events)
{
if (io_poll_get_ownership(req))
__io_poll_execute(req, res, events);
@@ -6526,6 +6759,7 @@ static void io_poll_cancel_req(struct io_kiocb *req)
#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1))
#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1)
+#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI)
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
@@ -6560,7 +6794,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
}
/* for instances that support it check for an event match first */
- if (mask && !(mask & poll->events))
+ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON)))
return 0;
if (io_poll_get_ownership(req)) {
@@ -6717,7 +6951,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
struct io_ring_ctx *ctx = req->ctx;
struct async_poll *apoll;
struct io_poll_table ipt;
- __poll_t mask = EPOLLONESHOT | POLLERR | POLLPRI;
+ __poll_t mask = POLLPRI | POLLERR;
int ret;
if (!def->pollin && !def->pollout)
@@ -6726,16 +6960,18 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_ABORTED;
if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
return IO_APOLL_ABORTED;
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT))
+ mask |= EPOLLONESHOT;
if (def->pollin) {
- mask |= POLLIN | POLLRDNORM;
+ mask |= EPOLLIN | EPOLLRDNORM;
/* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */
if ((req->opcode == IORING_OP_RECVMSG) &&
(req->sr_msg.msg_flags & MSG_ERRQUEUE))
- mask &= ~POLLIN;
+ mask &= ~EPOLLIN;
} else {
- mask |= POLLOUT | POLLWRNORM;
+ mask |= EPOLLOUT | EPOLLWRNORM;
}
if (def->poll_exclusive)
mask |= EPOLLEXCLUSIVE;
@@ -6887,9 +7123,7 @@ static int io_poll_update_prep(struct io_kiocb *req,
struct io_poll_update *upd = &req->poll_update;
u32 flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA |
@@ -6919,9 +7153,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
struct io_poll_iocb *poll = &req->poll;
u32 flags;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->addr)
+ if (sqe->buf_index || sqe->off || sqe->addr)
return -EINVAL;
flags = READ_ONCE(sqe->len);
if (flags & ~IORING_POLL_ADD_MULTI)
@@ -7128,11 +7360,9 @@ static int io_timeout_remove_prep(struct io_kiocb *req,
{
struct io_timeout_rem *tr = &req->timeout_rem;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->len || sqe->splice_fd_in)
return -EINVAL;
tr->ltimeout = false;
@@ -7202,10 +7432,7 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
unsigned flags;
u32 off = READ_ONCE(sqe->off);
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
- if (sqe->ioprio || sqe->buf_index || sqe->len != 1 ||
- sqe->splice_fd_in)
+ if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in)
return -EINVAL;
if (off && is_timeout_link)
return -EINVAL;
@@ -7387,11 +7614,9 @@ out:
static int io_async_cancel_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
- return -EINVAL;
if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
- if (sqe->ioprio || sqe->off || sqe->len || sqe->splice_fd_in)
+ if (sqe->off || sqe->len || sqe->splice_fd_in)
return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr);
@@ -7477,7 +7702,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req,
{
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
- if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in)
+ if (sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
req->rsrc_update.offset = READ_ONCE(sqe->off);
@@ -7609,7 +7834,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static int io_req_prep_async(struct io_kiocb *req)
{
- if (!io_op_defs[req->opcode].needs_async_setup)
+ const struct io_op_def *def = &io_op_defs[req->opcode];
+
+ /* assign early for deferred execution for non-fixed file */
+ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE))
+ req->file = io_file_get_normal(req, req->cqe.fd);
+ if (!def->needs_async_setup)
return 0;
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
@@ -7995,6 +8225,8 @@ fail:
* wait for request slots on the block side.
*/
if (!needs_poll) {
+ if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
+ break;
cond_resched();
continue;
}
@@ -8051,6 +8283,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
/* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
io_req_set_rsrc_node(req, ctx, 0);
+ WARN_ON_ONCE(file && !test_bit(fd, ctx->file_table.bitmap));
out:
io_ring_submit_unlock(ctx, issue_flags);
return file;
@@ -8296,9 +8529,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* enforce forwards compatibility on users */
if (sqe_flags & ~SQE_VALID_FLAGS)
return -EINVAL;
- if ((sqe_flags & IOSQE_BUFFER_SELECT) &&
- !io_op_defs[opcode].buffer_select)
- return -EOPNOTSUPP;
+ if (sqe_flags & IOSQE_BUFFER_SELECT) {
+ if (!io_op_defs[opcode].buffer_select)
+ return -EOPNOTSUPP;
+ req->buf_index = READ_ONCE(sqe->buf_group);
+ }
if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS)
ctx->drain_disabled = true;
if (sqe_flags & IOSQE_IO_DRAIN) {
@@ -8321,6 +8556,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
}
+ if (!io_op_defs[opcode].ioprio && sqe->ioprio)
+ return -EINVAL;
+ if (!io_op_defs[opcode].iopoll && (ctx->flags & IORING_SETUP_IOPOLL))
+ return -EINVAL;
+
if (io_op_defs[opcode].needs_file) {
struct io_submit_state *state = &ctx->submit_state;
@@ -8578,23 +8818,6 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)
return READ_ONCE(sqd->state);
}
-static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
-{
- /* Tell userspace we may need a wakeup call */
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
-static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
-{
- spin_lock(&ctx->completion_lock);
- WRITE_ONCE(ctx->rings->sq_flags,
- ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP);
- spin_unlock(&ctx->completion_lock);
-}
-
static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
unsigned int to_submit;
@@ -8710,8 +8933,8 @@ static int io_sq_thread(void *data)
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
- io_ring_set_wakeup_flag(ctx);
-
+ atomic_or(IORING_SQ_NEED_WAKEUP,
+ &ctx->rings->sq_flags);
if ((ctx->flags & IORING_SETUP_IOPOLL) &&
!wq_list_empty(&ctx->iopoll_list)) {
needs_sched = false;
@@ -8722,7 +8945,7 @@ static int io_sq_thread(void *data)
* Ensure the store of the wakeup flag is not
* reordered with the load of the SQ tail
*/
- smp_mb();
+ smp_mb__after_atomic();
if (io_sqring_entries(ctx)) {
needs_sched = false;
@@ -8736,7 +8959,8 @@ static int io_sq_thread(void *data)
mutex_lock(&sqd->lock);
}
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_clear_wakeup_flag(ctx);
+ atomic_andnot(IORING_SQ_NEED_WAKEUP,
+ &ctx->rings->sq_flags);
}
finish_wait(&sqd->wait, &wait);
@@ -8746,7 +8970,7 @@ static int io_sq_thread(void *data)
io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
- io_ring_set_wakeup_flag(ctx);
+ atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
io_run_task_work();
mutex_unlock(&sqd->lock);
@@ -9126,20 +9350,45 @@ static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
{
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
GFP_KERNEL_ACCOUNT);
- return !!table->files;
+ if (unlikely(!table->files))
+ return false;
+
+ table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
+ if (unlikely(!table->bitmap)) {
+ kvfree(table->files);
+ return false;
+ }
+
+ return true;
}
static void io_free_file_tables(struct io_file_table *table)
{
kvfree(table->files);
+ bitmap_free(table->bitmap);
table->files = NULL;
+ table->bitmap = NULL;
+}
+
+static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
+{
+ WARN_ON_ONCE(test_bit(bit, table->bitmap));
+ __set_bit(bit, table->bitmap);
+ if (bit == table->alloc_hint)
+ table->alloc_hint++;
+}
+
+static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
+{
+ __clear_bit(bit, table->bitmap);
+ table->alloc_hint = bit;
}
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
+#if !defined(IO_URING_SCM_ALL)
int i;
-#if !defined(IO_URING_SCM_ALL)
for (i = 0; i < ctx->nr_user_files; i++) {
struct file *file = io_file_from_index(ctx, i);
@@ -9147,6 +9396,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
continue;
if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
continue;
+ io_file_bitmap_clear(&ctx->file_table, i);
fput(file);
}
#endif
@@ -9515,12 +9765,12 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
struct io_fixed_file *file_slot;
- if (copy_from_user(&fd, &fds[i], sizeof(fd))) {
+ if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
ret = -EFAULT;
goto fail;
}
/* allow sparse sets */
- if (fd == -1) {
+ if (!fds || fd == -1) {
ret = -EINVAL;
if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
goto fail;
@@ -9550,6 +9800,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
}
file_slot = io_fixed_file_slot(&ctx->file_table, i);
io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, i);
}
io_rsrc_node_switch(ctx, NULL);
@@ -9610,6 +9861,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
if (ret)
goto err;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
@@ -9617,6 +9869,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
if (!ret) {
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, slot_index);
}
err:
if (needs_switch)
@@ -9658,6 +9911,7 @@ static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags)
goto out;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
ret = 0;
out:
@@ -9707,6 +9961,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
if (err)
break;
file_slot->file_ptr = 0;
+ io_file_bitmap_clear(&ctx->file_table, i);
needs_switch = true;
}
if (fd != -1) {
@@ -9735,6 +9990,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
*io_get_tag_slot(data, i) = tag;
io_fixed_file_set(file_slot, file);
+ io_file_bitmap_set(&ctx->file_table, i);
}
}
@@ -10162,30 +10418,18 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
- struct io_mapped_ubuf **pimu,
- struct page **last_hpage)
+static struct page **io_pin_pages(unsigned long ubuf, unsigned long len,
+ int *npages)
{
- struct io_mapped_ubuf *imu = NULL;
+ unsigned long start, end, nr_pages;
struct vm_area_struct **vmas = NULL;
struct page **pages = NULL;
- unsigned long off, start, end, ubuf;
- size_t size;
- int ret, pret, nr_pages, i;
-
- if (!iov->iov_base) {
- *pimu = ctx->dummy_ubuf;
- return 0;
- }
+ int i, pret, ret = -ENOMEM;
- ubuf = (unsigned long) iov->iov_base;
- end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
- *pimu = NULL;
- ret = -ENOMEM;
-
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
goto done;
@@ -10195,10 +10439,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
if (!vmas)
goto done;
- imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
- if (!imu)
- goto done;
-
ret = 0;
mmap_read_lock(current->mm);
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
@@ -10216,6 +10456,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
break;
}
}
+ *npages = nr_pages;
} else {
ret = pret < 0 ? pret : -EFAULT;
}
@@ -10229,14 +10470,53 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unpin_user_pages(pages, pret);
goto done;
}
+ ret = 0;
+done:
+ kvfree(vmas);
+ if (ret < 0) {
+ kvfree(pages);
+ pages = ERR_PTR(ret);
+ }
+ return pages;
+}
+
+static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
+ struct io_mapped_ubuf **pimu,
+ struct page **last_hpage)
+{
+ struct io_mapped_ubuf *imu = NULL;
+ struct page **pages = NULL;
+ unsigned long off;
+ size_t size;
+ int ret, nr_pages, i;
+
+ if (!iov->iov_base) {
+ *pimu = ctx->dummy_ubuf;
+ return 0;
+ }
+
+ *pimu = NULL;
+ ret = -ENOMEM;
+
+ pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
+ &nr_pages);
+ if (IS_ERR(pages)) {
+ ret = PTR_ERR(pages);
+ pages = NULL;
+ goto done;
+ }
- ret = io_buffer_account_pin(ctx, pages, pret, imu, last_hpage);
+ imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
+ if (!imu)
+ goto done;
+
+ ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
if (ret) {
- unpin_user_pages(pages, pret);
+ unpin_user_pages(pages, nr_pages);
goto done;
}
- off = ubuf & ~PAGE_MASK;
+ off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len;
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
@@ -10249,8 +10529,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
size -= vec_len;
}
/* store original address for later verification */
- imu->ubuf = ubuf;
- imu->ubuf_end = ubuf + iov->iov_len;
+ imu->ubuf = (unsigned long) iov->iov_base;
+ imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
*pimu = imu;
ret = 0;
@@ -10258,7 +10538,6 @@ done:
if (ret)
kvfree(imu);
kvfree(pages);
- kvfree(vmas);
return ret;
}
@@ -10317,12 +10596,17 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
- ret = io_copy_iov(ctx, &iov, arg, i);
- if (ret)
- break;
- ret = io_buffer_validate(&iov);
- if (ret)
- break;
+ if (arg) {
+ ret = io_copy_iov(ctx, &iov, arg, i);
+ if (ret)
+ break;
+ ret = io_buffer_validate(&iov);
+ if (ret)
+ break;
+ } else {
+ memset(&iov, 0, sizeof(iov));
+ }
+
if (!iov.iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
@@ -10461,19 +10745,19 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
+ struct io_buffer_list *bl;
+ unsigned long index;
int i;
- for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
- struct list_head *list = &ctx->io_buffers[i];
-
- while (!list_empty(list)) {
- struct io_buffer_list *bl;
+ for (i = 0; i < BGID_ARRAY; i++) {
+ if (!ctx->io_bl)
+ break;
+ __io_remove_buffers(ctx, &ctx->io_bl[i], -1U);
+ }
- bl = list_first_entry(list, struct io_buffer_list, list);
- __io_remove_buffers(ctx, bl, -1U);
- list_del(&bl->list);
- kfree(bl);
- }
+ xa_for_each(&ctx->io_bl_xa, index, bl) {
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ __io_remove_buffers(ctx, bl, -1U);
}
while (!list_empty(&ctx->io_buffers_pages)) {
@@ -10582,7 +10866,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_wq_put_hash(ctx->hash_map);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
- kfree(ctx->io_buffers);
+ kfree(ctx->io_bl);
+ xa_destroy(&ctx->io_bl_xa);
kfree(ctx);
}
@@ -11210,7 +11495,7 @@ static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
ret = -EFAULT;
break;
}
- if (reg.resv || reg.offset >= IO_RINGFD_REG_MAX) {
+ if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) {
ret = -EINVAL;
break;
}
@@ -11824,6 +12109,25 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
ctx->user = get_uid(current_user());
/*
+ * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if
+ * COOP_TASKRUN is set, then IPIs are never needed by the app.
+ */
+ ret = -EINVAL;
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ /* IPI related flags don't make sense with SQPOLL */
+ if (ctx->flags & (IORING_SETUP_COOP_TASKRUN |
+ IORING_SETUP_TASKRUN_FLAG))
+ goto err;
+ ctx->notify_method = TWA_SIGNAL_NO_IPI;
+ } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) {
+ ctx->notify_method = TWA_SIGNAL_NO_IPI;
+ } else {
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
+ goto err;
+ ctx->notify_method = TWA_SIGNAL;
+ }
+
+ /*
* This is just grabbed for accounting purposes. When a process exits,
* the mm is exited and dropped before the files, hence we need to hang
* on to this mm purely for the purposes of being able to unaccount
@@ -11920,10 +12224,11 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
+ IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
return -EINVAL;
- return io_uring_create(entries, &p, params);
+ return io_uring_create(entries, &p, params);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
@@ -12136,14 +12441,20 @@ static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
memset(&rr, 0, sizeof(rr));
if (copy_from_user(&rr, arg, size))
return -EFAULT;
- if (!rr.nr || rr.resv || rr.resv2)
+ if (!rr.nr || rr.resv2)
+ return -EINVAL;
+ if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
return -EINVAL;
switch (type) {
case IORING_RSRC_FILE:
+ if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+ break;
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
case IORING_RSRC_BUFFER:
+ if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
+ break;
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
rr.nr, u64_to_user_ptr(rr.tags));
}
@@ -12278,6 +12589,85 @@ err:
return ret;
}
+static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_ring *br;
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+ struct page **pages;
+ int nr_pages;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+
+ if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+ return -EINVAL;
+ if (!reg.ring_addr)
+ return -EFAULT;
+ if (reg.ring_addr & ~PAGE_MASK)
+ return -EINVAL;
+ if (!is_power_of_2(reg.ring_entries))
+ return -EINVAL;
+
+ if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) {
+ int ret = io_init_bl_list(ctx);
+ if (ret)
+ return ret;
+ }
+
+ bl = io_buffer_get_list(ctx, reg.bgid);
+ if (bl) {
+ /* if mapped buffer ring OR classic exists, don't allow */
+ if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
+ return -EEXIST;
+ } else {
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return -ENOMEM;
+ }
+
+ pages = io_pin_pages(reg.ring_addr,
+ struct_size(br, bufs, reg.ring_entries),
+ &nr_pages);
+ if (IS_ERR(pages)) {
+ kfree(bl);
+ return PTR_ERR(pages);
+ }
+
+ br = page_address(pages[0]);
+ bl->buf_pages = pages;
+ bl->buf_nr_pages = nr_pages;
+ bl->nr_entries = reg.ring_entries;
+ bl->buf_ring = br;
+ bl->mask = reg.ring_entries - 1;
+ io_buffer_add_list(ctx, bl, reg.bgid);
+ return 0;
+}
+
+static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_buf_reg reg;
+ struct io_buffer_list *bl;
+
+ if (copy_from_user(&reg, arg, sizeof(reg)))
+ return -EFAULT;
+ if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
+ return -EINVAL;
+
+ bl = io_buffer_get_list(ctx, reg.bgid);
+ if (!bl)
+ return -ENOENT;
+ if (!bl->buf_nr_pages)
+ return -EINVAL;
+
+ __io_remove_buffers(ctx, bl, -1U);
+ if (bl->bgid >= BGID_ARRAY) {
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ kfree(bl);
+ }
+ return 0;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -12303,6 +12693,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
switch (opcode) {
case IORING_REGISTER_BUFFERS:
+ ret = -EFAULT;
+ if (!arg)
+ break;
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_BUFFERS:
@@ -12312,6 +12705,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_buffers_unregister(ctx);
break;
case IORING_REGISTER_FILES:
+ ret = -EFAULT;
+ if (!arg)
+ break;
ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
break;
case IORING_UNREGISTER_FILES:
@@ -12406,6 +12802,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_UNREGISTER_RING_FDS:
ret = io_ringfd_unregister(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_pbuf_ring(ctx, arg);
+ break;
+ case IORING_UNREGISTER_PBUF_RING:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_unregister_pbuf_ring(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
@@ -12491,6 +12899,10 @@ static int __init io_uring_init(void)
/* ->buf_index is u16 */
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
+ BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE);
+ BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
+ BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
+ offsetof(struct io_uring_buf_ring, tail));
/* should fit into one byte */
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
@@ -12500,6 +12912,8 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int));
+ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
return 0;