for-5.5/io_uring-20191121
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl3WxNwQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgps4kD/9SIDXhYhhE8fNqeAF7Uouu8fxgwnkY3hSI 43vJwCziiDxWWJH5mYW7/83VNOMZKHIbiYMnU6iEUsRQ/sG/wI0wEfAQZDHLzCKt cko2q7zAC1/4rtoslwJ3q04hE2Ap/nb93ELZBVr7fOAuODBNFUp/vifAojvsMPKz hNMNPq/vYg7c/iYMZKSBdtjE3tqceFNBjAVNMB9dHKQLeexEy4ve7AjBeawWsSi7 GesnQ5w5u5LqkMYwLslpv/oVjHiiFWgGnDAvBNvykQvVy+DfB54KSqMV11W1aqdU l6L+ENfZasEvlk1yMAth2Foq4vlscm5MKEb6VdJhXWHHXtXkcBmz7RBqPmjSvXCY wS5GZRw8oYtTcid0aQf+t/wgRNTDJsGsnsT32qto41No3Z7vlIDHUDxHZGTA+gEL E8j9rDx6EXMTo3EFbC8XZcfsorhPJ1HKAyw1YFczHtYzJEQUR9jJe3f/Q9u6K2Vy s/EhkVeHa/lEd7kb6mI+6lQjGe1FXl7AHauDuaaEfIOZA/xJB3Bad5Wjq1va1cUO TX+37zjzFzJghhSIBGYq7G7iT4AMecPQgxHzCdCyYfW5S4Uur9tMmIElwVPI/Pjl kDZ9gdg9lm6JifZ9Ab8QcGhuQQTF3frwX9VfgrVgcqyvm38AiYzVgL9ZJnxRS/Cy ZfLNkACXqQ== =YZ9s -----END PGP SIGNATURE----- Merge tag 'for-5.5/io_uring-20191121' of git://git.kernel.dk/linux-block Pull io_uring updates from Jens Axboe: "A lot of stuff has been going on this cycle, with improving the support for networked IO (and hence unbounded request completion times) being one of the major themes. There's been a set of fixes done this week, I'll send those out as well once we're certain we're fully happy with them. This contains: - Unification of the "normal" submit path and the SQPOLL path (Pavel) - Support for sparse (and bigger) file sets, and updating of those file sets without needing to unregister/register again. - Independently sized CQ ring, instead of just making it always 2x the SQ ring size. This makes it more flexible for networked applications. - Support for overflowed CQ ring, never dropping events but providing backpressure on submits. - Add support for absolute timeouts, not just relative ones. - Support for generic cancellations. This divorces io_uring from workqueues as well, which additionally gets us one step closer to generic async system call support. - With cancellations, we can support grabbing the process file table as well, just like we do mm context. This allows support for system calls that create file descriptors, like accept4() support that's built on top of that. - Support for io_uring tracing (Dmitrii) - Support for linked timeouts. These abort an operation if it isn't completed by the time noted in the linke timeout. - Speedup tracking of poll requests - Various cleanups making the coder easier to follow (Jackie, Pavel, Bob, YueHaibing, me) - Update MAINTAINERS with new io_uring list" * tag 'for-5.5/io_uring-20191121' of git://git.kernel.dk/linux-block: (64 commits) io_uring: make POLL_ADD/POLL_REMOVE scale better io-wq: remove now redundant struct io_wq_nulls_list io_uring: Fix getting file for non-fd opcodes io_uring: introduce req_need_defer() io_uring: clean up io_uring_cancel_files() io-wq: ensure free/busy list browsing see all items io-wq: ensure we have a stable view of ->cur_work for cancellations io_wq: add get/put_work handlers to io_wq_create() io_uring: check for validity of ->rings in teardown io_uring: fix potential deadlock in io_poll_wake() io_uring: use correct "is IO worker" helper io_uring: fix -ENOENT issue with linked timer with short timeout io_uring: don't do flush cancel under inflight_lock io_uring: flag SQPOLL busy condition to userspace io_uring: make ASYNC_CANCEL work with poll and timeout io_uring: provide fallback request for OOM situations io_uring: convert accept4() -ERESTARTSYS into -EINTR io_uring: fix error clear of ->file_table in io_sqe_files_register() io_uring: separate the io_free_req and io_free_req_find_next interface io_uring: keep io_put_req only responsible for release and put req ...
This commit is contained in:
commit
fb4b3d3fd0
|
@ -8564,12 +8564,13 @@ F: include/linux/iova.h
|
|||
|
||||
IO_URING
|
||||
M: Jens Axboe <axboe@kernel.dk>
|
||||
L: linux-block@vger.kernel.org
|
||||
L: linux-fsdevel@vger.kernel.org
|
||||
L: io-uring@vger.kernel.org
|
||||
T: git git://git.kernel.dk/linux-block
|
||||
T: git git://git.kernel.dk/liburing
|
||||
S: Maintained
|
||||
F: fs/io_uring.c
|
||||
F: fs/io-wq.c
|
||||
F: fs/io-wq.h
|
||||
F: include/uapi/linux/io_uring.h
|
||||
|
||||
IPMI SUBSYSTEM
|
||||
|
|
|
@ -322,4 +322,7 @@ source "fs/nls/Kconfig"
|
|||
source "fs/dlm/Kconfig"
|
||||
source "fs/unicode/Kconfig"
|
||||
|
||||
config IO_WQ
|
||||
bool
|
||||
|
||||
endmenu
|
||||
|
|
|
@ -32,6 +32,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o
|
|||
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
|
||||
obj-$(CONFIG_AIO) += aio.o
|
||||
obj-$(CONFIG_IO_URING) += io_uring.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
||||
obj-$(CONFIG_FS_DAX) += dax.o
|
||||
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
|
||||
obj-$(CONFIG_FS_VERITY) += verity/
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,74 @@
|
|||
#ifndef INTERNAL_IO_WQ_H
|
||||
#define INTERNAL_IO_WQ_H
|
||||
|
||||
struct io_wq;
|
||||
|
||||
enum {
|
||||
IO_WQ_WORK_CANCEL = 1,
|
||||
IO_WQ_WORK_HAS_MM = 2,
|
||||
IO_WQ_WORK_HASHED = 4,
|
||||
IO_WQ_WORK_NEEDS_USER = 8,
|
||||
IO_WQ_WORK_NEEDS_FILES = 16,
|
||||
IO_WQ_WORK_UNBOUND = 32,
|
||||
IO_WQ_WORK_INTERNAL = 64,
|
||||
|
||||
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
|
||||
};
|
||||
|
||||
enum io_wq_cancel {
|
||||
IO_WQ_CANCEL_OK, /* cancelled before started */
|
||||
IO_WQ_CANCEL_RUNNING, /* found, running, and attempted cancelled */
|
||||
IO_WQ_CANCEL_NOTFOUND, /* work not found */
|
||||
};
|
||||
|
||||
struct io_wq_work {
|
||||
struct list_head list;
|
||||
void (*func)(struct io_wq_work **);
|
||||
unsigned flags;
|
||||
struct files_struct *files;
|
||||
};
|
||||
|
||||
#define INIT_IO_WORK(work, _func) \
|
||||
do { \
|
||||
(work)->func = _func; \
|
||||
(work)->flags = 0; \
|
||||
(work)->files = NULL; \
|
||||
} while (0) \
|
||||
|
||||
typedef void (get_work_fn)(struct io_wq_work *);
|
||||
typedef void (put_work_fn)(struct io_wq_work *);
|
||||
|
||||
struct io_wq *io_wq_create(unsigned bounded, struct mm_struct *mm,
|
||||
struct user_struct *user,
|
||||
get_work_fn *get_work, put_work_fn *put_work);
|
||||
void io_wq_destroy(struct io_wq *wq);
|
||||
|
||||
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
|
||||
void io_wq_enqueue_hashed(struct io_wq *wq, struct io_wq_work *work, void *val);
|
||||
void io_wq_flush(struct io_wq *wq);
|
||||
|
||||
void io_wq_cancel_all(struct io_wq *wq);
|
||||
enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork);
|
||||
|
||||
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
|
||||
|
||||
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
|
||||
void *data);
|
||||
|
||||
#if defined(CONFIG_IO_WQ)
|
||||
extern void io_wq_worker_sleeping(struct task_struct *);
|
||||
extern void io_wq_worker_running(struct task_struct *);
|
||||
#else
|
||||
static inline void io_wq_worker_sleeping(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
static inline void io_wq_worker_running(struct task_struct *tsk)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool io_wq_current_is_worker(void)
|
||||
{
|
||||
return in_task() && (current->flags & PF_IO_WORKER);
|
||||
}
|
||||
#endif
|
2221
fs/io_uring.c
2221
fs/io_uring.c
File diff suppressed because it is too large
Load Diff
|
@ -1027,6 +1027,7 @@ header-test- += trace/events/fsi_master_gpio.h
|
|||
header-test- += trace/events/huge_memory.h
|
||||
header-test- += trace/events/ib_mad.h
|
||||
header-test- += trace/events/ib_umad.h
|
||||
header-test- += trace/events/io_uring.h
|
||||
header-test- += trace/events/iscsi.h
|
||||
header-test- += trace/events/jbd2.h
|
||||
header-test- += trace/events/kvm.h
|
||||
|
|
|
@ -1468,6 +1468,7 @@ extern struct pid *cad_pid;
|
|||
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
|
||||
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
||||
#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
|
||||
#define PF_IO_WORKER 0x20000000 /* Task is an IO worker */
|
||||
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
|
||||
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
|
||||
|
||||
|
|
|
@ -392,6 +392,9 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
|
|||
extern int __sys_sendto(int fd, void __user *buff, size_t len,
|
||||
unsigned int flags, struct sockaddr __user *addr,
|
||||
int addr_len);
|
||||
extern int __sys_accept4_file(struct file *file, unsigned file_flags,
|
||||
struct sockaddr __user *upeer_sockaddr,
|
||||
int __user *upeer_addrlen, int flags);
|
||||
extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
|
||||
int __user *upeer_addrlen, int flags);
|
||||
extern int __sys_socket(int family, int type, int protocol);
|
||||
|
|
|
@ -0,0 +1,358 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#undef TRACE_SYSTEM
|
||||
#define TRACE_SYSTEM io_uring
|
||||
|
||||
#if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ)
|
||||
#define _TRACE_IO_URING_H
|
||||
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
struct io_wq_work;
|
||||
|
||||
/**
|
||||
* io_uring_create - called after a new io_uring context was prepared
|
||||
*
|
||||
* @fd: corresponding file descriptor
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @sq_entries: actual SQ size
|
||||
* @cq_entries: actual CQ size
|
||||
* @flags: SQ ring flags, provided to io_uring_setup(2)
|
||||
*
|
||||
* Allows to trace io_uring creation and provide pointer to a context, that can
|
||||
* be used later to find correlated events.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_create,
|
||||
|
||||
TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags),
|
||||
|
||||
TP_ARGS(fd, ctx, sq_entries, cq_entries, flags),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( int, fd )
|
||||
__field( void *, ctx )
|
||||
__field( u32, sq_entries )
|
||||
__field( u32, cq_entries )
|
||||
__field( u32, flags )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->fd = fd;
|
||||
__entry->ctx = ctx;
|
||||
__entry->sq_entries = sq_entries;
|
||||
__entry->cq_entries = cq_entries;
|
||||
__entry->flags = flags;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d",
|
||||
__entry->ctx, __entry->fd, __entry->sq_entries,
|
||||
__entry->cq_entries, __entry->flags)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_register - called after a buffer/file/eventfd was succesfully
|
||||
* registered for a ring
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @opcode: describes which operation to perform
|
||||
* @nr_user_files: number of registered files
|
||||
* @nr_user_bufs: number of registered buffers
|
||||
* @cq_ev_fd: whether eventfs registered or not
|
||||
* @ret: return code
|
||||
*
|
||||
* Allows to trace fixed files/buffers/eventfds, that could be registered to
|
||||
* avoid an overhead of getting references to them for every operation. This
|
||||
* event, together with io_uring_file_get, can provide a full picture of how
|
||||
* much overhead one can reduce via fixing.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_register,
|
||||
|
||||
TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files,
|
||||
unsigned nr_bufs, bool eventfd, long ret),
|
||||
|
||||
TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( unsigned, opcode )
|
||||
__field( unsigned, nr_files )
|
||||
__field( unsigned, nr_bufs )
|
||||
__field( bool, eventfd )
|
||||
__field( long, ret )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->opcode = opcode;
|
||||
__entry->nr_files = nr_files;
|
||||
__entry->nr_bufs = nr_bufs;
|
||||
__entry->eventfd = eventfd;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, "
|
||||
"eventfd %d, ret %ld",
|
||||
__entry->ctx, __entry->opcode, __entry->nr_files,
|
||||
__entry->nr_bufs, __entry->eventfd, __entry->ret)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_file_get - called before getting references to an SQE file
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @fd: SQE file descriptor
|
||||
*
|
||||
* Allows to trace out how often an SQE file reference is obtained, which can
|
||||
* help figuring out if it makes sense to use fixed files, or check that fixed
|
||||
* files are used correctly.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_file_get,
|
||||
|
||||
TP_PROTO(void *ctx, int fd),
|
||||
|
||||
TP_ARGS(ctx, fd),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( int, fd )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->fd = fd;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_queue_async_work - called before submitting a new async work
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @hashed: type of workqueue, hashed or normal
|
||||
* @req: pointer to a submitted request
|
||||
* @work: pointer to a submitted io_wq_work
|
||||
*
|
||||
* Allows to trace asynchronous work submission.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_queue_async_work,
|
||||
|
||||
TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work,
|
||||
unsigned int flags),
|
||||
|
||||
TP_ARGS(ctx, rw, req, work, flags),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( int, rw )
|
||||
__field( void *, req )
|
||||
__field( struct io_wq_work *, work )
|
||||
__field( unsigned int, flags )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->rw = rw;
|
||||
__entry->req = req;
|
||||
__entry->work = work;
|
||||
__entry->flags = flags;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, request %p, flags %d, %s queue, work %p",
|
||||
__entry->ctx, __entry->req, __entry->flags,
|
||||
__entry->rw ? "hashed" : "normal", __entry->work)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_defer_list - called before the io_uring work added into defer_list
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a deferred request
|
||||
* @shadow: whether request is shadow or not
|
||||
*
|
||||
* Allows to track deferred requests, to get an insight about what requests are
|
||||
* not started immediately.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_defer,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, bool shadow),
|
||||
|
||||
TP_ARGS(ctx, req, shadow),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( void *, req )
|
||||
__field( bool, shadow )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->req = req;
|
||||
__entry->shadow = shadow;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, request %p%s", __entry->ctx, __entry->req,
|
||||
__entry->shadow ? ", shadow": "")
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_link - called before the io_uring request added into link_list of
|
||||
* another request
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @req: pointer to a linked request
|
||||
* @target_req: pointer to a previous request, that would contain @req
|
||||
*
|
||||
* Allows to track linked requests, to understand dependencies between requests
|
||||
* and how does it influence their execution flow.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_link,
|
||||
|
||||
TP_PROTO(void *ctx, void *req, void *target_req),
|
||||
|
||||
TP_ARGS(ctx, req, target_req),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( void *, req )
|
||||
__field( void *, target_req )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->req = req;
|
||||
__entry->target_req = target_req;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, request %p linked after %p",
|
||||
__entry->ctx, __entry->req, __entry->target_req)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_cqring_wait - called before start waiting for an available CQE
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @min_events: minimal number of events to wait for
|
||||
*
|
||||
* Allows to track waiting for CQE, so that we can e.g. troubleshoot
|
||||
* situations, when an application wants to wait for an event, that never
|
||||
* comes.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_cqring_wait,
|
||||
|
||||
TP_PROTO(void *ctx, int min_events),
|
||||
|
||||
TP_ARGS(ctx, min_events),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( int, min_events )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->min_events = min_events;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_fail_link - called before failing a linked request
|
||||
*
|
||||
* @req: request, which links were cancelled
|
||||
* @link: cancelled link
|
||||
*
|
||||
* Allows to track linked requests cancellation, to see not only that some work
|
||||
* was cancelled, but also which request was the reason.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_fail_link,
|
||||
|
||||
TP_PROTO(void *req, void *link),
|
||||
|
||||
TP_ARGS(req, link),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, req )
|
||||
__field( void *, link )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->req = req;
|
||||
__entry->link = link;
|
||||
),
|
||||
|
||||
TP_printk("request %p, link %p", __entry->req, __entry->link)
|
||||
);
|
||||
|
||||
/**
|
||||
* io_uring_complete - called when completing an SQE
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @user_data: user data associated with the request
|
||||
* @res: result of the request
|
||||
*
|
||||
*/
|
||||
TRACE_EVENT(io_uring_complete,
|
||||
|
||||
TP_PROTO(void *ctx, u64 user_data, long res),
|
||||
|
||||
TP_ARGS(ctx, user_data, res),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u64, user_data )
|
||||
__field( long, res )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->user_data = user_data;
|
||||
__entry->res = res;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, user_data 0x%llx, result %ld",
|
||||
__entry->ctx, (unsigned long long)__entry->user_data,
|
||||
__entry->res)
|
||||
);
|
||||
|
||||
|
||||
/**
|
||||
* io_uring_submit_sqe - called before submitting one SQE
|
||||
*
|
||||
* @ctx: pointer to a ring context structure
|
||||
* @user_data: user data associated with the request
|
||||
* @force_nonblock: whether a context blocking or not
|
||||
* @sq_thread: true if sq_thread has submitted this SQE
|
||||
*
|
||||
* Allows to track SQE submitting, to understand what was the source of it, SQ
|
||||
* thread or io_uring_enter call.
|
||||
*/
|
||||
TRACE_EVENT(io_uring_submit_sqe,
|
||||
|
||||
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
|
||||
|
||||
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
|
||||
|
||||
TP_STRUCT__entry (
|
||||
__field( void *, ctx )
|
||||
__field( u64, user_data )
|
||||
__field( bool, force_nonblock )
|
||||
__field( bool, sq_thread )
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->ctx = ctx;
|
||||
__entry->user_data = user_data;
|
||||
__entry->force_nonblock = force_nonblock;
|
||||
__entry->sq_thread = sq_thread;
|
||||
),
|
||||
|
||||
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
|
||||
__entry->ctx, (unsigned long long) __entry->user_data,
|
||||
__entry->force_nonblock, __entry->sq_thread)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_IO_URING_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
#include <trace/define_trace.h>
|
|
@ -19,7 +19,10 @@ struct io_uring_sqe {
|
|||
__u8 flags; /* IOSQE_ flags */
|
||||
__u16 ioprio; /* ioprio for the request */
|
||||
__s32 fd; /* file descriptor to do IO on */
|
||||
__u64 off; /* offset into file */
|
||||
union {
|
||||
__u64 off; /* offset into file */
|
||||
__u64 addr2;
|
||||
};
|
||||
__u64 addr; /* pointer to buffer or iovecs */
|
||||
__u32 len; /* buffer size or number of iovecs */
|
||||
union {
|
||||
|
@ -29,6 +32,8 @@ struct io_uring_sqe {
|
|||
__u32 sync_range_flags;
|
||||
__u32 msg_flags;
|
||||
__u32 timeout_flags;
|
||||
__u32 accept_flags;
|
||||
__u32 cancel_flags;
|
||||
};
|
||||
__u64 user_data; /* data to be passed back at completion time */
|
||||
union {
|
||||
|
@ -50,6 +55,7 @@ struct io_uring_sqe {
|
|||
#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */
|
||||
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
|
||||
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
|
||||
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
|
||||
|
||||
#define IORING_OP_NOP 0
|
||||
#define IORING_OP_READV 1
|
||||
|
@ -63,12 +69,21 @@ struct io_uring_sqe {
|
|||
#define IORING_OP_SENDMSG 9
|
||||
#define IORING_OP_RECVMSG 10
|
||||
#define IORING_OP_TIMEOUT 11
|
||||
#define IORING_OP_TIMEOUT_REMOVE 12
|
||||
#define IORING_OP_ACCEPT 13
|
||||
#define IORING_OP_ASYNC_CANCEL 14
|
||||
#define IORING_OP_LINK_TIMEOUT 15
|
||||
|
||||
/*
|
||||
* sqe->fsync_flags
|
||||
*/
|
||||
#define IORING_FSYNC_DATASYNC (1U << 0)
|
||||
|
||||
/*
|
||||
* sqe->timeout_flags
|
||||
*/
|
||||
#define IORING_TIMEOUT_ABS (1U << 0)
|
||||
|
||||
/*
|
||||
* IO completion data structure (Completion Queue Entry)
|
||||
*/
|
||||
|
@ -140,6 +155,7 @@ struct io_uring_params {
|
|||
* io_uring_params->features flags
|
||||
*/
|
||||
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
|
||||
#define IORING_FEAT_NODROP (1U << 1)
|
||||
|
||||
/*
|
||||
* io_uring_register(2) opcodes and arguments
|
||||
|
@ -150,5 +166,11 @@ struct io_uring_params {
|
|||
#define IORING_UNREGISTER_FILES 3
|
||||
#define IORING_REGISTER_EVENTFD 4
|
||||
#define IORING_UNREGISTER_EVENTFD 5
|
||||
#define IORING_REGISTER_FILES_UPDATE 6
|
||||
|
||||
struct io_uring_files_update {
|
||||
__u32 offset;
|
||||
__s32 *fds;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1548,6 +1548,7 @@ config AIO
|
|||
config IO_URING
|
||||
bool "Enable IO uring support" if EXPERT
|
||||
select ANON_INODES
|
||||
select IO_WQ
|
||||
default y
|
||||
help
|
||||
This option enables support for the io_uring interface, enabling
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <asm/tlb.h>
|
||||
|
||||
#include "../workqueue_internal.h"
|
||||
#include "../../fs/io-wq.h"
|
||||
#include "../smpboot.h"
|
||||
|
||||
#include "pelt.h"
|
||||
|
@ -4112,9 +4113,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
|
|||
* we disable preemption to avoid it calling schedule() again
|
||||
* in the possible wakeup of a kworker.
|
||||
*/
|
||||
if (tsk->flags & PF_WQ_WORKER) {
|
||||
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
|
||||
preempt_disable();
|
||||
wq_worker_sleeping(tsk);
|
||||
if (tsk->flags & PF_WQ_WORKER)
|
||||
wq_worker_sleeping(tsk);
|
||||
else
|
||||
io_wq_worker_sleeping(tsk);
|
||||
preempt_enable_no_resched();
|
||||
}
|
||||
|
||||
|
@ -4131,8 +4135,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
|
|||
|
||||
static void sched_update_worker(struct task_struct *tsk)
|
||||
{
|
||||
if (tsk->flags & PF_WQ_WORKER)
|
||||
wq_worker_running(tsk);
|
||||
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
|
||||
if (tsk->flags & PF_WQ_WORKER)
|
||||
wq_worker_running(tsk);
|
||||
else
|
||||
io_wq_worker_running(tsk);
|
||||
}
|
||||
}
|
||||
|
||||
asmlinkage __visible void __sched schedule(void)
|
||||
|
|
65
net/socket.c
65
net/socket.c
|
@ -1691,24 +1691,13 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
|
|||
return __sys_listen(fd, backlog);
|
||||
}
|
||||
|
||||
/*
|
||||
* For accept, we attempt to create a new socket, set up the link
|
||||
* with the client, wake up the client, then return the new
|
||||
* connected fd. We collect the address of the connector in kernel
|
||||
* space and move it to user at the very end. This is unclean because
|
||||
* we open the socket then return an error.
|
||||
*
|
||||
* 1003.1g adds the ability to recvmsg() to query connection pending
|
||||
* status to recvmsg. We need to add that support in a way thats
|
||||
* clean when we restructure accept also.
|
||||
*/
|
||||
|
||||
int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
|
||||
int __user *upeer_addrlen, int flags)
|
||||
int __sys_accept4_file(struct file *file, unsigned file_flags,
|
||||
struct sockaddr __user *upeer_sockaddr,
|
||||
int __user *upeer_addrlen, int flags)
|
||||
{
|
||||
struct socket *sock, *newsock;
|
||||
struct file *newfile;
|
||||
int err, len, newfd, fput_needed;
|
||||
int err, len, newfd;
|
||||
struct sockaddr_storage address;
|
||||
|
||||
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
|
||||
|
@ -1717,14 +1706,14 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
|
|||
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
|
||||
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
|
||||
|
||||
sock = sockfd_lookup_light(fd, &err, &fput_needed);
|
||||
sock = sock_from_file(file, &err);
|
||||
if (!sock)
|
||||
goto out;
|
||||
|
||||
err = -ENFILE;
|
||||
newsock = sock_alloc();
|
||||
if (!newsock)
|
||||
goto out_put;
|
||||
goto out;
|
||||
|
||||
newsock->type = sock->type;
|
||||
newsock->ops = sock->ops;
|
||||
|
@ -1739,20 +1728,21 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
|
|||
if (unlikely(newfd < 0)) {
|
||||
err = newfd;
|
||||
sock_release(newsock);
|
||||
goto out_put;
|
||||
goto out;
|
||||
}
|
||||
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
|
||||
if (IS_ERR(newfile)) {
|
||||
err = PTR_ERR(newfile);
|
||||
put_unused_fd(newfd);
|
||||
goto out_put;
|
||||
goto out;
|
||||
}
|
||||
|
||||
err = security_socket_accept(sock, newsock);
|
||||
if (err)
|
||||
goto out_fd;
|
||||
|
||||
err = sock->ops->accept(sock, newsock, sock->file->f_flags, false);
|
||||
err = sock->ops->accept(sock, newsock, sock->file->f_flags | file_flags,
|
||||
false);
|
||||
if (err < 0)
|
||||
goto out_fd;
|
||||
|
||||
|
@ -1773,15 +1763,42 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
|
|||
|
||||
fd_install(newfd, newfile);
|
||||
err = newfd;
|
||||
|
||||
out_put:
|
||||
fput_light(sock->file, fput_needed);
|
||||
out:
|
||||
return err;
|
||||
out_fd:
|
||||
fput(newfile);
|
||||
put_unused_fd(newfd);
|
||||
goto out_put;
|
||||
goto out;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* For accept, we attempt to create a new socket, set up the link
|
||||
* with the client, wake up the client, then return the new
|
||||
* connected fd. We collect the address of the connector in kernel
|
||||
* space and move it to user at the very end. This is unclean because
|
||||
* we open the socket then return an error.
|
||||
*
|
||||
* 1003.1g adds the ability to recvmsg() to query connection pending
|
||||
* status to recvmsg. We need to add that support in a way thats
|
||||
* clean when we restructure accept also.
|
||||
*/
|
||||
|
||||
int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
|
||||
int __user *upeer_addrlen, int flags)
|
||||
{
|
||||
int ret = -EBADF;
|
||||
struct fd f;
|
||||
|
||||
f = fdget(fd);
|
||||
if (f.file) {
|
||||
ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
|
||||
upeer_addrlen, flags);
|
||||
if (f.flags)
|
||||
fput(f.file);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
|
||||
|
|
Loading…
Reference in New Issue