Browse Source

aio-posix: add aio_add_sqe() API for user-defined io_uring requests

Introduce the aio_add_sqe() API for submitting io_uring requests in the
current AioContext. This allows other components in QEMU, like the block
layer, to take advantage of io_uring features without creating their own
io_uring context.

This API supports nested event loops just like file descriptor
monitoring and BHs do. This comes at a complexity cost: CQE callbacks
must be placed on a list so that nested event loops can invoke pending
CQE callbacks from parent event loops. If you're wondering why
CqeHandler exists instead of just a callback function pointer, this is
why.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-ID: <20251104022933.618123-14-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
pull/307/head
Stefan Hajnoczi 5 months ago
committed by Kevin Wolf
parent
commit
1eebdab3c3
  1. 83
      include/block/aio.h
  2. 9
      util/aio-posix.c
  3. 1
      util/aio-posix.h
  4. 109
      util/fdmon-io_uring.c
  5. 4
      util/trace-events

83
include/block/aio.h

@ -61,6 +61,27 @@ typedef struct LuringState LuringState;
/* Is polling disabled? */
bool aio_poll_disabled(AioContext *ctx);
#ifdef CONFIG_LINUX_IO_URING
/*
* Each io_uring request must have a unique CqeHandler that processes the cqe.
* The lifetime of a CqeHandler must be at least from aio_add_sqe() until
* ->cb() invocation.
*/
typedef struct CqeHandler CqeHandler;
struct CqeHandler {
/* Called by the AioContext when the request has completed */
void (*cb)(CqeHandler *handler);
/* Used internally, do not access this */
QSIMPLEQ_ENTRY(CqeHandler) next;
/* This field is filled in before ->cb() is called */
struct io_uring_cqe cqe;
};
typedef QSIMPLEQ_HEAD(, CqeHandler) CqeHandlerSimpleQ;
#endif /* CONFIG_LINUX_IO_URING */
/* Callbacks for file descriptor monitoring implementations */
typedef struct {
/*
@ -157,6 +178,27 @@ typedef struct {
* Called with list_lock incremented.
*/
void (*gsource_dispatch)(AioContext *ctx, AioHandlerList *ready_list);
#ifdef CONFIG_LINUX_IO_URING
/**
* add_sqe: Add an io_uring sqe for submission.
* @prep_sqe: invoked with an sqe that should be prepared for submission
* @opaque: user-defined argument to @prep_sqe()
* @cqe_handler: the unique cqe handler associated with this request
*
* The caller's @prep_sqe() function is invoked to fill in the details of
* the sqe. Do not call io_uring_sqe_set_data() on this sqe.
*
* The kernel may see the sqe as soon as @prep_sqe() returns or it may take
* until the next event loop iteration.
*
* This function is called from the current AioContext and is not
* thread-safe.
*/
void (*add_sqe)(AioContext *ctx,
void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler);
#endif /* CONFIG_LINUX_IO_URING */
} FDMonOps;
/*
@ -274,7 +316,10 @@ struct AioContext {
struct io_uring fdmon_io_uring;
AioHandlerSList submit_list;
void *io_uring_fd_tag;
#endif
/* Pending callback state for cqe handlers */
CqeHandlerSimpleQ cqe_handler_ready_list;
#endif /* CONFIG_LINUX_IO_URING */
/* TimerLists for calling timers - one per clock type. Has its own
* locking.
@ -782,4 +827,40 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch);
*/
void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
int64_t max, Error **errp);
#ifdef CONFIG_LINUX_IO_URING
/**
* aio_has_io_uring: Return whether io_uring is available.
*
* io_uring is either available in all AioContexts or in none, so this only
* needs to be called once from within any thread's AioContext.
*/
static inline bool aio_has_io_uring(void)
{
AioContext *ctx = qemu_get_current_aio_context();
return ctx->fdmon_ops->add_sqe;
}
/**
* aio_add_sqe: Add an io_uring sqe for submission.
* @prep_sqe: invoked with an sqe that should be prepared for submission
* @opaque: user-defined argument to @prep_sqe()
* @cqe_handler: the unique cqe handler associated with this request
*
* The caller's @prep_sqe() function is invoked to fill in the details of the
* sqe. Do not call io_uring_sqe_set_data() on this sqe.
*
* The sqe is submitted by the current AioContext. The kernel may see the sqe
* as soon as @prep_sqe() returns or it may take until the next event loop
* iteration.
*
* When the AioContext is destroyed, pending sqes are ignored and their
* CqeHandlers are not invoked.
*
* This function must be called only when aio_has_io_uring() returns true.
*/
void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler);
#endif /* CONFIG_LINUX_IO_URING */
#endif

9
util/aio-posix.c

@ -806,3 +806,12 @@ void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch)
aio_notify(ctx);
}
#ifdef CONFIG_LINUX_IO_URING
void aio_add_sqe(void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler)
{
AioContext *ctx = qemu_get_current_aio_context();
ctx->fdmon_ops->add_sqe(ctx, prep_sqe, opaque, cqe_handler);
}
#endif /* CONFIG_LINUX_IO_URING */

1
util/aio-posix.h

@ -36,6 +36,7 @@ struct AioHandler {
#ifdef CONFIG_LINUX_IO_URING
QSLIST_ENTRY(AioHandler) node_submitted;
unsigned flags; /* see fdmon-io_uring.c */
CqeHandler internal_cqe_handler; /* used for POLL_ADD/POLL_REMOVE */
#endif
int64_t poll_idle_timeout; /* when to stop userspace polling */
bool poll_ready; /* has polling detected an event? */

109
util/fdmon-io_uring.c

@ -46,8 +46,10 @@
#include "qemu/osdep.h"
#include <poll.h>
#include "qapi/error.h"
#include "qemu/defer-call.h"
#include "qemu/rcu_queue.h"
#include "aio-posix.h"
#include "trace.h"
enum {
FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
@ -76,8 +78,8 @@ static inline int pfd_events_from_poll(int poll_events)
}
/*
* Returns an sqe for submitting a request. Only be called within
* fdmon_io_uring_wait().
* Returns an sqe for submitting a request. Only called from the AioContext
* thread.
*/
static struct io_uring_sqe *get_sqe(AioContext *ctx)
{
@ -168,23 +170,46 @@ static void fdmon_io_uring_update(AioContext *ctx,
}
}
static void fdmon_io_uring_add_sqe(AioContext *ctx,
void (*prep_sqe)(struct io_uring_sqe *sqe, void *opaque),
void *opaque, CqeHandler *cqe_handler)
{
struct io_uring_sqe *sqe = get_sqe(ctx);
prep_sqe(sqe, opaque);
io_uring_sqe_set_data(sqe, cqe_handler);
trace_fdmon_io_uring_add_sqe(ctx, opaque, sqe->opcode, sqe->fd, sqe->off,
cqe_handler);
}
static void fdmon_special_cqe_handler(CqeHandler *cqe_handler)
{
/*
* This is an empty function that is never called. It is used as a function
* pointer to distinguish it from ordinary cqe handlers.
*/
}
static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
{
struct io_uring_sqe *sqe = get_sqe(ctx);
int events = poll_events_from_pfd(node->pfd.events);
io_uring_prep_poll_add(sqe, node->pfd.fd, events);
io_uring_sqe_set_data(sqe, node);
node->internal_cqe_handler.cb = fdmon_special_cqe_handler;
io_uring_sqe_set_data(sqe, &node->internal_cqe_handler);
}
static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
{
struct io_uring_sqe *sqe = get_sqe(ctx);
CqeHandler *cqe_handler = &node->internal_cqe_handler;
#ifdef LIBURING_HAVE_DATA64
io_uring_prep_poll_remove(sqe, (uintptr_t)node);
io_uring_prep_poll_remove(sqe, (uintptr_t)cqe_handler);
#else
io_uring_prep_poll_remove(sqe, node);
io_uring_prep_poll_remove(sqe, cqe_handler);
#endif
io_uring_sqe_set_data(sqe, NULL);
}
@ -219,19 +244,13 @@ static void fill_sq_ring(AioContext *ctx)
}
}
/* Returns true if a handler became ready */
static bool process_cqe(AioContext *ctx,
AioHandlerList *ready_list,
struct io_uring_cqe *cqe)
static bool process_cqe_aio_handler(AioContext *ctx,
AioHandlerList *ready_list,
AioHandler *node,
struct io_uring_cqe *cqe)
{
AioHandler *node = io_uring_cqe_get_data(cqe);
unsigned flags;
/* poll_timeout and poll_remove have a zero user_data field */
if (!node) {
return false;
}
/*
* Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
* with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
@ -255,6 +274,35 @@ static bool process_cqe(AioContext *ctx,
return true;
}
/* Returns true if a handler became ready */
static bool process_cqe(AioContext *ctx,
AioHandlerList *ready_list,
struct io_uring_cqe *cqe)
{
CqeHandler *cqe_handler = io_uring_cqe_get_data(cqe);
/* poll_timeout and poll_remove have a zero user_data field */
if (!cqe_handler) {
return false;
}
/*
* Special handling for AioHandler cqes. They need ready_list and have a
* return value.
*/
if (cqe_handler->cb == fdmon_special_cqe_handler) {
AioHandler *node = container_of(cqe_handler, AioHandler,
internal_cqe_handler);
return process_cqe_aio_handler(ctx, ready_list, node, cqe);
}
cqe_handler->cqe = *cqe;
/* Handlers are invoked later by fdmon_io_uring_dispatch() */
QSIMPLEQ_INSERT_TAIL(&ctx->cqe_handler_ready_list, cqe_handler, next);
return false;
}
static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
{
struct io_uring *ring = &ctx->fdmon_io_uring;
@ -299,6 +347,32 @@ static bool fdmon_io_uring_gsource_check(AioContext *ctx)
return g_source_query_unix_fd(&ctx->source, tag) & G_IO_IN;
}
/* Dispatch CQE handlers that are ready */
static bool fdmon_io_uring_dispatch(AioContext *ctx)
{
CqeHandlerSimpleQ *ready_list = &ctx->cqe_handler_ready_list;
bool progress = false;
/* Handlers may use defer_call() to coalesce frequent operations */
defer_call_begin();
while (!QSIMPLEQ_EMPTY(ready_list)) {
CqeHandler *cqe_handler = QSIMPLEQ_FIRST(ready_list);
QSIMPLEQ_REMOVE_HEAD(ready_list, next);
trace_fdmon_io_uring_cqe_handler(ctx, cqe_handler,
cqe_handler->cqe.res);
cqe_handler->cb(cqe_handler);
progress = true;
}
defer_call_end();
return progress;
}
/* This is where CQEs are processed in the glib event loop */
static void fdmon_io_uring_gsource_dispatch(AioContext *ctx,
AioHandlerList *ready_list)
@ -371,9 +445,11 @@ static const FDMonOps fdmon_io_uring_ops = {
.update = fdmon_io_uring_update,
.wait = fdmon_io_uring_wait,
.need_wait = fdmon_io_uring_need_wait,
.dispatch = fdmon_io_uring_dispatch,
.gsource_prepare = fdmon_io_uring_gsource_prepare,
.gsource_check = fdmon_io_uring_gsource_check,
.gsource_dispatch = fdmon_io_uring_gsource_dispatch,
.add_sqe = fdmon_io_uring_add_sqe,
};
bool fdmon_io_uring_setup(AioContext *ctx, Error **errp)
@ -389,6 +465,7 @@ bool fdmon_io_uring_setup(AioContext *ctx, Error **errp)
}
QSLIST_INIT(&ctx->submit_list);
QSIMPLEQ_INIT(&ctx->cqe_handler_ready_list);
ctx->fdmon_ops = &fdmon_io_uring_ops;
ctx->io_uring_fd_tag = g_source_add_unix_fd(&ctx->source,
ctx->fdmon_io_uring.ring_fd, G_IO_IN);
@ -425,6 +502,8 @@ void fdmon_io_uring_destroy(AioContext *ctx)
g_source_remove_unix_fd(&ctx->source, ctx->io_uring_fd_tag);
ctx->io_uring_fd_tag = NULL;
assert(QSIMPLEQ_EMPTY(&ctx->cqe_handler_ready_list));
qemu_lockcnt_lock(&ctx->list_lock);
fdmon_poll_downgrade(ctx);
qemu_lockcnt_unlock(&ctx->list_lock);

4
util/trace-events

@ -24,6 +24,10 @@ buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes
buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
buffer_free(const char *buf, size_t len) "%s: capacity %zd"
# fdmon-io_uring.c
fdmon_io_uring_add_sqe(void *ctx, void *opaque, int opcode, int fd, uint64_t off, void *cqe_handler) "ctx %p opaque %p opcode %d fd %d off %"PRId64" cqe_handler %p"
fdmon_io_uring_cqe_handler(void *ctx, void *cqe_handler, int cqe_res) "ctx %p cqe_handler %p cqe_res %d"
# filemonitor-inotify.c
qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64
qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64

Loading…
Cancel
Save