From 0c330a734b51c177ab8488932ac3b0c4d63a718a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 14:52:19 +0100 Subject: aio: introduce aio_co_schedule and aio_co_wake aio_co_wake provides the infrastructure to start a coroutine on a "home" AioContext. It will be used by CoMutex and CoQueue, so that coroutines don't jump from one context to another when they go to sleep on a mutex or waitqueue. However, it can also be used as a more efficient alternative to one-shot bottom halves, and saves the effort of tracking which AioContext a coroutine is running on. aio_co_schedule is the part of aio_co_wake that starts a coroutine on a remove AioContext, but it is also useful to implement e.g. bdrv_set_aio_context callbacks. The implementation of aio_co_schedule is based on a lock-free multiple-producer, single-consumer queue. The multiple producers use cmpxchg to add to a LIFO stack. The consumer (a per-AioContext bottom half) grabs all items added so far, inverts the list to make it FIFO, and goes through it one item at a time until it's empty. The data structure was inspired by OSv, which uses it in the very code we'll "port" to QEMU for the thread-safe CoMutex. Most of the new code is really tests. Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213135235.12274-3-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/block/aio.h | 32 ++++++++++++++++++++++++++++++++ include/qemu/coroutine_int.h | 11 ++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/block/aio.h b/include/block/aio.h index 7df271d2b9..614cbc6982 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -47,6 +47,7 @@ typedef void QEMUBHFunc(void *opaque); typedef bool AioPollFn(void *opaque); typedef void IOHandler(void *opaque); +struct Coroutine; struct ThreadPool; struct LinuxAioState; @@ -108,6 +109,9 @@ struct AioContext { bool notified; EventNotifier notifier; + QSLIST_HEAD(, Coroutine) scheduled_coroutines; + QEMUBH *co_schedule_bh; + /* Thread pool for performing work and receiving completion callbacks. * Has its own locking. */ @@ -482,6 +486,34 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external) return !is_external || !atomic_read(&ctx->external_disable_cnt); } +/** + * aio_co_schedule: + * @ctx: the aio context + * @co: the coroutine + * + * Start a coroutine on a remote AioContext. + * + * The coroutine must not be entered by anyone else while aio_co_schedule() + * is active. In addition the coroutine must have yielded unless ctx + * is the context in which the coroutine is running (i.e. the value of + * qemu_get_current_aio_context() from the coroutine itself). + */ +void aio_co_schedule(AioContext *ctx, struct Coroutine *co); + +/** + * aio_co_wake: + * @co: the coroutine + * + * Restart a coroutine on the AioContext where it was running last, thus + * preventing coroutines from jumping from one context to another when they + * go to sleep. + * + * aio_co_wake may be executed either in coroutine or non-coroutine + * context. The coroutine must not be entered by anyone else while + * aio_co_wake() is active. + */ +void aio_co_wake(struct Coroutine *co); + /** * Return the AioContext whose event loop runs in the current thread. * diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h index 14d4f1d1f2..cb98892bba 100644 --- a/include/qemu/coroutine_int.h +++ b/include/qemu/coroutine_int.h @@ -40,12 +40,21 @@ struct Coroutine { CoroutineEntry *entry; void *entry_arg; Coroutine *caller; + + /* Only used when the coroutine has terminated. */ QSLIST_ENTRY(Coroutine) pool_next; + size_t locks_held; - /* Coroutines that should be woken up when we yield or terminate */ + /* Coroutines that should be woken up when we yield or terminate. + * Only used when the coroutine is running. + */ QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup; + + /* Only used when the coroutine has yielded. */ + AioContext *ctx; QSIMPLEQ_ENTRY(Coroutine) co_queue_next; + QSLIST_ENTRY(Coroutine) co_scheduled_next; }; Coroutine *qemu_coroutine_new(void); -- cgit v1.2.3-55-g7522 From bf88c1247f80ac6d62710d5d0d0d9ce3a53e99ec Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 14:52:22 +0100 Subject: io: add methods to set I/O handlers on AioContext This is in preparation for making qio_channel_yield work on AioContexts other than the main one. Reviewed-by: Daniel P. Berrange Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213135235.12274-6-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/io/channel.h | 25 +++++++++++++++++++++++++ io/channel-command.c | 13 +++++++++++++ io/channel-file.c | 11 +++++++++++ io/channel-socket.c | 16 +++++++++++----- io/channel-tls.c | 12 ++++++++++++ io/channel-watch.c | 6 ++++++ io/channel.c | 11 +++++++++++ 7 files changed, 89 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/io/channel.h b/include/io/channel.h index 32a9470794..0bc7c3f47f 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -23,6 +23,7 @@ #include "qemu-common.h" #include "qom/object.h" +#include "block/aio.h" #define TYPE_QIO_CHANNEL "qio-channel" #define QIO_CHANNEL(obj) \ @@ -132,6 +133,11 @@ struct QIOChannelClass { off_t offset, int whence, Error **errp); + void (*io_set_aio_fd_handler)(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque); }; /* General I/O handling functions */ @@ -525,4 +531,23 @@ void qio_channel_yield(QIOChannel *ioc, void qio_channel_wait(QIOChannel *ioc, GIOCondition condition); +/** + * qio_channel_set_aio_fd_handler: + * @ioc: the channel object + * @ctx: the AioContext to set the handlers on + * @io_read: the read handler + * @io_write: the write handler + * @opaque: the opaque value passed to the handler + * + * This is used internally by qio_channel_yield(). It can + * be used by channel implementations to forward the handlers + * to another channel (e.g. from #QIOChannelTLS to the + * underlying socket). + */ +void qio_channel_set_aio_fd_handler(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque); + #endif /* QIO_CHANNEL_H */ diff --git a/io/channel-command.c b/io/channel-command.c index ad25313be1..319c5ed50c 100644 --- a/io/channel-command.c +++ b/io/channel-command.c @@ -328,6 +328,18 @@ static int qio_channel_command_close(QIOChannel *ioc, } +static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque) +{ + QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc); + aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque); + aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque); +} + + static GSource *qio_channel_command_create_watch(QIOChannel *ioc, GIOCondition condition) { @@ -349,6 +361,7 @@ static void qio_channel_command_class_init(ObjectClass *klass, ioc_klass->io_set_blocking = qio_channel_command_set_blocking; ioc_klass->io_close = qio_channel_command_close; ioc_klass->io_create_watch = qio_channel_command_create_watch; + ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler; } static const TypeInfo qio_channel_command_info = { diff --git a/io/channel-file.c b/io/channel-file.c index e1da2435e6..b383273201 100644 --- a/io/channel-file.c +++ b/io/channel-file.c @@ -186,6 +186,16 @@ static int qio_channel_file_close(QIOChannel *ioc, } +static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque) +{ + QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc); + aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque); +} + static GSource *qio_channel_file_create_watch(QIOChannel *ioc, GIOCondition condition) { @@ -206,6 +216,7 @@ static void qio_channel_file_class_init(ObjectClass *klass, ioc_klass->io_seek = qio_channel_file_seek; ioc_klass->io_close = qio_channel_file_close; ioc_klass->io_create_watch = qio_channel_file_create_watch; + ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler; } static const TypeInfo qio_channel_file_info = { diff --git a/io/channel-socket.c b/io/channel-socket.c index f385233f18..f546c6830e 100644 --- a/io/channel-socket.c +++ b/io/channel-socket.c @@ -649,11 +649,6 @@ qio_channel_socket_set_blocking(QIOChannel *ioc, qemu_set_block(sioc->fd); } else { qemu_set_nonblock(sioc->fd); -#ifdef WIN32 - WSAEventSelect(sioc->fd, ioc->event, - FD_READ | FD_ACCEPT | FD_CLOSE | - FD_CONNECT | FD_WRITE | FD_OOB); -#endif } return 0; } @@ -733,6 +728,16 @@ qio_channel_socket_shutdown(QIOChannel *ioc, return 0; } +static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque) +{ + QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc); + aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque); +} + static GSource *qio_channel_socket_create_watch(QIOChannel *ioc, GIOCondition condition) { @@ -755,6 +760,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass, ioc_klass->io_set_cork = qio_channel_socket_set_cork; ioc_klass->io_set_delay = qio_channel_socket_set_delay; ioc_klass->io_create_watch = qio_channel_socket_create_watch; + ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler; } static const TypeInfo qio_channel_socket_info = { diff --git a/io/channel-tls.c b/io/channel-tls.c index f25ab0ae53..6182702dab 100644 --- a/io/channel-tls.c +++ b/io/channel-tls.c @@ -345,6 +345,17 @@ static int qio_channel_tls_close(QIOChannel *ioc, return qio_channel_close(tioc->master, errp); } +static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque) +{ + QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc); + + qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque); +} + static GSource *qio_channel_tls_create_watch(QIOChannel *ioc, GIOCondition condition) { @@ -372,6 +383,7 @@ static void qio_channel_tls_class_init(ObjectClass *klass, ioc_klass->io_close = qio_channel_tls_close; ioc_klass->io_shutdown = qio_channel_tls_shutdown; ioc_klass->io_create_watch = qio_channel_tls_create_watch; + ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler; } static const TypeInfo qio_channel_tls_info = { diff --git a/io/channel-watch.c b/io/channel-watch.c index cf1cdff896..8640d1c464 100644 --- a/io/channel-watch.c +++ b/io/channel-watch.c @@ -285,6 +285,12 @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc, GSource *source; QIOChannelSocketSource *ssource; +#ifdef WIN32 + WSAEventSelect(socket, ioc->event, + FD_READ | FD_ACCEPT | FD_CLOSE | + FD_CONNECT | FD_WRITE | FD_OOB); +#endif + source = g_source_new(&qio_channel_socket_source_funcs, sizeof(QIOChannelSocketSource)); ssource = (QIOChannelSocketSource *)source; diff --git a/io/channel.c b/io/channel.c index 80924c1772..ce470d7c5d 100644 --- a/io/channel.c +++ b/io/channel.c @@ -154,6 +154,17 @@ GSource *qio_channel_create_watch(QIOChannel *ioc, } +void qio_channel_set_aio_fd_handler(QIOChannel *ioc, + AioContext *ctx, + IOHandler *io_read, + IOHandler *io_write, + void *opaque) +{ + QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc); + + klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque); +} + guint qio_channel_add_watch(QIOChannel *ioc, GIOCondition condition, QIOChannelFunc func, -- cgit v1.2.3-55-g7522 From c4c497d27f0be8552ae244e76ba2bce66bd2443e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 14:52:23 +0100 Subject: io: make qio_channel_yield aware of AioContexts Support separate coroutines for reading and writing, and place the read/write handlers on the AioContext that the QIOChannel is registered with. Reviewed-by: Daniel P. Berrange Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213135235.12274-7-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/io/channel.h | 47 ++++++++++++++++++++++++++-- io/channel.c | 86 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 109 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/io/channel.h b/include/io/channel.h index 0bc7c3f47f..5d48906998 100644 --- a/include/io/channel.h +++ b/include/io/channel.h @@ -23,6 +23,7 @@ #include "qemu-common.h" #include "qom/object.h" +#include "qemu/coroutine.h" #include "block/aio.h" #define TYPE_QIO_CHANNEL "qio-channel" @@ -81,6 +82,9 @@ struct QIOChannel { Object parent; unsigned int features; /* bitmask of QIOChannelFeatures */ char *name; + AioContext *ctx; + Coroutine *read_coroutine; + Coroutine *write_coroutine; #ifdef _WIN32 HANDLE event; /* For use with GSource on Win32 */ #endif @@ -502,14 +506,51 @@ guint qio_channel_add_watch(QIOChannel *ioc, GDestroyNotify notify); +/** + * qio_channel_attach_aio_context: + * @ioc: the channel object + * @ctx: the #AioContext to set the handlers on + * + * Request that qio_channel_yield() sets I/O handlers on + * the given #AioContext. If @ctx is %NULL, qio_channel_yield() + * uses QEMU's main thread event loop. + * + * You can move a #QIOChannel from one #AioContext to another even if + * I/O handlers are set for a coroutine. However, #QIOChannel provides + * no synchronization between the calls to qio_channel_yield() and + * qio_channel_attach_aio_context(). + * + * Therefore you should first call qio_channel_detach_aio_context() + * to ensure that the coroutine is not entered concurrently. Then, + * while the coroutine has yielded, call qio_channel_attach_aio_context(), + * and then aio_co_schedule() to place the coroutine on the new + * #AioContext. The calls to qio_channel_detach_aio_context() + * and qio_channel_attach_aio_context() should be protected with + * aio_context_acquire() and aio_context_release(). + */ +void qio_channel_attach_aio_context(QIOChannel *ioc, + AioContext *ctx); + +/** + * qio_channel_detach_aio_context: + * @ioc: the channel object + * + * Disable any I/O handlers set by qio_channel_yield(). With the + * help of aio_co_schedule(), this allows moving a coroutine that was + * paused by qio_channel_yield() to another context. + */ +void qio_channel_detach_aio_context(QIOChannel *ioc); + /** * qio_channel_yield: * @ioc: the channel object * @condition: the I/O condition to wait for * - * Yields execution from the current coroutine until - * the condition indicated by @condition becomes - * available. + * Yields execution from the current coroutine until the condition + * indicated by @condition becomes available. @condition must + * be either %G_IO_IN or %G_IO_OUT; it cannot contain both. In + * addition, no two coroutine can be waiting on the same condition + * and channel at the same time. * * This must only be called from coroutine context */ diff --git a/io/channel.c b/io/channel.c index ce470d7c5d..cdf74540c1 100644 --- a/io/channel.c +++ b/io/channel.c @@ -21,7 +21,7 @@ #include "qemu/osdep.h" #include "io/channel.h" #include "qapi/error.h" -#include "qemu/coroutine.h" +#include "qemu/main-loop.h" bool qio_channel_has_feature(QIOChannel *ioc, QIOChannelFeature feature) @@ -238,36 +238,80 @@ off_t qio_channel_io_seek(QIOChannel *ioc, } -typedef struct QIOChannelYieldData QIOChannelYieldData; -struct QIOChannelYieldData { - QIOChannel *ioc; - Coroutine *co; -}; +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc); + +static void qio_channel_restart_read(void *opaque) +{ + QIOChannel *ioc = opaque; + Coroutine *co = ioc->read_coroutine; + ioc->read_coroutine = NULL; + qio_channel_set_aio_fd_handlers(ioc); + aio_co_wake(co); +} -static gboolean qio_channel_yield_enter(QIOChannel *ioc, - GIOCondition condition, - gpointer opaque) +static void qio_channel_restart_write(void *opaque) { - QIOChannelYieldData *data = opaque; - qemu_coroutine_enter(data->co); - return FALSE; + QIOChannel *ioc = opaque; + Coroutine *co = ioc->write_coroutine; + + ioc->write_coroutine = NULL; + qio_channel_set_aio_fd_handlers(ioc); + aio_co_wake(co); } +static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc) +{ + IOHandler *rd_handler = NULL, *wr_handler = NULL; + AioContext *ctx; + + if (ioc->read_coroutine) { + rd_handler = qio_channel_restart_read; + } + if (ioc->write_coroutine) { + wr_handler = qio_channel_restart_write; + } + + ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context(); + qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc); +} + +void qio_channel_attach_aio_context(QIOChannel *ioc, + AioContext *ctx) +{ + AioContext *old_ctx; + if (ioc->ctx == ctx) { + return; + } + + old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context(); + qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL); + ioc->ctx = ctx; + qio_channel_set_aio_fd_handlers(ioc); +} + +void qio_channel_detach_aio_context(QIOChannel *ioc) +{ + ioc->read_coroutine = NULL; + ioc->write_coroutine = NULL; + qio_channel_set_aio_fd_handlers(ioc); + ioc->ctx = NULL; +} void coroutine_fn qio_channel_yield(QIOChannel *ioc, GIOCondition condition) { - QIOChannelYieldData data; - assert(qemu_in_coroutine()); - data.ioc = ioc; - data.co = qemu_coroutine_self(); - qio_channel_add_watch(ioc, - condition, - qio_channel_yield_enter, - &data, - NULL); + if (condition == G_IO_IN) { + assert(!ioc->read_coroutine); + ioc->read_coroutine = qemu_coroutine_self(); + } else if (condition == G_IO_OUT) { + assert(!ioc->write_coroutine); + ioc->write_coroutine = qemu_coroutine_self(); + } else { + abort(); + } + qio_channel_set_aio_fd_handlers(ioc); qemu_coroutine_yield(); } -- cgit v1.2.3-55-g7522 From a153bf52b37e148f052b0869600877130671a03d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 14:52:33 +0100 Subject: aio-posix: partially inline aio_dispatch into aio_poll This patch prepares for the removal of unnecessary lockcnt inc/dec pairs. Extract the dispatching loop for file descriptor handlers into a new function aio_dispatch_handlers, and then inline aio_dispatch into aio_poll. aio_dispatch can now become void. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Reviewed-by: Daniel P. Berrange Message-id: 20170213135235.12274-17-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/block/aio.h | 6 +----- util/aio-posix.c | 44 ++++++++++++++------------------------------ util/aio-win32.c | 13 ++++--------- util/async.c | 2 +- 4 files changed, 20 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/block/aio.h b/include/block/aio.h index 614cbc6982..677b6ffc25 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -310,12 +310,8 @@ bool aio_pending(AioContext *ctx); /* Dispatch any pending callbacks from the GSource attached to the AioContext. * * This is used internally in the implementation of the GSource. - * - * @dispatch_fds: true to process fds, false to skip them - * (can be used as an optimization by callers that know there - * are no fds ready) */ -bool aio_dispatch(AioContext *ctx, bool dispatch_fds); +void aio_dispatch(AioContext *ctx); /* Progress in completing AIO work to occur. This can issue new pending * aio as a result of executing I/O completion or bh callbacks. diff --git a/util/aio-posix.c b/util/aio-posix.c index 84cee4315d..2173378570 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -386,12 +386,6 @@ static bool aio_dispatch_handlers(AioContext *ctx) AioHandler *node, *tmp; bool progress = false; - /* - * We have to walk very carefully in case aio_set_fd_handler is - * called while we're walking. - */ - qemu_lockcnt_inc(&ctx->list_lock); - QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) { int revents; @@ -426,33 +420,18 @@ static bool aio_dispatch_handlers(AioContext *ctx) } } - qemu_lockcnt_dec(&ctx->list_lock); return progress; } -/* - * Note that dispatch_fds == false has the side-effect of post-poning the - * freeing of deleted handlers. - */ -bool aio_dispatch(AioContext *ctx, bool dispatch_fds) +void aio_dispatch(AioContext *ctx) { - bool progress; - - /* - * If there are callbacks left that have been queued, we need to call them. - * Do not call select in this case, because it is possible that the caller - * does not need a complete flush (as is the case for aio_poll loops). - */ - progress = aio_bh_poll(ctx); + aio_bh_poll(ctx); - if (dispatch_fds) { - progress |= aio_dispatch_handlers(ctx); - } - - /* Run our timers */ - progress |= timerlistgroup_run_timers(&ctx->tlg); + qemu_lockcnt_inc(&ctx->list_lock); + aio_dispatch_handlers(ctx); + qemu_lockcnt_dec(&ctx->list_lock); - return progress; + timerlistgroup_run_timers(&ctx->tlg); } /* These thread-local variables are used only in a small part of aio_poll @@ -702,11 +681,16 @@ bool aio_poll(AioContext *ctx, bool blocking) npfd = 0; qemu_lockcnt_dec(&ctx->list_lock); - /* Run dispatch even if there were no readable fds to run timers */ - if (aio_dispatch(ctx, ret > 0)) { - progress = true; + progress |= aio_bh_poll(ctx); + + if (ret > 0) { + qemu_lockcnt_inc(&ctx->list_lock); + progress |= aio_dispatch_handlers(ctx); + qemu_lockcnt_dec(&ctx->list_lock); } + progress |= timerlistgroup_run_timers(&ctx->tlg); + return progress; } diff --git a/util/aio-win32.c b/util/aio-win32.c index 20b63ce875..442a1792b7 100644 --- a/util/aio-win32.c +++ b/util/aio-win32.c @@ -309,16 +309,11 @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event) return progress; } -bool aio_dispatch(AioContext *ctx, bool dispatch_fds) +void aio_dispatch(AioContext *ctx) { - bool progress; - - progress = aio_bh_poll(ctx); - if (dispatch_fds) { - progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE); - } - progress |= timerlistgroup_run_timers(&ctx->tlg); - return progress; + aio_bh_poll(ctx); + aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE); + timerlistgroup_run_timers(&ctx->tlg); } bool aio_poll(AioContext *ctx, bool blocking) diff --git a/util/async.c b/util/async.c index c54da7164e..187bc5bc4b 100644 --- a/util/async.c +++ b/util/async.c @@ -258,7 +258,7 @@ aio_ctx_dispatch(GSource *source, AioContext *ctx = (AioContext *) source; assert(callback == NULL); - aio_dispatch(ctx, true); + aio_dispatch(ctx); return true; } -- cgit v1.2.3-55-g7522 From 91bcea4899017891983b9149bd50cb283e78dfc0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 14:52:35 +0100 Subject: block: document fields protected by AioContext lock Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Reviewed-by: Daniel P. Berrange Message-id: 20170213135235.12274-19-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/block/block_int.h | 64 +++++++++++++++++++++++++----------------- include/sysemu/block-backend.h | 14 ++++++--- 2 files changed, 49 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/block/block_int.h b/include/block/block_int.h index 2d92d7edfe..1670941da9 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -430,8 +430,9 @@ struct BdrvChild { * copied as well. */ struct BlockDriverState { - int64_t total_sectors; /* if we are reading a disk image, give its - size in sectors */ + /* Protected by big QEMU lock or read-only after opening. No special + * locking needed during I/O... + */ int open_flags; /* flags used to open the file, re-used for re-open */ bool read_only; /* if true, the media is read only */ bool encrypted; /* if true, the media is encrypted */ @@ -439,14 +440,6 @@ struct BlockDriverState { bool sg; /* if true, the device is a /dev/sg* */ bool probed; /* if true, format was probed rather than specified */ - int copy_on_read; /* if nonzero, copy read backing sectors into image. - note this is a reference count */ - - CoQueue flush_queue; /* Serializing flush queue */ - bool active_flush_req; /* Flush request in flight? */ - unsigned int write_gen; /* Current data generation */ - unsigned int flushed_gen; /* Flushed write generation */ - BlockDriver *drv; /* NULL means no media */ void *opaque; @@ -468,18 +461,6 @@ struct BlockDriverState { BdrvChild *backing; BdrvChild *file; - /* Callback before write request is processed */ - NotifierWithReturnList before_write_notifiers; - - /* number of in-flight requests; overall and serialising */ - unsigned int in_flight; - unsigned int serialising_in_flight; - - bool wakeup; - - /* Offset after the highest byte written to */ - uint64_t wr_highest_offset; - /* I/O Limits */ BlockLimits bl; @@ -497,11 +478,8 @@ struct BlockDriverState { QTAILQ_ENTRY(BlockDriverState) bs_list; /* element of the list of monitor-owned BDS */ QTAILQ_ENTRY(BlockDriverState) monitor_list; - QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; int refcnt; - QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; - /* operation blockers */ QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX]; @@ -522,6 +500,31 @@ struct BlockDriverState { /* The error object in use for blocking operations on backing_hd */ Error *backing_blocker; + /* Protected by AioContext lock */ + + /* If true, copy read backing sectors into image. Can be >1 if more + * than one client has requested copy-on-read. + */ + int copy_on_read; + + /* If we are reading a disk image, give its size in sectors. + * Generally read-only; it is written to by load_vmstate and save_vmstate, + * but the block layer is quiescent during those. + */ + int64_t total_sectors; + + /* Callback before write request is processed */ + NotifierWithReturnList before_write_notifiers; + + /* number of in-flight requests; overall and serialising */ + unsigned int in_flight; + unsigned int serialising_in_flight; + + bool wakeup; + + /* Offset after the highest byte written to */ + uint64_t wr_highest_offset; + /* threshold limit for writes, in bytes. "High water mark". */ uint64_t write_threshold_offset; NotifierWithReturn write_threshold_notifier; @@ -529,6 +532,17 @@ struct BlockDriverState { /* counter for nested bdrv_io_plug */ unsigned io_plugged; + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; + CoQueue flush_queue; /* Serializing flush queue */ + bool active_flush_req; /* Flush request in flight? */ + unsigned int write_gen; /* Current data generation */ + unsigned int flushed_gen; /* Flushed write generation */ + + QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; + + /* do we need to tell the quest if we have a volatile write cache? */ + int enable_write_cache; + int quiesce_counter; }; diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index 6444e41d39..f365a51acf 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -64,14 +64,20 @@ typedef struct BlockDevOps { * fields that must be public. This is in particular for QLIST_ENTRY() and * friends so that BlockBackends can be kept in lists outside block-backend.c */ typedef struct BlockBackendPublic { - /* I/O throttling. - * throttle_state tells us if this BlockBackend has I/O limits configured. - * io_limits_disabled tells us if they are currently being enforced */ + /* I/O throttling has its own locking, but also some fields are + * protected by the AioContext lock. + */ + + /* Protected by AioContext lock. */ CoQueue throttled_reqs[2]; + + /* Nonzero if the I/O limits are currently being ignored; generally + * it is zero. */ unsigned int io_limits_disabled; /* The following fields are protected by the ThrottleGroup lock. - * See the ThrottleGroup documentation for details. */ + * See the ThrottleGroup documentation for details. + * throttle_state tells us if I/O limits are configured. */ ThrottleState *throttle_state; ThrottleTimers throttle_timers; unsigned pending_reqs[2]; -- cgit v1.2.3-55-g7522 From fed20a70e39bb9385020bdc4e8839d95326df8e2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 19:12:39 +0100 Subject: coroutine-lock: make CoMutex thread-safe This uses the lock-free mutex described in the paper '"Blocking without Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and Papatriantafilou. The same technique is used in OSv, and in fact the code is essentially a conversion to C of OSv's code. [Added missing coroutine_fn in tests/test-aio-multithread.c. --Stefan] Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213181244.16297-2-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/qemu/coroutine.h | 17 ++++- tests/test-aio-multithread.c | 86 ++++++++++++++++++++++++ util/qemu-coroutine-lock.c | 153 ++++++++++++++++++++++++++++++++++++++++--- util/trace-events | 1 + 4 files changed, 245 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 12584ed1b7..fce228f68a 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -160,10 +160,23 @@ bool qemu_co_queue_empty(CoQueue *queue); /** * Provides a mutex that can be used to synchronise coroutines */ +struct CoWaitRecord; typedef struct CoMutex { - bool locked; + /* Count of pending lockers; 0 for a free mutex, 1 for an + * uncontended mutex. + */ + unsigned locked; + + /* A queue of waiters. Elements are added atomically in front of + * from_push. to_pop is only populated, and popped from, by whoever + * is in charge of the next wakeup. This can be an unlocker or, + * through the handoff protocol, a locker that is about to go to sleep. + */ + QSLIST_HEAD(, CoWaitRecord) from_push, to_pop; + + unsigned handoff, sequence; + Coroutine *holder; - CoQueue queue; } CoMutex; /** diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c index 534807d45b..4fa2e9bb74 100644 --- a/tests/test-aio-multithread.c +++ b/tests/test-aio-multithread.c @@ -196,6 +196,88 @@ static void test_multi_co_schedule_10(void) test_multi_co_schedule(10); } +/* CoMutex thread-safety. */ + +static uint32_t atomic_counter; +static uint32_t running; +static uint32_t counter; +static CoMutex comutex; + +static void coroutine_fn test_multi_co_mutex_entry(void *opaque) +{ + while (!atomic_mb_read(&now_stopping)) { + qemu_co_mutex_lock(&comutex); + counter++; + qemu_co_mutex_unlock(&comutex); + + /* Increase atomic_counter *after* releasing the mutex. Otherwise + * there is a chance (it happens about 1 in 3 runs) that the iothread + * exits before the coroutine is woken up, causing a spurious + * assertion failure. + */ + atomic_inc(&atomic_counter); + } + atomic_dec(&running); +} + +static void test_multi_co_mutex(int threads, int seconds) +{ + int i; + + qemu_co_mutex_init(&comutex); + counter = 0; + atomic_counter = 0; + now_stopping = false; + + create_aio_contexts(); + assert(threads <= NUM_CONTEXTS); + running = threads; + for (i = 0; i < threads; i++) { + Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL); + aio_co_schedule(ctx[i], co1); + } + + g_usleep(seconds * 1000000); + + atomic_mb_set(&now_stopping, true); + while (running > 0) { + g_usleep(100000); + } + + join_aio_contexts(); + g_test_message("%d iterations/second\n", counter / seconds); + g_assert_cmpint(counter, ==, atomic_counter); +} + +/* Testing with NUM_CONTEXTS threads focuses on the queue. The mutex however + * is too contended (and the threads spend too much time in aio_poll) + * to actually stress the handoff protocol. + */ +static void test_multi_co_mutex_1(void) +{ + test_multi_co_mutex(NUM_CONTEXTS, 1); +} + +static void test_multi_co_mutex_10(void) +{ + test_multi_co_mutex(NUM_CONTEXTS, 10); +} + +/* Testing with fewer threads stresses the handoff protocol too. Still, the + * case where the locker _can_ pick up a handoff is very rare, happening + * about 10 times in 1 million, so increase the runtime a bit compared to + * other "quick" testcases that only run for 1 second. + */ +static void test_multi_co_mutex_2_3(void) +{ + test_multi_co_mutex(2, 3); +} + +static void test_multi_co_mutex_2_30(void) +{ + test_multi_co_mutex(2, 30); +} + /* End of tests. */ int main(int argc, char **argv) @@ -206,8 +288,12 @@ int main(int argc, char **argv) g_test_add_func("/aio/multi/lifecycle", test_lifecycle); if (g_test_quick()) { g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1); + g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1); + g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3); } else { g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10); + g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10); + g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30); } return g_test_run(); } diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index e6afd1aa6c..25da9fa8d0 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -20,6 +20,10 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. + * + * The lock-free mutex implementation is based on OSv + * (core/lfmutex.cc, include/lockfree/mutex.hh). + * Copyright (C) 2013 Cloudius Systems, Ltd. */ #include "qemu/osdep.h" @@ -111,27 +115,119 @@ bool qemu_co_queue_empty(CoQueue *queue) return QSIMPLEQ_FIRST(&queue->entries) == NULL; } +/* The wait records are handled with a multiple-producer, single-consumer + * lock-free queue. There cannot be two concurrent pop_waiter() calls + * because pop_waiter() can only be called while mutex->handoff is zero. + * This can happen in three cases: + * - in qemu_co_mutex_unlock, before the hand-off protocol has started. + * In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and + * not take part in the handoff. + * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from + * qemu_co_mutex_unlock. In this case, qemu_co_mutex_unlock will fail + * the cmpxchg (it will see either 0 or the next sequence value) and + * exit. The next hand-off cannot begin until qemu_co_mutex_lock has + * woken up someone. + * - in qemu_co_mutex_unlock, if it takes the hand-off token itself. + * In this case another iteration starts with mutex->handoff == 0; + * a concurrent qemu_co_mutex_lock will fail the cmpxchg, and + * qemu_co_mutex_unlock will go back to case (1). + * + * The following functions manage this queue. + */ +typedef struct CoWaitRecord { + Coroutine *co; + QSLIST_ENTRY(CoWaitRecord) next; +} CoWaitRecord; + +static void push_waiter(CoMutex *mutex, CoWaitRecord *w) +{ + w->co = qemu_coroutine_self(); + QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next); +} + +static void move_waiters(CoMutex *mutex) +{ + QSLIST_HEAD(, CoWaitRecord) reversed; + QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push); + while (!QSLIST_EMPTY(&reversed)) { + CoWaitRecord *w = QSLIST_FIRST(&reversed); + QSLIST_REMOVE_HEAD(&reversed, next); + QSLIST_INSERT_HEAD(&mutex->to_pop, w, next); + } +} + +static CoWaitRecord *pop_waiter(CoMutex *mutex) +{ + CoWaitRecord *w; + + if (QSLIST_EMPTY(&mutex->to_pop)) { + move_waiters(mutex); + if (QSLIST_EMPTY(&mutex->to_pop)) { + return NULL; + } + } + w = QSLIST_FIRST(&mutex->to_pop); + QSLIST_REMOVE_HEAD(&mutex->to_pop, next); + return w; +} + +static bool has_waiters(CoMutex *mutex) +{ + return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push); +} + void qemu_co_mutex_init(CoMutex *mutex) { memset(mutex, 0, sizeof(*mutex)); - qemu_co_queue_init(&mutex->queue); } -void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex) +static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex) { Coroutine *self = qemu_coroutine_self(); + CoWaitRecord w; + unsigned old_handoff; trace_qemu_co_mutex_lock_entry(mutex, self); + w.co = self; + push_waiter(mutex, &w); + + /* This is the "Responsibility Hand-Off" protocol; a lock() picks from + * a concurrent unlock() the responsibility of waking somebody up. + */ + old_handoff = atomic_mb_read(&mutex->handoff); + if (old_handoff && + has_waiters(mutex) && + atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) { + /* There can be no concurrent pops, because there can be only + * one active handoff at a time. + */ + CoWaitRecord *to_wake = pop_waiter(mutex); + Coroutine *co = to_wake->co; + if (co == self) { + /* We got the lock ourselves! */ + assert(to_wake == &w); + return; + } - while (mutex->locked) { - qemu_co_queue_wait(&mutex->queue); + aio_co_wake(co); } - mutex->locked = true; + qemu_coroutine_yield(); + trace_qemu_co_mutex_lock_return(mutex, self); +} + +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + + if (atomic_fetch_inc(&mutex->locked) == 0) { + /* Uncontended. */ + trace_qemu_co_mutex_lock_uncontended(mutex, self); + } else { + qemu_co_mutex_lock_slowpath(mutex); + } mutex->holder = self; self->locks_held++; - - trace_qemu_co_mutex_lock_return(mutex, self); } void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) @@ -140,14 +236,51 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) trace_qemu_co_mutex_unlock_entry(mutex, self); - assert(mutex->locked == true); + assert(mutex->locked); assert(mutex->holder == self); assert(qemu_in_coroutine()); - mutex->locked = false; mutex->holder = NULL; self->locks_held--; - qemu_co_queue_next(&mutex->queue); + if (atomic_fetch_dec(&mutex->locked) == 1) { + /* No waiting qemu_co_mutex_lock(). Pfew, that was easy! */ + return; + } + + for (;;) { + CoWaitRecord *to_wake = pop_waiter(mutex); + unsigned our_handoff; + + if (to_wake) { + Coroutine *co = to_wake->co; + aio_co_wake(co); + break; + } + + /* Some concurrent lock() is in progress (we know this because + * mutex->locked was >1) but it hasn't yet put itself on the wait + * queue. Pick a sequence number for the handoff protocol (not 0). + */ + if (++mutex->sequence == 0) { + mutex->sequence = 1; + } + + our_handoff = mutex->sequence; + atomic_mb_set(&mutex->handoff, our_handoff); + if (!has_waiters(mutex)) { + /* The concurrent lock has not added itself yet, so it + * will be able to pick our handoff. + */ + break; + } + + /* Try to do the handoff protocol ourselves; if somebody else has + * already taken it, however, we're done and they're responsible. + */ + if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) { + break; + } + } trace_qemu_co_mutex_unlock_return(mutex, self); } diff --git a/util/trace-events b/util/trace-events index 65c978715a..ac27d94a97 100644 --- a/util/trace-events +++ b/util/trace-events @@ -28,6 +28,7 @@ qemu_coroutine_terminate(void *co) "self %p" # util/qemu-coroutine-lock.c qemu_co_queue_run_restart(void *co) "co %p" +qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p" qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p" qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p" qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p" -- cgit v1.2.3-55-g7522 From 480cff632221dc4d4889bf72dd0f09cd35096bc1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 19:12:40 +0100 Subject: coroutine-lock: add limited spinning to CoMutex Running a very small critical section on pthread_mutex_t and CoMutex shows that pthread_mutex_t is much faster because it doesn't actually go to sleep. What happens is that the critical section is shorter than the latency of entering the kernel and thus FUTEX_WAIT always fails. With CoMutex there is no such latency but you still want to avoid wait and wakeup. So introduce it artificially. This only works with one waiters; because CoMutex is fair, it will always have more waits and wakeups than a pthread_mutex_t. Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213181244.16297-3-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/qemu/coroutine.h | 5 +++++ util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------ util/qemu-coroutine.c | 2 +- 3 files changed, 51 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index fce228f68a..12ce8e109e 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -167,6 +167,11 @@ typedef struct CoMutex { */ unsigned locked; + /* Context that is holding the lock. Useful to avoid spinning + * when two coroutines on the same AioContext try to get the lock. :) + */ + AioContext *ctx; + /* A queue of waiters. Elements are added atomically in front of * from_push. to_pop is only populated, and popped from, by whoever * is in charge of the next wakeup. This can be an unlocker or, diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 25da9fa8d0..73fe77cc80 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -30,6 +30,7 @@ #include "qemu-common.h" #include "qemu/coroutine.h" #include "qemu/coroutine_int.h" +#include "qemu/processor.h" #include "qemu/queue.h" #include "block/aio.h" #include "trace.h" @@ -181,7 +182,18 @@ void qemu_co_mutex_init(CoMutex *mutex) memset(mutex, 0, sizeof(*mutex)); } -static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex) +static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co) +{ + /* Read co before co->ctx; pairs with smp_wmb() in + * qemu_coroutine_enter(). + */ + smp_read_barrier_depends(); + mutex->ctx = co->ctx; + aio_co_wake(co); +} + +static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx, + CoMutex *mutex) { Coroutine *self = qemu_coroutine_self(); CoWaitRecord w; @@ -206,10 +218,11 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex) if (co == self) { /* We got the lock ourselves! */ assert(to_wake == &w); + mutex->ctx = ctx; return; } - aio_co_wake(co); + qemu_co_mutex_wake(mutex, co); } qemu_coroutine_yield(); @@ -218,13 +231,39 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex) void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex) { + AioContext *ctx = qemu_get_current_aio_context(); Coroutine *self = qemu_coroutine_self(); + int waiters, i; + + /* Running a very small critical section on pthread_mutex_t and CoMutex + * shows that pthread_mutex_t is much faster because it doesn't actually + * go to sleep. What happens is that the critical section is shorter + * than the latency of entering the kernel and thus FUTEX_WAIT always + * fails. With CoMutex there is no such latency but you still want to + * avoid wait and wakeup. So introduce it artificially. + */ + i = 0; +retry_fast_path: + waiters = atomic_cmpxchg(&mutex->locked, 0, 1); + if (waiters != 0) { + while (waiters == 1 && ++i < 1000) { + if (atomic_read(&mutex->ctx) == ctx) { + break; + } + if (atomic_read(&mutex->locked) == 0) { + goto retry_fast_path; + } + cpu_relax(); + } + waiters = atomic_fetch_inc(&mutex->locked); + } - if (atomic_fetch_inc(&mutex->locked) == 0) { + if (waiters == 0) { /* Uncontended. */ trace_qemu_co_mutex_lock_uncontended(mutex, self); + mutex->ctx = ctx; } else { - qemu_co_mutex_lock_slowpath(mutex); + qemu_co_mutex_lock_slowpath(ctx, mutex); } mutex->holder = self; self->locks_held++; @@ -240,6 +279,7 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) assert(mutex->holder == self); assert(qemu_in_coroutine()); + mutex->ctx = NULL; mutex->holder = NULL; self->locks_held--; if (atomic_fetch_dec(&mutex->locked) == 1) { @@ -252,8 +292,7 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) unsigned our_handoff; if (to_wake) { - Coroutine *co = to_wake->co; - aio_co_wake(co); + qemu_co_mutex_wake(mutex, to_wake->co); break; } diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 415600dc30..72412e5649 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -118,7 +118,7 @@ void qemu_coroutine_enter(Coroutine *co) co->ctx = qemu_get_current_aio_context(); /* Store co->ctx before anything that stores co. Matches - * barrier in aio_co_wake. + * barrier in aio_co_wake and qemu_co_mutex_wake. */ smp_wmb(); -- cgit v1.2.3-55-g7522 From f8c6e1cbc3d397207bedabdb2932fd6e1d7484df Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 19:12:42 +0100 Subject: coroutine-lock: place CoMutex before CoQueue in header This will avoid forward references in the next patch. It is also more logical because CoQueue is not anymore the basic primitive. Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213181244.16297-5-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 12ce8e109e..9f685794d2 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -112,51 +112,6 @@ bool qemu_in_coroutine(void); */ bool qemu_coroutine_entered(Coroutine *co); - -/** - * CoQueues are a mechanism to queue coroutines in order to continue executing - * them later. They provide the fundamental primitives on which coroutine locks - * are built. - */ -typedef struct CoQueue { - QSIMPLEQ_HEAD(, Coroutine) entries; -} CoQueue; - -/** - * Initialise a CoQueue. This must be called before any other operation is used - * on the CoQueue. - */ -void qemu_co_queue_init(CoQueue *queue); - -/** - * Adds the current coroutine to the CoQueue and transfers control to the - * caller of the coroutine. - */ -void coroutine_fn qemu_co_queue_wait(CoQueue *queue); - -/** - * Restarts the next coroutine in the CoQueue and removes it from the queue. - * - * Returns true if a coroutine was restarted, false if the queue is empty. - */ -bool coroutine_fn qemu_co_queue_next(CoQueue *queue); - -/** - * Restarts all coroutines in the CoQueue and leaves the queue empty. - */ -void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue); - -/** - * Enter the next coroutine in the queue - */ -bool qemu_co_enter_next(CoQueue *queue); - -/** - * Checks if the CoQueue is empty. - */ -bool qemu_co_queue_empty(CoQueue *queue); - - /** * Provides a mutex that can be used to synchronise coroutines */ @@ -202,6 +157,50 @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex); */ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex); + +/** + * CoQueues are a mechanism to queue coroutines in order to continue executing + * them later. + */ +typedef struct CoQueue { + QSIMPLEQ_HEAD(, Coroutine) entries; +} CoQueue; + +/** + * Initialise a CoQueue. This must be called before any other operation is used + * on the CoQueue. + */ +void qemu_co_queue_init(CoQueue *queue); + +/** + * Adds the current coroutine to the CoQueue and transfers control to the + * caller of the coroutine. + */ +void coroutine_fn qemu_co_queue_wait(CoQueue *queue); + +/** + * Restarts the next coroutine in the CoQueue and removes it from the queue. + * + * Returns true if a coroutine was restarted, false if the queue is empty. + */ +bool coroutine_fn qemu_co_queue_next(CoQueue *queue); + +/** + * Restarts all coroutines in the CoQueue and leaves the queue empty. + */ +void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue); + +/** + * Enter the next coroutine in the queue + */ +bool qemu_co_enter_next(CoQueue *queue); + +/** + * Checks if the CoQueue is empty. + */ +bool qemu_co_queue_empty(CoQueue *queue); + + typedef struct CoRwlock { bool writer; int reader; -- cgit v1.2.3-55-g7522 From 1ace7ceac507d90d50ecb2e13f7222beadb64d92 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 19:12:43 +0100 Subject: coroutine-lock: add mutex argument to CoQueue APIs All that CoQueue needs in order to become thread-safe is help from an external mutex. Add this to the API. Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213181244.16297-6-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- block/backup.c | 2 +- block/io.c | 4 ++-- block/nbd-client.c | 2 +- block/qcow2-cluster.c | 4 +--- block/sheepdog.c | 2 +- block/throttle-groups.c | 2 +- hw/9pfs/9p.c | 2 +- include/qemu/coroutine.h | 8 +++++--- util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++--- 9 files changed, 34 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/block/backup.c b/block/backup.c index ea38733849..fe010e78e3 100644 --- a/block/backup.c +++ b/block/backup.c @@ -64,7 +64,7 @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, retry = false; QLIST_FOREACH(req, &job->inflight_reqs, list) { if (end > req->start && start < req->end) { - qemu_co_queue_wait(&req->wait_queue); + qemu_co_queue_wait(&req->wait_queue, NULL); retry = true; break; } diff --git a/block/io.c b/block/io.c index a5c7d36d8c..d5c45447fd 100644 --- a/block/io.c +++ b/block/io.c @@ -539,7 +539,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) * (instead of producing a deadlock in the former case). */ if (!req->waiting_for) { self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue); + qemu_co_queue_wait(&req->wait_queue, NULL); self->waiting_for = NULL; retry = true; waited = true; @@ -2275,7 +2275,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) /* Wait until any previous flushes are completed */ while (bs->active_flush_req) { - qemu_co_queue_wait(&bs->flush_queue); + qemu_co_queue_wait(&bs->flush_queue, NULL); } bs->active_flush_req = true; diff --git a/block/nbd-client.c b/block/nbd-client.c index 10fcc9e81d..0dc12c2d67 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -182,7 +182,7 @@ static void nbd_coroutine_start(NBDClientSession *s, /* Poor man semaphore. The free_sema is locked when no other request * can be accepted, and unlocked after receiving one reply. */ if (s->in_flight == MAX_NBD_REQUESTS) { - qemu_co_queue_wait(&s->free_sema); + qemu_co_queue_wait(&s->free_sema, NULL); assert(s->in_flight < MAX_NBD_REQUESTS); } s->in_flight++; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 928c1e298d..78c11d4948 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -932,9 +932,7 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, if (bytes == 0) { /* Wait for the dependency to complete. We need to recheck * the free/allocated clusters when we continue. */ - qemu_co_mutex_unlock(&s->lock); - qemu_co_queue_wait(&old_alloc->dependent_requests); - qemu_co_mutex_lock(&s->lock); + qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock); return -EAGAIN; } } diff --git a/block/sheepdog.c b/block/sheepdog.c index 32c4e4c507..860ba61502 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -486,7 +486,7 @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb) retry: QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { if (AIOCBOverlapping(acb, cb)) { - qemu_co_queue_wait(&s->overlapping_queue); + qemu_co_queue_wait(&s->overlapping_queue, NULL); goto retry; } } diff --git a/block/throttle-groups.c b/block/throttle-groups.c index aade5def39..b73e7a800b 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -326,7 +326,7 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk, if (must_wait || blkp->pending_reqs[is_write]) { blkp->pending_reqs[is_write]++; qemu_mutex_unlock(&tg->lock); - qemu_co_queue_wait(&blkp->throttled_reqs[is_write]); + qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL); qemu_mutex_lock(&tg->lock); blkp->pending_reqs[is_write]--; } diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c index 99e94723b9..3af1c93dc8 100644 --- a/hw/9pfs/9p.c +++ b/hw/9pfs/9p.c @@ -2374,7 +2374,7 @@ static void coroutine_fn v9fs_flush(void *opaque) /* * Wait for pdu to complete. */ - qemu_co_queue_wait(&cancel_pdu->complete); + qemu_co_queue_wait(&cancel_pdu->complete, NULL); cancel_pdu->cancelled = 0; pdu_free(cancel_pdu); } diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index 9f685794d2..d2de268981 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -160,7 +160,8 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex); /** * CoQueues are a mechanism to queue coroutines in order to continue executing - * them later. + * them later. They are similar to condition variables, but they need help + * from an external mutex in order to maintain thread-safety. */ typedef struct CoQueue { QSIMPLEQ_HEAD(, Coroutine) entries; @@ -174,9 +175,10 @@ void qemu_co_queue_init(CoQueue *queue); /** * Adds the current coroutine to the CoQueue and transfers control to the - * caller of the coroutine. + * caller of the coroutine. The mutex is unlocked during the wait and + * locked again afterwards. */ -void coroutine_fn qemu_co_queue_wait(CoQueue *queue); +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex); /** * Restarts the next coroutine in the CoQueue and removes it from the queue. diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 73fe77cc80..b0a554f40d 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -40,12 +40,30 @@ void qemu_co_queue_init(CoQueue *queue) QSIMPLEQ_INIT(&queue->entries); } -void coroutine_fn qemu_co_queue_wait(CoQueue *queue) +void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) { Coroutine *self = qemu_coroutine_self(); QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); + + if (mutex) { + qemu_co_mutex_unlock(mutex); + } + + /* There is no race condition here. Other threads will call + * aio_co_schedule on our AioContext, which can reenter this + * coroutine but only after this yield and after the main loop + * has gone through the next iteration. + */ qemu_coroutine_yield(); assert(qemu_in_coroutine()); + + /* TODO: OSv implements wait morphing here, where the wakeup + * primitive automatically places the woken coroutine on the + * mutex's queue. This avoids the thundering herd effect. + */ + if (mutex) { + qemu_co_mutex_lock(mutex); + } } /** @@ -335,7 +353,7 @@ void qemu_co_rwlock_rdlock(CoRwlock *lock) Coroutine *self = qemu_coroutine_self(); while (lock->writer) { - qemu_co_queue_wait(&lock->queue); + qemu_co_queue_wait(&lock->queue, NULL); } lock->reader++; self->locks_held++; @@ -365,7 +383,7 @@ void qemu_co_rwlock_wrlock(CoRwlock *lock) Coroutine *self = qemu_coroutine_self(); while (lock->writer || lock->reader) { - qemu_co_queue_wait(&lock->queue); + qemu_co_queue_wait(&lock->queue, NULL); } lock->writer = true; self->locks_held++; -- cgit v1.2.3-55-g7522 From a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Feb 2017 19:12:44 +0100 Subject: coroutine-lock: make CoRwlock thread-safe and fair This adds a CoMutex around the existing CoQueue. Because the write-side can just take CoMutex, the old "writer" field is not necessary anymore. Instead of removing it altogether, count the number of pending writers during a read-side critical section and forbid further readers from entering. Signed-off-by: Paolo Bonzini Reviewed-by: Fam Zheng Message-id: 20170213181244.16297-7-pbonzini@redhat.com Signed-off-by: Stefan Hajnoczi --- include/qemu/coroutine.h | 3 ++- util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index d2de268981..e60beaff81 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -204,8 +204,9 @@ bool qemu_co_queue_empty(CoQueue *queue); typedef struct CoRwlock { - bool writer; + int pending_writer; int reader; + CoMutex mutex; CoQueue queue; } CoRwlock; diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index b0a554f40d..6328eed26b 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -346,16 +346,22 @@ void qemu_co_rwlock_init(CoRwlock *lock) { memset(lock, 0, sizeof(*lock)); qemu_co_queue_init(&lock->queue); + qemu_co_mutex_init(&lock->mutex); } void qemu_co_rwlock_rdlock(CoRwlock *lock) { Coroutine *self = qemu_coroutine_self(); - while (lock->writer) { - qemu_co_queue_wait(&lock->queue, NULL); + qemu_co_mutex_lock(&lock->mutex); + /* For fairness, wait if a writer is in line. */ + while (lock->pending_writer) { + qemu_co_queue_wait(&lock->queue, &lock->mutex); } lock->reader++; + qemu_co_mutex_unlock(&lock->mutex); + + /* The rest of the read-side critical section is run without the mutex. */ self->locks_held++; } @@ -364,10 +370,13 @@ void qemu_co_rwlock_unlock(CoRwlock *lock) Coroutine *self = qemu_coroutine_self(); assert(qemu_in_coroutine()); - if (lock->writer) { - lock->writer = false; + if (!lock->reader) { + /* The critical section started in qemu_co_rwlock_wrlock. */ qemu_co_queue_restart_all(&lock->queue); } else { + self->locks_held--; + + qemu_co_mutex_lock(&lock->mutex); lock->reader--; assert(lock->reader >= 0); /* Wakeup only one waiting writer */ @@ -375,16 +384,20 @@ void qemu_co_rwlock_unlock(CoRwlock *lock) qemu_co_queue_next(&lock->queue); } } - self->locks_held--; + qemu_co_mutex_unlock(&lock->mutex); } void qemu_co_rwlock_wrlock(CoRwlock *lock) { - Coroutine *self = qemu_coroutine_self(); - - while (lock->writer || lock->reader) { - qemu_co_queue_wait(&lock->queue, NULL); + qemu_co_mutex_lock(&lock->mutex); + lock->pending_writer++; + while (lock->reader) { + qemu_co_queue_wait(&lock->queue, &lock->mutex); } - lock->writer = true; - self->locks_held++; + lock->pending_writer--; + + /* The rest of the write-side critical section is run with + * the mutex taken, so that lock->reader remains zero. + * There is no need to update self->locks_held. + */ } -- cgit v1.2.3-55-g7522