From 0c330a734b51c177ab8488932ac3b0c4d63a718a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 14:52:19 +0100
Subject: aio: introduce aio_co_schedule and aio_co_wake

aio_co_wake provides the infrastructure to start a coroutine on a "home"
AioContext.  It will be used by CoMutex and CoQueue, so that coroutines
don't jump from one context to another when they go to sleep on a
mutex or waitqueue.  However, it can also be used as a more efficient
alternative to one-shot bottom halves, and saves the effort of tracking
which AioContext a coroutine is running on.

aio_co_schedule is the part of aio_co_wake that starts a coroutine
on a remove AioContext, but it is also useful to implement e.g.
bdrv_set_aio_context callbacks.

The implementation of aio_co_schedule is based on a lock-free
multiple-producer, single-consumer queue.  The multiple producers use
cmpxchg to add to a LIFO stack.  The consumer (a per-AioContext bottom
half) grabs all items added so far, inverts the list to make it FIFO,
and goes through it one item at a time until it's empty.  The data
structure was inspired by OSv, which uses it in the very code we'll
"port" to QEMU for the thread-safe CoMutex.

Most of the new code is really tests.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/aio.h          | 32 ++++++++++++++++++++++++++++++++
 include/qemu/coroutine_int.h | 11 ++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/block/aio.h b/include/block/aio.h
index 7df271d2b9..614cbc6982 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -47,6 +47,7 @@ typedef void QEMUBHFunc(void *opaque);
 typedef bool AioPollFn(void *opaque);
 typedef void IOHandler(void *opaque);
 
+struct Coroutine;
 struct ThreadPool;
 struct LinuxAioState;
 
@@ -108,6 +109,9 @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
     /* Thread pool for performing work and receiving completion callbacks.
      * Has its own locking.
      */
@@ -482,6 +486,34 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
     return !is_external || !atomic_read(&ctx->external_disable_cnt);
 }
 
+/**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ */
+void aio_co_wake(struct Coroutine *co);
+
 /**
  * Return the AioContext whose event loop runs in the current thread.
  *
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
index 14d4f1d1f2..cb98892bba 100644
--- a/include/qemu/coroutine_int.h
+++ b/include/qemu/coroutine_int.h
@@ -40,12 +40,21 @@ struct Coroutine {
     CoroutineEntry *entry;
     void *entry_arg;
     Coroutine *caller;
+
+    /* Only used when the coroutine has terminated.  */
     QSLIST_ENTRY(Coroutine) pool_next;
+
     size_t locks_held;
 
-    /* Coroutines that should be woken up when we yield or terminate */
+    /* Coroutines that should be woken up when we yield or terminate.
+     * Only used when the coroutine is running.
+     */
     QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
+
+    /* Only used when the coroutine has yielded.  */
+    AioContext *ctx;
     QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
+    QSLIST_ENTRY(Coroutine) co_scheduled_next;
 };
 
 Coroutine *qemu_coroutine_new(void);
-- 
cgit v1.2.3-55-g7522


From bf88c1247f80ac6d62710d5d0d0d9ce3a53e99ec Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 14:52:22 +0100
Subject: io: add methods to set I/O handlers on AioContext

This is in preparation for making qio_channel_yield work on
AioContexts other than the main one.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 25 +++++++++++++++++++++++++
 io/channel-command.c | 13 +++++++++++++
 io/channel-file.c    | 11 +++++++++++
 io/channel-socket.c  | 16 +++++++++++-----
 io/channel-tls.c     | 12 ++++++++++++
 io/channel-watch.c   |  6 ++++++
 io/channel.c         | 11 +++++++++++
 7 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/io/channel.h b/include/io/channel.h
index 32a9470794..0bc7c3f47f 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -23,6 +23,7 @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
 #define QIO_CHANNEL(obj)                                    \
@@ -132,6 +133,11 @@ struct QIOChannelClass {
                      off_t offset,
                      int whence,
                      Error **errp);
+    void (*io_set_aio_fd_handler)(QIOChannel *ioc,
+                                  AioContext *ctx,
+                                  IOHandler *io_read,
+                                  IOHandler *io_write,
+                                  void *opaque);
 };
 
 /* General I/O handling functions */
@@ -525,4 +531,23 @@ void qio_channel_yield(QIOChannel *ioc,
 void qio_channel_wait(QIOChannel *ioc,
                       GIOCondition condition);
 
+/**
+ * qio_channel_set_aio_fd_handler:
+ * @ioc: the channel object
+ * @ctx: the AioContext to set the handlers on
+ * @io_read: the read handler
+ * @io_write: the write handler
+ * @opaque: the opaque value passed to the handler
+ *
+ * This is used internally by qio_channel_yield().  It can
+ * be used by channel implementations to forward the handlers
+ * to another channel (e.g. from #QIOChannelTLS to the
+ * underlying socket).
+ */
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque);
+
 #endif /* QIO_CHANNEL_H */
diff --git a/io/channel-command.c b/io/channel-command.c
index ad25313be1..319c5ed50c 100644
--- a/io/channel-command.c
+++ b/io/channel-command.c
@@ -328,6 +328,18 @@ static int qio_channel_command_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
+                                                   AioContext *ctx,
+                                                   IOHandler *io_read,
+                                                   IOHandler *io_write,
+                                                   void *opaque)
+{
+    QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
+    aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
+    aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
+}
+
+
 static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
                                                  GIOCondition condition)
 {
@@ -349,6 +361,7 @@ static void qio_channel_command_class_init(ObjectClass *klass,
     ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
     ioc_klass->io_close = qio_channel_command_close;
     ioc_klass->io_create_watch = qio_channel_command_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_command_info = {
diff --git a/io/channel-file.c b/io/channel-file.c
index e1da2435e6..b383273201 100644
--- a/io/channel-file.c
+++ b/io/channel-file.c
@@ -186,6 +186,16 @@ static int qio_channel_file_close(QIOChannel *ioc,
 }
 
 
+static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
+                                                AioContext *ctx,
+                                                IOHandler *io_read,
+                                                IOHandler *io_write,
+                                                void *opaque)
+{
+    QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
+    aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
                                               GIOCondition condition)
 {
@@ -206,6 +216,7 @@ static void qio_channel_file_class_init(ObjectClass *klass,
     ioc_klass->io_seek = qio_channel_file_seek;
     ioc_klass->io_close = qio_channel_file_close;
     ioc_klass->io_create_watch = qio_channel_file_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_file_info = {
diff --git a/io/channel-socket.c b/io/channel-socket.c
index f385233f18..f546c6830e 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -649,11 +649,6 @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
         qemu_set_block(sioc->fd);
     } else {
         qemu_set_nonblock(sioc->fd);
-#ifdef WIN32
-        WSAEventSelect(sioc->fd, ioc->event,
-                       FD_READ | FD_ACCEPT | FD_CLOSE |
-                       FD_CONNECT | FD_WRITE | FD_OOB);
-#endif
     }
     return 0;
 }
@@ -733,6 +728,16 @@ qio_channel_socket_shutdown(QIOChannel *ioc,
     return 0;
 }
 
+static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
+                                                  AioContext *ctx,
+                                                  IOHandler *io_read,
+                                                  IOHandler *io_write,
+                                                  void *opaque)
+{
+    QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
+    aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
+}
+
 static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
                                                 GIOCondition condition)
 {
@@ -755,6 +760,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass,
     ioc_klass->io_set_cork = qio_channel_socket_set_cork;
     ioc_klass->io_set_delay = qio_channel_socket_set_delay;
     ioc_klass->io_create_watch = qio_channel_socket_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_socket_info = {
diff --git a/io/channel-tls.c b/io/channel-tls.c
index f25ab0ae53..6182702dab 100644
--- a/io/channel-tls.c
+++ b/io/channel-tls.c
@@ -345,6 +345,17 @@ static int qio_channel_tls_close(QIOChannel *ioc,
     return qio_channel_close(tioc->master, errp);
 }
 
+static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
+                                               AioContext *ctx,
+                                               IOHandler *io_read,
+                                               IOHandler *io_write,
+                                               void *opaque)
+{
+    QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
+
+    qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
+}
+
 static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
                                              GIOCondition condition)
 {
@@ -372,6 +383,7 @@ static void qio_channel_tls_class_init(ObjectClass *klass,
     ioc_klass->io_close = qio_channel_tls_close;
     ioc_klass->io_shutdown = qio_channel_tls_shutdown;
     ioc_klass->io_create_watch = qio_channel_tls_create_watch;
+    ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
 }
 
 static const TypeInfo qio_channel_tls_info = {
diff --git a/io/channel-watch.c b/io/channel-watch.c
index cf1cdff896..8640d1c464 100644
--- a/io/channel-watch.c
+++ b/io/channel-watch.c
@@ -285,6 +285,12 @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
     GSource *source;
     QIOChannelSocketSource *ssource;
 
+#ifdef WIN32
+    WSAEventSelect(socket, ioc->event,
+                   FD_READ | FD_ACCEPT | FD_CLOSE |
+                   FD_CONNECT | FD_WRITE | FD_OOB);
+#endif
+
     source = g_source_new(&qio_channel_socket_source_funcs,
                           sizeof(QIOChannelSocketSource));
     ssource = (QIOChannelSocketSource *)source;
diff --git a/io/channel.c b/io/channel.c
index 80924c1772..ce470d7c5d 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -154,6 +154,17 @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
 }
 
 
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
+                                    AioContext *ctx,
+                                    IOHandler *io_read,
+                                    IOHandler *io_write,
+                                    void *opaque)
+{
+    QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
+
+    klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
+}
+
 guint qio_channel_add_watch(QIOChannel *ioc,
                             GIOCondition condition,
                             QIOChannelFunc func,
-- 
cgit v1.2.3-55-g7522


From c4c497d27f0be8552ae244e76ba2bce66bd2443e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 14:52:23 +0100
Subject: io: make qio_channel_yield aware of AioContexts

Support separate coroutines for reading and writing, and place the
read/write handlers on the AioContext that the QIOChannel is registered
with.

Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213135235.12274-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/io/channel.h | 47 ++++++++++++++++++++++++++--
 io/channel.c         | 86 +++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 109 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/io/channel.h b/include/io/channel.h
index 0bc7c3f47f..5d48906998 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -23,6 +23,7 @@
 
 #include "qemu-common.h"
 #include "qom/object.h"
+#include "qemu/coroutine.h"
 #include "block/aio.h"
 
 #define TYPE_QIO_CHANNEL "qio-channel"
@@ -81,6 +82,9 @@ struct QIOChannel {
     Object parent;
     unsigned int features; /* bitmask of QIOChannelFeatures */
     char *name;
+    AioContext *ctx;
+    Coroutine *read_coroutine;
+    Coroutine *write_coroutine;
 #ifdef _WIN32
     HANDLE event; /* For use with GSource on Win32 */
 #endif
@@ -502,14 +506,51 @@ guint qio_channel_add_watch(QIOChannel *ioc,
                             GDestroyNotify notify);
 
 
+/**
+ * qio_channel_attach_aio_context:
+ * @ioc: the channel object
+ * @ctx: the #AioContext to set the handlers on
+ *
+ * Request that qio_channel_yield() sets I/O handlers on
+ * the given #AioContext.  If @ctx is %NULL, qio_channel_yield()
+ * uses QEMU's main thread event loop.
+ *
+ * You can move a #QIOChannel from one #AioContext to another even if
+ * I/O handlers are set for a coroutine.  However, #QIOChannel provides
+ * no synchronization between the calls to qio_channel_yield() and
+ * qio_channel_attach_aio_context().
+ *
+ * Therefore you should first call qio_channel_detach_aio_context()
+ * to ensure that the coroutine is not entered concurrently.  Then,
+ * while the coroutine has yielded, call qio_channel_attach_aio_context(),
+ * and then aio_co_schedule() to place the coroutine on the new
+ * #AioContext.  The calls to qio_channel_detach_aio_context()
+ * and qio_channel_attach_aio_context() should be protected with
+ * aio_context_acquire() and aio_context_release().
+ */
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx);
+
+/**
+ * qio_channel_detach_aio_context:
+ * @ioc: the channel object
+ *
+ * Disable any I/O handlers set by qio_channel_yield().  With the
+ * help of aio_co_schedule(), this allows moving a coroutine that was
+ * paused by qio_channel_yield() to another context.
+ */
+void qio_channel_detach_aio_context(QIOChannel *ioc);
+
 /**
  * qio_channel_yield:
  * @ioc: the channel object
  * @condition: the I/O condition to wait for
  *
- * Yields execution from the current coroutine until
- * the condition indicated by @condition becomes
- * available.
+ * Yields execution from the current coroutine until the condition
+ * indicated by @condition becomes available.  @condition must
+ * be either %G_IO_IN or %G_IO_OUT; it cannot contain both.  In
+ * addition, no two coroutine can be waiting on the same condition
+ * and channel at the same time.
  *
  * This must only be called from coroutine context
  */
diff --git a/io/channel.c b/io/channel.c
index ce470d7c5d..cdf74540c1 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -21,7 +21,7 @@
 #include "qemu/osdep.h"
 #include "io/channel.h"
 #include "qapi/error.h"
-#include "qemu/coroutine.h"
+#include "qemu/main-loop.h"
 
 bool qio_channel_has_feature(QIOChannel *ioc,
                              QIOChannelFeature feature)
@@ -238,36 +238,80 @@ off_t qio_channel_io_seek(QIOChannel *ioc,
 }
 
 
-typedef struct QIOChannelYieldData QIOChannelYieldData;
-struct QIOChannelYieldData {
-    QIOChannel *ioc;
-    Coroutine *co;
-};
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
+
+static void qio_channel_restart_read(void *opaque)
+{
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->read_coroutine;
 
+    ioc->read_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
+}
 
-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
-                                        GIOCondition condition,
-                                        gpointer opaque)
+static void qio_channel_restart_write(void *opaque)
 {
-    QIOChannelYieldData *data = opaque;
-    qemu_coroutine_enter(data->co);
-    return FALSE;
+    QIOChannel *ioc = opaque;
+    Coroutine *co = ioc->write_coroutine;
+
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    aio_co_wake(co);
 }
 
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
+{
+    IOHandler *rd_handler = NULL, *wr_handler = NULL;
+    AioContext *ctx;
+
+    if (ioc->read_coroutine) {
+        rd_handler = qio_channel_restart_read;
+    }
+    if (ioc->write_coroutine) {
+        wr_handler = qio_channel_restart_write;
+    }
+
+    ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
+}
+
+void qio_channel_attach_aio_context(QIOChannel *ioc,
+                                    AioContext *ctx)
+{
+    AioContext *old_ctx;
+    if (ioc->ctx == ctx) {
+        return;
+    }
+
+    old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
+    qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
+    ioc->ctx = ctx;
+    qio_channel_set_aio_fd_handlers(ioc);
+}
+
+void qio_channel_detach_aio_context(QIOChannel *ioc)
+{
+    ioc->read_coroutine = NULL;
+    ioc->write_coroutine = NULL;
+    qio_channel_set_aio_fd_handlers(ioc);
+    ioc->ctx = NULL;
+}
 
 void coroutine_fn qio_channel_yield(QIOChannel *ioc,
                                     GIOCondition condition)
 {
-    QIOChannelYieldData data;
-
     assert(qemu_in_coroutine());
-    data.ioc = ioc;
-    data.co = qemu_coroutine_self();
-    qio_channel_add_watch(ioc,
-                          condition,
-                          qio_channel_yield_enter,
-                          &data,
-                          NULL);
+    if (condition == G_IO_IN) {
+        assert(!ioc->read_coroutine);
+        ioc->read_coroutine = qemu_coroutine_self();
+    } else if (condition == G_IO_OUT) {
+        assert(!ioc->write_coroutine);
+        ioc->write_coroutine = qemu_coroutine_self();
+    } else {
+        abort();
+    }
+    qio_channel_set_aio_fd_handlers(ioc);
     qemu_coroutine_yield();
 }
 
-- 
cgit v1.2.3-55-g7522


From a153bf52b37e148f052b0869600877130671a03d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 14:52:33 +0100
Subject: aio-posix: partially inline aio_dispatch into aio_poll

This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
Extract the dispatching loop for file descriptor handlers into a new
function aio_dispatch_handlers, and then inline aio_dispatch into
aio_poll.

aio_dispatch can now become void.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-17-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/aio.h |  6 +-----
 util/aio-posix.c    | 44 ++++++++++++++------------------------------
 util/aio-win32.c    | 13 ++++---------
 util/async.c        |  2 +-
 4 files changed, 20 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/block/aio.h b/include/block/aio.h
index 614cbc6982..677b6ffc25 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -310,12 +310,8 @@ bool aio_pending(AioContext *ctx);
 /* Dispatch any pending callbacks from the GSource attached to the AioContext.
  *
  * This is used internally in the implementation of the GSource.
- *
- * @dispatch_fds: true to process fds, false to skip them
- *                (can be used as an optimization by callers that know there
- *                are no fds ready)
  */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
+void aio_dispatch(AioContext *ctx);
 
 /* Progress in completing AIO work to occur.  This can issue new pending
  * aio as a result of executing I/O completion or bh callbacks.
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 84cee4315d..2173378570 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -386,12 +386,6 @@ static bool aio_dispatch_handlers(AioContext *ctx)
     AioHandler *node, *tmp;
     bool progress = false;
 
-    /*
-     * We have to walk very carefully in case aio_set_fd_handler is
-     * called while we're walking.
-     */
-    qemu_lockcnt_inc(&ctx->list_lock);
-
     QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
         int revents;
 
@@ -426,33 +420,18 @@ static bool aio_dispatch_handlers(AioContext *ctx)
         }
     }
 
-    qemu_lockcnt_dec(&ctx->list_lock);
     return progress;
 }
 
-/*
- * Note that dispatch_fds == false has the side-effect of post-poning the
- * freeing of deleted handlers.
- */
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
-
-    /*
-     * If there are callbacks left that have been queued, we need to call them.
-     * Do not call select in this case, because it is possible that the caller
-     * does not need a complete flush (as is the case for aio_poll loops).
-     */
-    progress = aio_bh_poll(ctx);
+    aio_bh_poll(ctx);
 
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx);
-    }
-
-    /* Run our timers */
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
+    qemu_lockcnt_inc(&ctx->list_lock);
+    aio_dispatch_handlers(ctx);
+    qemu_lockcnt_dec(&ctx->list_lock);
 
-    return progress;
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 /* These thread-local variables are used only in a small part of aio_poll
@@ -702,11 +681,16 @@ bool aio_poll(AioContext *ctx, bool blocking)
     npfd = 0;
     qemu_lockcnt_dec(&ctx->list_lock);
 
-    /* Run dispatch even if there were no readable fds to run timers */
-    if (aio_dispatch(ctx, ret > 0)) {
-        progress = true;
+    progress |= aio_bh_poll(ctx);
+
+    if (ret > 0) {
+        qemu_lockcnt_inc(&ctx->list_lock);
+        progress |= aio_dispatch_handlers(ctx);
+        qemu_lockcnt_dec(&ctx->list_lock);
     }
 
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+
     return progress;
 }
 
diff --git a/util/aio-win32.c b/util/aio-win32.c
index 20b63ce875..442a1792b7 100644
--- a/util/aio-win32.c
+++ b/util/aio-win32.c
@@ -309,16 +309,11 @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
     return progress;
 }
 
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
+void aio_dispatch(AioContext *ctx)
 {
-    bool progress;
-
-    progress = aio_bh_poll(ctx);
-    if (dispatch_fds) {
-        progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
-    }
-    progress |= timerlistgroup_run_timers(&ctx->tlg);
-    return progress;
+    aio_bh_poll(ctx);
+    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    timerlistgroup_run_timers(&ctx->tlg);
 }
 
 bool aio_poll(AioContext *ctx, bool blocking)
diff --git a/util/async.c b/util/async.c
index c54da7164e..187bc5bc4b 100644
--- a/util/async.c
+++ b/util/async.c
@@ -258,7 +258,7 @@ aio_ctx_dispatch(GSource     *source,
     AioContext *ctx = (AioContext *) source;
 
     assert(callback == NULL);
-    aio_dispatch(ctx, true);
+    aio_dispatch(ctx);
     return true;
 }
 
-- 
cgit v1.2.3-55-g7522


From 91bcea4899017891983b9149bd50cb283e78dfc0 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 14:52:35 +0100
Subject: block: document fields protected by AioContext lock

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
Message-id: 20170213135235.12274-19-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/block/block_int.h      | 64 +++++++++++++++++++++++++-----------------
 include/sysemu/block-backend.h | 14 ++++++---
 2 files changed, 49 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 2d92d7edfe..1670941da9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -430,8 +430,9 @@ struct BdrvChild {
  * copied as well.
  */
 struct BlockDriverState {
-    int64_t total_sectors; /* if we are reading a disk image, give its
-                              size in sectors */
+    /* Protected by big QEMU lock or read-only after opening.  No special
+     * locking needed during I/O...
+     */
     int open_flags; /* flags used to open the file, re-used for re-open */
     bool read_only; /* if true, the media is read only */
     bool encrypted; /* if true, the media is encrypted */
@@ -439,14 +440,6 @@ struct BlockDriverState {
     bool sg;        /* if true, the device is a /dev/sg* */
     bool probed;    /* if true, format was probed rather than specified */
 
-    int copy_on_read; /* if nonzero, copy read backing sectors into image.
-                         note this is a reference count */
-
-    CoQueue flush_queue;            /* Serializing flush queue */
-    bool active_flush_req;          /* Flush request in flight? */
-    unsigned int write_gen;         /* Current data generation */
-    unsigned int flushed_gen;       /* Flushed write generation */
-
     BlockDriver *drv; /* NULL means no media */
     void *opaque;
 
@@ -468,18 +461,6 @@ struct BlockDriverState {
     BdrvChild *backing;
     BdrvChild *file;
 
-    /* Callback before write request is processed */
-    NotifierWithReturnList before_write_notifiers;
-
-    /* number of in-flight requests; overall and serialising */
-    unsigned int in_flight;
-    unsigned int serialising_in_flight;
-
-    bool wakeup;
-
-    /* Offset after the highest byte written to */
-    uint64_t wr_highest_offset;
-
     /* I/O Limits */
     BlockLimits bl;
 
@@ -497,11 +478,8 @@ struct BlockDriverState {
     QTAILQ_ENTRY(BlockDriverState) bs_list;
     /* element of the list of monitor-owned BDS */
     QTAILQ_ENTRY(BlockDriverState) monitor_list;
-    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
     int refcnt;
 
-    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
-
     /* operation blockers */
     QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 
@@ -522,6 +500,31 @@ struct BlockDriverState {
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
 
+    /* Protected by AioContext lock */
+
+    /* If true, copy read backing sectors into image.  Can be >1 if more
+     * than one client has requested copy-on-read.
+     */
+    int copy_on_read;
+
+    /* If we are reading a disk image, give its size in sectors.
+     * Generally read-only; it is written to by load_vmstate and save_vmstate,
+     * but the block layer is quiescent during those.
+     */
+    int64_t total_sectors;
+
+    /* Callback before write request is processed */
+    NotifierWithReturnList before_write_notifiers;
+
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
+    unsigned int serialising_in_flight;
+
+    bool wakeup;
+
+    /* Offset after the highest byte written to */
+    uint64_t wr_highest_offset;
+
     /* threshold limit for writes, in bytes. "High water mark". */
     uint64_t write_threshold_offset;
     NotifierWithReturn write_threshold_notifier;
@@ -529,6 +532,17 @@ struct BlockDriverState {
     /* counter for nested bdrv_io_plug */
     unsigned io_plugged;
 
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+    CoQueue flush_queue;                  /* Serializing flush queue */
+    bool active_flush_req;                /* Flush request in flight? */
+    unsigned int write_gen;               /* Current data generation */
+    unsigned int flushed_gen;             /* Flushed write generation */
+
+    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+
+    /* do we need to tell the quest if we have a volatile write cache? */
+    int enable_write_cache;
+
     int quiesce_counter;
 };
 
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 6444e41d39..f365a51acf 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -64,14 +64,20 @@ typedef struct BlockDevOps {
  * fields that must be public. This is in particular for QLIST_ENTRY() and
  * friends so that BlockBackends can be kept in lists outside block-backend.c */
 typedef struct BlockBackendPublic {
-    /* I/O throttling.
-     * throttle_state tells us if this BlockBackend has I/O limits configured.
-     * io_limits_disabled tells us if they are currently being enforced */
+    /* I/O throttling has its own locking, but also some fields are
+     * protected by the AioContext lock.
+     */
+
+    /* Protected by AioContext lock.  */
     CoQueue      throttled_reqs[2];
+
+    /* Nonzero if the I/O limits are currently being ignored; generally
+     * it is zero.  */
     unsigned int io_limits_disabled;
 
     /* The following fields are protected by the ThrottleGroup lock.
-     * See the ThrottleGroup documentation for details. */
+     * See the ThrottleGroup documentation for details.
+     * throttle_state tells us if I/O limits are configured. */
     ThrottleState *throttle_state;
     ThrottleTimers throttle_timers;
     unsigned       pending_reqs[2];
-- 
cgit v1.2.3-55-g7522


From fed20a70e39bb9385020bdc4e8839d95326df8e2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 19:12:39 +0100
Subject: coroutine-lock: make CoMutex thread-safe

This uses the lock-free mutex described in the paper '"Blocking without
Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
Papatriantafilou.  The same technique is used in OSv, and in fact
the code is essentially a conversion to C of OSv's code.

[Added missing coroutine_fn in tests/test-aio-multithread.c.
--Stefan]

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-2-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h     |  17 ++++-
 tests/test-aio-multithread.c |  86 ++++++++++++++++++++++++
 util/qemu-coroutine-lock.c   | 153 ++++++++++++++++++++++++++++++++++++++++---
 util/trace-events            |   1 +
 4 files changed, 245 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index 12584ed1b7..fce228f68a 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -160,10 +160,23 @@ bool qemu_co_queue_empty(CoQueue *queue);
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
+struct CoWaitRecord;
 typedef struct CoMutex {
-    bool locked;
+    /* Count of pending lockers; 0 for a free mutex, 1 for an
+     * uncontended mutex.
+     */
+    unsigned locked;
+
+    /* A queue of waiters.  Elements are added atomically in front of
+     * from_push.  to_pop is only populated, and popped from, by whoever
+     * is in charge of the next wakeup.  This can be an unlocker or,
+     * through the handoff protocol, a locker that is about to go to sleep.
+     */
+    QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
+
+    unsigned handoff, sequence;
+
     Coroutine *holder;
-    CoQueue queue;
 } CoMutex;
 
 /**
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
index 534807d45b..4fa2e9bb74 100644
--- a/tests/test-aio-multithread.c
+++ b/tests/test-aio-multithread.c
@@ -196,6 +196,88 @@ static void test_multi_co_schedule_10(void)
     test_multi_co_schedule(10);
 }
 
+/* CoMutex thread-safety.  */
+
+static uint32_t atomic_counter;
+static uint32_t running;
+static uint32_t counter;
+static CoMutex comutex;
+
+static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
+{
+    while (!atomic_mb_read(&now_stopping)) {
+        qemu_co_mutex_lock(&comutex);
+        counter++;
+        qemu_co_mutex_unlock(&comutex);
+
+        /* Increase atomic_counter *after* releasing the mutex.  Otherwise
+         * there is a chance (it happens about 1 in 3 runs) that the iothread
+         * exits before the coroutine is woken up, causing a spurious
+         * assertion failure.
+         */
+        atomic_inc(&atomic_counter);
+    }
+    atomic_dec(&running);
+}
+
+static void test_multi_co_mutex(int threads, int seconds)
+{
+    int i;
+
+    qemu_co_mutex_init(&comutex);
+    counter = 0;
+    atomic_counter = 0;
+    now_stopping = false;
+
+    create_aio_contexts();
+    assert(threads <= NUM_CONTEXTS);
+    running = threads;
+    for (i = 0; i < threads; i++) {
+        Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
+        aio_co_schedule(ctx[i], co1);
+    }
+
+    g_usleep(seconds * 1000000);
+
+    atomic_mb_set(&now_stopping, true);
+    while (running > 0) {
+        g_usleep(100000);
+    }
+
+    join_aio_contexts();
+    g_test_message("%d iterations/second\n", counter / seconds);
+    g_assert_cmpint(counter, ==, atomic_counter);
+}
+
+/* Testing with NUM_CONTEXTS threads focuses on the queue.  The mutex however
+ * is too contended (and the threads spend too much time in aio_poll)
+ * to actually stress the handoff protocol.
+ */
+static void test_multi_co_mutex_1(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 1);
+}
+
+static void test_multi_co_mutex_10(void)
+{
+    test_multi_co_mutex(NUM_CONTEXTS, 10);
+}
+
+/* Testing with fewer threads stresses the handoff protocol too.  Still, the
+ * case where the locker _can_ pick up a handoff is very rare, happening
+ * about 10 times in 1 million, so increase the runtime a bit compared to
+ * other "quick" testcases that only run for 1 second.
+ */
+static void test_multi_co_mutex_2_3(void)
+{
+    test_multi_co_mutex(2, 3);
+}
+
+static void test_multi_co_mutex_2_30(void)
+{
+    test_multi_co_mutex(2, 30);
+}
+
 /* End of tests.  */
 
 int main(int argc, char **argv)
@@ -206,8 +288,12 @@ int main(int argc, char **argv)
     g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
     if (g_test_quick()) {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
     } else {
         g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
+        g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
+        g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
     }
     return g_test_run();
 }
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index e6afd1aa6c..25da9fa8d0 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -20,6 +20,10 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
+ *
+ * The lock-free mutex implementation is based on OSv
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
  */
 
 #include "qemu/osdep.h"
@@ -111,27 +115,119 @@ bool qemu_co_queue_empty(CoQueue *queue)
     return QSIMPLEQ_FIRST(&queue->entries) == NULL;
 }
 
+/* The wait records are handled with a multiple-producer, single-consumer
+ * lock-free queue.  There cannot be two concurrent pop_waiter() calls
+ * because pop_waiter() can only be called while mutex->handoff is zero.
+ * This can happen in three cases:
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
+ *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
+ *   not take part in the handoff.
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
+ *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
+ *   the cmpxchg (it will see either 0 or the next sequence value) and
+ *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
+ *   woken up someone.
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
+ *   In this case another iteration starts with mutex->handoff == 0;
+ *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
+ *   qemu_co_mutex_unlock will go back to case (1).
+ *
+ * The following functions manage this queue.
+ */
+typedef struct CoWaitRecord {
+    Coroutine *co;
+    QSLIST_ENTRY(CoWaitRecord) next;
+} CoWaitRecord;
+
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+{
+    w->co = qemu_coroutine_self();
+    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
+}
+
+static void move_waiters(CoMutex *mutex)
+{
+    QSLIST_HEAD(, CoWaitRecord) reversed;
+    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
+    while (!QSLIST_EMPTY(&reversed)) {
+        CoWaitRecord *w = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, next);
+        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
+    }
+}
+
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
+{
+    CoWaitRecord *w;
+
+    if (QSLIST_EMPTY(&mutex->to_pop)) {
+        move_waiters(mutex);
+        if (QSLIST_EMPTY(&mutex->to_pop)) {
+            return NULL;
+        }
+    }
+    w = QSLIST_FIRST(&mutex->to_pop);
+    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
+    return w;
+}
+
+static bool has_waiters(CoMutex *mutex)
+{
+    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
+}
+
 void qemu_co_mutex_init(CoMutex *mutex)
 {
     memset(mutex, 0, sizeof(*mutex));
-    qemu_co_queue_init(&mutex->queue);
 }
 
-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
+    CoWaitRecord w;
+    unsigned old_handoff;
 
     trace_qemu_co_mutex_lock_entry(mutex, self);
+    w.co = self;
+    push_waiter(mutex, &w);
+
+    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+     * a concurrent unlock() the responsibility of waking somebody up.
+     */
+    old_handoff = atomic_mb_read(&mutex->handoff);
+    if (old_handoff &&
+        has_waiters(mutex) &&
+        atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
+        /* There can be no concurrent pops, because there can be only
+         * one active handoff at a time.
+         */
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        Coroutine *co = to_wake->co;
+        if (co == self) {
+            /* We got the lock ourselves!  */
+            assert(to_wake == &w);
+            return;
+        }
 
-    while (mutex->locked) {
-        qemu_co_queue_wait(&mutex->queue);
+        aio_co_wake(co);
     }
 
-    mutex->locked = true;
+    qemu_coroutine_yield();
+    trace_qemu_co_mutex_lock_return(mutex, self);
+}
+
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    if (atomic_fetch_inc(&mutex->locked) == 0) {
+        /* Uncontended.  */
+        trace_qemu_co_mutex_lock_uncontended(mutex, self);
+    } else {
+        qemu_co_mutex_lock_slowpath(mutex);
+    }
     mutex->holder = self;
     self->locks_held++;
-
-    trace_qemu_co_mutex_lock_return(mutex, self);
 }
 
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
@@ -140,14 +236,51 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
 
     trace_qemu_co_mutex_unlock_entry(mutex, self);
 
-    assert(mutex->locked == true);
+    assert(mutex->locked);
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
-    mutex->locked = false;
     mutex->holder = NULL;
     self->locks_held--;
-    qemu_co_queue_next(&mutex->queue);
+    if (atomic_fetch_dec(&mutex->locked) == 1) {
+        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
+        return;
+    }
+
+    for (;;) {
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        unsigned our_handoff;
+
+        if (to_wake) {
+            Coroutine *co = to_wake->co;
+            aio_co_wake(co);
+            break;
+        }
+
+        /* Some concurrent lock() is in progress (we know this because
+         * mutex->locked was >1) but it hasn't yet put itself on the wait
+         * queue.  Pick a sequence number for the handoff protocol (not 0).
+         */
+        if (++mutex->sequence == 0) {
+            mutex->sequence = 1;
+        }
+
+        our_handoff = mutex->sequence;
+        atomic_mb_set(&mutex->handoff, our_handoff);
+        if (!has_waiters(mutex)) {
+            /* The concurrent lock has not added itself yet, so it
+             * will be able to pick our handoff.
+             */
+            break;
+        }
+
+        /* Try to do the handoff protocol ourselves; if somebody else has
+         * already taken it, however, we're done and they're responsible.
+         */
+        if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
+            break;
+        }
+    }
 
     trace_qemu_co_mutex_unlock_return(mutex, self);
 }
diff --git a/util/trace-events b/util/trace-events
index 65c978715a..ac27d94a97 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -28,6 +28,7 @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # util/qemu-coroutine-lock.c
 qemu_co_queue_run_restart(void *co) "co %p"
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
cgit v1.2.3-55-g7522


From 480cff632221dc4d4889bf72dd0f09cd35096bc1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 19:12:40 +0100
Subject: coroutine-lock: add limited spinning to CoMutex

Running a very small critical section on pthread_mutex_t and CoMutex
shows that pthread_mutex_t is much faster because it doesn't actually
go to sleep.  What happens is that the critical section is shorter
than the latency of entering the kernel and thus FUTEX_WAIT always
fails.  With CoMutex there is no such latency but you still want to
avoid wait and wakeup.  So introduce it artificially.

This only works with one waiters; because CoMutex is fair, it will
always have more waits and wakeups than a pthread_mutex_t.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-3-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  5 +++++
 util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
 util/qemu-coroutine.c      |  2 +-
 3 files changed, 51 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index fce228f68a..12ce8e109e 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -167,6 +167,11 @@ typedef struct CoMutex {
      */
     unsigned locked;
 
+    /* Context that is holding the lock.  Useful to avoid spinning
+     * when two coroutines on the same AioContext try to get the lock. :)
+     */
+    AioContext *ctx;
+
     /* A queue of waiters.  Elements are added atomically in front of
      * from_push.  to_pop is only populated, and popped from, by whoever
      * is in charge of the next wakeup.  This can be an unlocker or,
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index 25da9fa8d0..73fe77cc80 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -30,6 +30,7 @@
 #include "qemu-common.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "qemu/processor.h"
 #include "qemu/queue.h"
 #include "block/aio.h"
 #include "trace.h"
@@ -181,7 +182,18 @@ void qemu_co_mutex_init(CoMutex *mutex)
     memset(mutex, 0, sizeof(*mutex));
 }
 
-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
+{
+    /* Read co before co->ctx; pairs with smp_wmb() in
+     * qemu_coroutine_enter().
+     */
+    smp_read_barrier_depends();
+    mutex->ctx = co->ctx;
+    aio_co_wake(co);
+}
+
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+                                                     CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     CoWaitRecord w;
@@ -206,10 +218,11 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
         if (co == self) {
             /* We got the lock ourselves!  */
             assert(to_wake == &w);
+            mutex->ctx = ctx;
             return;
         }
 
-        aio_co_wake(co);
+        qemu_co_mutex_wake(mutex, co);
     }
 
     qemu_coroutine_yield();
@@ -218,13 +231,39 @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
 
 void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
 {
+    AioContext *ctx = qemu_get_current_aio_context();
     Coroutine *self = qemu_coroutine_self();
+    int waiters, i;
+
+    /* Running a very small critical section on pthread_mutex_t and CoMutex
+     * shows that pthread_mutex_t is much faster because it doesn't actually
+     * go to sleep.  What happens is that the critical section is shorter
+     * than the latency of entering the kernel and thus FUTEX_WAIT always
+     * fails.  With CoMutex there is no such latency but you still want to
+     * avoid wait and wakeup.  So introduce it artificially.
+     */
+    i = 0;
+retry_fast_path:
+    waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
+    if (waiters != 0) {
+        while (waiters == 1 && ++i < 1000) {
+            if (atomic_read(&mutex->ctx) == ctx) {
+                break;
+            }
+            if (atomic_read(&mutex->locked) == 0) {
+                goto retry_fast_path;
+            }
+            cpu_relax();
+        }
+        waiters = atomic_fetch_inc(&mutex->locked);
+    }
 
-    if (atomic_fetch_inc(&mutex->locked) == 0) {
+    if (waiters == 0) {
         /* Uncontended.  */
         trace_qemu_co_mutex_lock_uncontended(mutex, self);
+        mutex->ctx = ctx;
     } else {
-        qemu_co_mutex_lock_slowpath(mutex);
+        qemu_co_mutex_lock_slowpath(ctx, mutex);
     }
     mutex->holder = self;
     self->locks_held++;
@@ -240,6 +279,7 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
     assert(mutex->holder == self);
     assert(qemu_in_coroutine());
 
+    mutex->ctx = NULL;
     mutex->holder = NULL;
     self->locks_held--;
     if (atomic_fetch_dec(&mutex->locked) == 1) {
@@ -252,8 +292,7 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
         unsigned our_handoff;
 
         if (to_wake) {
-            Coroutine *co = to_wake->co;
-            aio_co_wake(co);
+            qemu_co_mutex_wake(mutex, to_wake->co);
             break;
         }
 
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 415600dc30..72412e5649 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -118,7 +118,7 @@ void qemu_coroutine_enter(Coroutine *co)
     co->ctx = qemu_get_current_aio_context();
 
     /* Store co->ctx before anything that stores co.  Matches
-     * barrier in aio_co_wake.
+     * barrier in aio_co_wake and qemu_co_mutex_wake.
      */
     smp_wmb();
 
-- 
cgit v1.2.3-55-g7522


From f8c6e1cbc3d397207bedabdb2932fd6e1d7484df Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 19:12:42 +0100
Subject: coroutine-lock: place CoMutex before CoQueue in header

This will avoid forward references in the next patch.  It is also
more logical because CoQueue is not anymore the basic primitive.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-5-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index 12ce8e109e..9f685794d2 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -112,51 +112,6 @@ bool qemu_in_coroutine(void);
  */
 bool qemu_coroutine_entered(Coroutine *co);
 
-
-/**
- * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later. They provide the fundamental primitives on which coroutine locks
- * are built.
- */
-typedef struct CoQueue {
-    QSIMPLEQ_HEAD(, Coroutine) entries;
-} CoQueue;
-
-/**
- * Initialise a CoQueue. This must be called before any other operation is used
- * on the CoQueue.
- */
-void qemu_co_queue_init(CoQueue *queue);
-
-/**
- * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
- */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
-
-/**
- * Restarts the next coroutine in the CoQueue and removes it from the queue.
- *
- * Returns true if a coroutine was restarted, false if the queue is empty.
- */
-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
-
-/**
- * Restarts all coroutines in the CoQueue and leaves the queue empty.
- */
-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
-
-/**
- * Enter the next coroutine in the queue
- */
-bool qemu_co_enter_next(CoQueue *queue);
-
-/**
- * Checks if the CoQueue is empty.
- */
-bool qemu_co_queue_empty(CoQueue *queue);
-
-
 /**
  * Provides a mutex that can be used to synchronise coroutines
  */
@@ -202,6 +157,50 @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
  */
 void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later.
+ */
+typedef struct CoQueue {
+    QSIMPLEQ_HEAD(, Coroutine) entries;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
+ */
+void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
+
+/**
+ * Enter the next coroutine in the queue
+ */
+bool qemu_co_enter_next(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
 typedef struct CoRwlock {
     bool writer;
     int reader;
-- 
cgit v1.2.3-55-g7522


From 1ace7ceac507d90d50ecb2e13f7222beadb64d92 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 19:12:43 +0100
Subject: coroutine-lock: add mutex argument to CoQueue APIs

All that CoQueue needs in order to become thread-safe is help
from an external mutex.  Add this to the API.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-6-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/backup.c             |  2 +-
 block/io.c                 |  4 ++--
 block/nbd-client.c         |  2 +-
 block/qcow2-cluster.c      |  4 +---
 block/sheepdog.c           |  2 +-
 block/throttle-groups.c    |  2 +-
 hw/9pfs/9p.c               |  2 +-
 include/qemu/coroutine.h   |  8 +++++---
 util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
 9 files changed, 34 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/block/backup.c b/block/backup.c
index ea38733849..fe010e78e3 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -64,7 +64,7 @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
         retry = false;
         QLIST_FOREACH(req, &job->inflight_reqs, list) {
             if (end > req->start && start < req->end) {
-                qemu_co_queue_wait(&req->wait_queue);
+                qemu_co_queue_wait(&req->wait_queue, NULL);
                 retry = true;
                 break;
             }
diff --git a/block/io.c b/block/io.c
index a5c7d36d8c..d5c45447fd 100644
--- a/block/io.c
+++ b/block/io.c
@@ -539,7 +539,7 @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
                  * (instead of producing a deadlock in the former case). */
                 if (!req->waiting_for) {
                     self->waiting_for = req;
-                    qemu_co_queue_wait(&req->wait_queue);
+                    qemu_co_queue_wait(&req->wait_queue, NULL);
                     self->waiting_for = NULL;
                     retry = true;
                     waited = true;
@@ -2275,7 +2275,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
     /* Wait until any previous flushes are completed */
     while (bs->active_flush_req) {
-        qemu_co_queue_wait(&bs->flush_queue);
+        qemu_co_queue_wait(&bs->flush_queue, NULL);
     }
 
     bs->active_flush_req = true;
diff --git a/block/nbd-client.c b/block/nbd-client.c
index 10fcc9e81d..0dc12c2d67 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -182,7 +182,7 @@ static void nbd_coroutine_start(NBDClientSession *s,
     /* Poor man semaphore.  The free_sema is locked when no other request
      * can be accepted, and unlocked after receiving one reply.  */
     if (s->in_flight == MAX_NBD_REQUESTS) {
-        qemu_co_queue_wait(&s->free_sema);
+        qemu_co_queue_wait(&s->free_sema, NULL);
         assert(s->in_flight < MAX_NBD_REQUESTS);
     }
     s->in_flight++;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 928c1e298d..78c11d4948 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -932,9 +932,7 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
             if (bytes == 0) {
                 /* Wait for the dependency to complete. We need to recheck
                  * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
                 return -EAGAIN;
             }
         }
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 32c4e4c507..860ba61502 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -486,7 +486,7 @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
 retry:
     QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
         if (AIOCBOverlapping(acb, cb)) {
-            qemu_co_queue_wait(&s->overlapping_queue);
+            qemu_co_queue_wait(&s->overlapping_queue, NULL);
             goto retry;
         }
     }
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index aade5def39..b73e7a800b 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -326,7 +326,7 @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
     if (must_wait || blkp->pending_reqs[is_write]) {
         blkp->pending_reqs[is_write]++;
         qemu_mutex_unlock(&tg->lock);
-        qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
+        qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
         qemu_mutex_lock(&tg->lock);
         blkp->pending_reqs[is_write]--;
     }
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 99e94723b9..3af1c93dc8 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -2374,7 +2374,7 @@ static void coroutine_fn v9fs_flush(void *opaque)
         /*
          * Wait for pdu to complete.
          */
-        qemu_co_queue_wait(&cancel_pdu->complete);
+        qemu_co_queue_wait(&cancel_pdu->complete, NULL);
         cancel_pdu->cancelled = 0;
         pdu_free(cancel_pdu);
     }
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index 9f685794d2..d2de268981 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -160,7 +160,8 @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
 
 /**
  * CoQueues are a mechanism to queue coroutines in order to continue executing
- * them later.
+ * them later.  They are similar to condition variables, but they need help
+ * from an external mutex in order to maintain thread-safety.
  */
 typedef struct CoQueue {
     QSIMPLEQ_HEAD(, Coroutine) entries;
@@ -174,9 +175,10 @@ void qemu_co_queue_init(CoQueue *queue);
 
 /**
  * Adds the current coroutine to the CoQueue and transfers control to the
- * caller of the coroutine.
+ * caller of the coroutine.  The mutex is unlocked during the wait and
+ * locked again afterwards.
  */
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
 
 /**
  * Restarts the next coroutine in the CoQueue and removes it from the queue.
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index 73fe77cc80..b0a554f40d 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -40,12 +40,30 @@ void qemu_co_queue_init(CoQueue *queue)
     QSIMPLEQ_INIT(&queue->entries);
 }
 
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
 {
     Coroutine *self = qemu_coroutine_self();
     QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+
+    if (mutex) {
+        qemu_co_mutex_unlock(mutex);
+    }
+
+    /* There is no race condition here.  Other threads will call
+     * aio_co_schedule on our AioContext, which can reenter this
+     * coroutine but only after this yield and after the main loop
+     * has gone through the next iteration.
+     */
     qemu_coroutine_yield();
     assert(qemu_in_coroutine());
+
+    /* TODO: OSv implements wait morphing here, where the wakeup
+     * primitive automatically places the woken coroutine on the
+     * mutex's queue.  This avoids the thundering herd effect.
+     */
+    if (mutex) {
+        qemu_co_mutex_lock(mutex);
+    }
 }
 
 /**
@@ -335,7 +353,7 @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->reader++;
     self->locks_held++;
@@ -365,7 +383,7 @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue);
+        qemu_co_queue_wait(&lock->queue, NULL);
     }
     lock->writer = true;
     self->locks_held++;
-- 
cgit v1.2.3-55-g7522


From a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Mon, 13 Feb 2017 19:12:44 +0100
Subject: coroutine-lock: make CoRwlock thread-safe and fair

This adds a CoMutex around the existing CoQueue.  Because the write-side
can just take CoMutex, the old "writer" field is not necessary anymore.
Instead of removing it altogether, count the number of pending writers
during a read-side critical section and forbid further readers from
entering.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Message-id: 20170213181244.16297-7-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/qemu/coroutine.h   |  3 ++-
 util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
 2 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
index d2de268981..e60beaff81 100644
--- a/include/qemu/coroutine.h
+++ b/include/qemu/coroutine.h
@@ -204,8 +204,9 @@ bool qemu_co_queue_empty(CoQueue *queue);
 
 
 typedef struct CoRwlock {
-    bool writer;
+    int pending_writer;
     int reader;
+    CoMutex mutex;
     CoQueue queue;
 } CoRwlock;
 
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
index b0a554f40d..6328eed26b 100644
--- a/util/qemu-coroutine-lock.c
+++ b/util/qemu-coroutine-lock.c
@@ -346,16 +346,22 @@ void qemu_co_rwlock_init(CoRwlock *lock)
 {
     memset(lock, 0, sizeof(*lock));
     qemu_co_queue_init(&lock->queue);
+    qemu_co_mutex_init(&lock->mutex);
 }
 
 void qemu_co_rwlock_rdlock(CoRwlock *lock)
 {
     Coroutine *self = qemu_coroutine_self();
 
-    while (lock->writer) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    /* For fairness, wait if a writer is in line.  */
+    while (lock->pending_writer) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
     lock->reader++;
+    qemu_co_mutex_unlock(&lock->mutex);
+
+    /* The rest of the read-side critical section is run without the mutex.  */
     self->locks_held++;
 }
 
@@ -364,10 +370,13 @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
     Coroutine *self = qemu_coroutine_self();
 
     assert(qemu_in_coroutine());
-    if (lock->writer) {
-        lock->writer = false;
+    if (!lock->reader) {
+        /* The critical section started in qemu_co_rwlock_wrlock.  */
         qemu_co_queue_restart_all(&lock->queue);
     } else {
+        self->locks_held--;
+
+        qemu_co_mutex_lock(&lock->mutex);
         lock->reader--;
         assert(lock->reader >= 0);
         /* Wakeup only one waiting writer */
@@ -375,16 +384,20 @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
             qemu_co_queue_next(&lock->queue);
         }
     }
-    self->locks_held--;
+    qemu_co_mutex_unlock(&lock->mutex);
 }
 
 void qemu_co_rwlock_wrlock(CoRwlock *lock)
 {
-    Coroutine *self = qemu_coroutine_self();
-
-    while (lock->writer || lock->reader) {
-        qemu_co_queue_wait(&lock->queue, NULL);
+    qemu_co_mutex_lock(&lock->mutex);
+    lock->pending_writer++;
+    while (lock->reader) {
+        qemu_co_queue_wait(&lock->queue, &lock->mutex);
     }
-    lock->writer = true;
-    self->locks_held++;
+    lock->pending_writer--;
+
+    /* The rest of the write-side critical section is run with
+     * the mutex taken, so that lock->reader remains zero.
+     * There is no need to update self->locks_held.
+     */
 }
-- 
cgit v1.2.3-55-g7522