From cdb3081269347fd9271fd1b7a9df312e2953bdd9 Mon Sep 17 00:00:00 2001
From: Peter Xu
Date: Fri, 23 Sep 2016 13:02:26 +0800
Subject: memory: introduce IOMMUNotifier and its caps

IOMMU Notifier list is used for notifying IO address mapping changes.
Currently VFIO is the only user.

However it is possible that future consumer like vhost would like to
only listen to part of its notifications (e.g., cache invalidations).

This patch introduced IOMMUNotifier and IOMMUNotfierFlag bits for a
finer grained control of it.

IOMMUNotifier contains a bitfield for the notify consumer describing
what kind of notification it is interested in. Currently two kinds of
notifications are defined:

- IOMMU_NOTIFIER_MAP:    for newly mapped entries (additions)
- IOMMU_NOTIFIER_UNMAP:  for entries to be removed (cache invalidates)

When registering the IOMMU notifier, we need to specify one or multiple
types of messages to listen to.

When notifications are triggered, its type will be checked against the
notifier's type bits, and only notifiers with registered bits will be
notified.

(For any IOMMU implementation, an in-place mapping change should be
 notified with an UNMAP followed by a MAP.)

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1474606948-14391-2-git-send-email-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/vfio/common.c              |  4 ++--
 include/exec/memory.h         | 47 ++++++++++++++++++++++++++++++++++++-------
 include/hw/vfio/vfio-common.h |  2 +-
 memory.c                      | 37 +++++++++++++++++++++++++---------
 4 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index b313e7c2c6..29188a12fc 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -293,11 +293,10 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section)
            section->offset_within_address_space & (1ULL << 63);
 }
 
-static void vfio_iommu_map_notify(Notifier *n, void *data)
+static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
 {
     VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
     VFIOContainer *container = giommu->container;
-    IOMMUTLBEntry *iotlb = data;
     hwaddr iova = iotlb->iova + giommu->iommu_offset;
     MemoryRegion *mr;
     hwaddr xlat;
@@ -454,6 +453,7 @@ static void vfio_listener_region_add(MemoryListener *listener,
                                section->offset_within_region;
         giommu->container = container;
         giommu->n.notify = vfio_iommu_map_notify;
+        giommu->n.notifier_flags = IOMMU_NOTIFIER_ALL;
         QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
 
         memory_region_register_iommu_notifier(giommu->iommu, &giommu->n);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 3e4d4164cd..14cda673f9 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -67,6 +67,27 @@ struct IOMMUTLBEntry {
     IOMMUAccessFlags perm;
 };
 
+/*
+ * Bitmap for different IOMMUNotifier capabilities. Each notifier can
+ * register with one or multiple IOMMU Notifier capability bit(s).
+ */
+typedef enum {
+    IOMMU_NOTIFIER_NONE = 0,
+    /* Notify cache invalidations */
+    IOMMU_NOTIFIER_UNMAP = 0x1,
+    /* Notify entry changes (newly created entries) */
+    IOMMU_NOTIFIER_MAP = 0x2,
+} IOMMUNotifierFlag;
+
+#define IOMMU_NOTIFIER_ALL (IOMMU_NOTIFIER_MAP | IOMMU_NOTIFIER_UNMAP)
+
+struct IOMMUNotifier {
+    void (*notify)(struct IOMMUNotifier *notifier, IOMMUTLBEntry *data);
+    IOMMUNotifierFlag notifier_flags;
+    QLIST_ENTRY(IOMMUNotifier) node;
+};
+typedef struct IOMMUNotifier IOMMUNotifier;
+
 /* New-style MMIO accessors can indicate that the transaction failed.
  * A zero (MEMTX_OK) response means success; anything else is a failure
  * of some kind. The memory subsystem will bitwise-OR together results
@@ -201,7 +222,7 @@ struct MemoryRegion {
     const char *name;
     unsigned ioeventfd_nb;
     MemoryRegionIoeventfd *ioeventfds;
-    NotifierList iommu_notify;
+    QLIST_HEAD(, IOMMUNotifier) iommu_notify;
 };
 
 /**
@@ -607,6 +628,15 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr);
 /**
  * memory_region_notify_iommu: notify a change in an IOMMU translation entry.
  *
+ * The notification type will be decided by entry.perm bits:
+ *
+ * - For UNMAP (cache invalidation) notifies: set entry.perm to IOMMU_NONE.
+ * - For MAP (newly added entry) notifies: set entry.perm to the
+ *   permission of the page (which is definitely !IOMMU_NONE).
+ *
+ * Note: for any IOMMU implementation, an in-place mapping change
+ * should be notified with an UNMAP followed by a MAP.
+ *
  * @mr: the memory region that was changed
  * @entry: the new entry in the IOMMU translation table.  The entry
  *         replaces all old entries for the same virtual I/O address range.
@@ -620,11 +650,12 @@ void memory_region_notify_iommu(MemoryRegion *mr,
  * IOMMU translation entries.
  *
  * @mr: the memory region to observe
- * @n: the notifier to be added; the notifier receives a pointer to an
- *     #IOMMUTLBEntry as the opaque value; the pointer ceases to be
- *     valid on exit from the notifier.
+ * @n: the IOMMUNotifier to be added; the notify callback receives a
+ *     pointer to an #IOMMUTLBEntry as the opaque value; the pointer
+ *     ceases to be valid on exit from the notifier.
  */
-void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n);
+void memory_region_register_iommu_notifier(MemoryRegion *mr,
+                                           IOMMUNotifier *n);
 
 /**
  * memory_region_iommu_replay: replay existing IOMMU translations to
@@ -636,7 +667,8 @@ void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n);
  * @is_write: Whether to treat the replay as a translate "write"
  *     through the iommu
  */
-void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write);
+void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n,
+                                bool is_write);
 
 /**
  * memory_region_unregister_iommu_notifier: unregister a notifier for
@@ -646,7 +678,8 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write);
  *      needs to be called
  * @n: the notifier to be removed.
  */
-void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n);
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
+                                             IOMMUNotifier *n);
 
 /**
  * memory_region_name: get a memory region's name
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 94dfae387a..c17602e533 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -93,7 +93,7 @@ typedef struct VFIOGuestIOMMU {
     VFIOContainer *container;
     MemoryRegion *iommu;
     hwaddr iommu_offset;
-    Notifier n;
+    IOMMUNotifier n;
     QLIST_ENTRY(VFIOGuestIOMMU) giommu_next;
 } VFIOGuestIOMMU;
 
diff --git a/memory.c b/memory.c
index 1a1baf574c..69d9d9ab7f 100644
--- a/memory.c
+++ b/memory.c
@@ -1413,7 +1413,7 @@ void memory_region_init_iommu(MemoryRegion *mr,
     memory_region_init(mr, owner, name, size);
     mr->iommu_ops = ops,
     mr->terminates = true;  /* then re-forwards */
-    notifier_list_init(&mr->iommu_notify);
+    QLIST_INIT(&mr->iommu_notify);
 }
 
 static void memory_region_finalize(Object *obj)
@@ -1508,13 +1508,16 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client)
     return memory_region_get_dirty_log_mask(mr) & (1 << client);
 }
 
-void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n)
+void memory_region_register_iommu_notifier(MemoryRegion *mr,
+                                           IOMMUNotifier *n)
 {
+    /* We need to register for at least one bitfield */
+    assert(n->notifier_flags != IOMMU_NOTIFIER_NONE);
     if (mr->iommu_ops->notify_started &&
-        QLIST_EMPTY(&mr->iommu_notify.notifiers)) {
+        QLIST_EMPTY(&mr->iommu_notify)) {
         mr->iommu_ops->notify_started(mr);
     }
-    notifier_list_add(&mr->iommu_notify, n);
+    QLIST_INSERT_HEAD(&mr->iommu_notify, n, node);
 }
 
 uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr)
@@ -1526,7 +1529,8 @@ uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr)
     return TARGET_PAGE_SIZE;
 }
 
-void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write)
+void memory_region_iommu_replay(MemoryRegion *mr, IOMMUNotifier *n,
+                                bool is_write)
 {
     hwaddr addr, granularity;
     IOMMUTLBEntry iotlb;
@@ -1547,11 +1551,12 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write)
     }
 }
 
-void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n)
+void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
+                                             IOMMUNotifier *n)
 {
-    notifier_remove(n);
+    QLIST_REMOVE(n, node);
     if (mr->iommu_ops->notify_stopped &&
-        QLIST_EMPTY(&mr->iommu_notify.notifiers)) {
+        QLIST_EMPTY(&mr->iommu_notify)) {
         mr->iommu_ops->notify_stopped(mr);
     }
 }
@@ -1559,8 +1564,22 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n)
 void memory_region_notify_iommu(MemoryRegion *mr,
                                 IOMMUTLBEntry entry)
 {
+    IOMMUNotifier *iommu_notifier;
+    IOMMUNotifierFlag request_flags;
+
     assert(memory_region_is_iommu(mr));
-    notifier_list_notify(&mr->iommu_notify, &entry);
+
+    if (entry.perm & IOMMU_RW) {
+        request_flags = IOMMU_NOTIFIER_MAP;
+    } else {
+        request_flags = IOMMU_NOTIFIER_UNMAP;
+    }
+
+    QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
+        if (iommu_notifier->notifier_flags & request_flags) {
+            iommu_notifier->notify(iommu_notifier, &entry);
+        }
+    }
 }
 
 void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
-- 
cgit v1.2.3-55-g7522


From 5bf3d319030b1e95116ba49b31339f2bdd1d3b2a Mon Sep 17 00:00:00 2001
From: Peter Xu
Date: Fri, 23 Sep 2016 13:02:27 +0800
Subject: memory: introduce IOMMUOps.notify_flag_changed

The new interface can be used to replace the old notify_started() and
notify_stopped(). Meanwhile it provides explicit flags so that IOMMUs
can know what kind of notifications it is requested for.

Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1474606948-14391-3-git-send-email-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/amd_iommu.c   |  6 ++++--
 hw/i386/intel_iommu.c |  6 ++++--
 hw/ppc/spapr_iommu.c  | 18 ++++++++++--------
 include/exec/memory.h |  9 +++++----
 memory.c              | 29 +++++++++++++++++++++--------
 5 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index a91a1798cb..a868539fb0 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1066,7 +1066,9 @@ static const MemoryRegionOps mmio_mem_ops = {
     }
 };
 
-static void amdvi_iommu_notify_started(MemoryRegion *iommu)
+static void amdvi_iommu_notify_flag_changed(MemoryRegion *iommu,
+                                            IOMMUNotifierFlag old,
+                                            IOMMUNotifierFlag new)
 {
     AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 
@@ -1080,7 +1082,7 @@ static void amdvi_init(AMDVIState *s)
     amdvi_iotlb_reset(s);
 
     s->iommu_ops.translate = amdvi_translate;
-    s->iommu_ops.notify_started = amdvi_iommu_notify_started;
+    s->iommu_ops.notify_flag_changed = amdvi_iommu_notify_flag_changed;
     s->devtab_len = 0;
     s->cmdbuf_len = 0;
     s->cmdbuf_head = 0;
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d6e02c821a..b6cc38c73d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1974,7 +1974,9 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr,
     return ret;
 }
 
-static void vtd_iommu_notify_started(MemoryRegion *iommu)
+static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
+                                          IOMMUNotifierFlag old,
+                                          IOMMUNotifierFlag new)
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
 
@@ -2348,7 +2350,7 @@ static void vtd_init(IntelIOMMUState *s)
     memset(s->womask, 0, DMAR_REG_SIZE);
 
     s->iommu_ops.translate = vtd_iommu_translate;
-    s->iommu_ops.notify_started = vtd_iommu_notify_started;
+    s->iommu_ops.notify_flag_changed = vtd_iommu_notify_flag_changed;
     s->root = 0;
     s->root_extended = false;
     s->dmar_enabled = false;
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index f20b0b884f..ae30bbe30f 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -156,14 +156,17 @@ static uint64_t spapr_tce_get_min_page_size(MemoryRegion *iommu)
     return 1ULL << tcet->page_shift;
 }
 
-static void spapr_tce_notify_started(MemoryRegion *iommu)
+static void spapr_tce_notify_flag_changed(MemoryRegion *iommu,
+                                          IOMMUNotifierFlag old,
+                                          IOMMUNotifierFlag new)
 {
-    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), true);
-}
+    struct sPAPRTCETable *tbl = container_of(iommu, sPAPRTCETable, iommu);
 
-static void spapr_tce_notify_stopped(MemoryRegion *iommu)
-{
-    spapr_tce_set_need_vfio(container_of(iommu, sPAPRTCETable, iommu), false);
+    if (old == IOMMU_NOTIFIER_NONE && new != IOMMU_NOTIFIER_NONE) {
+        spapr_tce_set_need_vfio(tbl, true);
+    } else if (old != IOMMU_NOTIFIER_NONE && new == IOMMU_NOTIFIER_NONE) {
+        spapr_tce_set_need_vfio(tbl, false);
+    }
 }
 
 static int spapr_tce_table_post_load(void *opaque, int version_id)
@@ -246,8 +249,7 @@ static const VMStateDescription vmstate_spapr_tce_table = {
 static MemoryRegionIOMMUOps spapr_iommu_ops = {
     .translate = spapr_tce_translate_iommu,
     .get_min_page_size = spapr_tce_get_min_page_size,
-    .notify_started = spapr_tce_notify_started,
-    .notify_stopped = spapr_tce_notify_stopped,
+    .notify_flag_changed = spapr_tce_notify_flag_changed,
 };
 
 static int spapr_tce_table_realize(DeviceState *dev)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 14cda673f9..a3f988b640 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -174,10 +174,10 @@ struct MemoryRegionIOMMUOps {
     IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write);
     /* Returns minimum supported page size */
     uint64_t (*get_min_page_size)(MemoryRegion *iommu);
-    /* Called when the first notifier is set */
-    void (*notify_started)(MemoryRegion *iommu);
-    /* Called when the last notifier is removed */
-    void (*notify_stopped)(MemoryRegion *iommu);
+    /* Called when IOMMU Notifier flag changed */
+    void (*notify_flag_changed)(MemoryRegion *iommu,
+                                IOMMUNotifierFlag old_flags,
+                                IOMMUNotifierFlag new_flags);
 };
 
 typedef struct CoalescedMemoryRange CoalescedMemoryRange;
@@ -223,6 +223,7 @@ struct MemoryRegion {
     unsigned ioeventfd_nb;
     MemoryRegionIoeventfd *ioeventfds;
     QLIST_HEAD(, IOMMUNotifier) iommu_notify;
+    IOMMUNotifierFlag iommu_notify_flags;
 };
 
 /**
diff --git a/memory.c b/memory.c
index 69d9d9ab7f..27a3f2fca2 100644
--- a/memory.c
+++ b/memory.c
@@ -1414,6 +1414,7 @@ void memory_region_init_iommu(MemoryRegion *mr,
     mr->iommu_ops = ops,
     mr->terminates = true;  /* then re-forwards */
     QLIST_INIT(&mr->iommu_notify);
+    mr->iommu_notify_flags = IOMMU_NOTIFIER_NONE;
 }
 
 static void memory_region_finalize(Object *obj)
@@ -1508,16 +1509,31 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client)
     return memory_region_get_dirty_log_mask(mr) & (1 << client);
 }
 
+static void memory_region_update_iommu_notify_flags(MemoryRegion *mr)
+{
+    IOMMUNotifierFlag flags = IOMMU_NOTIFIER_NONE;
+    IOMMUNotifier *iommu_notifier;
+
+    QLIST_FOREACH(iommu_notifier, &mr->iommu_notify, node) {
+        flags |= iommu_notifier->notifier_flags;
+    }
+
+    if (flags != mr->iommu_notify_flags &&
+        mr->iommu_ops->notify_flag_changed) {
+        mr->iommu_ops->notify_flag_changed(mr, mr->iommu_notify_flags,
+                                           flags);
+    }
+
+    mr->iommu_notify_flags = flags;
+}
+
 void memory_region_register_iommu_notifier(MemoryRegion *mr,
                                            IOMMUNotifier *n)
 {
     /* We need to register for at least one bitfield */
     assert(n->notifier_flags != IOMMU_NOTIFIER_NONE);
-    if (mr->iommu_ops->notify_started &&
-        QLIST_EMPTY(&mr->iommu_notify)) {
-        mr->iommu_ops->notify_started(mr);
-    }
     QLIST_INSERT_HEAD(&mr->iommu_notify, n, node);
+    memory_region_update_iommu_notify_flags(mr);
 }
 
 uint64_t memory_region_iommu_get_min_page_size(MemoryRegion *mr)
@@ -1555,10 +1571,7 @@ void memory_region_unregister_iommu_notifier(MemoryRegion *mr,
                                              IOMMUNotifier *n)
 {
     QLIST_REMOVE(n, node);
-    if (mr->iommu_ops->notify_stopped &&
-        QLIST_EMPTY(&mr->iommu_notify)) {
-        mr->iommu_ops->notify_stopped(mr);
-    }
+    memory_region_update_iommu_notify_flags(mr);
 }
 
 void memory_region_notify_iommu(MemoryRegion *mr,
-- 
cgit v1.2.3-55-g7522


From a3276f786c84d3410be5bc3a4b3e5036745e5a90 Mon Sep 17 00:00:00 2001
From: Peter Xu
Date: Fri, 23 Sep 2016 13:02:28 +0800
Subject: intel_iommu, amd_iommu: allow UNMAP notifiers

x86 vIOMMUs still lack of a complete IOMMU notifier mechanism.
Before that is achieved, let's open a door for vhost DMAR support,
which only requires cache invalidations (UNMAP operations).

Meanwhile, convert hw_error() to error_report() and exit(1), to make
the error messages cleaner and obvious (no CPU registers will be dumped).

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1474606948-14391-4-git-send-email-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/i386/amd_iommu.c   | 10 +++++++---
 hw/i386/intel_iommu.c | 12 ++++++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index a868539fb0..023de526f6 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -21,6 +21,7 @@
  */
 #include "qemu/osdep.h"
 #include "hw/i386/amd_iommu.h"
+#include "qemu/error-report.h"
 #include "trace.h"
 
 /* used AMD-Vi MMIO registers */
@@ -1072,9 +1073,12 @@ static void amdvi_iommu_notify_flag_changed(MemoryRegion *iommu,
 {
     AMDVIAddressSpace *as = container_of(iommu, AMDVIAddressSpace, iommu);
 
-    hw_error("device %02x.%02x.%x requires iommu notifier which is not "
-             "currently supported", as->bus_num, PCI_SLOT(as->devfn),
-             PCI_FUNC(as->devfn));
+    if (new & IOMMU_NOTIFIER_MAP) {
+        error_report("device %02x.%02x.%x requires iommu notifier which is not "
+                     "currently supported", as->bus_num, PCI_SLOT(as->devfn),
+                     PCI_FUNC(as->devfn));
+        exit(1);
+    }
 }
 
 static void amdvi_init(AMDVIState *s)
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b6cc38c73d..9f4e64af1a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1980,10 +1980,14 @@ static void vtd_iommu_notify_flag_changed(MemoryRegion *iommu,
 {
     VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
 
-    hw_error("Device at bus %s addr %02x.%d requires iommu notifier which "
-             "is currently not supported by intel-iommu emulation",
-             vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
-             PCI_FUNC(vtd_as->devfn));
+    if (new & IOMMU_NOTIFIER_MAP) {
+        error_report("Device at bus %s addr %02x.%d requires iommu "
+                     "notifier which is currently not supported by "
+                     "intel-iommu emulation",
+                     vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn),
+                     PCI_FUNC(vtd_as->devfn));
+        exit(1);
+    }
 }
 
 static const VMStateDescription vtd_vmstate = {
-- 
cgit v1.2.3-55-g7522


From 048a2e8869cb7e26013e40d860c9ebdf8e28c2ac Mon Sep 17 00:00:00 2001
From: Peter Xu
Date: Fri, 23 Sep 2016 13:33:15 +0800
Subject: x86: ioapic: boost default version to 0x20

It's 2.8 now, and maybe it's time to switch IOAPIC default version to
0x20.

Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1474608795-23058-1-git-send-email-peterx@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/intc/ioapic.c    | 2 +-
 include/hw/compat.h | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 31791b0986..fd9208fde0 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -416,7 +416,7 @@ static void ioapic_realize(DeviceState *dev, Error **errp)
 }
 
 static Property ioapic_properties[] = {
-    DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x11),
+    DEFINE_PROP_UINT8("version", IOAPICCommonState, version, 0x20),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/compat.h b/include/hw/compat.h
index a1d6694492..46412b229a 100644
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@@ -6,6 +6,10 @@
         .driver   = "virtio-pci",\
         .property = "page-per-vq",\
         .value    = "on",\
+    },{\
+        .driver   = "ioapic",\
+        .property = "version",\
+        .value    = "0x11",\
     },
 
 #define HW_COMPAT_2_6 \
-- 
cgit v1.2.3-55-g7522


From 63ae8b942d8e4095179bbf1a8946348bc71b7973 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Wed, 21 Sep 2016 18:49:07 +0200
Subject: checkpatch: downgrade "architecture specific defines should be
 avoided"

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index dde3f5f9d9..3afa19a766 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2407,7 +2407,7 @@ sub process {
 # we have e.g. CONFIG_LINUX and CONFIG_WIN32 for common cases
 # where they might be necessary.
 		if ($line =~ m@^.\s*\#\s*if.*\b__@) {
-			ERROR("architecture specific defines should be avoided\n" .  $herecurr);
+			WARN("architecture specific defines should be avoided\n" .  $herecurr);
 		}
 
 # Check that the storage class is at the beginning of a declaration
-- 
cgit v1.2.3-55-g7522


From cc9d8a3b2c41c22fb09f90f3085e6036c199c3ca Mon Sep 17 00:00:00 2001
From: Felipe Franciosi
Date: Fri, 23 Sep 2016 16:02:51 +0100
Subject: compiler: Swap 'public domain' header for license

As discussed on the list [1], having a comment stating that this file
is "public domain" is arguably wrong and not legally binding. This patch
replaces that comment with a clear GPLv2+ license as proposed in [2].

[1] http://lists.nongnu.org/archive/html/qemu-devel/2016-09/msg06151.html
[2] http://lists.nongnu.org/archive/html/qemu-devel/2016-09/msg06217.html

Worth noting, compiler.h was originally created on 5c026320 by splitting
qemu-common.h. At the time, qemu-common.h was already GPLv2+.

Signed-off-by: Felipe Franciosi <felipe@nutanix.com>
Message-Id: <1474642971-11866-1-git-send-email-felipe@nutanix.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/qemu/compiler.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h
index 338d3a65b3..157698bfa9 100644
--- a/include/qemu/compiler.h
+++ b/include/qemu/compiler.h
@@ -1,4 +1,8 @@
-/* public domain */
+/* compiler.h: macros to abstract away compiler specifics
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
 
 #ifndef COMPILER_H
 #define COMPILER_H
-- 
cgit v1.2.3-55-g7522


From 9c1f8f4493e8355d0e48f7d1eebdf86893ba082d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Thu, 22 Sep 2016 16:08:31 +0200
Subject: migration: sync all address spaces

Migrating a VM during reboot sometimes results in differences
between the source and destination in the SMRAM area.

This is because migration_bitmap_sync() only fetches from KVM
the dirty log of address_space_memory.  SMRAM memory slots
are ignored and the modifications to SMRAM are not sent to the
destination.

Reported-by: He Rongguang <herongguang.he@huawei.com>
Reviewed-by: He Rongguang <herongguang.he@huawei.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/exec/memory.h |  7 +++----
 memory.c              | 46 +++++++++++++++++++++++++++++++++-------------
 migration/ram.c       |  2 +-
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index a3f988b640..10d7eacc40 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1188,12 +1188,11 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr,
                                        hwaddr addr, uint64_t size);
 
 /**
- * address_space_sync_dirty_bitmap: synchronize the dirty log for all memory
+ * memory_global_dirty_log_sync: synchronize the dirty log for all memory
  *
- * Synchronizes the dirty page log for an entire address space.
- * @as: the address space that contains the memory being synchronized
+ * Synchronizes the dirty page log for all address spaces.
  */
-void address_space_sync_dirty_bitmap(AddressSpace *as);
+void memory_global_dirty_log_sync(void);
 
 /**
  * memory_region_transaction_begin: Start a transaction.
diff --git a/memory.c b/memory.c
index 27a3f2fca2..58f92693e2 100644
--- a/memory.c
+++ b/memory.c
@@ -158,14 +158,10 @@ static bool memory_listener_match(MemoryListener *listener,
 
 /* No need to ref/unref .mr, the FlatRange keeps it alive.  */
 #define MEMORY_LISTENER_UPDATE_REGION(fr, as, dir, callback, _args...)  \
-    MEMORY_LISTENER_CALL(callback, dir, (&(MemoryRegionSection) {       \
-        .mr = (fr)->mr,                                                 \
-        .address_space = (as),                                          \
-        .offset_within_region = (fr)->offset_in_region,                 \
-        .size = (fr)->addr.size,                                        \
-        .offset_within_address_space = int128_get64((fr)->addr.start),  \
-        .readonly = (fr)->readonly,                                     \
-              }), ##_args)
+    do {                                                                \
+        MemoryRegionSection mrs = section_from_flat_range(fr, as);      \
+        MEMORY_LISTENER_CALL(callback, dir, &mrs, ##_args);             \
+    } while(0)
 
 struct CoalescedMemoryRange {
     AddrRange addr;
@@ -245,6 +241,19 @@ typedef struct AddressSpaceOps AddressSpaceOps;
 #define FOR_EACH_FLAT_RANGE(var, view)          \
     for (var = (view)->ranges; var < (view)->ranges + (view)->nr; ++var)
 
+static inline MemoryRegionSection
+section_from_flat_range(FlatRange *fr, AddressSpace *as)
+{
+    return (MemoryRegionSection) {
+        .mr = fr->mr,
+        .address_space = as,
+        .offset_within_region = fr->offset_in_region,
+        .size = fr->addr.size,
+        .offset_within_address_space = int128_get64(fr->addr.start),
+        .readonly = fr->readonly,
+    };
+}
+
 static bool flatrange_equal(FlatRange *a, FlatRange *b)
 {
     return a->mr == b->mr
@@ -2156,16 +2165,27 @@ bool memory_region_present(MemoryRegion *container, hwaddr addr)
     return mr && mr != container;
 }
 
-void address_space_sync_dirty_bitmap(AddressSpace *as)
+void memory_global_dirty_log_sync(void)
 {
+    MemoryListener *listener;
+    AddressSpace *as;
     FlatView *view;
     FlatRange *fr;
 
-    view = address_space_get_flatview(as);
-    FOR_EACH_FLAT_RANGE(fr, view) {
-        MEMORY_LISTENER_UPDATE_REGION(fr, as, Forward, log_sync);
+    QTAILQ_FOREACH(listener, &memory_listeners, link) {
+        if (!listener->log_sync) {
+            continue;
+        }
+        /* Global listeners are being phased out.  */
+        assert(listener->address_space_filter);
+        as = listener->address_space_filter;
+        view = address_space_get_flatview(as);
+        FOR_EACH_FLAT_RANGE(fr, view) {
+            MemoryRegionSection mrs = section_from_flat_range(fr, as);
+            listener->log_sync(listener, &mrs);
+        }
+        flatview_unref(view);
     }
-    flatview_unref(view);
 }
 
 void memory_global_dirty_log_start(void)
diff --git a/migration/ram.c b/migration/ram.c
index a6e1c63a08..c8ec9f268f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -626,7 +626,7 @@ static void migration_bitmap_sync(void)
     }
 
     trace_migration_bitmap_sync_start();
-    address_space_sync_dirty_bitmap(&address_space_memory);
+    memory_global_dirty_log_sync();
 
     qemu_mutex_lock(&migration_bitmap_mutex);
     rcu_read_lock();
-- 
cgit v1.2.3-55-g7522


From 1f04b992cf643cd9eb98394d1960b1a5bdfa4b42 Mon Sep 17 00:00:00 2001
From: Marc-André Lureau
Date: Mon, 26 Sep 2016 00:57:47 +0400
Subject: build-sys: remove unused GLIB_CFLAGS

Message-Id: <20160925205748.6280-1-marcandre.lureau@redhat.com>

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure | 1 -
 1 file changed, 1 deletion(-)

diff --git a/configure b/configure
index 8fa62ade57..c83160025e 100755
--- a/configure
+++ b/configure
@@ -5140,7 +5140,6 @@ fi
 if test "$glib_subprocess" = "yes" ; then
   echo "CONFIG_HAS_GLIB_SUBPROCESS_TESTS=y" >> $config_host_mak
 fi
-echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak
 if test "$gtk" = "yes" ; then
   echo "CONFIG_GTK=y" >> $config_host_mak
   echo "CONFIG_GTKABI=$gtkabi" >> $config_host_mak
-- 
cgit v1.2.3-55-g7522


From 4a0588996a5848ce9550188d0f60642636815059 Mon Sep 17 00:00:00 2001
From: Marc-André Lureau
Date: Mon, 26 Sep 2016 00:57:48 +0400
Subject: build-sys: put glib_cflags in QEMU_CFLAGS

This way, overriding CFLAGS on make command line keeps glib-cflags
and doesn't break the build.

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Message-Id: <20160925205748.6280-2-marcandre.lureau@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index c83160025e..5412d4f0b7 100755
--- a/configure
+++ b/configure
@@ -2933,7 +2933,7 @@ for i in $glib_modules; do
     if $pkg_config --atleast-version=$glib_req_ver $i; then
         glib_cflags=$($pkg_config --cflags $i)
         glib_libs=$($pkg_config --libs $i)
-        CFLAGS="$glib_cflags $CFLAGS"
+        QEMU_CFLAGS="$glib_cflags $QEMU_CFLAGS"
         LIBS="$glib_libs $LIBS"
         libs_qga="$glib_libs $libs_qga"
     else
-- 
cgit v1.2.3-55-g7522


From e0eeb4a21a3ca4b296220ce4449d8acef9de9049 Mon Sep 17 00:00:00 2001
From: Alex Bennée
Date: Tue, 2 Aug 2016 18:27:33 +0100
Subject: cpus: pass CPUState to run_on_cpu helpers

CPUState is a fairly common pointer to pass to these helpers. This means
if you need other arguments for the async_run_on_cpu case you end up
having to do a g_malloc to stuff additional data into the routine. For
the current users this isn't a massive deal but for MTTCG this gets
cumbersome when the only other parameter is often an address.

This adds the typedef run_on_cpu_func for helper functions which has an
explicit CPUState * passed as the first parameter. All the users of
run_on_cpu and async_run_on_cpu have had their helpers updated to use
CPUState where available.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
[Sergey Fedorov:
 - eliminate more CPUState in user data;
 - remove unnecessary user data passing;
 - fix target-s390x/kvm.c and target-s390x/misc_helper.c]
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au> (ppc parts)
Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com> (s390 parts)
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-3-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c                     | 15 ++++---
 hw/i386/kvm/apic.c         |  5 +--
 hw/i386/kvmvapic.c         |  6 +--
 hw/ppc/ppce500_spin.c      | 31 +++++----------
 hw/ppc/spapr.c             |  6 +--
 hw/ppc/spapr_hcall.c       | 17 ++++----
 include/qom/cpu.h          |  8 ++--
 kvm-all.c                  | 21 ++++------
 target-i386/helper.c       | 19 ++++-----
 target-i386/kvm.c          |  6 +--
 target-s390x/cpu.c         |  4 +-
 target-s390x/cpu.h         |  7 +---
 target-s390x/kvm.c         | 98 +++++++++++++++++++++++-----------------------
 target-s390x/misc_helper.c |  4 +-
 14 files changed, 109 insertions(+), 138 deletions(-)

diff --git a/cpus.c b/cpus.c
index e39ccb7f30..1a2a9b0334 100644
--- a/cpus.c
+++ b/cpus.c
@@ -557,9 +557,8 @@ static const VMStateDescription vmstate_timers = {
     }
 };
 
-static void cpu_throttle_thread(void *opaque)
+static void cpu_throttle_thread(CPUState *cpu, void *opaque)
 {
-    CPUState *cpu = opaque;
     double pct;
     double throttle_ratio;
     long sleeptime_ns;
@@ -589,7 +588,7 @@ static void cpu_throttle_timer_tick(void *opaque)
     }
     CPU_FOREACH(cpu) {
         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
-            async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
+            async_run_on_cpu(cpu, cpu_throttle_thread, NULL);
         }
     }
 
@@ -917,12 +916,12 @@ void qemu_init_cpu_loop(void)
     qemu_thread_get_self(&io_thread);
 }
 
-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
     struct qemu_work_item wi;
 
     if (qemu_cpu_is_self(cpu)) {
-        func(data);
+        func(cpu, data);
         return;
     }
 
@@ -950,12 +949,12 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
     }
 }
 
-void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
     struct qemu_work_item *wi;
 
     if (qemu_cpu_is_self(cpu)) {
-        func(data);
+        func(cpu, data);
         return;
     }
 
@@ -1006,7 +1005,7 @@ static void flush_queued_work(CPUState *cpu)
             cpu->queued_work_last = NULL;
         }
         qemu_mutex_unlock(&cpu->work_mutex);
-        wi->func(wi->data);
+        wi->func(cpu, wi->data);
         qemu_mutex_lock(&cpu->work_mutex);
         if (wi->free) {
             g_free(wi);
diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
index f57fed1cb0..c016e63fc2 100644
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@@ -125,7 +125,7 @@ static void kvm_apic_vapic_base_update(APICCommonState *s)
     }
 }
 
-static void kvm_apic_put(void *data)
+static void kvm_apic_put(CPUState *cs, void *data)
 {
     APICCommonState *s = data;
     struct kvm_lapic_state kapic;
@@ -146,10 +146,9 @@ static void kvm_apic_post_load(APICCommonState *s)
     run_on_cpu(CPU(s->cpu), kvm_apic_put, s);
 }
 
-static void do_inject_external_nmi(void *data)
+static void do_inject_external_nmi(CPUState *cpu, void *data)
 {
     APICCommonState *s = data;
-    CPUState *cpu = CPU(s->cpu);
     uint32_t lvt;
     int ret;
 
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
index a1cd9b5a29..74a549becf 100644
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@@ -483,7 +483,7 @@ typedef struct VAPICEnableTPRReporting {
     bool enable;
 } VAPICEnableTPRReporting;
 
-static void vapic_do_enable_tpr_reporting(void *data)
+static void vapic_do_enable_tpr_reporting(CPUState *cpu, void *data)
 {
     VAPICEnableTPRReporting *info = data;
 
@@ -734,10 +734,10 @@ static void vapic_realize(DeviceState *dev, Error **errp)
     nb_option_roms++;
 }
 
-static void do_vapic_enable(void *data)
+static void do_vapic_enable(CPUState *cs, void *data)
 {
     VAPICROMState *s = data;
-    X86CPU *cpu = X86_CPU(first_cpu);
+    X86CPU *cpu = X86_CPU(cs);
 
     static const uint8_t enabled = 1;
     cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled),
diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c
index 22c584eb8d..8e16f651ea 100644
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -54,11 +54,6 @@ typedef struct SpinState {
     SpinInfo spin[MAX_CPUS];
 } SpinState;
 
-typedef struct spin_kick {
-    PowerPCCPU *cpu;
-    SpinInfo *spin;
-} SpinKick;
-
 static void spin_reset(void *opaque)
 {
     SpinState *s = opaque;
@@ -89,16 +84,15 @@ static void mmubooke_create_initial_mapping(CPUPPCState *env,
     env->tlb_dirty = true;
 }
 
-static void spin_kick(void *data)
+static void spin_kick(CPUState *cs, void *data)
 {
-    SpinKick *kick = data;
-    CPUState *cpu = CPU(kick->cpu);
-    CPUPPCState *env = &kick->cpu->env;
-    SpinInfo *curspin = kick->spin;
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+    SpinInfo *curspin = data;
     hwaddr map_size = 64 * 1024 * 1024;
     hwaddr map_start;
 
-    cpu_synchronize_state(cpu);
+    cpu_synchronize_state(cs);
     stl_p(&curspin->pir, env->spr[SPR_BOOKE_PIR]);
     env->nip = ldq_p(&curspin->addr) & (map_size - 1);
     env->gpr[3] = ldq_p(&curspin->r3);
@@ -112,10 +106,10 @@ static void spin_kick(void *data)
     map_start = ldq_p(&curspin->addr) & ~(map_size - 1);
     mmubooke_create_initial_mapping(env, 0, map_start, map_size);
 
-    cpu->halted = 0;
-    cpu->exception_index = -1;
-    cpu->stopped = false;
-    qemu_cpu_kick(cpu);
+    cs->halted = 0;
+    cs->exception_index = -1;
+    cs->stopped = false;
+    qemu_cpu_kick(cs);
 }
 
 static void spin_write(void *opaque, hwaddr addr, uint64_t value,
@@ -153,12 +147,7 @@ static void spin_write(void *opaque, hwaddr addr, uint64_t value,
 
     if (!(ldq_p(&curspin->addr) & 1)) {
         /* run CPU */
-        SpinKick kick = {
-            .cpu = POWERPC_CPU(cpu),
-            .spin = curspin,
-        };
-
-        run_on_cpu(cpu, spin_kick, &kick);
+        run_on_cpu(cpu, spin_kick, curspin);
     }
 }
 
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 9b506d5d3a..aa067aefdb 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2134,10 +2134,8 @@ static void spapr_machine_finalizefn(Object *obj)
     g_free(spapr->kvm_type);
 }
 
-static void ppc_cpu_do_nmi_on_cpu(void *arg)
+static void ppc_cpu_do_nmi_on_cpu(CPUState *cs, void *arg)
 {
-    CPUState *cs = arg;
-
     cpu_synchronize_state(cs);
     ppc_cpu_do_system_reset(cs);
 }
@@ -2147,7 +2145,7 @@ static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
     CPUState *cs;
 
     CPU_FOREACH(cs) {
-        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, cs);
+        async_run_on_cpu(cs, ppc_cpu_do_nmi_on_cpu, NULL);
     }
 }
 
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 290a7122d4..c5e7e8c995 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -13,19 +13,18 @@
 #include "kvm_ppc.h"
 
 struct SPRSyncState {
-    CPUState *cs;
     int spr;
     target_ulong value;
     target_ulong mask;
 };
 
-static void do_spr_sync(void *arg)
+static void do_spr_sync(CPUState *cs, void *arg)
 {
     struct SPRSyncState *s = arg;
-    PowerPCCPU *cpu = POWERPC_CPU(s->cs);
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
     CPUPPCState *env = &cpu->env;
 
-    cpu_synchronize_state(s->cs);
+    cpu_synchronize_state(cs);
     env->spr[s->spr] &= ~s->mask;
     env->spr[s->spr] |= s->value;
 }
@@ -34,7 +33,6 @@ static void set_spr(CPUState *cs, int spr, target_ulong value,
                     target_ulong mask)
 {
     struct SPRSyncState s = {
-        .cs = cs,
         .spr = spr,
         .value = value,
         .mask = mask
@@ -909,17 +907,17 @@ static target_ulong cas_get_option_vector(int vector, target_ulong table)
 }
 
 typedef struct {
-    PowerPCCPU *cpu;
     uint32_t cpu_version;
     Error *err;
 } SetCompatState;
 
-static void do_set_compat(void *arg)
+static void do_set_compat(CPUState *cs, void *arg)
 {
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
     SetCompatState *s = arg;
 
-    cpu_synchronize_state(CPU(s->cpu));
-    ppc_set_compat(s->cpu, s->cpu_version, &s->err);
+    cpu_synchronize_state(cs);
+    ppc_set_compat(cpu, s->cpu_version, &s->err);
 }
 
 #define get_compat_level(cpuver) ( \
@@ -1015,7 +1013,6 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_,
     if (old_cpu_version != cpu_version) {
         CPU_FOREACH(cs) {
             SetCompatState s = {
-                .cpu = POWERPC_CPU(cs),
                 .cpu_version = cpu_version,
                 .err = NULL,
             };
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index ce0c406f27..4aa9e61c5d 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -232,9 +232,11 @@ struct kvm_run;
 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 
 /* work queue */
+typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
+
 struct qemu_work_item {
     struct qemu_work_item *next;
-    void (*func)(void *data);
+    run_on_cpu_func func;
     void *data;
     int done;
     bool free;
@@ -623,7 +625,7 @@ bool cpu_is_stopped(CPUState *cpu);
  *
  * Schedules the function @func for execution on the vCPU @cpu.
  */
-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data);
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 
 /**
  * async_run_on_cpu:
@@ -633,7 +635,7 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data);
  *
  * Schedules the function @func for execution on the vCPU @cpu asynchronously.
  */
-void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data);
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 
 /**
  * qemu_get_cpu:
diff --git a/kvm-all.c b/kvm-all.c
index 8a4382eed8..fc2898a951 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1847,10 +1847,8 @@ void kvm_flush_coalesced_mmio_buffer(void)
     s->coalesced_flush_in_progress = false;
 }
 
-static void do_kvm_cpu_synchronize_state(void *arg)
+static void do_kvm_cpu_synchronize_state(CPUState *cpu, void *arg)
 {
-    CPUState *cpu = arg;
-
     if (!cpu->kvm_vcpu_dirty) {
         kvm_arch_get_registers(cpu);
         cpu->kvm_vcpu_dirty = true;
@@ -1860,34 +1858,30 @@ static void do_kvm_cpu_synchronize_state(void *arg)
 void kvm_cpu_synchronize_state(CPUState *cpu)
 {
     if (!cpu->kvm_vcpu_dirty) {
-        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
+        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, NULL);
     }
 }
 
-static void do_kvm_cpu_synchronize_post_reset(void *arg)
+static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, void *arg)
 {
-    CPUState *cpu = arg;
-
     kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
     cpu->kvm_vcpu_dirty = false;
 }
 
 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
 {
-    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, cpu);
+    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, NULL);
 }
 
-static void do_kvm_cpu_synchronize_post_init(void *arg)
+static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, void *arg)
 {
-    CPUState *cpu = arg;
-
     kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
     cpu->kvm_vcpu_dirty = false;
 }
 
 void kvm_cpu_synchronize_post_init(CPUState *cpu)
 {
-    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, cpu);
+    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, NULL);
 }
 
 int kvm_cpu_exec(CPUState *cpu)
@@ -2216,7 +2210,7 @@ struct kvm_set_guest_debug_data {
     int err;
 };
 
-static void kvm_invoke_set_guest_debug(void *data)
+static void kvm_invoke_set_guest_debug(CPUState *unused_cpu, void *data)
 {
     struct kvm_set_guest_debug_data *dbg_data = data;
 
@@ -2234,7 +2228,6 @@ int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
     }
     kvm_arch_update_guest_debug(cpu, &data.dbg);
-    data.cpu = cpu;
 
     run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
     return data.err;
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 1c250b8245..9bc961bff3 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -1113,7 +1113,6 @@ out:
 
 typedef struct MCEInjectionParams {
     Monitor *mon;
-    X86CPU *cpu;
     int bank;
     uint64_t status;
     uint64_t mcg_status;
@@ -1122,14 +1121,14 @@ typedef struct MCEInjectionParams {
     int flags;
 } MCEInjectionParams;
 
-static void do_inject_x86_mce(void *data)
+static void do_inject_x86_mce(CPUState *cs, void *data)
 {
     MCEInjectionParams *params = data;
-    CPUX86State *cenv = &params->cpu->env;
-    CPUState *cpu = CPU(params->cpu);
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *cenv = &cpu->env;
     uint64_t *banks = cenv->mce_banks + 4 * params->bank;
 
-    cpu_synchronize_state(cpu);
+    cpu_synchronize_state(cs);
 
     /*
      * If there is an MCE exception being processed, ignore this SRAO MCE
@@ -1149,7 +1148,7 @@ static void do_inject_x86_mce(void *data)
         if ((cenv->mcg_cap & MCG_CTL_P) && cenv->mcg_ctl != ~(uint64_t)0) {
             monitor_printf(params->mon,
                            "CPU %d: Uncorrected error reporting disabled\n",
-                           cpu->cpu_index);
+                           cs->cpu_index);
             return;
         }
 
@@ -1161,7 +1160,7 @@ static void do_inject_x86_mce(void *data)
             monitor_printf(params->mon,
                            "CPU %d: Uncorrected error reporting disabled for"
                            " bank %d\n",
-                           cpu->cpu_index, params->bank);
+                           cs->cpu_index, params->bank);
             return;
         }
 
@@ -1170,7 +1169,7 @@ static void do_inject_x86_mce(void *data)
             monitor_printf(params->mon,
                            "CPU %d: Previous MCE still in progress, raising"
                            " triple fault\n",
-                           cpu->cpu_index);
+                           cs->cpu_index);
             qemu_log_mask(CPU_LOG_RESET, "Triple fault\n");
             qemu_system_reset_request();
             return;
@@ -1182,7 +1181,7 @@ static void do_inject_x86_mce(void *data)
         banks[3] = params->misc;
         cenv->mcg_status = params->mcg_status;
         banks[1] = params->status;
-        cpu_interrupt(cpu, CPU_INTERRUPT_MCE);
+        cpu_interrupt(cs, CPU_INTERRUPT_MCE);
     } else if (!(banks[1] & MCI_STATUS_VAL)
                || !(banks[1] & MCI_STATUS_UC)) {
         if (banks[1] & MCI_STATUS_VAL) {
@@ -1204,7 +1203,6 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank,
     CPUX86State *cenv = &cpu->env;
     MCEInjectionParams params = {
         .mon = mon,
-        .cpu = cpu,
         .bank = bank,
         .status = status,
         .mcg_status = mcg_status,
@@ -1245,7 +1243,6 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int bank,
             if (other_cs == cs) {
                 continue;
             }
-            params.cpu = X86_CPU(other_cs);
             run_on_cpu(other_cs, do_inject_x86_mce, &params);
         }
     }
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index a0e42b2c4e..1955a6b3a4 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -156,10 +156,8 @@ static int kvm_get_tsc(CPUState *cs)
     return 0;
 }
 
-static inline void do_kvm_synchronize_tsc(void *arg)
+static inline void do_kvm_synchronize_tsc(CPUState *cpu, void *arg)
 {
-    CPUState *cpu = arg;
-
     kvm_get_tsc(cpu);
 }
 
@@ -169,7 +167,7 @@ void kvm_synchronize_all_tsc(void)
 
     if (kvm_enabled()) {
         CPU_FOREACH(cpu) {
-            run_on_cpu(cpu, do_kvm_synchronize_tsc, cpu);
+            run_on_cpu(cpu, do_kvm_synchronize_tsc, NULL);
         }
     }
 }
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index 2f3c8e245d..35ae2cec4b 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -164,7 +164,7 @@ static void s390_cpu_machine_reset_cb(void *opaque)
 {
     S390CPU *cpu = opaque;
 
-    run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, CPU(cpu));
+    run_on_cpu(CPU(cpu), s390_do_cpu_full_reset, NULL);
 }
 #endif
 
@@ -220,7 +220,7 @@ static void s390_cpu_realizefn(DeviceState *dev, Error **errp)
     s390_cpu_gdb_init(cs);
     qemu_init_vcpu(cs);
 #if !defined(CONFIG_USER_ONLY)
-    run_on_cpu(cs, s390_do_cpu_full_reset, cs);
+    run_on_cpu(cs, s390_do_cpu_full_reset, NULL);
 #else
     cpu_reset(cs);
 #endif
diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h
index 5645e063af..4fb34b598d 100644
--- a/target-s390x/cpu.h
+++ b/target-s390x/cpu.h
@@ -502,17 +502,14 @@ static inline hwaddr decode_basedisp_s(CPUS390XState *env, uint32_t ipb,
 #define decode_basedisp_rs decode_basedisp_s
 
 /* helper functions for run_on_cpu() */
-static inline void s390_do_cpu_reset(void *arg)
+static inline void s390_do_cpu_reset(CPUState *cs, void *arg)
 {
-    CPUState *cs = arg;
     S390CPUClass *scc = S390_CPU_GET_CLASS(cs);
 
     scc->cpu_reset(cs);
 }
-static inline void s390_do_cpu_full_reset(void *arg)
+static inline void s390_do_cpu_full_reset(CPUState *cs, void *arg)
 {
-    CPUState *cs = arg;
-
     cpu_reset(cs);
 }
 
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index 4b847a3be4..fd929e8351 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -1385,7 +1385,6 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb)
 }
 
 typedef struct SigpInfo {
-    S390CPU *cpu;
     uint64_t param;
     int cc;
     uint64_t *status_reg;
@@ -1398,38 +1397,40 @@ static void set_sigp_status(SigpInfo *si, uint64_t status)
     si->cc = SIGP_CC_STATUS_STORED;
 }
 
-static void sigp_start(void *arg)
+static void sigp_start(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
 
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
         si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
         return;
     }
 
-    s390_cpu_set_state(CPU_STATE_OPERATING, si->cpu);
+    s390_cpu_set_state(CPU_STATE_OPERATING, cpu);
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-static void sigp_stop(void *arg)
+static void sigp_stop(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
     struct kvm_s390_irq irq = {
         .type = KVM_S390_SIGP_STOP,
     };
 
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_OPERATING) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_OPERATING) {
         si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
         return;
     }
 
     /* disabled wait - sleeping in user space */
-    if (CPU(si->cpu)->halted) {
-        s390_cpu_set_state(CPU_STATE_STOPPED, si->cpu);
+    if (cs->halted) {
+        s390_cpu_set_state(CPU_STATE_STOPPED, cpu);
     } else {
         /* execute the stop function */
-        si->cpu->env.sigp_order = SIGP_STOP;
-        kvm_s390_vcpu_interrupt(si->cpu, &irq);
+        cpu->env.sigp_order = SIGP_STOP;
+        kvm_s390_vcpu_interrupt(cpu, &irq);
     }
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
@@ -1496,56 +1497,58 @@ static int kvm_s390_store_status(S390CPU *cpu, hwaddr addr, bool store_arch)
     return 0;
 }
 
-static void sigp_stop_and_store_status(void *arg)
+static void sigp_stop_and_store_status(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
     struct kvm_s390_irq irq = {
         .type = KVM_S390_SIGP_STOP,
     };
 
     /* disabled wait - sleeping in user space */
-    if (s390_cpu_get_state(si->cpu) == CPU_STATE_OPERATING &&
-        CPU(si->cpu)->halted) {
-        s390_cpu_set_state(CPU_STATE_STOPPED, si->cpu);
+    if (s390_cpu_get_state(cpu) == CPU_STATE_OPERATING && cs->halted) {
+        s390_cpu_set_state(CPU_STATE_STOPPED, cpu);
     }
 
-    switch (s390_cpu_get_state(si->cpu)) {
+    switch (s390_cpu_get_state(cpu)) {
     case CPU_STATE_OPERATING:
-        si->cpu->env.sigp_order = SIGP_STOP_STORE_STATUS;
-        kvm_s390_vcpu_interrupt(si->cpu, &irq);
+        cpu->env.sigp_order = SIGP_STOP_STORE_STATUS;
+        kvm_s390_vcpu_interrupt(cpu, &irq);
         /* store will be performed when handling the stop intercept */
         break;
     case CPU_STATE_STOPPED:
         /* already stopped, just store the status */
-        cpu_synchronize_state(CPU(si->cpu));
-        kvm_s390_store_status(si->cpu, KVM_S390_STORE_STATUS_DEF_ADDR, true);
+        cpu_synchronize_state(cs);
+        kvm_s390_store_status(cpu, KVM_S390_STORE_STATUS_DEF_ADDR, true);
         break;
     }
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-static void sigp_store_status_at_address(void *arg)
+static void sigp_store_status_at_address(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
     uint32_t address = si->param & 0x7ffffe00u;
 
     /* cpu has to be stopped */
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
         set_sigp_status(si, SIGP_STAT_INCORRECT_STATE);
         return;
     }
 
-    cpu_synchronize_state(CPU(si->cpu));
+    cpu_synchronize_state(cs);
 
-    if (kvm_s390_store_status(si->cpu, address, false)) {
+    if (kvm_s390_store_status(cpu, address, false)) {
         set_sigp_status(si, SIGP_STAT_INVALID_PARAMETER);
         return;
     }
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-static void sigp_store_adtl_status(void *arg)
+static void sigp_store_adtl_status(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
 
     if (!s390_has_feat(S390_FEAT_VECTOR)) {
@@ -1554,7 +1557,7 @@ static void sigp_store_adtl_status(void *arg)
     }
 
     /* cpu has to be stopped */
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
         set_sigp_status(si, SIGP_STAT_INCORRECT_STATE);
         return;
     }
@@ -1565,31 +1568,32 @@ static void sigp_store_adtl_status(void *arg)
         return;
     }
 
-    cpu_synchronize_state(CPU(si->cpu));
+    cpu_synchronize_state(cs);
 
-    if (kvm_s390_store_adtl_status(si->cpu, si->param)) {
+    if (kvm_s390_store_adtl_status(cpu, si->param)) {
         set_sigp_status(si, SIGP_STAT_INVALID_PARAMETER);
         return;
     }
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-static void sigp_restart(void *arg)
+static void sigp_restart(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
     struct kvm_s390_irq irq = {
         .type = KVM_S390_RESTART,
     };
 
-    switch (s390_cpu_get_state(si->cpu)) {
+    switch (s390_cpu_get_state(cpu)) {
     case CPU_STATE_STOPPED:
         /* the restart irq has to be delivered prior to any other pending irq */
-        cpu_synchronize_state(CPU(si->cpu));
-        do_restart_interrupt(&si->cpu->env);
-        s390_cpu_set_state(CPU_STATE_OPERATING, si->cpu);
+        cpu_synchronize_state(cs);
+        do_restart_interrupt(&cpu->env);
+        s390_cpu_set_state(CPU_STATE_OPERATING, cpu);
         break;
     case CPU_STATE_OPERATING:
-        kvm_s390_vcpu_interrupt(si->cpu, &irq);
+        kvm_s390_vcpu_interrupt(cpu, &irq);
         break;
     }
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
@@ -1597,20 +1601,18 @@ static void sigp_restart(void *arg)
 
 int kvm_s390_cpu_restart(S390CPU *cpu)
 {
-    SigpInfo si = {
-        .cpu = cpu,
-    };
+    SigpInfo si = {};
 
     run_on_cpu(CPU(cpu), sigp_restart, &si);
     DPRINTF("DONE: KVM cpu restart: %p\n", &cpu->env);
     return 0;
 }
 
-static void sigp_initial_cpu_reset(void *arg)
+static void sigp_initial_cpu_reset(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
+    S390CPUClass *scc = S390_CPU_GET_CLASS(cpu);
     SigpInfo *si = arg;
-    CPUState *cs = CPU(si->cpu);
-    S390CPUClass *scc = S390_CPU_GET_CLASS(si->cpu);
 
     cpu_synchronize_state(cs);
     scc->initial_cpu_reset(cs);
@@ -1618,11 +1620,11 @@ static void sigp_initial_cpu_reset(void *arg)
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-static void sigp_cpu_reset(void *arg)
+static void sigp_cpu_reset(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
+    S390CPUClass *scc = S390_CPU_GET_CLASS(cpu);
     SigpInfo *si = arg;
-    CPUState *cs = CPU(si->cpu);
-    S390CPUClass *scc = S390_CPU_GET_CLASS(si->cpu);
 
     cpu_synchronize_state(cs);
     scc->cpu_reset(cs);
@@ -1630,12 +1632,13 @@ static void sigp_cpu_reset(void *arg)
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-static void sigp_set_prefix(void *arg)
+static void sigp_set_prefix(CPUState *cs, void *arg)
 {
+    S390CPU *cpu = S390_CPU(cs);
     SigpInfo *si = arg;
     uint32_t addr = si->param & 0x7fffe000u;
 
-    cpu_synchronize_state(CPU(si->cpu));
+    cpu_synchronize_state(cs);
 
     if (!address_space_access_valid(&address_space_memory, addr,
                                     sizeof(struct LowCore), false)) {
@@ -1644,13 +1647,13 @@ static void sigp_set_prefix(void *arg)
     }
 
     /* cpu has to be stopped */
-    if (s390_cpu_get_state(si->cpu) != CPU_STATE_STOPPED) {
+    if (s390_cpu_get_state(cpu) != CPU_STATE_STOPPED) {
         set_sigp_status(si, SIGP_STAT_INCORRECT_STATE);
         return;
     }
 
-    si->cpu->env.psa = addr;
-    cpu_synchronize_post_init(CPU(si->cpu));
+    cpu->env.psa = addr;
+    cpu_synchronize_post_init(cs);
     si->cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
@@ -1658,7 +1661,6 @@ static int handle_sigp_single_dst(S390CPU *dst_cpu, uint8_t order,
                                   uint64_t param, uint64_t *status_reg)
 {
     SigpInfo si = {
-        .cpu = dst_cpu,
         .param = param,
         .status_reg = status_reg,
     };
diff --git a/target-s390x/misc_helper.c b/target-s390x/misc_helper.c
index 86da1947b9..4df2ec6c7d 100644
--- a/target-s390x/misc_helper.c
+++ b/target-s390x/misc_helper.c
@@ -126,7 +126,7 @@ static int modified_clear_reset(S390CPU *cpu)
     pause_all_vcpus();
     cpu_synchronize_all_states();
     CPU_FOREACH(t) {
-        run_on_cpu(t, s390_do_cpu_full_reset, t);
+        run_on_cpu(t, s390_do_cpu_full_reset, NULL);
     }
     s390_cmma_reset();
     subsystem_reset();
@@ -145,7 +145,7 @@ static int load_normal_reset(S390CPU *cpu)
     pause_all_vcpus();
     cpu_synchronize_all_states();
     CPU_FOREACH(t) {
-        run_on_cpu(t, s390_do_cpu_reset, t);
+        run_on_cpu(t, s390_do_cpu_reset, NULL);
     }
     s390_cmma_reset();
     subsystem_reset();
-- 
cgit v1.2.3-55-g7522


From fd38b25103ad9e3b51862424074d4eef75975623 Mon Sep 17 00:00:00 2001
From: Sergey Fedorov
Date: Tue, 2 Aug 2016 18:27:34 +0100
Subject: cpus: Move common code out of {async_, }run_on_cpu()

Move the code common between run_on_cpu() and async_run_on_cpu() into a
new function queue_work_on_cpu().

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-4-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/cpus.c b/cpus.c
index 1a2a9b0334..ed7d30a6c1 100644
--- a/cpus.c
+++ b/cpus.c
@@ -916,6 +916,22 @@ void qemu_init_cpu_loop(void)
     qemu_thread_get_self(&io_thread);
 }
 
+static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+{
+    qemu_mutex_lock(&cpu->work_mutex);
+    if (cpu->queued_work_first == NULL) {
+        cpu->queued_work_first = wi;
+    } else {
+        cpu->queued_work_last->next = wi;
+    }
+    cpu->queued_work_last = wi;
+    wi->next = NULL;
+    wi->done = false;
+    qemu_mutex_unlock(&cpu->work_mutex);
+
+    qemu_cpu_kick(cpu);
+}
+
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
     struct qemu_work_item wi;
@@ -929,18 +945,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
     wi.data = data;
     wi.free = false;
 
-    qemu_mutex_lock(&cpu->work_mutex);
-    if (cpu->queued_work_first == NULL) {
-        cpu->queued_work_first = &wi;
-    } else {
-        cpu->queued_work_last->next = &wi;
-    }
-    cpu->queued_work_last = &wi;
-    wi.next = NULL;
-    wi.done = false;
-    qemu_mutex_unlock(&cpu->work_mutex);
-
-    qemu_cpu_kick(cpu);
+    queue_work_on_cpu(cpu, &wi);
     while (!atomic_mb_read(&wi.done)) {
         CPUState *self_cpu = current_cpu;
 
@@ -963,18 +968,7 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
     wi->data = data;
     wi->free = true;
 
-    qemu_mutex_lock(&cpu->work_mutex);
-    if (cpu->queued_work_first == NULL) {
-        cpu->queued_work_first = wi;
-    } else {
-        cpu->queued_work_last->next = wi;
-    }
-    cpu->queued_work_last = wi;
-    wi->next = NULL;
-    wi->done = false;
-    qemu_mutex_unlock(&cpu->work_mutex);
-
-    qemu_cpu_kick(cpu);
+    queue_work_on_cpu(cpu, wi);
 }
 
 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
-- 
cgit v1.2.3-55-g7522


From a5403c69fcf2c946d166faa27e5db8436a00d183 Mon Sep 17 00:00:00 2001
From: Sergey Fedorov
Date: Tue, 2 Aug 2016 18:27:36 +0100
Subject: cpus: Rename flush_queued_work()

To avoid possible confusion, rename flush_queued_work() to
process_queued_cpu_work().

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-6-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpus.c b/cpus.c
index ed7d30a6c1..28d62062f3 100644
--- a/cpus.c
+++ b/cpus.c
@@ -983,7 +983,7 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }
 
-static void flush_queued_work(CPUState *cpu)
+static void process_queued_cpu_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
 
@@ -1018,7 +1018,7 @@ static void qemu_wait_io_event_common(CPUState *cpu)
         cpu->stopped = true;
         qemu_cond_broadcast(&qemu_pause_cond);
     }
-    flush_queued_work(cpu);
+    process_queued_cpu_work(cpu);
     cpu->thread_kicked = false;
 }
 
-- 
cgit v1.2.3-55-g7522


From 959f593c0e010cc0ee2e47e7f45e66c0695683b7 Mon Sep 17 00:00:00 2001
From: Sergey Fedorov
Date: Tue, 2 Aug 2016 18:27:37 +0100
Subject: linux-user: Use QemuMutex and QemuCond

Convert pthread_mutex_t and pthread_cond_t to QemuMutex and QemuCond.
This will allow to make some locks and conditional variables common
between user and system mode emulation.

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-7-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 linux-user/main.c | 55 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index 8daebe0767..7a056fcfd6 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -111,17 +111,25 @@ int cpu_get_pic_interrupt(CPUX86State *env)
    We don't require a full sync, only that no cpus are executing guest code.
    The alternative is to map target atomic ops onto host equivalents,
    which requires quite a lot of per host/target work.  */
-static pthread_mutex_t cpu_list_mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_mutex_t exclusive_lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t exclusive_cond = PTHREAD_COND_INITIALIZER;
-static pthread_cond_t exclusive_resume = PTHREAD_COND_INITIALIZER;
+static QemuMutex cpu_list_lock;
+static QemuMutex exclusive_lock;
+static QemuCond exclusive_cond;
+static QemuCond exclusive_resume;
 static int pending_cpus;
 
+void qemu_init_cpu_loop(void)
+{
+    qemu_mutex_init(&cpu_list_lock);
+    qemu_mutex_init(&exclusive_lock);
+    qemu_cond_init(&exclusive_cond);
+    qemu_cond_init(&exclusive_resume);
+}
+
 /* Make sure everything is in a consistent state for calling fork().  */
 void fork_start(void)
 {
     qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
-    pthread_mutex_lock(&exclusive_lock);
+    qemu_mutex_lock(&exclusive_lock);
     mmap_fork_start();
 }
 
@@ -138,14 +146,14 @@ void fork_end(int child)
             }
         }
         pending_cpus = 0;
-        pthread_mutex_init(&exclusive_lock, NULL);
-        pthread_mutex_init(&cpu_list_mutex, NULL);
-        pthread_cond_init(&exclusive_cond, NULL);
-        pthread_cond_init(&exclusive_resume, NULL);
+        qemu_mutex_init(&exclusive_lock);
+        qemu_mutex_init(&cpu_list_lock);
+        qemu_cond_init(&exclusive_cond);
+        qemu_cond_init(&exclusive_resume);
         qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
         gdbserver_fork(thread_cpu);
     } else {
-        pthread_mutex_unlock(&exclusive_lock);
+        qemu_mutex_unlock(&exclusive_lock);
         qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
     }
 }
@@ -155,7 +163,7 @@ void fork_end(int child)
 static inline void exclusive_idle(void)
 {
     while (pending_cpus) {
-        pthread_cond_wait(&exclusive_resume, &exclusive_lock);
+        qemu_cond_wait(&exclusive_resume, &exclusive_lock);
     }
 }
 
@@ -165,7 +173,7 @@ static inline void start_exclusive(void)
 {
     CPUState *other_cpu;
 
-    pthread_mutex_lock(&exclusive_lock);
+    qemu_mutex_lock(&exclusive_lock);
     exclusive_idle();
 
     pending_cpus = 1;
@@ -176,8 +184,8 @@ static inline void start_exclusive(void)
             cpu_exit(other_cpu);
         }
     }
-    if (pending_cpus > 1) {
-        pthread_cond_wait(&exclusive_cond, &exclusive_lock);
+    while (pending_cpus > 1) {
+        qemu_cond_wait(&exclusive_cond, &exclusive_lock);
     }
 }
 
@@ -185,42 +193,42 @@ static inline void start_exclusive(void)
 static inline void __attribute__((unused)) end_exclusive(void)
 {
     pending_cpus = 0;
-    pthread_cond_broadcast(&exclusive_resume);
-    pthread_mutex_unlock(&exclusive_lock);
+    qemu_cond_broadcast(&exclusive_resume);
+    qemu_mutex_unlock(&exclusive_lock);
 }
 
 /* Wait for exclusive ops to finish, and begin cpu execution.  */
 static inline void cpu_exec_start(CPUState *cpu)
 {
-    pthread_mutex_lock(&exclusive_lock);
+    qemu_mutex_lock(&exclusive_lock);
     exclusive_idle();
     cpu->running = true;
-    pthread_mutex_unlock(&exclusive_lock);
+    qemu_mutex_unlock(&exclusive_lock);
 }
 
 /* Mark cpu as not executing, and release pending exclusive ops.  */
 static inline void cpu_exec_end(CPUState *cpu)
 {
-    pthread_mutex_lock(&exclusive_lock);
+    qemu_mutex_lock(&exclusive_lock);
     cpu->running = false;
     if (pending_cpus > 1) {
         pending_cpus--;
         if (pending_cpus == 1) {
-            pthread_cond_signal(&exclusive_cond);
+            qemu_cond_signal(&exclusive_cond);
         }
     }
     exclusive_idle();
-    pthread_mutex_unlock(&exclusive_lock);
+    qemu_mutex_unlock(&exclusive_lock);
 }
 
 void cpu_list_lock(void)
 {
-    pthread_mutex_lock(&cpu_list_mutex);
+    qemu_mutex_lock(&cpu_list_lock);
 }
 
 void cpu_list_unlock(void)
 {
-    pthread_mutex_unlock(&cpu_list_mutex);
+    qemu_mutex_unlock(&cpu_list_lock);
 }
 
 
@@ -4211,6 +4219,7 @@ int main(int argc, char **argv, char **envp)
     int ret;
     int execfd;
 
+    qemu_init_cpu_loop();
     module_call_init(MODULE_INIT_QOM);
 
     if ((envlist = envlist_create()) == NULL) {
-- 
cgit v1.2.3-55-g7522


From 178f94297a23e68183ce08bb841cf5d209208b03 Mon Sep 17 00:00:00 2001
From: Sergey Fedorov
Date: Tue, 2 Aug 2016 18:27:39 +0100
Subject: linux-user: Add qemu_cpu_is_self() and qemu_cpu_kick()

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-9-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 linux-user/main.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/linux-user/main.c b/linux-user/main.c
index 7a056fcfd6..6e14010229 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -3777,6 +3777,16 @@ void cpu_loop(CPUTLGState *env)
 
 THREAD CPUState *thread_cpu;
 
+bool qemu_cpu_is_self(CPUState *cpu)
+{
+    return thread_cpu == cpu;
+}
+
+void qemu_cpu_kick(CPUState *cpu)
+{
+    cpu_exit(cpu);
+}
+
 void task_settid(TaskState *ts)
 {
     if (ts->ts_tid == 0) {
-- 
cgit v1.2.3-55-g7522


From 267f685b8b20784c97251618b515fcd17b42aad6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Sun, 28 Aug 2016 03:45:14 +0200
Subject: cpus-common: move CPU list management to common code

Add a mutex for the CPU list to system emulation, as it will be used to
manage safe work.  Abstract manipulation of the CPU list in new functions
cpu_list_add and cpu_list_remove.

Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Makefile.objs             |  2 +-
 bsd-user/main.c           |  9 +----
 cpus-common.c             | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 exec.c                    | 37 ++-------------------
 include/exec/cpu-common.h |  5 +++
 include/exec/exec-all.h   | 11 -------
 include/qom/cpu.h         | 12 +++++++
 linux-user/main.c         | 17 +++-------
 vl.c                      |  1 +
 9 files changed, 109 insertions(+), 68 deletions(-)
 create mode 100644 cpus-common.c

diff --git a/Makefile.objs b/Makefile.objs
index 7301544cdd..a8e022452f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -89,7 +89,7 @@ endif
 
 #######################################################################
 # Target-independent parts used in system and user emulation
-common-obj-y += tcg-runtime.o
+common-obj-y += tcg-runtime.o cpus-common.o
 common-obj-y += hw/
 common-obj-y += qom/
 common-obj-y += disas/
diff --git a/bsd-user/main.c b/bsd-user/main.c
index 0fb08e405d..591c424e6e 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -95,14 +95,6 @@ void fork_end(int child)
     }
 }
 
-void cpu_list_lock(void)
-{
-}
-
-void cpu_list_unlock(void)
-{
-}
-
 #ifdef TARGET_I386
 /***********************************************************/
 /* CPUX86 core interface */
@@ -748,6 +740,7 @@ int main(int argc, char **argv)
     if (argc <= 1)
         usage();
 
+    qemu_init_cpu_list();
     module_call_init(MODULE_INIT_QOM);
 
     if ((envlist = envlist_create()) == NULL) {
diff --git a/cpus-common.c b/cpus-common.c
new file mode 100644
index 0000000000..fda3848b7d
--- /dev/null
+++ b/cpus-common.c
@@ -0,0 +1,83 @@
+/*
+ * CPU thread main loop - common bits for user and system mode emulation
+ *
+ *  Copyright (c) 2003-2005 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/cpu-common.h"
+#include "qom/cpu.h"
+#include "sysemu/cpus.h"
+
+static QemuMutex qemu_cpu_list_lock;
+
+void qemu_init_cpu_list(void)
+{
+    qemu_mutex_init(&qemu_cpu_list_lock);
+}
+
+void cpu_list_lock(void)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+}
+
+void cpu_list_unlock(void)
+{
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+static bool cpu_index_auto_assigned;
+
+static int cpu_get_free_index(void)
+{
+    CPUState *some_cpu;
+    int cpu_index = 0;
+
+    cpu_index_auto_assigned = true;
+    CPU_FOREACH(some_cpu) {
+        cpu_index++;
+    }
+    return cpu_index;
+}
+
+void cpu_list_add(CPUState *cpu)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) {
+        cpu->cpu_index = cpu_get_free_index();
+        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
+    } else {
+        assert(!cpu_index_auto_assigned);
+    }
+    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+void cpu_list_remove(CPUState *cpu)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    if (!QTAILQ_IN_USE(cpu, node)) {
+        /* there is nothing to undo since cpu_exec_init() hasn't been called */
+        qemu_mutex_unlock(&qemu_cpu_list_lock);
+        return;
+    }
+
+    assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ)));
+
+    QTAILQ_REMOVE(&cpus, cpu, node);
+    cpu->cpu_index = UNASSIGNED_CPU_INDEX;
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
diff --git a/exec.c b/exec.c
index c81d5ab981..c8389f93c3 100644
--- a/exec.c
+++ b/exec.c
@@ -598,36 +598,11 @@ AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
 }
 #endif
 
-static bool cpu_index_auto_assigned;
-
-static int cpu_get_free_index(void)
-{
-    CPUState *some_cpu;
-    int cpu_index = 0;
-
-    cpu_index_auto_assigned = true;
-    CPU_FOREACH(some_cpu) {
-        cpu_index++;
-    }
-    return cpu_index;
-}
-
 void cpu_exec_exit(CPUState *cpu)
 {
     CPUClass *cc = CPU_GET_CLASS(cpu);
 
-    cpu_list_lock();
-    if (!QTAILQ_IN_USE(cpu, node)) {
-        /* there is nothing to undo since cpu_exec_init() hasn't been called */
-        cpu_list_unlock();
-        return;
-    }
-
-    assert(!(cpu_index_auto_assigned && cpu != QTAILQ_LAST(&cpus, CPUTailQ)));
-
-    QTAILQ_REMOVE(&cpus, cpu, node);
-    cpu->cpu_index = UNASSIGNED_CPU_INDEX;
-    cpu_list_unlock();
+    cpu_list_remove(cpu);
 
     if (cc->vmsd != NULL) {
         vmstate_unregister(NULL, cc->vmsd, cpu);
@@ -663,15 +638,7 @@ void cpu_exec_init(CPUState *cpu, Error **errp)
     object_ref(OBJECT(cpu->memory));
 #endif
 
-    cpu_list_lock();
-    if (cpu->cpu_index == UNASSIGNED_CPU_INDEX) {
-        cpu->cpu_index = cpu_get_free_index();
-        assert(cpu->cpu_index != UNASSIGNED_CPU_INDEX);
-    } else {
-        assert(!cpu_index_auto_assigned);
-    }
-    QTAILQ_INSERT_TAIL(&cpus, cpu, node);
-    cpu_list_unlock();
+    cpu_list_add(cpu);
 
 #ifndef CONFIG_USER_ONLY
     if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 952bcfeb4c..869ba41b0c 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -23,6 +23,11 @@ typedef struct CPUListState {
     FILE *file;
 } CPUListState;
 
+/* The CPU list lock nests outside tb_lock/tb_unlock.  */
+void qemu_init_cpu_list(void);
+void cpu_list_lock(void);
+void cpu_list_unlock(void);
+
 #if !defined(CONFIG_USER_ONLY)
 
 enum device_endian {
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 008e09a3c1..336a57cde6 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -56,17 +56,6 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
                               target_ulong pc, target_ulong cs_base,
                               uint32_t flags,
                               int cflags);
-#if defined(CONFIG_USER_ONLY)
-void cpu_list_lock(void);
-void cpu_list_unlock(void);
-#else
-static inline void cpu_list_unlock(void)
-{
-}
-static inline void cpu_list_lock(void)
-{
-}
-#endif
 
 void cpu_exec_init(CPUState *cpu, Error **errp);
 void QEMU_NORETURN cpu_loop_exit(CPUState *cpu);
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 4aa9e61c5d..ea3233ff5b 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -544,6 +544,18 @@ static inline int cpu_asidx_from_attrs(CPUState *cpu, MemTxAttrs attrs)
 }
 #endif
 
+/**
+ * cpu_list_add:
+ * @cpu: The CPU to be added to the list of CPUs.
+ */
+void cpu_list_add(CPUState *cpu);
+
+/**
+ * cpu_list_remove:
+ * @cpu: The CPU to be removed from the list of CPUs.
+ */
+void cpu_list_remove(CPUState *cpu);
+
 /**
  * cpu_reset:
  * @cpu: The CPU whose state is to be reset.
diff --git a/linux-user/main.c b/linux-user/main.c
index 6e14010229..719f0462e4 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -111,7 +111,6 @@ int cpu_get_pic_interrupt(CPUX86State *env)
    We don't require a full sync, only that no cpus are executing guest code.
    The alternative is to map target atomic ops onto host equivalents,
    which requires quite a lot of per host/target work.  */
-static QemuMutex cpu_list_lock;
 static QemuMutex exclusive_lock;
 static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
@@ -119,7 +118,6 @@ static int pending_cpus;
 
 void qemu_init_cpu_loop(void)
 {
-    qemu_mutex_init(&cpu_list_lock);
     qemu_mutex_init(&exclusive_lock);
     qemu_cond_init(&exclusive_cond);
     qemu_cond_init(&exclusive_resume);
@@ -128,6 +126,7 @@ void qemu_init_cpu_loop(void)
 /* Make sure everything is in a consistent state for calling fork().  */
 void fork_start(void)
 {
+    cpu_list_lock();
     qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
     qemu_mutex_lock(&exclusive_lock);
     mmap_fork_start();
@@ -147,14 +146,15 @@ void fork_end(int child)
         }
         pending_cpus = 0;
         qemu_mutex_init(&exclusive_lock);
-        qemu_mutex_init(&cpu_list_lock);
         qemu_cond_init(&exclusive_cond);
         qemu_cond_init(&exclusive_resume);
         qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
+        qemu_init_cpu_list();
         gdbserver_fork(thread_cpu);
     } else {
         qemu_mutex_unlock(&exclusive_lock);
         qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
+        cpu_list_unlock();
     }
 }
 
@@ -221,16 +221,6 @@ static inline void cpu_exec_end(CPUState *cpu)
     qemu_mutex_unlock(&exclusive_lock);
 }
 
-void cpu_list_lock(void)
-{
-    qemu_mutex_lock(&cpu_list_lock);
-}
-
-void cpu_list_unlock(void)
-{
-    qemu_mutex_unlock(&cpu_list_lock);
-}
-
 
 #ifdef TARGET_I386
 /***********************************************************/
@@ -4229,6 +4219,7 @@ int main(int argc, char **argv, char **envp)
     int ret;
     int execfd;
 
+    qemu_init_cpu_list();
     qemu_init_cpu_loop();
     module_call_init(MODULE_INIT_QOM);
 
diff --git a/vl.c b/vl.c
index 215a6f9c7a..eda83fa93d 100644
--- a/vl.c
+++ b/vl.c
@@ -3017,6 +3017,7 @@ int main(int argc, char **argv, char **envp)
     Error *err = NULL;
     bool list_data_dirs = false;
 
+    qemu_init_cpu_list();
     qemu_init_cpu_loop();
     qemu_mutex_lock_iothread();
 
-- 
cgit v1.2.3-55-g7522


From d148d90ee83738d45a90dc0b2fb7b1712f164103 Mon Sep 17 00:00:00 2001
From: Sergey Fedorov
Date: Mon, 29 Aug 2016 09:51:00 +0200
Subject: cpus-common: move CPU work item management to common code

Make CPU work core functions common between system and user-mode
emulation. User-mode does not use run_on_cpu, so do not implement it.

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-10-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 bsd-user/main.c   | 11 +++++--
 cpus-common.c     | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 cpus.c            | 82 +-----------------------------------------------
 include/qom/cpu.h | 27 +++++++++++-----
 linux-user/main.c | 25 +++++++++++++++
 5 files changed, 148 insertions(+), 91 deletions(-)

diff --git a/bsd-user/main.c b/bsd-user/main.c
index 591c424e6e..6dfa91230f 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -68,11 +68,11 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 #endif
 
 /* These are no-ops because we are not threadsafe.  */
-static inline void cpu_exec_start(CPUArchState *env)
+static inline void cpu_exec_start(CPUState *cpu)
 {
 }
 
-static inline void cpu_exec_end(CPUArchState *env)
+static inline void cpu_exec_end(CPUState *cpu)
 {
 }
 
@@ -164,7 +164,11 @@ void cpu_loop(CPUX86State *env)
     //target_siginfo_t info;
 
     for(;;) {
+        cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
+        cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch(trapnr) {
         case 0x80:
             /* syscall from int $0x80 */
@@ -505,7 +509,10 @@ void cpu_loop(CPUSPARCState *env)
     //target_siginfo_t info;
 
     while (1) {
+        cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
+        cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
 
         switch (trapnr) {
 #ifndef TARGET_SPARC64
diff --git a/cpus-common.c b/cpus-common.c
index fda3848b7d..2005bfe41f 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -23,10 +23,12 @@
 #include "sysemu/cpus.h"
 
 static QemuMutex qemu_cpu_list_lock;
+static QemuCond qemu_work_cond;
 
 void qemu_init_cpu_list(void)
 {
     qemu_mutex_init(&qemu_cpu_list_lock);
+    qemu_cond_init(&qemu_work_cond);
 }
 
 void cpu_list_lock(void)
@@ -81,3 +83,95 @@ void cpu_list_remove(CPUState *cpu)
     cpu->cpu_index = UNASSIGNED_CPU_INDEX;
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
+
+struct qemu_work_item {
+    struct qemu_work_item *next;
+    run_on_cpu_func func;
+    void *data;
+    int done;
+    bool free;
+};
+
+static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+{
+    qemu_mutex_lock(&cpu->work_mutex);
+    if (cpu->queued_work_first == NULL) {
+        cpu->queued_work_first = wi;
+    } else {
+        cpu->queued_work_last->next = wi;
+    }
+    cpu->queued_work_last = wi;
+    wi->next = NULL;
+    wi->done = false;
+    qemu_mutex_unlock(&cpu->work_mutex);
+
+    qemu_cpu_kick(cpu);
+}
+
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
+                   QemuMutex *mutex)
+{
+    struct qemu_work_item wi;
+
+    if (qemu_cpu_is_self(cpu)) {
+        func(cpu, data);
+        return;
+    }
+
+    wi.func = func;
+    wi.data = data;
+    wi.free = false;
+
+    queue_work_on_cpu(cpu, &wi);
+    while (!atomic_mb_read(&wi.done)) {
+        CPUState *self_cpu = current_cpu;
+
+        qemu_cond_wait(&qemu_work_cond, mutex);
+        current_cpu = self_cpu;
+    }
+}
+
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+{
+    struct qemu_work_item *wi;
+
+    if (qemu_cpu_is_self(cpu)) {
+        func(cpu, data);
+        return;
+    }
+
+    wi = g_malloc0(sizeof(struct qemu_work_item));
+    wi->func = func;
+    wi->data = data;
+    wi->free = true;
+
+    queue_work_on_cpu(cpu, wi);
+}
+
+void process_queued_cpu_work(CPUState *cpu)
+{
+    struct qemu_work_item *wi;
+
+    if (cpu->queued_work_first == NULL) {
+        return;
+    }
+
+    qemu_mutex_lock(&cpu->work_mutex);
+    while (cpu->queued_work_first != NULL) {
+        wi = cpu->queued_work_first;
+        cpu->queued_work_first = wi->next;
+        if (!cpu->queued_work_first) {
+            cpu->queued_work_last = NULL;
+        }
+        qemu_mutex_unlock(&cpu->work_mutex);
+        wi->func(cpu, wi->data);
+        qemu_mutex_lock(&cpu->work_mutex);
+        if (wi->free) {
+            g_free(wi);
+        } else {
+            atomic_mb_set(&wi->done, true);
+        }
+    }
+    qemu_mutex_unlock(&cpu->work_mutex);
+    qemu_cond_broadcast(&qemu_work_cond);
+}
diff --git a/cpus.c b/cpus.c
index 28d62062f3..c3afd18ffb 100644
--- a/cpus.c
+++ b/cpus.c
@@ -902,73 +902,21 @@ static QemuThread io_thread;
 static QemuCond qemu_cpu_cond;
 /* system init */
 static QemuCond qemu_pause_cond;
-static QemuCond qemu_work_cond;
 
 void qemu_init_cpu_loop(void)
 {
     qemu_init_sigbus();
     qemu_cond_init(&qemu_cpu_cond);
     qemu_cond_init(&qemu_pause_cond);
-    qemu_cond_init(&qemu_work_cond);
     qemu_cond_init(&qemu_io_proceeded_cond);
     qemu_mutex_init(&qemu_global_mutex);
 
     qemu_thread_get_self(&io_thread);
 }
 
-static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
-{
-    qemu_mutex_lock(&cpu->work_mutex);
-    if (cpu->queued_work_first == NULL) {
-        cpu->queued_work_first = wi;
-    } else {
-        cpu->queued_work_last->next = wi;
-    }
-    cpu->queued_work_last = wi;
-    wi->next = NULL;
-    wi->done = false;
-    qemu_mutex_unlock(&cpu->work_mutex);
-
-    qemu_cpu_kick(cpu);
-}
-
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
-    struct qemu_work_item wi;
-
-    if (qemu_cpu_is_self(cpu)) {
-        func(cpu, data);
-        return;
-    }
-
-    wi.func = func;
-    wi.data = data;
-    wi.free = false;
-
-    queue_work_on_cpu(cpu, &wi);
-    while (!atomic_mb_read(&wi.done)) {
-        CPUState *self_cpu = current_cpu;
-
-        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
-        current_cpu = self_cpu;
-    }
-}
-
-void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
-{
-    struct qemu_work_item *wi;
-
-    if (qemu_cpu_is_self(cpu)) {
-        func(cpu, data);
-        return;
-    }
-
-    wi = g_malloc0(sizeof(struct qemu_work_item));
-    wi->func = func;
-    wi->data = data;
-    wi->free = true;
-
-    queue_work_on_cpu(cpu, wi);
+    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 }
 
 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
@@ -983,34 +931,6 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }
 
-static void process_queued_cpu_work(CPUState *cpu)
-{
-    struct qemu_work_item *wi;
-
-    if (cpu->queued_work_first == NULL) {
-        return;
-    }
-
-    qemu_mutex_lock(&cpu->work_mutex);
-    while (cpu->queued_work_first != NULL) {
-        wi = cpu->queued_work_first;
-        cpu->queued_work_first = wi->next;
-        if (!cpu->queued_work_first) {
-            cpu->queued_work_last = NULL;
-        }
-        qemu_mutex_unlock(&cpu->work_mutex);
-        wi->func(cpu, wi->data);
-        qemu_mutex_lock(&cpu->work_mutex);
-        if (wi->free) {
-            g_free(wi);
-        } else {
-            atomic_mb_set(&wi->done, true);
-        }
-    }
-    qemu_mutex_unlock(&cpu->work_mutex);
-    qemu_cond_broadcast(&qemu_work_cond);
-}
-
 static void qemu_wait_io_event_common(CPUState *cpu)
 {
     if (cpu->stop) {
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index ea3233ff5b..c04e510ef1 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -233,14 +233,7 @@ struct kvm_run;
 
 /* work queue */
 typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
-
-struct qemu_work_item {
-    struct qemu_work_item *next;
-    run_on_cpu_func func;
-    void *data;
-    int done;
-    bool free;
-};
+struct qemu_work_item;
 
 /**
  * CPUState:
@@ -629,6 +622,18 @@ void qemu_cpu_kick(CPUState *cpu);
  */
 bool cpu_is_stopped(CPUState *cpu);
 
+/**
+ * do_run_on_cpu:
+ * @cpu: The vCPU to run on.
+ * @func: The function to be executed.
+ * @data: Data to pass to the function.
+ * @mutex: Mutex to release while waiting for @func to run.
+ *
+ * Used internally in the implementation of run_on_cpu.
+ */
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
+                   QemuMutex *mutex);
+
 /**
  * run_on_cpu:
  * @cpu: The vCPU to run on.
@@ -807,6 +812,12 @@ void cpu_remove(CPUState *cpu);
  */
 void cpu_remove_sync(CPUState *cpu);
 
+/**
+ * process_queued_cpu_work() - process all items on CPU work queue
+ * @cpu: The CPU which work queue to process.
+ */
+void process_queued_cpu_work(CPUState *cpu);
+
 /**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
diff --git a/linux-user/main.c b/linux-user/main.c
index 719f0462e4..e3eca40a75 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -294,6 +294,8 @@ void cpu_loop(CPUX86State *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch(trapnr) {
         case 0x80:
             /* linux syscall from int $0x80 */
@@ -735,6 +737,8 @@ void cpu_loop(CPUARMState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch(trapnr) {
         case EXCP_UDEF:
             {
@@ -1071,6 +1075,7 @@ void cpu_loop(CPUARMState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
 
         switch (trapnr) {
         case EXCP_SWI:
@@ -1159,6 +1164,8 @@ void cpu_loop(CPUUniCore32State *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch (trapnr) {
         case UC32_EXCP_PRIV:
             {
@@ -1364,6 +1371,7 @@ void cpu_loop (CPUSPARCState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
 
         /* Compute PSR before exposing state.  */
         if (env->cc_op != CC_OP_FLAGS) {
@@ -1636,6 +1644,8 @@ void cpu_loop(CPUPPCState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch(trapnr) {
         case POWERPC_EXCP_NONE:
             /* Just go on */
@@ -2482,6 +2492,8 @@ void cpu_loop(CPUMIPSState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch(trapnr) {
         case EXCP_SYSCALL:
             env->active_tc.PC += 4;
@@ -2722,6 +2734,7 @@ void cpu_loop(CPUOpenRISCState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
         gdbsig = 0;
 
         switch (trapnr) {
@@ -2816,6 +2829,7 @@ void cpu_loop(CPUSH4State *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
 
         switch (trapnr) {
         case 0x160:
@@ -2882,6 +2896,8 @@ void cpu_loop(CPUCRISState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch (trapnr) {
         case 0xaa:
             {
@@ -2947,6 +2963,8 @@ void cpu_loop(CPUMBState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch (trapnr) {
         case 0xaa:
             {
@@ -3064,6 +3082,8 @@ void cpu_loop(CPUM68KState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch(trapnr) {
         case EXCP_ILLEGAL:
             {
@@ -3207,6 +3227,7 @@ void cpu_loop(CPUAlphaState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
 
         /* All of the traps imply a transition through PALcode, which
            implies an REI instruction has been executed.  Which means
@@ -3399,6 +3420,8 @@ void cpu_loop(CPUS390XState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch (trapnr) {
         case EXCP_INTERRUPT:
             /* Just indicate that signals should be handled asap.  */
@@ -3708,6 +3731,8 @@ void cpu_loop(CPUTLGState *env)
         cpu_exec_start(cs);
         trapnr = cpu_exec(cs);
         cpu_exec_end(cs);
+        process_queued_cpu_work(cs);
+
         switch (trapnr) {
         case TILEGX_EXCP_SYSCALL:
         {
-- 
cgit v1.2.3-55-g7522


From 0e55539c076a61b0b10a1aea1158fc20fb159d99 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Tue, 6 Sep 2016 17:28:03 +0200
Subject: cpus-common: fix uninitialized variable use in run_on_cpu

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus-common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 2005bfe41f..d6cd426235 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -88,8 +88,7 @@ struct qemu_work_item {
     struct qemu_work_item *next;
     run_on_cpu_func func;
     void *data;
-    int done;
-    bool free;
+    bool free, done;
 };
 
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
@@ -120,6 +119,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
 
     wi.func = func;
     wi.data = data;
+    wi.done = false;
     wi.free = false;
 
     queue_work_on_cpu(cpu, &wi);
-- 
cgit v1.2.3-55-g7522


From ab129972c8b41e15b0521895a46fd9c752b68a5e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Wed, 31 Aug 2016 16:56:04 +0200
Subject: cpus-common: move exclusive work infrastructure from linux-user

This will serve as the base for async_safe_run_on_cpu.  Because
start_exclusive uses CPU_FOREACH, merge exclusive_lock with
qemu_cpu_list_lock: together with a call to exclusive_idle (via
cpu_exec_start/end) in cpu_list_add, this protects exclusive work
against concurrent CPU addition and removal.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 bsd-user/main.c   | 17 -----------
 cpus-common.c     | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++
 cpus.c            |  2 ++
 include/qom/cpu.h | 44 +++++++++++++++++++++++++++-
 linux-user/main.c | 87 -------------------------------------------------------
 5 files changed, 127 insertions(+), 105 deletions(-)

diff --git a/bsd-user/main.c b/bsd-user/main.c
index 6dfa91230f..35125b720f 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -67,23 +67,6 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 }
 #endif
 
-/* These are no-ops because we are not threadsafe.  */
-static inline void cpu_exec_start(CPUState *cpu)
-{
-}
-
-static inline void cpu_exec_end(CPUState *cpu)
-{
-}
-
-static inline void start_exclusive(void)
-{
-}
-
-static inline void end_exclusive(void)
-{
-}
-
 void fork_start(void)
 {
 }
diff --git a/cpus-common.c b/cpus-common.c
index d6cd426235..7d935fd3dd 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -23,11 +23,21 @@
 #include "sysemu/cpus.h"
 
 static QemuMutex qemu_cpu_list_lock;
+static QemuCond exclusive_cond;
+static QemuCond exclusive_resume;
 static QemuCond qemu_work_cond;
 
+static int pending_cpus;
+
 void qemu_init_cpu_list(void)
 {
+    /* This is needed because qemu_init_cpu_list is also called by the
+     * child process in a fork.  */
+    pending_cpus = 0;
+
     qemu_mutex_init(&qemu_cpu_list_lock);
+    qemu_cond_init(&exclusive_cond);
+    qemu_cond_init(&exclusive_resume);
     qemu_cond_init(&qemu_work_cond);
 }
 
@@ -55,6 +65,12 @@ static int cpu_get_free_index(void)
     return cpu_index;
 }
 
+static void finish_safe_work(CPUState *cpu)
+{
+    cpu_exec_start(cpu);
+    cpu_exec_end(cpu);
+}
+
 void cpu_list_add(CPUState *cpu)
 {
     qemu_mutex_lock(&qemu_cpu_list_lock);
@@ -66,6 +82,8 @@ void cpu_list_add(CPUState *cpu)
     }
     QTAILQ_INSERT_TAIL(&cpus, cpu, node);
     qemu_mutex_unlock(&qemu_cpu_list_lock);
+
+    finish_safe_work(cpu);
 }
 
 void cpu_list_remove(CPUState *cpu)
@@ -148,6 +166,70 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
     queue_work_on_cpu(cpu, wi);
 }
 
+/* Wait for pending exclusive operations to complete.  The CPU list lock
+   must be held.  */
+static inline void exclusive_idle(void)
+{
+    while (pending_cpus) {
+        qemu_cond_wait(&exclusive_resume, &qemu_cpu_list_lock);
+    }
+}
+
+/* Start an exclusive operation.
+   Must only be called from outside cpu_exec, takes
+   qemu_cpu_list_lock.   */
+void start_exclusive(void)
+{
+    CPUState *other_cpu;
+
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    exclusive_idle();
+
+    /* Make all other cpus stop executing.  */
+    pending_cpus = 1;
+    CPU_FOREACH(other_cpu) {
+        if (other_cpu->running) {
+            pending_cpus++;
+            qemu_cpu_kick(other_cpu);
+        }
+    }
+    while (pending_cpus > 1) {
+        qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
+    }
+}
+
+/* Finish an exclusive operation.  Releases qemu_cpu_list_lock.  */
+void end_exclusive(void)
+{
+    pending_cpus = 0;
+    qemu_cond_broadcast(&exclusive_resume);
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+/* Wait for exclusive ops to finish, and begin cpu execution.  */
+void cpu_exec_start(CPUState *cpu)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    exclusive_idle();
+    cpu->running = true;
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
+/* Mark cpu as not executing, and release pending exclusive ops.  */
+void cpu_exec_end(CPUState *cpu)
+{
+    qemu_mutex_lock(&qemu_cpu_list_lock);
+    cpu->running = false;
+    if (pending_cpus > 1) {
+        pending_cpus--;
+        if (pending_cpus == 1) {
+            qemu_cond_signal(&exclusive_cond);
+        }
+    }
+    exclusive_idle();
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
+}
+
 void process_queued_cpu_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
diff --git a/cpus.c b/cpus.c
index c3afd18ffb..fbd70f59f7 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1457,7 +1457,9 @@ static int tcg_cpu_exec(CPUState *cpu)
         cpu->icount_decr.u16.low = decr;
         cpu->icount_extra = count;
     }
+    cpu_exec_start(cpu);
     ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
 #ifdef CONFIG_PROFILER
     tcg_time += profile_getclock() - ti;
 #endif
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index c04e510ef1..f8726140f6 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -242,7 +242,8 @@ struct qemu_work_item;
  * @nr_threads: Number of threads within this CPU.
  * @numa_node: NUMA node this CPU is belonging to.
  * @host_tid: Host thread ID.
- * @running: #true if CPU is currently running (usermode).
+ * @running: #true if CPU is currently running;
+ * valid under cpu_list_lock.
  * @created: Indicates whether the CPU thread has been successfully created.
  * @interrupt_request: Indicates a pending interrupt request.
  * @halted: Nonzero if the CPU is in suspended state.
@@ -818,6 +819,47 @@ void cpu_remove_sync(CPUState *cpu);
  */
 void process_queued_cpu_work(CPUState *cpu);
 
+/**
+ * cpu_exec_start:
+ * @cpu: The CPU for the current thread.
+ *
+ * Record that a CPU has started execution and can be interrupted with
+ * cpu_exit.
+ */
+void cpu_exec_start(CPUState *cpu);
+
+/**
+ * cpu_exec_end:
+ * @cpu: The CPU for the current thread.
+ *
+ * Record that a CPU has stopped execution and exclusive sections
+ * can be executed without interrupting it.
+ */
+void cpu_exec_end(CPUState *cpu);
+
+/**
+ * start_exclusive:
+ *
+ * Wait for a concurrent exclusive section to end, and then start
+ * a section of work that is run while other CPUs are not running
+ * between cpu_exec_start and cpu_exec_end.  CPUs that are running
+ * cpu_exec are exited immediately.  CPUs that call cpu_exec_start
+ * during the exclusive section go to sleep until this CPU calls
+ * end_exclusive.
+ *
+ * Returns with the CPU list lock taken (which nests outside all
+ * other locks except the BQL).
+ */
+void start_exclusive(void);
+
+/**
+ * end_exclusive:
+ *
+ * Concludes an exclusive execution section started by start_exclusive.
+ * Releases the CPU list lock.
+ */
+void end_exclusive(void);
+
 /**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
diff --git a/linux-user/main.c b/linux-user/main.c
index e3eca40a75..c8f8573614 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -107,28 +107,11 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 /***********************************************************/
 /* Helper routines for implementing atomic operations.  */
 
-/* To implement exclusive operations we force all cpus to syncronise.
-   We don't require a full sync, only that no cpus are executing guest code.
-   The alternative is to map target atomic ops onto host equivalents,
-   which requires quite a lot of per host/target work.  */
-static QemuMutex exclusive_lock;
-static QemuCond exclusive_cond;
-static QemuCond exclusive_resume;
-static int pending_cpus;
-
-void qemu_init_cpu_loop(void)
-{
-    qemu_mutex_init(&exclusive_lock);
-    qemu_cond_init(&exclusive_cond);
-    qemu_cond_init(&exclusive_resume);
-}
-
 /* Make sure everything is in a consistent state for calling fork().  */
 void fork_start(void)
 {
     cpu_list_lock();
     qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
-    qemu_mutex_lock(&exclusive_lock);
     mmap_fork_start();
 }
 
@@ -144,84 +127,15 @@ void fork_end(int child)
                 QTAILQ_REMOVE(&cpus, cpu, node);
             }
         }
-        pending_cpus = 0;
-        qemu_mutex_init(&exclusive_lock);
-        qemu_cond_init(&exclusive_cond);
-        qemu_cond_init(&exclusive_resume);
         qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
         qemu_init_cpu_list();
         gdbserver_fork(thread_cpu);
     } else {
-        qemu_mutex_unlock(&exclusive_lock);
         qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
         cpu_list_unlock();
     }
 }
 
-/* Wait for pending exclusive operations to complete.  The exclusive lock
-   must be held.  */
-static inline void exclusive_idle(void)
-{
-    while (pending_cpus) {
-        qemu_cond_wait(&exclusive_resume, &exclusive_lock);
-    }
-}
-
-/* Start an exclusive operation.
-   Must only be called from outside cpu_exec.   */
-static inline void start_exclusive(void)
-{
-    CPUState *other_cpu;
-
-    qemu_mutex_lock(&exclusive_lock);
-    exclusive_idle();
-
-    pending_cpus = 1;
-    /* Make all other cpus stop executing.  */
-    CPU_FOREACH(other_cpu) {
-        if (other_cpu->running) {
-            pending_cpus++;
-            cpu_exit(other_cpu);
-        }
-    }
-    while (pending_cpus > 1) {
-        qemu_cond_wait(&exclusive_cond, &exclusive_lock);
-    }
-}
-
-/* Finish an exclusive operation.  */
-static inline void __attribute__((unused)) end_exclusive(void)
-{
-    pending_cpus = 0;
-    qemu_cond_broadcast(&exclusive_resume);
-    qemu_mutex_unlock(&exclusive_lock);
-}
-
-/* Wait for exclusive ops to finish, and begin cpu execution.  */
-static inline void cpu_exec_start(CPUState *cpu)
-{
-    qemu_mutex_lock(&exclusive_lock);
-    exclusive_idle();
-    cpu->running = true;
-    qemu_mutex_unlock(&exclusive_lock);
-}
-
-/* Mark cpu as not executing, and release pending exclusive ops.  */
-static inline void cpu_exec_end(CPUState *cpu)
-{
-    qemu_mutex_lock(&exclusive_lock);
-    cpu->running = false;
-    if (pending_cpus > 1) {
-        pending_cpus--;
-        if (pending_cpus == 1) {
-            qemu_cond_signal(&exclusive_cond);
-        }
-    }
-    exclusive_idle();
-    qemu_mutex_unlock(&exclusive_lock);
-}
-
-
 #ifdef TARGET_I386
 /***********************************************************/
 /* CPUX86 core interface */
@@ -4245,7 +4159,6 @@ int main(int argc, char **argv, char **envp)
     int execfd;
 
     qemu_init_cpu_list();
-    qemu_init_cpu_loop();
     module_call_init(MODULE_INIT_QOM);
 
     if ((envlist = envlist_create()) == NULL) {
-- 
cgit v1.2.3-55-g7522


From a200f2fb571f337db37f865aec18f655fa3c872b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Fri, 2 Sep 2016 23:35:55 +0200
Subject: docs: include formal model for TCG exclusive sections

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 docs/tcg-exclusive.promela | 177 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 docs/tcg-exclusive.promela

diff --git a/docs/tcg-exclusive.promela b/docs/tcg-exclusive.promela
new file mode 100644
index 0000000000..5889b40638
--- /dev/null
+++ b/docs/tcg-exclusive.promela
@@ -0,0 +1,177 @@
+/*
+ * This model describes the implementation of exclusive sections in
+ * cpus-common.c (start_exclusive, end_exclusive, cpu_exec_start,
+ * cpu_exec_end).
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This file is in the public domain.  If you really want a license,
+ * the WTFPL will do.
+ *
+ * To verify it:
+ *     spin -a docs/tcg-exclusive.promela
+ *     gcc pan.c -O2
+ *     ./a.out -a
+ *
+ * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, TEST_EXPENSIVE.
+ */
+
+// Define the missing parameters for the model
+#ifndef N_CPUS
+#define N_CPUS 2
+#warning defaulting to 2 CPU processes
+#endif
+
+// the expensive test is not so expensive for <= 3 CPUs
+#if N_CPUS <= 3
+#define TEST_EXPENSIVE
+#endif
+
+#ifndef N_EXCLUSIVE
+# if !defined N_CYCLES || N_CYCLES <= 1 || defined TEST_EXPENSIVE
+#  define N_EXCLUSIVE     2
+#  warning defaulting to 2 concurrent exclusive sections
+# else
+#  define N_EXCLUSIVE     1
+#  warning defaulting to 1 concurrent exclusive sections
+# endif
+#endif
+#ifndef N_CYCLES
+# if N_EXCLUSIVE <= 1 || defined TEST_EXPENSIVE
+#  define N_CYCLES        2
+#  warning defaulting to 2 CPU cycles
+# else
+#  define N_CYCLES        1
+#  warning defaulting to 1 CPU cycles
+# endif
+#endif
+
+
+// synchronization primitives.  condition variables require a
+// process-local "cond_t saved;" variable.
+
+#define mutex_t              byte
+#define MUTEX_LOCK(m)        atomic { m == 0 -> m = 1 }
+#define MUTEX_UNLOCK(m)      m = 0
+
+#define cond_t               int
+#define COND_WAIT(c, m)      {                                  \
+                               saved = c;                       \
+                               MUTEX_UNLOCK(m);                 \
+                               c != saved -> MUTEX_LOCK(m);     \
+                             }
+#define COND_BROADCAST(c)    c++
+
+// this is the logic from cpus-common.c
+
+mutex_t mutex;
+cond_t exclusive_cond;
+cond_t exclusive_resume;
+byte pending_cpus;
+
+byte running[N_CPUS];
+byte has_waiter[N_CPUS];
+
+#define exclusive_idle()                                          \
+  do                                                              \
+      :: pending_cpus -> COND_WAIT(exclusive_resume, mutex);      \
+      :: else         -> break;                                   \
+  od
+
+#define start_exclusive()                                         \
+    MUTEX_LOCK(mutex);                                            \
+    exclusive_idle();                                             \
+    pending_cpus = 1;                                             \
+                                                                  \
+    i = 0;                                                        \
+    do                                                            \
+       :: i < N_CPUS -> {                                         \
+           if                                                     \
+              :: running[i] -> has_waiter[i] = 1; pending_cpus++; \
+              :: else       -> skip;                              \
+           fi;                                                    \
+           i++;                                                   \
+       }                                                          \
+       :: else -> break;                                          \
+    od;                                                           \
+                                                                  \
+    do                                                            \
+      :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex);    \
+      :: else             -> break;                               \
+    od
+
+#define end_exclusive()                                           \
+    pending_cpus = 0;                                             \
+    COND_BROADCAST(exclusive_resume);                             \
+    MUTEX_UNLOCK(mutex);
+
+#define cpu_exec_start(id)                                                   \
+    MUTEX_LOCK(mutex);                                                       \
+    exclusive_idle();                                                        \
+    running[id] = 1;                                                         \
+    MUTEX_UNLOCK(mutex);
+
+#define cpu_exec_end(id)                                                     \
+    MUTEX_LOCK(mutex);                                                       \
+    running[id] = 0;                                                         \
+    if                                                                       \
+        :: pending_cpus -> {                                                 \
+            pending_cpus--;                                                  \
+            if                                                               \
+                :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond);      \
+                :: else -> skip;                                             \
+            fi;                                                              \
+        }                                                                    \
+        :: else -> skip;                                                     \
+    fi;                                                                      \
+    exclusive_idle();                                                        \
+    MUTEX_UNLOCK(mutex);
+
+// Promela processes
+
+byte done_cpu;
+byte in_cpu;
+active[N_CPUS] proctype cpu()
+{
+    byte id = _pid % N_CPUS;
+    byte cycles = 0;
+    cond_t saved;
+
+    do
+       :: cycles == N_CYCLES -> break;
+       :: else -> {
+           cycles++;
+           cpu_exec_start(id)
+           in_cpu++;
+           done_cpu++;
+           in_cpu--;
+           cpu_exec_end(id)
+       }
+    od;
+}
+
+byte done_exclusive;
+byte in_exclusive;
+active[N_EXCLUSIVE] proctype exclusive()
+{
+    cond_t saved;
+    byte i;
+
+    start_exclusive();
+    in_exclusive = 1;
+    done_exclusive++;
+    in_exclusive = 0;
+    end_exclusive();
+}
+
+#define LIVENESS   (done_cpu == N_CPUS * N_CYCLES && done_exclusive == N_EXCLUSIVE)
+#define SAFETY     !(in_exclusive && in_cpu)
+
+never {    /* ! ([] SAFETY && <> [] LIVENESS) */
+    do
+    // once the liveness property is satisfied, this is not executable
+    // and the never clause is not accepted
+    :: ! LIVENESS -> accept_liveness: skip
+    :: 1          -> assert(SAFETY)
+    od;
+}
-- 
cgit v1.2.3-55-g7522


From c978b3168727d3a76ffcb18462ea972f50b53634 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Wed, 31 Aug 2016 18:03:39 +0200
Subject: cpus-common: always defer async_run_on_cpu work items

async_run_on_cpu is only called from the I/O thread, not from CPU threads,
so it doesn't make any difference.  It will make a difference however
for async_safe_run_on_cpu.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus-common.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 7d935fd3dd..115f3d45df 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -153,11 +153,6 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
     struct qemu_work_item *wi;
 
-    if (qemu_cpu_is_self(cpu)) {
-        func(cpu, data);
-        return;
-    }
-
     wi = g_malloc0(sizeof(struct qemu_work_item));
     wi->func = func;
     wi->data = data;
-- 
cgit v1.2.3-55-g7522


From cf07da65f335b9a74e62f5413078f67280572f36 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Fri, 2 Sep 2016 21:02:10 +0200
Subject: cpus-common: remove redundant call to exclusive_idle()

No need to call exclusive_idle() from cpu_exec_end since it is done
immediately afterwards in cpu_exec_start.  Any exclusive section could
run as soon as cpu_exec_end leaves, because cpu->running is false and the
mutex is not taken, so the call does not add any protection either.

Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus-common.c              | 1 -
 docs/tcg-exclusive.promela | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 115f3d45df..80aaf9b42d 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -221,7 +221,6 @@ void cpu_exec_end(CPUState *cpu)
             qemu_cond_signal(&exclusive_cond);
         }
     }
-    exclusive_idle();
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
diff --git a/docs/tcg-exclusive.promela b/docs/tcg-exclusive.promela
index 5889b40638..8bb0967df6 100644
--- a/docs/tcg-exclusive.promela
+++ b/docs/tcg-exclusive.promela
@@ -124,7 +124,6 @@ byte has_waiter[N_CPUS];
         }                                                                    \
         :: else -> skip;                                                     \
     fi;                                                                      \
-    exclusive_idle();                                                        \
     MUTEX_UNLOCK(mutex);
 
 // Promela processes
-- 
cgit v1.2.3-55-g7522


From 758e1b2b622d7c177dc2d95e887a11aa069b7e68 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Fri, 2 Sep 2016 23:33:38 +0200
Subject: cpus-common: simplify locking for start_exclusive/end_exclusive

It is not necessary to hold qemu_cpu_list_mutex throughout the
exclusive section, because no other exclusive section can run
while pending_cpus != 0.

exclusive_idle() is called in cpu_exec_start(), and that prevents
any CPUs created after start_exclusive() from entering cpu_exec()
during an exclusive section.

Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus-common.c              | 11 ++++++++---
 docs/tcg-exclusive.promela |  4 +++-
 include/qom/cpu.h          |  4 ----
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 80aaf9b42d..429652c7fd 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -171,8 +171,7 @@ static inline void exclusive_idle(void)
 }
 
 /* Start an exclusive operation.
-   Must only be called from outside cpu_exec, takes
-   qemu_cpu_list_lock.   */
+   Must only be called from outside cpu_exec.  */
 void start_exclusive(void)
 {
     CPUState *other_cpu;
@@ -191,11 +190,17 @@ void start_exclusive(void)
     while (pending_cpus > 1) {
         qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
     }
+
+    /* Can release mutex, no one will enter another exclusive
+     * section until end_exclusive resets pending_cpus to 0.
+     */
+    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
-/* Finish an exclusive operation.  Releases qemu_cpu_list_lock.  */
+/* Finish an exclusive operation.  */
 void end_exclusive(void)
 {
+    qemu_mutex_lock(&qemu_cpu_list_lock);
     pending_cpus = 0;
     qemu_cond_broadcast(&exclusive_resume);
     qemu_mutex_unlock(&qemu_cpu_list_lock);
diff --git a/docs/tcg-exclusive.promela b/docs/tcg-exclusive.promela
index 8bb0967df6..feac679b9a 100644
--- a/docs/tcg-exclusive.promela
+++ b/docs/tcg-exclusive.promela
@@ -98,9 +98,11 @@ byte has_waiter[N_CPUS];
     do                                                            \
       :: pending_cpus > 1 -> COND_WAIT(exclusive_cond, mutex);    \
       :: else             -> break;                               \
-    od
+    od;                                                           \
+    MUTEX_UNLOCK(mutex);
 
 #define end_exclusive()                                           \
+    MUTEX_LOCK(mutex);                                            \
     pending_cpus = 0;                                             \
     COND_BROADCAST(exclusive_resume);                             \
     MUTEX_UNLOCK(mutex);
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index f8726140f6..934c07afbf 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -846,9 +846,6 @@ void cpu_exec_end(CPUState *cpu);
  * cpu_exec are exited immediately.  CPUs that call cpu_exec_start
  * during the exclusive section go to sleep until this CPU calls
  * end_exclusive.
- *
- * Returns with the CPU list lock taken (which nests outside all
- * other locks except the BQL).
  */
 void start_exclusive(void);
 
@@ -856,7 +853,6 @@ void start_exclusive(void);
  * end_exclusive:
  *
  * Concludes an exclusive execution section started by start_exclusive.
- * Releases the CPU list lock.
  */
 void end_exclusive(void);
 
-- 
cgit v1.2.3-55-g7522


From 53f5ed95064fe6807890cd5535445a05d3361bd2 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Sun, 28 Aug 2016 05:38:24 +0200
Subject: cpus-common: Introduce async_safe_run_on_cpu()

Reviewed-by: Richard Henderson <rth@twiddle.net>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus-common.c     | 33 +++++++++++++++++++++++++++++++--
 include/qom/cpu.h | 14 ++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 429652c7fd..38b1d553fb 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -18,6 +18,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/main-loop.h"
 #include "exec/cpu-common.h"
 #include "qom/cpu.h"
 #include "sysemu/cpus.h"
@@ -106,7 +107,7 @@ struct qemu_work_item {
     struct qemu_work_item *next;
     run_on_cpu_func func;
     void *data;
-    bool free, done;
+    bool free, exclusive, done;
 };
 
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
@@ -139,6 +140,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
     wi.data = data;
     wi.done = false;
     wi.free = false;
+    wi.exclusive = false;
 
     queue_work_on_cpu(cpu, &wi);
     while (!atomic_mb_read(&wi.done)) {
@@ -229,6 +231,19 @@ void cpu_exec_end(CPUState *cpu)
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+{
+    struct qemu_work_item *wi;
+
+    wi = g_malloc0(sizeof(struct qemu_work_item));
+    wi->func = func;
+    wi->data = data;
+    wi->free = true;
+    wi->exclusive = true;
+
+    queue_work_on_cpu(cpu, wi);
+}
+
 void process_queued_cpu_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
@@ -245,7 +260,21 @@ void process_queued_cpu_work(CPUState *cpu)
             cpu->queued_work_last = NULL;
         }
         qemu_mutex_unlock(&cpu->work_mutex);
-        wi->func(cpu, wi->data);
+        if (wi->exclusive) {
+            /* Running work items outside the BQL avoids the following deadlock:
+             * 1) start_exclusive() is called with the BQL taken while another
+             * CPU is running; 2) cpu_exec in the other CPU tries to takes the
+             * BQL, so it goes to sleep; start_exclusive() is sleeping too, so
+             * neither CPU can proceed.
+             */
+            qemu_mutex_unlock_iothread();
+            start_exclusive();
+            wi->func(cpu, wi->data);
+            end_exclusive();
+            qemu_mutex_lock_iothread();
+        } else {
+            wi->func(cpu, wi->data);
+        }
         qemu_mutex_lock(&cpu->work_mutex);
         if (wi->free) {
             g_free(wi);
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 934c07afbf..4092dd919b 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -655,6 +655,20 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
  */
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
 
+/**
+ * async_safe_run_on_cpu:
+ * @cpu: The vCPU to run on.
+ * @func: The function to be executed.
+ * @data: Data to pass to the function.
+ *
+ * Schedules the function @func for execution on the vCPU @cpu asynchronously,
+ * while all other vCPUs are sleeping.
+ *
+ * Unlike run_on_cpu and async_run_on_cpu, the function is run outside the
+ * BQL.
+ */
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+
 /**
  * qemu_get_cpu:
  * @index: The CPUState@cpu_index value of the CPU to obtain.
-- 
cgit v1.2.3-55-g7522


From 3359baad36889b83df40b637ed993a4b816c4906 Mon Sep 17 00:00:00 2001
From: Sergey Fedorov
Date: Tue, 2 Aug 2016 18:27:43 +0100
Subject: tcg: Make tb_flush() thread safe

Use async_safe_run_on_cpu() to make tb_flush() thread safe.  This is
possible now that code generation does not happen in the middle of
execution.

It can happen that multiple threads schedule a safe work to flush the
translation buffer. To keep statistics and debugging output sane, always
check if the translation buffer has already been flushed.

Signed-off-by: Sergey Fedorov <serge.fdrv@gmail.com>
Signed-off-by: Sergey Fedorov <sergey.fedorov@linaro.org>
[AJB: minor re-base fixes]
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Message-Id: <1470158864-17651-13-git-send-email-alex.bennee@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpu-exec.c                | 12 ++----------
 include/exec/tb-context.h |  2 +-
 include/qom/cpu.h         |  2 --
 translate-all.c           | 38 ++++++++++++++++++++++++++++----------
 4 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 9f4bd0b6dd..8823d23df7 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -204,20 +204,16 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
                              TranslationBlock *orig_tb, bool ignore_icount)
 {
     TranslationBlock *tb;
-    bool old_tb_flushed;
 
     /* Should never happen.
        We only end up here when an existing TB is too long.  */
     if (max_cycles > CF_COUNT_MASK)
         max_cycles = CF_COUNT_MASK;
 
-    old_tb_flushed = cpu->tb_flushed;
-    cpu->tb_flushed = false;
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
                      max_cycles | CF_NOCACHE
                          | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-    tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
-    cpu->tb_flushed |= old_tb_flushed;
+    tb->orig_tb = orig_tb;
     /* execute the generated code */
     trace_exec_tb_nocache(tb, tb->pc);
     cpu_tb_exec(cpu, tb);
@@ -338,10 +334,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
             tb_lock();
             have_tb_lock = true;
         }
-        /* Check if translation buffer has been flushed */
-        if (cpu->tb_flushed) {
-            cpu->tb_flushed = false;
-        } else if (!tb->invalid) {
+        if (!tb->invalid) {
             tb_add_jump(last_tb, tb_exit, tb);
         }
     }
@@ -606,7 +599,6 @@ int cpu_exec(CPUState *cpu)
                 break;
             }
 
-            atomic_mb_set(&cpu->tb_flushed, false); /* reset before first TB lookup */
             for(;;) {
                 cpu_handle_interrupt(cpu, &last_tb);
                 tb = tb_find(cpu, last_tb, tb_exit);
diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h
index dce95d92d6..c7f17f26e0 100644
--- a/include/exec/tb-context.h
+++ b/include/exec/tb-context.h
@@ -38,7 +38,7 @@ struct TBContext {
     QemuMutex tb_lock;
 
     /* statistics */
-    int tb_flush_count;
+    unsigned tb_flush_count;
     int tb_phys_invalidate_count;
 };
 
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 4092dd919b..5dfe74a0e7 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -253,7 +253,6 @@ struct qemu_work_item;
  * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *           CPU and return to its top level loop.
- * @tb_flushed: Indicates the translation buffer has been flushed.
  * @singlestep_enabled: Flags for single-stepping.
  * @icount_extra: Instructions until next timer event.
  * @icount_decr: Number of cycles left, with interrupt flag in high bit.
@@ -306,7 +305,6 @@ struct CPUState {
     bool unplug;
     bool crash_occurred;
     bool exit_request;
-    bool tb_flushed;
     uint32_t interrupt_request;
     int singlestep_enabled;
     int64_t icount_extra;
diff --git a/translate-all.c b/translate-all.c
index e9bc90c654..8ca393c9d0 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -834,12 +834,19 @@ static void page_flush_tb(void)
 }
 
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
-void tb_flush(CPUState *cpu)
+static void do_tb_flush(CPUState *cpu, void *data)
 {
-    if (!tcg_enabled()) {
-        return;
+    unsigned tb_flush_req = (unsigned) (uintptr_t) data;
+
+    tb_lock();
+
+    /* If it's already been done on request of another CPU,
+     * just retry.
+     */
+    if (tcg_ctx.tb_ctx.tb_flush_count != tb_flush_req) {
+        goto done;
     }
+
 #if defined(DEBUG_FLUSH)
     printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
            (unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -858,7 +865,6 @@ void tb_flush(CPUState *cpu)
         for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
             atomic_set(&cpu->tb_jmp_cache[i], NULL);
         }
-        atomic_mb_set(&cpu->tb_flushed, true);
     }
 
     tcg_ctx.tb_ctx.nb_tbs = 0;
@@ -868,7 +874,19 @@ void tb_flush(CPUState *cpu)
     tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer;
     /* XXX: flush processor icache at this point if cache flush is
        expensive */
-    tcg_ctx.tb_ctx.tb_flush_count++;
+    atomic_mb_set(&tcg_ctx.tb_ctx.tb_flush_count,
+                  tcg_ctx.tb_ctx.tb_flush_count + 1);
+
+done:
+    tb_unlock();
+}
+
+void tb_flush(CPUState *cpu)
+{
+    if (tcg_enabled()) {
+        uintptr_t tb_flush_req = atomic_mb_read(&tcg_ctx.tb_ctx.tb_flush_count);
+        async_safe_run_on_cpu(cpu, do_tb_flush, (void *) tb_flush_req);
+    }
 }
 
 #ifdef DEBUG_TB_CHECK
@@ -1175,9 +1193,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
  buffer_overflow:
         /* flush must be done */
         tb_flush(cpu);
-        /* cannot fail at this point */
-        tb = tb_alloc(pc);
-        assert(tb != NULL);
+        mmap_unlock();
+        cpu_loop_exit(cpu);
     }
 
     gen_code_buf = tcg_ctx.code_gen_ptr;
@@ -1775,7 +1792,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
     qht_statistics_destroy(&hst);
 
     cpu_fprintf(f, "\nStatistics:\n");
-    cpu_fprintf(f, "TB flush count      %d\n", tcg_ctx.tb_ctx.tb_flush_count);
+    cpu_fprintf(f, "TB flush count      %u\n",
+            atomic_read(&tcg_ctx.tb_ctx.tb_flush_count));
     cpu_fprintf(f, "TB invalidate count %d\n",
             tcg_ctx.tb_ctx.tb_phys_invalidate_count);
     cpu_fprintf(f, "TLB flush count     %d\n", tlb_flush_count);
-- 
cgit v1.2.3-55-g7522


From c265e976f4669fd65f5b47e6865f50d1cb66bd02 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Wed, 31 Aug 2016 21:33:58 +0200
Subject: cpus-common: lock-free fast path for cpu_exec_start/end

Set cpu->running without taking the cpu_list lock, only requiring it if
there is a concurrent exclusive section.  This requires adding a new
field to CPUState, which records whether a running CPU is being counted
in pending_cpus.

When an exclusive section is started concurrently with cpu_exec_start,
cpu_exec_start can use the new field to determine if it has to wait for
the end of the exclusive section.  Likewise, cpu_exec_end can use it to
see if start_exclusive is waiting for that CPU.

This a separate patch for easier bisection of issues.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 cpus-common.c              | 95 ++++++++++++++++++++++++++++++++++++++--------
 docs/tcg-exclusive.promela | 53 ++++++++++++++++++++++++--
 include/qom/cpu.h          |  5 ++-
 3 files changed, 133 insertions(+), 20 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index 38b1d553fb..3e114529c9 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -28,6 +28,9 @@ static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
 static QemuCond qemu_work_cond;
 
+/* >= 1 if a thread is inside start_exclusive/end_exclusive.  Written
+ * under qemu_cpu_list_lock, read with atomic operations.
+ */
 static int pending_cpus;
 
 void qemu_init_cpu_list(void)
@@ -177,18 +180,26 @@ static inline void exclusive_idle(void)
 void start_exclusive(void)
 {
     CPUState *other_cpu;
+    int running_cpus;
 
     qemu_mutex_lock(&qemu_cpu_list_lock);
     exclusive_idle();
 
     /* Make all other cpus stop executing.  */
-    pending_cpus = 1;
+    atomic_set(&pending_cpus, 1);
+
+    /* Write pending_cpus before reading other_cpu->running.  */
+    smp_mb();
+    running_cpus = 0;
     CPU_FOREACH(other_cpu) {
-        if (other_cpu->running) {
-            pending_cpus++;
+        if (atomic_read(&other_cpu->running)) {
+            other_cpu->has_waiter = true;
+            running_cpus++;
             qemu_cpu_kick(other_cpu);
         }
     }
+
+    atomic_set(&pending_cpus, running_cpus + 1);
     while (pending_cpus > 1) {
         qemu_cond_wait(&exclusive_cond, &qemu_cpu_list_lock);
     }
@@ -203,7 +214,7 @@ void start_exclusive(void)
 void end_exclusive(void)
 {
     qemu_mutex_lock(&qemu_cpu_list_lock);
-    pending_cpus = 0;
+    atomic_set(&pending_cpus, 0);
     qemu_cond_broadcast(&exclusive_resume);
     qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
@@ -211,24 +222,78 @@ void end_exclusive(void)
 /* Wait for exclusive ops to finish, and begin cpu execution.  */
 void cpu_exec_start(CPUState *cpu)
 {
-    qemu_mutex_lock(&qemu_cpu_list_lock);
-    exclusive_idle();
-    cpu->running = true;
-    qemu_mutex_unlock(&qemu_cpu_list_lock);
+    atomic_set(&cpu->running, true);
+
+    /* Write cpu->running before reading pending_cpus.  */
+    smp_mb();
+
+    /* 1. start_exclusive saw cpu->running == true and pending_cpus >= 1.
+     * After taking the lock we'll see cpu->has_waiter == true and run---not
+     * for long because start_exclusive kicked us.  cpu_exec_end will
+     * decrement pending_cpus and signal the waiter.
+     *
+     * 2. start_exclusive saw cpu->running == false but pending_cpus >= 1.
+     * This includes the case when an exclusive item is running now.
+     * Then we'll see cpu->has_waiter == false and wait for the item to
+     * complete.
+     *
+     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
+     * see cpu->running == true, and it will kick the CPU.
+     */
+    if (unlikely(atomic_read(&pending_cpus))) {
+        qemu_mutex_lock(&qemu_cpu_list_lock);
+        if (!cpu->has_waiter) {
+            /* Not counted in pending_cpus, let the exclusive item
+             * run.  Since we have the lock, just set cpu->running to true
+             * while holding it; no need to check pending_cpus again.
+             */
+            atomic_set(&cpu->running, false);
+            exclusive_idle();
+            /* Now pending_cpus is zero.  */
+            atomic_set(&cpu->running, true);
+        } else {
+            /* Counted in pending_cpus, go ahead and release the
+             * waiter at cpu_exec_end.
+             */
+        }
+        qemu_mutex_unlock(&qemu_cpu_list_lock);
+    }
 }
 
 /* Mark cpu as not executing, and release pending exclusive ops.  */
 void cpu_exec_end(CPUState *cpu)
 {
-    qemu_mutex_lock(&qemu_cpu_list_lock);
-    cpu->running = false;
-    if (pending_cpus > 1) {
-        pending_cpus--;
-        if (pending_cpus == 1) {
-            qemu_cond_signal(&exclusive_cond);
+    atomic_set(&cpu->running, false);
+
+    /* Write cpu->running before reading pending_cpus.  */
+    smp_mb();
+
+    /* 1. start_exclusive saw cpu->running == true.  Then it will increment
+     * pending_cpus and wait for exclusive_cond.  After taking the lock
+     * we'll see cpu->has_waiter == true.
+     *
+     * 2. start_exclusive saw cpu->running == false but here pending_cpus >= 1.
+     * This includes the case when an exclusive item started after setting
+     * cpu->running to false and before we read pending_cpus.  Then we'll see
+     * cpu->has_waiter == false and not touch pending_cpus.  The next call to
+     * cpu_exec_start will run exclusive_idle if still necessary, thus waiting
+     * for the item to complete.
+     *
+     * 3. pending_cpus == 0.  Then start_exclusive is definitely going to
+     * see cpu->running == false, and it can ignore this CPU until the
+     * next cpu_exec_start.
+     */
+    if (unlikely(atomic_read(&pending_cpus))) {
+        qemu_mutex_lock(&qemu_cpu_list_lock);
+        if (cpu->has_waiter) {
+            cpu->has_waiter = false;
+            atomic_set(&pending_cpus, pending_cpus - 1);
+            if (pending_cpus == 1) {
+                qemu_cond_signal(&exclusive_cond);
+            }
         }
+        qemu_mutex_unlock(&qemu_cpu_list_lock);
     }
-    qemu_mutex_unlock(&qemu_cpu_list_lock);
 }
 
 void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
diff --git a/docs/tcg-exclusive.promela b/docs/tcg-exclusive.promela
index feac679b9a..c91cfca9f7 100644
--- a/docs/tcg-exclusive.promela
+++ b/docs/tcg-exclusive.promela
@@ -13,7 +13,8 @@
  *     gcc pan.c -O2
  *     ./a.out -a
  *
- * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, TEST_EXPENSIVE.
+ * Tunable processor macros: N_CPUS, N_EXCLUSIVE, N_CYCLES, USE_MUTEX,
+ *                           TEST_EXPENSIVE.
  */
 
 // Define the missing parameters for the model
@@ -22,8 +23,10 @@
 #warning defaulting to 2 CPU processes
 #endif
 
-// the expensive test is not so expensive for <= 3 CPUs
-#if N_CPUS <= 3
+// the expensive test is not so expensive for <= 2 CPUs
+// If the mutex is used, it's also cheap (300 MB / 4 seconds) for 3 CPUs
+// For 3 CPUs and the lock-free option it needs 1.5 GB of RAM
+#if N_CPUS <= 2 || (N_CPUS <= 3 && defined USE_MUTEX)
 #define TEST_EXPENSIVE
 #endif
 
@@ -107,6 +110,8 @@ byte has_waiter[N_CPUS];
     COND_BROADCAST(exclusive_resume);                             \
     MUTEX_UNLOCK(mutex);
 
+#ifdef USE_MUTEX
+// Simple version using mutexes
 #define cpu_exec_start(id)                                                   \
     MUTEX_LOCK(mutex);                                                       \
     exclusive_idle();                                                        \
@@ -127,6 +132,48 @@ byte has_waiter[N_CPUS];
         :: else -> skip;                                                     \
     fi;                                                                      \
     MUTEX_UNLOCK(mutex);
+#else
+// Wait-free fast path, only needs mutex when concurrent with
+// an exclusive section
+#define cpu_exec_start(id)                                                   \
+    running[id] = 1;                                                         \
+    if                                                                       \
+        :: pending_cpus -> {                                                 \
+            MUTEX_LOCK(mutex);                                               \
+            if                                                               \
+                :: !has_waiter[id] -> {                                      \
+                    running[id] = 0;                                         \
+                    exclusive_idle();                                        \
+                    running[id] = 1;                                         \
+                }                                                            \
+                :: else -> skip;                                             \
+            fi;                                                              \
+            MUTEX_UNLOCK(mutex);                                             \
+        }                                                                    \
+        :: else -> skip;                                                     \
+    fi;
+
+#define cpu_exec_end(id)                                                     \
+    running[id] = 0;                                                         \
+    if                                                                       \
+        :: pending_cpus -> {                                                 \
+            MUTEX_LOCK(mutex);                                               \
+            if                                                               \
+                :: has_waiter[id] -> {                                       \
+                    has_waiter[id] = 0;                                      \
+                    pending_cpus--;                                          \
+                    if                                                       \
+                        :: pending_cpus == 1 -> COND_BROADCAST(exclusive_cond); \
+                        :: else -> skip;                                     \
+                    fi;                                                      \
+                }                                                            \
+                :: else -> skip;                                             \
+            fi;                                                              \
+            MUTEX_UNLOCK(mutex);                                             \
+        }                                                                    \
+        :: else -> skip;                                                     \
+    fi
+#endif
 
 // Promela processes
 
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 5dfe74a0e7..22b54d6d93 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -242,7 +242,8 @@ struct qemu_work_item;
  * @nr_threads: Number of threads within this CPU.
  * @numa_node: NUMA node this CPU is belonging to.
  * @host_tid: Host thread ID.
- * @running: #true if CPU is currently running;
+ * @running: #true if CPU is currently running (lockless).
+ * @has_waiter: #true if a CPU is currently waiting for the cpu_exec_end;
  * valid under cpu_list_lock.
  * @created: Indicates whether the CPU thread has been successfully created.
  * @interrupt_request: Indicates a pending interrupt request.
@@ -296,7 +297,7 @@ struct CPUState {
 #endif
     int thread_id;
     uint32_t host_tid;
-    bool running;
+    bool running, has_waiter;
     struct QemuCond *halt_cond;
     bool thread_kicked;
     bool created;
-- 
cgit v1.2.3-55-g7522


From f186d64d8fda4bb22c15beb8e45b7814fbd8b51e Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk
Date: Mon, 26 Sep 2016 11:08:04 +0300
Subject: replay: move internal data to the structure

This patch moves replay static variables into the structure
to allow saving and loading them with savevm/loadvm.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>

Signed-off-by: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
Message-Id: <20160926080804.6992.87687.stgit@PASHA-ISP>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 replay/replay-events.c   |  2 +-
 replay/replay-internal.c | 20 +++++++++-----------
 replay/replay-internal.h |  8 +++++---
 replay/replay-time.c     |  2 +-
 replay/replay.c          | 15 ++++++++-------
 5 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/replay/replay-events.c b/replay/replay-events.c
index 3807245ae7..4eb2ea3604 100644
--- a/replay/replay-events.c
+++ b/replay/replay-events.c
@@ -279,7 +279,7 @@ static Event *replay_read_event(int checkpoint)
 /* Called with replay mutex locked */
 void replay_read_events(int checkpoint)
 {
-    while (replay_data_kind == EVENT_ASYNC) {
+    while (replay_state.data_kind == EVENT_ASYNC) {
         Event *event = replay_read_event(checkpoint);
         if (!event) {
             break;
diff --git a/replay/replay-internal.c b/replay/replay-internal.c
index 5835e8def3..bea7b4aa6b 100644
--- a/replay/replay-internal.c
+++ b/replay/replay-internal.c
@@ -16,11 +16,8 @@
 #include "qemu/error-report.h"
 #include "sysemu/sysemu.h"
 
-unsigned int replay_data_kind = -1;
-static unsigned int replay_has_unread_data;
-
 /* Mutex to protect reading and writing events to the log.
-   replay_data_kind and replay_has_unread_data are also protected
+   data_kind and has_unread_data are also protected
    by this mutex.
    It also protects replay events queue which stores events to be
    written or read to the log. */
@@ -150,15 +147,16 @@ void replay_check_error(void)
 void replay_fetch_data_kind(void)
 {
     if (replay_file) {
-        if (!replay_has_unread_data) {
-            replay_data_kind = replay_get_byte();
-            if (replay_data_kind == EVENT_INSTRUCTION) {
+        if (!replay_state.has_unread_data) {
+            replay_state.data_kind = replay_get_byte();
+            if (replay_state.data_kind == EVENT_INSTRUCTION) {
                 replay_state.instructions_count = replay_get_dword();
             }
             replay_check_error();
-            replay_has_unread_data = 1;
-            if (replay_data_kind >= EVENT_COUNT) {
-                error_report("Replay: unknown event kind %d", replay_data_kind);
+            replay_state.has_unread_data = 1;
+            if (replay_state.data_kind >= EVENT_COUNT) {
+                error_report("Replay: unknown event kind %d",
+                             replay_state.data_kind);
                 exit(1);
             }
         }
@@ -167,7 +165,7 @@ void replay_fetch_data_kind(void)
 
 void replay_finish_event(void)
 {
-    replay_has_unread_data = 0;
+    replay_state.has_unread_data = 0;
     replay_fetch_data_kind();
 }
 
diff --git a/replay/replay-internal.h b/replay/replay-internal.h
index efbf14c8a7..9b02d7d6aa 100644
--- a/replay/replay-internal.h
+++ b/replay/replay-internal.h
@@ -62,11 +62,13 @@ typedef struct ReplayState {
     uint64_t current_step;
     /*! Number of instructions to be executed before other events happen. */
     int instructions_count;
+    /*! Type of the currently executed event. */
+    unsigned int data_kind;
+    /*! Flag which indicates that event is not processed yet. */
+    unsigned int has_unread_data;
 } ReplayState;
 extern ReplayState replay_state;
 
-extern unsigned int replay_data_kind;
-
 /* File for replay writing */
 extern FILE *replay_file;
 
@@ -98,7 +100,7 @@ void replay_check_error(void);
     the next event from the log. */
 void replay_finish_event(void);
 /*! Reads data type from the file and stores it in the
-    replay_data_kind variable. */
+    data_kind variable. */
 void replay_fetch_data_kind(void);
 
 /*! Saves queued events (like instructions and sound). */
diff --git a/replay/replay-time.c b/replay/replay-time.c
index fffe072c55..f70382a88f 100644
--- a/replay/replay-time.c
+++ b/replay/replay-time.c
@@ -31,7 +31,7 @@ int64_t replay_save_clock(ReplayClockKind kind, int64_t clock)
 
 void replay_read_next_clock(ReplayClockKind kind)
 {
-    unsigned int read_kind = replay_data_kind - EVENT_CLOCK;
+    unsigned int read_kind = replay_state.data_kind - EVENT_CLOCK;
 
     assert(read_kind == kind);
 
diff --git a/replay/replay.c b/replay/replay.c
index 167fd2942d..cc2238d077 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -38,15 +38,15 @@ bool replay_next_event_is(int event)
 
     /* nothing to skip - not all instructions used */
     if (replay_state.instructions_count != 0) {
-        assert(replay_data_kind == EVENT_INSTRUCTION);
+        assert(replay_state.data_kind == EVENT_INSTRUCTION);
         return event == EVENT_INSTRUCTION;
     }
 
     while (true) {
-        if (event == replay_data_kind) {
+        if (event == replay_state.data_kind) {
             res = true;
         }
-        switch (replay_data_kind) {
+        switch (replay_state.data_kind) {
         case EVENT_SHUTDOWN:
             replay_finish_event();
             qemu_system_shutdown_request();
@@ -85,7 +85,7 @@ void replay_account_executed_instructions(void)
             replay_state.instructions_count -= count;
             replay_state.current_step += count;
             if (replay_state.instructions_count == 0) {
-                assert(replay_data_kind == EVENT_INSTRUCTION);
+                assert(replay_state.data_kind == EVENT_INSTRUCTION);
                 replay_finish_event();
                 /* Wake up iothread. This is required because
                    timers will not expire until clock counters
@@ -188,7 +188,7 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint)
     if (replay_mode == REPLAY_MODE_PLAY) {
         if (replay_next_event_is(EVENT_CHECKPOINT + checkpoint)) {
             replay_finish_event();
-        } else if (replay_data_kind != EVENT_ASYNC) {
+        } else if (replay_state.data_kind != EVENT_ASYNC) {
             res = false;
             goto out;
         }
@@ -196,7 +196,7 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint)
         /* replay_read_events may leave some unread events.
            Return false if not all of the events associated with
            checkpoint were processed */
-        res = replay_data_kind != EVENT_ASYNC;
+        res = replay_state.data_kind != EVENT_ASYNC;
     } else if (replay_mode == REPLAY_MODE_RECORD) {
         replay_put_event(EVENT_CHECKPOINT + checkpoint);
         replay_save_events(checkpoint);
@@ -237,9 +237,10 @@ static void replay_enable(const char *fname, int mode)
     replay_filename = g_strdup(fname);
 
     replay_mode = mode;
-    replay_data_kind = -1;
+    replay_state.data_kind = -1;
     replay_state.instructions_count = 0;
     replay_state.current_step = 0;
+    replay_state.has_unread_data = 0;
 
     /* skip file header for RECORD and check it for PLAY */
     if (replay_mode == REPLAY_MODE_RECORD) {
-- 
cgit v1.2.3-55-g7522


From 306e196fa24c46d384577fb9c16e7cdb80f26d17 Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk
Date: Mon, 26 Sep 2016 11:08:10 +0300
Subject: replay: vmstate for replay module

This patch introduces vmstate for replay data structures.
It allows saving and loading vmstate while replaying.

Signed-off-by: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
Message-Id: <20160926080810.6992.68420.stgit@PASHA-ISP>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 replay/Makefile.objs     |  1 +
 replay/replay-internal.h |  9 ++++++++
 replay/replay-snapshot.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++
 replay/replay.c          |  1 +
 4 files changed, 71 insertions(+)
 create mode 100644 replay/replay-snapshot.c

diff --git a/replay/Makefile.objs b/replay/Makefile.objs
index fcb3f74d60..c8ad3ebb89 100644
--- a/replay/Makefile.objs
+++ b/replay/Makefile.objs
@@ -4,3 +4,4 @@ common-obj-y += replay-events.o
 common-obj-y += replay-time.o
 common-obj-y += replay-input.o
 common-obj-y += replay-char.o
+common-obj-y += replay-snapshot.o
diff --git a/replay/replay-internal.h b/replay/replay-internal.h
index 9b02d7d6aa..e07eb7d45d 100644
--- a/replay/replay-internal.h
+++ b/replay/replay-internal.h
@@ -66,6 +66,8 @@ typedef struct ReplayState {
     unsigned int data_kind;
     /*! Flag which indicates that event is not processed yet. */
     unsigned int has_unread_data;
+    /*! Temporary variable for saving current log offset. */
+    uint64_t file_offset;
 } ReplayState;
 extern ReplayState replay_state;
 
@@ -157,4 +159,11 @@ void replay_event_char_read_save(void *opaque);
 /*! Reads char event read from the file. */
 void *replay_event_char_read_load(void);
 
+/* VMState-related functions */
+
+/* Registers replay VMState.
+   Should be called before virtual devices initialization
+   to make cached timers available for post_load functions. */
+void replay_vmstate_register(void);
+
 #endif
diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c
new file mode 100644
index 0000000000..a17e80e26c
--- /dev/null
+++ b/replay/replay-snapshot.c
@@ -0,0 +1,60 @@
+/*
+ * replay-snapshot.c
+ *
+ * Copyright (c) 2010-2016 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "sysemu/replay.h"
+#include "replay-internal.h"
+#include "sysemu/sysemu.h"
+#include "monitor/monitor.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/error-report.h"
+#include "migration/vmstate.h"
+
+static void replay_pre_save(void *opaque)
+{
+    ReplayState *state = opaque;
+    state->file_offset = ftell(replay_file);
+}
+
+static int replay_post_load(void *opaque, int version_id)
+{
+    ReplayState *state = opaque;
+    fseek(replay_file, state->file_offset, SEEK_SET);
+    /* If this was a vmstate, saved in recording mode,
+       we need to initialize replay data fields. */
+    replay_fetch_data_kind();
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_replay = {
+    .name = "replay",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_save = replay_pre_save,
+    .post_load = replay_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_INT64_ARRAY(cached_clock, ReplayState, REPLAY_CLOCK_COUNT),
+        VMSTATE_UINT64(current_step, ReplayState),
+        VMSTATE_INT32(instructions_count, ReplayState),
+        VMSTATE_UINT32(data_kind, ReplayState),
+        VMSTATE_UINT32(has_unread_data, ReplayState),
+        VMSTATE_UINT64(file_offset, ReplayState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+void replay_vmstate_register(void)
+{
+    vmstate_register(NULL, 0, &vmstate_replay, &replay_state);
+}
diff --git a/replay/replay.c b/replay/replay.c
index cc2238d077..c797aeae8a 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -292,6 +292,7 @@ void replay_configure(QemuOpts *opts)
         exit(1);
     }
 
+    replay_vmstate_register();
     replay_enable(fname, mode);
 
 out:
-- 
cgit v1.2.3-55-g7522


From 6d0ceb80ffe18ad4b28aab7356f440636c0be7be Mon Sep 17 00:00:00 2001
From: Pavel Dovgalyuk
Date: Mon, 26 Sep 2016 11:08:16 +0300
Subject: replay: allow replay stopping and restarting

This patch fixes bug with stopping and restarting replay
through monitor.

Signed-off-by: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
Message-Id: <20160926080815.6992.71818.stgit@PASHA-ISP>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/blkreplay.c        | 15 +++++----------
 cpus.c                   |  1 +
 include/sysemu/replay.h  |  4 ++++
 replay/replay-events.c   |  8 ++++++++
 replay/replay-internal.h |  6 ++++--
 replay/replay-snapshot.c |  1 +
 stubs/replay.c           |  5 +++++
 vl.c                     |  1 +
 8 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/block/blkreplay.c b/block/blkreplay.c
index 30f9d5ff6c..a741654d35 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -20,11 +20,6 @@ typedef struct Request {
     QEMUBH *bh;
 } Request;
 
-/* Next request id.
-   This counter is global, because requests from different
-   block devices should not get overlapping ids. */
-static uint64_t request_id;
-
 static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags,
                           Error **errp)
 {
@@ -84,7 +79,7 @@ static void block_request_create(uint64_t reqid, BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
     int ret = bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
@@ -95,7 +90,7 @@ static int coroutine_fn blkreplay_co_preadv(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
     uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
     int ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
@@ -106,7 +101,7 @@ static int coroutine_fn blkreplay_co_pwritev(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
     int64_t offset, int count, BdrvRequestFlags flags)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
     int ret = bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
@@ -117,7 +112,7 @@ static int coroutine_fn blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
 static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
                                               int64_t offset, int count)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
     int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
@@ -127,7 +122,7 @@ static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
 
 static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs)
 {
-    uint64_t reqid = request_id++;
+    uint64_t reqid = blkreplay_next_id();
     int ret = bdrv_co_flush(bs->file->bs);
     block_request_create(reqid, bs, qemu_coroutine_self());
     qemu_coroutine_yield();
diff --git a/cpus.c b/cpus.c
index fbd70f59f7..b2fbe33304 100644
--- a/cpus.c
+++ b/cpus.c
@@ -750,6 +750,7 @@ static int do_vm_stop(RunState state)
     }
 
     bdrv_drain_all();
+    replay_disable_events();
     ret = blk_flush_all();
 
     return ret;
diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
index 0a88393d2b..f80d6d28e8 100644
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@@ -105,6 +105,8 @@ bool replay_checkpoint(ReplayCheckpoint checkpoint);
 
 /*! Disables storing events in the queue */
 void replay_disable_events(void);
+/*! Enables storing events in the queue */
+void replay_enable_events(void);
 /*! Returns true when saving events is enabled */
 bool replay_events_enabled(void);
 /*! Adds bottom half event to the queue */
@@ -115,6 +117,8 @@ void replay_input_event(QemuConsole *src, InputEvent *evt);
 void replay_input_sync_event(void);
 /*! Adds block layer event to the queue */
 void replay_block_event(QEMUBH *bh, uint64_t id);
+/*! Returns ID for the next block event */
+uint64_t blkreplay_next_id(void);
 
 /* Character device */
 
diff --git a/replay/replay-events.c b/replay/replay-events.c
index 4eb2ea3604..c513913671 100644
--- a/replay/replay-events.c
+++ b/replay/replay-events.c
@@ -309,3 +309,11 @@ bool replay_events_enabled(void)
 {
     return events_enabled;
 }
+
+uint64_t blkreplay_next_id(void)
+{
+    if (replay_events_enabled()) {
+        return replay_state.block_request_id++;
+    }
+    return 0;
+}
diff --git a/replay/replay-internal.h b/replay/replay-internal.h
index e07eb7d45d..9117e442d0 100644
--- a/replay/replay-internal.h
+++ b/replay/replay-internal.h
@@ -68,6 +68,10 @@ typedef struct ReplayState {
     unsigned int has_unread_data;
     /*! Temporary variable for saving current log offset. */
     uint64_t file_offset;
+    /*! Next block operation id.
+        This counter is global, because requests from different
+        block devices should not get overlapping ids. */
+    uint64_t block_request_id;
 } ReplayState;
 extern ReplayState replay_state;
 
@@ -123,8 +127,6 @@ void replay_read_next_clock(unsigned int kind);
 void replay_init_events(void);
 /*! Clears internal data structures for events handling */
 void replay_finish_events(void);
-/*! Enables storing events in the queue */
-void replay_enable_events(void);
 /*! Flushes events queue */
 void replay_flush_events(void);
 /*! Clears events list before loading new VM state */
diff --git a/replay/replay-snapshot.c b/replay/replay-snapshot.c
index a17e80e26c..498059734d 100644
--- a/replay/replay-snapshot.c
+++ b/replay/replay-snapshot.c
@@ -50,6 +50,7 @@ static const VMStateDescription vmstate_replay = {
         VMSTATE_UINT32(data_kind, ReplayState),
         VMSTATE_UINT32(has_unread_data, ReplayState),
         VMSTATE_UINT64(file_offset, ReplayState),
+        VMSTATE_UINT64(block_request_id, ReplayState),
         VMSTATE_END_OF_LIST()
     },
 };
diff --git a/stubs/replay.c b/stubs/replay.c
index de9fa1ec98..d9a6da99d2 100644
--- a/stubs/replay.c
+++ b/stubs/replay.c
@@ -67,3 +67,8 @@ void replay_char_read_all_save_buf(uint8_t *buf, int offset)
 void replay_block_event(QEMUBH *bh, uint64_t id)
 {
 }
+
+uint64_t blkreplay_next_id(void)
+{
+    return 0;
+}
diff --git a/vl.c b/vl.c
index eda83fa93d..5759e0ad51 100644
--- a/vl.c
+++ b/vl.c
@@ -784,6 +784,7 @@ void vm_start(void)
     if (runstate_is_running()) {
         qapi_event_send_stop(&error_abort);
     } else {
+        replay_enable_events();
         cpu_enable_ticks();
         runstate_set(RUN_STATE_RUNNING);
         vm_state_notify(1, RUN_STATE_RUNNING);
-- 
cgit v1.2.3-55-g7522