From 53f09a1076f5efbba7d751a8005e2fcf008606db Mon Sep 17 00:00:00 2001
From: Pavel Butsykin
Date: Fri, 3 Feb 2017 18:23:20 +0300
Subject: add 'release-ram' migrate capability

This feature frees the migrated memory on the source during postcopy-ram
migration. In the second step of postcopy-ram migration when the source vm
is put on pause we can free unnecessary memory. It will allow, in particular,
to start relaxing the memory stress on the source host in a load-balancing
scenario.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Message-Id: <20170203152321.19739-3-pbutsykin@virtuozzo.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
   Manually merged in Pavel's 'migration: madvise error_report fixup!'
---
 include/migration/migration.h | 1 +
 include/migration/qemu-file.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 7528cc2fbc..b9b706a7e3 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -304,6 +304,7 @@ int migrate_add_blocker(Error *reason, Error **errp);
  */
 void migrate_del_blocker(Error *reason);
 
+bool migrate_release_ram(void);
 bool migrate_postcopy_ram(void);
 bool migrate_zero_blocks(void);
 
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index abedd466c9..0cd648a733 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -132,7 +132,8 @@ void qemu_put_byte(QEMUFile *f, int v);
  * put_buffer without copying the buffer.
  * The buffer should be available till it is sent asynchronously.
  */
-void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size);
+void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
+                           bool may_free);
 bool qemu_file_mode_is_not_valid(const char *mode);
 bool qemu_file_is_writable(QEMUFile *f);
 
-- 
cgit v1.2.3-55-g7522


From ced1c6166ef901a91578d54ff39f46b3e7ae870f Mon Sep 17 00:00:00 2001
From: Pavel Butsykin
Date: Fri, 3 Feb 2017 18:23:21 +0300
Subject: migration: discard non-dirty ram pages after the start of postcopy

After the start of postcopy migration there are some non-dirty pages which have
already been migrated. These pages are no longer needed on the source vm so that
we can free them and it doen't hurt to complete the migration.

Signed-off-by: Pavel Butsykin <pbutsykin@virtuozzo.com>
Message-Id: <20170203152321.19739-4-pbutsykin@virtuozzo.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |  1 +
 migration/migration.c         |  4 ++++
 migration/ram.c               | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/migration/migration.h b/include/migration/migration.h
index b9b706a7e3..71ce19062c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -285,6 +285,7 @@ int ram_postcopy_send_discard_bitmap(MigrationState *ms);
 int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
                       uint64_t start, size_t length);
 int ram_postcopy_incoming_init(MigrationIncomingState *mis);
+void ram_postcopy_migrated_memory_release(MigrationState *ms);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
diff --git a/migration/migration.c b/migration/migration.c
index 68afc07016..2a26a20aaf 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1722,6 +1722,10 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
      */
     qemu_savevm_send_ping(ms->to_dst_file, 4);
 
+    if (migrate_release_ram()) {
+        ram_postcopy_migrated_memory_release(ms);
+    }
+
     ret = qemu_file_get_error(ms->to_dst_file);
     if (ret) {
         error_report("postcopy_start: Migration stream errored");
diff --git a/migration/ram.c b/migration/ram.c
index c22209db30..67f2efbc59 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1537,6 +1537,25 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
 
 /* **** functions for postcopy ***** */
 
+void ram_postcopy_migrated_memory_release(MigrationState *ms)
+{
+    struct RAMBlock *block;
+    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+        unsigned long range = first + (block->used_length >> TARGET_PAGE_BITS);
+        unsigned long run_start = find_next_zero_bit(bitmap, range, first);
+
+        while (run_start < range) {
+            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
+            ram_discard_range(NULL, block->idstr, run_start << TARGET_PAGE_BITS,
+                              (run_end - run_start) << TARGET_PAGE_BITS);
+            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
+        }
+    }
+}
+
 /*
  * Callback from postcopy_each_ram_send_discard for each RAMBlock
  * Note: At this point the 'unsentmap' is the processed bitmap combined
-- 
cgit v1.2.3-55-g7522


From 59046ec29ad4c24391bb9fe1fbdced33557aaa70 Mon Sep 17 00:00:00 2001
From: Halil Pasic
Date: Fri, 3 Feb 2017 18:52:17 +0100
Subject: migration: consolidate VMStateField.start

The member VMStateField.start is used for two things, partial data
migration for VBUFFER data (basically provide migration for a
sub-buffer) and for locating next in QTAILQ.

The implementation of the VBUFFER feature is broken when VMSTATE_ALLOC
is used. This however goes unnoticed because actually partial migration
for VBUFFER is not used at all.

Let's consolidate the usage of VMStateField.start by removing support
for partial migration for VBUFFER.

Signed-off-by: Halil Pasic <pasic@linux.vnet.ibm.com>

Message-Id: <20170203175217.45562-1-pasic@linux.vnet.ibm.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 hw/char/exynos4210_uart.c   |  2 +-
 hw/display/g364fb.c         |  2 +-
 hw/dma/pl330.c              |  8 ++++----
 hw/intc/exynos4210_gic.c    |  2 +-
 hw/ipmi/isa_ipmi_bt.c       |  6 ++----
 hw/net/vmxnet3.c            |  2 +-
 hw/nvram/mac_nvram.c        |  2 +-
 hw/nvram/spapr_nvram.c      |  2 +-
 hw/sd/sdhci.c               |  2 +-
 hw/timer/m48t59.c           |  2 +-
 include/migration/vmstate.h | 21 ++++++++-------------
 migration/savevm.c          |  2 +-
 migration/vmstate.c         |  4 ++--
 target/s390x/machine.c      |  2 +-
 util/fifo8.c                |  2 +-
 15 files changed, 27 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c
index 7c16e894e2..b75f28d473 100644
--- a/hw/char/exynos4210_uart.c
+++ b/hw/char/exynos4210_uart.c
@@ -561,7 +561,7 @@ static const VMStateDescription vmstate_exynos4210_uart_fifo = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(sp, Exynos4210UartFIFO),
         VMSTATE_UINT32(rp, Exynos4210UartFIFO),
-        VMSTATE_VBUFFER_UINT32(data, Exynos4210UartFIFO, 1, NULL, 0, size),
+        VMSTATE_VBUFFER_UINT32(data, Exynos4210UartFIFO, 1, NULL, size),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/hw/display/g364fb.c b/hw/display/g364fb.c
index 70ef2c7453..8cdc205dd9 100644
--- a/hw/display/g364fb.c
+++ b/hw/display/g364fb.c
@@ -464,7 +464,7 @@ static const VMStateDescription vmstate_g364fb = {
     .minimum_version_id = 1,
     .post_load = g364fb_post_load,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(vram, G364State, 1, NULL, 0, vram_size),
+        VMSTATE_VBUFFER_UINT32(vram, G364State, 1, NULL, vram_size),
         VMSTATE_BUFFER_UNSAFE(color_palette, G364State, 0, 256 * 3),
         VMSTATE_BUFFER_UNSAFE(cursor_palette, G364State, 0, 9),
         VMSTATE_UINT16_ARRAY(cursor, G364State, 512),
diff --git a/hw/dma/pl330.c b/hw/dma/pl330.c
index c0bd9fec30..32cf8399b8 100644
--- a/hw/dma/pl330.c
+++ b/hw/dma/pl330.c
@@ -173,8 +173,8 @@ static const VMStateDescription vmstate_pl330_fifo = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(buf, PL330Fifo, 1, NULL, 0, buf_size),
-        VMSTATE_VBUFFER_UINT32(tag, PL330Fifo, 1, NULL, 0, buf_size),
+        VMSTATE_VBUFFER_UINT32(buf, PL330Fifo, 1, NULL, buf_size),
+        VMSTATE_VBUFFER_UINT32(tag, PL330Fifo, 1, NULL, buf_size),
         VMSTATE_UINT32(head, PL330Fifo),
         VMSTATE_UINT32(num, PL330Fifo),
         VMSTATE_UINT32(buf_size, PL330Fifo),
@@ -282,8 +282,8 @@ static const VMStateDescription vmstate_pl330 = {
         VMSTATE_STRUCT(manager, PL330State, 0, vmstate_pl330_chan, PL330Chan),
         VMSTATE_STRUCT_VARRAY_UINT32(chan, PL330State, num_chnls, 0,
                                      vmstate_pl330_chan, PL330Chan),
-        VMSTATE_VBUFFER_UINT32(lo_seqn, PL330State, 1, NULL, 0, num_chnls),
-        VMSTATE_VBUFFER_UINT32(hi_seqn, PL330State, 1, NULL, 0, num_chnls),
+        VMSTATE_VBUFFER_UINT32(lo_seqn, PL330State, 1, NULL, num_chnls),
+        VMSTATE_VBUFFER_UINT32(hi_seqn, PL330State, 1, NULL, num_chnls),
         VMSTATE_STRUCT(fifo, PL330State, 0, vmstate_pl330_fifo, PL330Fifo),
         VMSTATE_STRUCT(read_queue, PL330State, 0, vmstate_pl330_queue,
                        PL330Queue),
diff --git a/hw/intc/exynos4210_gic.c b/hw/intc/exynos4210_gic.c
index fd7a8f3058..2a55817b76 100644
--- a/hw/intc/exynos4210_gic.c
+++ b/hw/intc/exynos4210_gic.c
@@ -393,7 +393,7 @@ static const VMStateDescription vmstate_exynos4210_irq_gate = {
     .version_id = 2,
     .minimum_version_id = 2,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(level, Exynos4210IRQGateState, 1, NULL, 0, n_in),
+        VMSTATE_VBUFFER_UINT32(level, Exynos4210IRQGateState, 1, NULL, n_in),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/hw/ipmi/isa_ipmi_bt.c b/hw/ipmi/isa_ipmi_bt.c
index f03661715c..1c69cb33f8 100644
--- a/hw/ipmi/isa_ipmi_bt.c
+++ b/hw/ipmi/isa_ipmi_bt.c
@@ -471,10 +471,8 @@ static const VMStateDescription vmstate_ISAIPMIBTDevice = {
         VMSTATE_BOOL(bt.use_irq, ISAIPMIBTDevice),
         VMSTATE_BOOL(bt.irqs_enabled, ISAIPMIBTDevice),
         VMSTATE_UINT32(bt.outpos, ISAIPMIBTDevice),
-        VMSTATE_VBUFFER_UINT32(bt.outmsg, ISAIPMIBTDevice, 1, NULL, 0,
-                               bt.outlen),
-        VMSTATE_VBUFFER_UINT32(bt.inmsg, ISAIPMIBTDevice, 1, NULL, 0,
-                               bt.inlen),
+        VMSTATE_VBUFFER_UINT32(bt.outmsg, ISAIPMIBTDevice, 1, NULL, bt.outlen),
+        VMSTATE_VBUFFER_UINT32(bt.inmsg, ISAIPMIBTDevice, 1, NULL, bt.inlen),
         VMSTATE_UINT8(bt.control_reg, ISAIPMIBTDevice),
         VMSTATE_UINT8(bt.mask_reg, ISAIPMIBTDevice),
         VMSTATE_UINT8(bt.waiting_rsp, ISAIPMIBTDevice),
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 7dd456551c..e13a798b3b 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -2397,7 +2397,7 @@ static const VMStateDescription vmxstate_vmxnet3_mcast_list = {
     .pre_load = vmxnet3_mcast_list_pre_load,
     .needed = vmxnet3_mc_list_needed,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(mcast_list, VMXNET3State, 0, NULL, 0,
+        VMSTATE_VBUFFER_UINT32(mcast_list, VMXNET3State, 0, NULL,
             mcast_list_buff_size),
         VMSTATE_END_OF_LIST()
     }
diff --git a/hw/nvram/mac_nvram.c b/hw/nvram/mac_nvram.c
index 63f9ed1d82..aef80e64df 100644
--- a/hw/nvram/mac_nvram.c
+++ b/hw/nvram/mac_nvram.c
@@ -82,7 +82,7 @@ static const VMStateDescription vmstate_macio_nvram = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(data, MacIONVRAMState, 0, NULL, 0, size),
+        VMSTATE_VBUFFER_UINT32(data, MacIONVRAMState, 0, NULL, size),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c
index eb42ea323f..65ba188555 100644
--- a/hw/nvram/spapr_nvram.c
+++ b/hw/nvram/spapr_nvram.c
@@ -224,7 +224,7 @@ static const VMStateDescription vmstate_spapr_nvram = {
     .post_load = spapr_nvram_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(size, sPAPRNVRAM),
-        VMSTATE_VBUFFER_ALLOC_UINT32(buf, sPAPRNVRAM, 1, NULL, 0, size),
+        VMSTATE_VBUFFER_ALLOC_UINT32(buf, sPAPRNVRAM, 1, NULL, size),
         VMSTATE_END_OF_LIST()
     },
 };
diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 5bd5ab6319..da32b5f709 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -1253,7 +1253,7 @@ const VMStateDescription sdhci_vmstate = {
         VMSTATE_UINT16(data_count, SDHCIState),
         VMSTATE_UINT64(admasysaddr, SDHCIState),
         VMSTATE_UINT8(stopped_state, SDHCIState),
-        VMSTATE_VBUFFER_UINT32(fifo_buffer, SDHCIState, 1, NULL, 0, buf_maxsz),
+        VMSTATE_VBUFFER_UINT32(fifo_buffer, SDHCIState, 1, NULL, buf_maxsz),
         VMSTATE_TIMER_PTR(insert_timer, SDHCIState),
         VMSTATE_TIMER_PTR(transfer_timer, SDHCIState),
         VMSTATE_END_OF_LIST()
diff --git a/hw/timer/m48t59.c b/hw/timer/m48t59.c
index 015797732f..474981a6ac 100644
--- a/hw/timer/m48t59.c
+++ b/hw/timer/m48t59.c
@@ -563,7 +563,7 @@ static const VMStateDescription vmstate_m48t59 = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT8(lock, M48t59State),
         VMSTATE_UINT16(addr, M48t59State),
-        VMSTATE_VBUFFER_UINT32(buffer, M48t59State, 0, NULL, 0, size),
+        VMSTATE_VBUFFER_UINT32(buffer, M48t59State, 0, NULL, size),
         VMSTATE_END_OF_LIST()
     }
 };
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 6233fe2e5b..39db47eb3f 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -587,7 +587,8 @@ extern const VMStateInfo vmstate_info_qtailq;
     .offset       = vmstate_offset_buffer(_state, _field) + _start,  \
 }
 
-#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test, _start, _field_size, _multiply) { \
+#define VMSTATE_VBUFFER_MULTIPLY(_field, _state, _version, _test,    \
+                                 _field_size, _multiply) {           \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -596,10 +597,9 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER|VMS_MULTIPLY,            \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
-#define VMSTATE_VBUFFER(_field, _state, _version, _test, _start, _field_size) { \
+#define VMSTATE_VBUFFER(_field, _state, _version, _test, _field_size) { \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -607,10 +607,9 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER,                         \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
-#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _start, _field_size) { \
+#define VMSTATE_VBUFFER_UINT32(_field, _state, _version, _test, _field_size) { \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -618,10 +617,10 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER,                         \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
-#define VMSTATE_VBUFFER_ALLOC_UINT32(_field, _state, _version, _test, _start, _field_size) { \
+#define VMSTATE_VBUFFER_ALLOC_UINT32(_field, _state, _version,       \
+                                     _test, _field_size) {           \
     .name         = (stringify(_field)),                             \
     .version_id   = (_version),                                      \
     .field_exists = (_test),                                         \
@@ -629,7 +628,6 @@ extern const VMStateInfo vmstate_info_qtailq;
     .info         = &vmstate_info_buffer,                            \
     .flags        = VMS_VBUFFER|VMS_POINTER|VMS_ALLOC,               \
     .offset       = offsetof(_state, _field),                        \
-    .start        = (_start),                                        \
 }
 
 #define VMSTATE_BUFFER_UNSAFE_INFO_TEST(_field, _state, _test, _version, _info, _size) { \
@@ -948,13 +946,10 @@ extern const VMStateInfo vmstate_info_qtailq;
     VMSTATE_BUFFER_START_MIDDLE_V(_f, _s, _start, 0)
 
 #define VMSTATE_PARTIAL_VBUFFER(_f, _s, _size)                        \
-    VMSTATE_VBUFFER(_f, _s, 0, NULL, 0, _size)
+    VMSTATE_VBUFFER(_f, _s, 0, NULL, _size)
 
 #define VMSTATE_PARTIAL_VBUFFER_UINT32(_f, _s, _size)                        \
-    VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, 0, _size)
-
-#define VMSTATE_SUB_VBUFFER(_f, _s, _start, _size)                    \
-    VMSTATE_VBUFFER(_f, _s, 0, NULL, _start, _size)
+    VMSTATE_VBUFFER_UINT32(_f, _s, 0, NULL, _size)
 
 #define VMSTATE_BUFFER_TEST(_f, _s, _test)                            \
     VMSTATE_STATIC_BUFFER(_f, _s, 0, _test, 0, sizeof(typeof_field(_s, _f)))
diff --git a/migration/savevm.c b/migration/savevm.c
index 01997687c4..5ecd264134 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -356,7 +356,7 @@ static const VMStateDescription vmstate_configuration = {
     .pre_save = configuration_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT32(len, SaveState),
-        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len),
+        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription*[]) {
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 2b2b3a58e6..520341a2de 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -68,10 +68,10 @@ static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc)
                 }
             }
             if (size) {
-                *((void **)base_addr + field->start) = g_malloc(size);
+                *(void **)base_addr = g_malloc(size);
             }
         }
-        base_addr = *(void **)base_addr + field->start;
+        base_addr = *(void **)base_addr;
     }
 
     return base_addr;
diff --git a/target/s390x/machine.c b/target/s390x/machine.c
index edc3a4717b..8503fa1c8d 100644
--- a/target/s390x/machine.c
+++ b/target/s390x/machine.c
@@ -180,7 +180,7 @@ const VMStateDescription vmstate_s390_cpu = {
         VMSTATE_UINT8(env.cpu_state, S390CPU),
         VMSTATE_UINT8(env.sigp_order, S390CPU),
         VMSTATE_UINT32_V(irqstate_saved_size, S390CPU, 4),
-        VMSTATE_VBUFFER_UINT32(irqstate, S390CPU, 4, NULL, 0,
+        VMSTATE_VBUFFER_UINT32(irqstate, S390CPU, 4, NULL,
                                irqstate_saved_size),
         VMSTATE_END_OF_LIST()
     },
diff --git a/util/fifo8.c b/util/fifo8.c
index 5c64101b33..d38b3bdaa5 100644
--- a/util/fifo8.c
+++ b/util/fifo8.c
@@ -118,7 +118,7 @@ const VMStateDescription vmstate_fifo8 = {
     .version_id = 1,
     .minimum_version_id = 1,
     .fields = (VMStateField[]) {
-        VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, 0, capacity),
+        VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, capacity),
         VMSTATE_UINT32(head, Fifo8),
         VMSTATE_UINT32(num, Fifo8),
         VMSTATE_END_OF_LIST()
-- 
cgit v1.2.3-55-g7522


From 479125d53eb8509d69a0548f131028a65fcbd65a Mon Sep 17 00:00:00 2001
From: zhanghailiang
Date: Tue, 17 Jan 2017 20:57:42 +0800
Subject: COLO: fix setting checkpoint-delay not working properly

If we set checkpoint-delay through command 'migrate-set-parameters',
It will not take effect until we finish last sleep chekpoint-delay,
That's will be offensive espeically when we want to change its value
from an extreme big one to a proper value.

Fix it by using timer to realize checkpoint-delay.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Message-Id: <1484657864-21708-2-git-send-email-zhang.zhanghailiang@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/colo.h      |  2 ++
 include/migration/migration.h |  5 +++++
 migration/colo.c              | 33 +++++++++++++++++++++++----------
 migration/migration.c         |  3 +++
 4 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/migration/colo.h b/include/migration/colo.h
index e32eef4763..2bbff9e6c2 100644
--- a/include/migration/colo.h
+++ b/include/migration/colo.h
@@ -35,4 +35,6 @@ COLOMode get_colo_mode(void);
 
 /* failover */
 void colo_do_failover(MigrationState *s);
+
+void colo_checkpoint_notify(void *opaque);
 #endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 71ce19062c..cb83f1688e 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -188,6 +188,11 @@ struct MigrationState
     /* The RAMBlock used in the last src_page_request */
     RAMBlock *last_req_rb;
 
+    /* The semaphore is used to notify COLO thread to do checkpoint */
+    QemuSemaphore colo_checkpoint_sem;
+    int64_t colo_checkpoint_time;
+    QEMUTimer *colo_delay_timer;
+
     /* The last error that occurred */
     Error *error;
 };
diff --git a/migration/colo.c b/migration/colo.c
index 93c85c538b..08b2e46dac 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -302,7 +302,7 @@ static void colo_process_checkpoint(MigrationState *s)
 {
     QIOChannelBuffer *bioc;
     QEMUFile *fb = NULL;
-    int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     Error *local_err = NULL;
     int ret;
 
@@ -332,26 +332,21 @@ static void colo_process_checkpoint(MigrationState *s)
     qemu_mutex_unlock_iothread();
     trace_colo_vm_state_change("stop", "run");
 
+    timer_mod(s->colo_delay_timer,
+            current_time + s->parameters.x_checkpoint_delay);
+
     while (s->state == MIGRATION_STATUS_COLO) {
         if (failover_get_state() != FAILOVER_STATUS_NONE) {
             error_report("failover request");
             goto out;
         }
 
-        current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
-        if (current_time - checkpoint_time <
-            s->parameters.x_checkpoint_delay) {
-            int64_t delay_ms;
+        qemu_sem_wait(&s->colo_checkpoint_sem);
 
-            delay_ms = s->parameters.x_checkpoint_delay -
-                       (current_time - checkpoint_time);
-            g_usleep(delay_ms * 1000);
-        }
         ret = colo_do_checkpoint_transaction(s, bioc, fb);
         if (ret < 0) {
             goto out;
         }
-        checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     }
 
 out:
@@ -364,14 +359,32 @@ out:
         qemu_fclose(fb);
     }
 
+    timer_del(s->colo_delay_timer);
+
     if (s->rp_state.from_dst_file) {
         qemu_fclose(s->rp_state.from_dst_file);
     }
 }
 
+void colo_checkpoint_notify(void *opaque)
+{
+    MigrationState *s = opaque;
+    int64_t next_notify_time;
+
+    qemu_sem_post(&s->colo_checkpoint_sem);
+    s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+    next_notify_time = s->colo_checkpoint_time +
+                    s->parameters.x_checkpoint_delay;
+    timer_mod(s->colo_delay_timer, next_notify_time);
+}
+
 void migrate_start_colo_process(MigrationState *s)
 {
     qemu_mutex_unlock_iothread();
+    qemu_sem_init(&s->colo_checkpoint_sem, 0);
+    s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
+                                colo_checkpoint_notify, s);
+
     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
     colo_process_checkpoint(s);
diff --git a/migration/migration.c b/migration/migration.c
index 2a26a20aaf..c6ae69d371 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -891,6 +891,9 @@ void qmp_migrate_set_parameters(MigrationParameters *params, Error **errp)
 
     if (params->has_x_checkpoint_delay) {
         s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
+        if (migration_in_colo_state()) {
+            colo_checkpoint_notify(s);
+        }
     }
 }
 
-- 
cgit v1.2.3-55-g7522


From c937b9a6db2d564b96aae35a6757bb4144ea5184 Mon Sep 17 00:00:00 2001
From: zhanghailiang
Date: Tue, 17 Jan 2017 20:57:43 +0800
Subject: COLO: Shutdown related socket fd while do failover

If the net connection between primary host and secondary host breaks
while COLO/COLO incoming threads are doing read() or write().
It will block until connection is timeout, and the failover process
will be blocked because of it.

So it is necessary to shutdown all the socket fds used by COLO
to avoid this situation. Besides, we should close the corresponding
file descriptors after failvoer BH shutdown them,
Or there will be an error.

Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <1484657864-21708-3-git-send-email-zhang.zhanghailiang@huawei.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |  3 +++
 migration/colo.c              | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/include/migration/migration.h b/include/migration/migration.h
index cb83f1688e..1735d66512 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -116,6 +116,7 @@ struct MigrationIncomingState {
     QemuThread colo_incoming_thread;
     /* The coroutine we should enter (back) after failover */
     Coroutine *migration_incoming_co;
+    QemuSemaphore colo_incoming_sem;
 
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
@@ -187,6 +188,8 @@ struct MigrationState
     QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests;
     /* The RAMBlock used in the last src_page_request */
     RAMBlock *last_req_rb;
+    /* The semaphore is used to notify COLO thread that failover is finished */
+    QemuSemaphore colo_exit_sem;
 
     /* The semaphore is used to notify COLO thread to do checkpoint */
     QemuSemaphore colo_checkpoint_sem;
diff --git a/migration/colo.c b/migration/colo.c
index 08b2e46dac..3222812d96 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -59,6 +59,18 @@ static void secondary_vm_do_failover(void)
         /* recover runstate to normal migration finish state */
         autostart = true;
     }
+    /*
+     * Make sure COLO incoming thread not block in recv or send,
+     * If mis->from_src_file and mis->to_src_file use the same fd,
+     * The second shutdown() will return -1, we ignore this value,
+     * It is harmless.
+     */
+    if (mis->from_src_file) {
+        qemu_file_shutdown(mis->from_src_file);
+    }
+    if (mis->to_src_file) {
+        qemu_file_shutdown(mis->to_src_file);
+    }
 
     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
                                    FAILOVER_STATUS_COMPLETED);
@@ -67,6 +79,8 @@ static void secondary_vm_do_failover(void)
                      "secondary VM", FailoverStatus_lookup[old_state]);
         return;
     }
+    /* Notify COLO incoming thread that failover work is finished */
+    qemu_sem_post(&mis->colo_incoming_sem);
     /* For Secondary VM, jump to incoming co */
     if (mis->migration_incoming_co) {
         qemu_coroutine_enter(mis->migration_incoming_co);
@@ -81,6 +95,18 @@ static void primary_vm_do_failover(void)
     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
                       MIGRATION_STATUS_COMPLETED);
 
+    /*
+     * Wake up COLO thread which may blocked in recv() or send(),
+     * The s->rp_state.from_dst_file and s->to_dst_file may use the
+     * same fd, but we still shutdown the fd for twice, it is harmless.
+     */
+    if (s->to_dst_file) {
+        qemu_file_shutdown(s->to_dst_file);
+    }
+    if (s->rp_state.from_dst_file) {
+        qemu_file_shutdown(s->rp_state.from_dst_file);
+    }
+
     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
                                    FAILOVER_STATUS_COMPLETED);
     if (old_state != FAILOVER_STATUS_ACTIVE) {
@@ -88,6 +114,8 @@ static void primary_vm_do_failover(void)
                      FailoverStatus_lookup[old_state]);
         return;
     }
+    /* Notify COLO thread that failover work is finished */
+    qemu_sem_post(&s->colo_exit_sem);
 }
 
 void colo_do_failover(MigrationState *s)
@@ -361,6 +389,14 @@ out:
 
     timer_del(s->colo_delay_timer);
 
+    /* Hope this not to be too long to wait here */
+    qemu_sem_wait(&s->colo_exit_sem);
+    qemu_sem_destroy(&s->colo_exit_sem);
+    /*
+     * Must be called after failover BH is completed,
+     * Or the failover BH may shutdown the wrong fd that
+     * re-used by other threads after we release here.
+     */
     if (s->rp_state.from_dst_file) {
         qemu_fclose(s->rp_state.from_dst_file);
     }
@@ -385,6 +421,7 @@ void migrate_start_colo_process(MigrationState *s)
     s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
                                 colo_checkpoint_notify, s);
 
+    qemu_sem_init(&s->colo_exit_sem, 0);
     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
     colo_process_checkpoint(s);
@@ -423,6 +460,8 @@ void *colo_process_incoming_thread(void *opaque)
     uint64_t value;
     Error *local_err = NULL;
 
+    qemu_sem_init(&mis->colo_incoming_sem, 0);
+
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                       MIGRATION_STATUS_COLO);
 
@@ -533,6 +572,10 @@ out:
         qemu_fclose(fb);
     }
 
+    /* Hope this not to be too long to loop here */
+    qemu_sem_wait(&mis->colo_incoming_sem);
+    qemu_sem_destroy(&mis->colo_incoming_sem);
+    /* Must be called after failover BH is completed */
     if (mis->to_src_file) {
         qemu_fclose(mis->to_src_file);
     }
-- 
cgit v1.2.3-55-g7522


From b5b5c569570c414bf0aa80a5ae9480debe07ed58 Mon Sep 17 00:00:00 2001
From: Dr. David Alan Gilbert
Date: Fri, 3 Feb 2017 16:06:48 +0000
Subject: migration: Add VMSTATE_UNUSED_VARRAY_UINT32

VMSTATE_UNUSED_VARRAY_UINT32 is used to skip a chunk of the stream
that's an n-element array;  note the array size and the dynamic value
read never get multiplied so there's no overflow risk.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20170203160651.19917-2-dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/vmstate.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 39db47eb3f..7339594337 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -658,6 +658,17 @@ extern const VMStateInfo vmstate_info_qtailq;
     .flags        = VMS_BUFFER,                                      \
 }
 
+/* Discard size * field_num bytes, where field_num is a uint32 member */
+#define VMSTATE_UNUSED_VARRAY_UINT32(_state, _test, _version, _field_num, _size) {\
+    .name         = "unused",                                        \
+    .field_exists = (_test),                                         \
+    .num_offset   = vmstate_offset_value(_state, _field_num, uint32_t),\
+    .version_id   = (_version),                                      \
+    .size         = (_size),                                         \
+    .info         = &vmstate_info_unused_buffer,                     \
+    .flags        = VMS_VARRAY_UINT32 | VMS_BUFFER,                  \
+}
+
 /* _field_size should be a int32_t field in the _state struct giving the
  * size of the bitmap _field in bits.
  */
-- 
cgit v1.2.3-55-g7522


From bcf45131293664118355df03d2ce5458156deaad Mon Sep 17 00:00:00 2001
From: Dr. David Alan Gilbert
Date: Fri, 3 Feb 2017 16:06:49 +0000
Subject: migration: Add VMSTATE_WITH_TMP

VMSTATE_WITH_TMP is for handling structures where some calculation
or rearrangement of the data needs to be performed before the data
hits the wire.
For example,  where the value on the wire is an offset from a
non-migrated base, but the data in the structure is the actual pointer.

To use it, a temporary type is created and a vmsd used on that type.
The first element of the type must be 'parent' a pointer back to the
type of the main structure.  VMSTATE_WITH_TMP takes care of allocating
and freeing the temporary before running the child vmsd.

The post_load/pre_save on the child vmsd can copy things from the parent
to the temporary using the parent pointer and do any other calculations
needed; it can then use normal VMSD entries to do the actual data
storage without having to fiddle around with qemu_get_*/qemu_put_*

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Message-Id: <20170203160651.19917-3-dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/vmstate.h | 19 +++++++++++++++++++
 migration/vmstate.c         | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 7339594337..63e7b02e05 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -259,6 +259,7 @@ extern const VMStateInfo vmstate_info_cpudouble;
 extern const VMStateInfo vmstate_info_timer;
 extern const VMStateInfo vmstate_info_buffer;
 extern const VMStateInfo vmstate_info_unused_buffer;
+extern const VMStateInfo vmstate_info_tmp;
 extern const VMStateInfo vmstate_info_bitmap;
 extern const VMStateInfo vmstate_info_qtailq;
 
@@ -649,6 +650,24 @@ extern const VMStateInfo vmstate_info_qtailq;
     .offset     = offsetof(_state, _field),                          \
 }
 
+/* Allocate a temporary of type 'tmp_type', set tmp->parent to _state
+ * and execute the vmsd on the temporary.  Note that we're working with
+ * the whole of _state here, not a field within it.
+ * We compile time check that:
+ *    That _tmp_type contains a 'parent' member that's a pointer to the
+ *        '_state' type
+ *    That the pointer is right at the start of _tmp_type.
+ */
+#define VMSTATE_WITH_TMP(_state, _tmp_type, _vmsd) {                 \
+    .name         = "tmp",                                           \
+    .size         = sizeof(_tmp_type) +                              \
+                    QEMU_BUILD_BUG_ON_ZERO(offsetof(_tmp_type, parent) != 0) + \
+                    type_check_pointer(_state,                       \
+                        typeof_field(_tmp_type, parent)),            \
+    .vmsd         = &(_vmsd),                                        \
+    .info         = &vmstate_info_tmp,                               \
+}
+
 #define VMSTATE_UNUSED_BUFFER(_test, _version, _size) {              \
     .name         = "unused",                                        \
     .field_exists = (_test),                                         \
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 520341a2de..b4d8ae982a 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -935,6 +935,46 @@ const VMStateInfo vmstate_info_unused_buffer = {
     .put  = put_unused_buffer,
 };
 
+/* vmstate_info_tmp, see VMSTATE_WITH_TMP, the idea is that we allocate
+ * a temporary buffer and the pre_load/pre_save methods in the child vmsd
+ * copy stuff from the parent into the child and do calculations to fill
+ * in fields that don't really exist in the parent but need to be in the
+ * stream.
+ */
+static int get_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field)
+{
+    int ret;
+    const VMStateDescription *vmsd = field->vmsd;
+    int version_id = field->version_id;
+    void *tmp = g_malloc(size);
+
+    /* Writes the parent field which is at the start of the tmp */
+    *(void **)tmp = pv;
+    ret = vmstate_load_state(f, vmsd, tmp, version_id);
+    g_free(tmp);
+    return ret;
+}
+
+static int put_tmp(QEMUFile *f, void *pv, size_t size, VMStateField *field,
+                    QJSON *vmdesc)
+{
+    const VMStateDescription *vmsd = field->vmsd;
+    void *tmp = g_malloc(size);
+
+    /* Writes the parent field which is at the start of the tmp */
+    *(void **)tmp = pv;
+    vmstate_save_state(f, vmsd, tmp, vmdesc);
+    g_free(tmp);
+
+    return 0;
+}
+
+const VMStateInfo vmstate_info_tmp = {
+    .name = "tmp",
+    .get = get_tmp,
+    .put = put_tmp,
+};
+
 /* bitmaps (as defined by bitmap.h). Note that size here is the size
  * of the bitmap in bits. The on-the-wire format of a bitmap is 64
  * bit words with the bits in big endian order. The in-memory format
-- 
cgit v1.2.3-55-g7522


From 982b78c5e37864c06fd7b5f156d80bf02628a855 Mon Sep 17 00:00:00 2001
From: Dr. David Alan Gilbert
Date: Fri, 3 Feb 2017 16:06:51 +0000
Subject: virtio/migration: Migrate virtio-net to VMState

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Message-Id: <20170203160651.19917-5-dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
  Merge fix against Halil's removal of the '_start' field in
    VMSTATE_VBUFFER_MULTIPLY
---
 hw/net/virtio-net.c            | 316 +++++++++++++++++++++++++++--------------
 include/hw/virtio/virtio-net.h |   4 +-
 2 files changed, 213 insertions(+), 107 deletions(-)

(limited to 'include')

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7b3ad4a9f0..354a19eab8 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1557,119 +1557,22 @@ static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
     virtio_net_set_queues(n);
 }
 
-static void virtio_net_save_device(VirtIODevice *vdev, QEMUFile *f)
+static int virtio_net_post_load_device(void *opaque, int version_id)
 {
-    VirtIONet *n = VIRTIO_NET(vdev);
-    int i;
-
-    qemu_put_buffer(f, n->mac, ETH_ALEN);
-    qemu_put_be32(f, n->vqs[0].tx_waiting);
-    qemu_put_be32(f, n->mergeable_rx_bufs);
-    qemu_put_be16(f, n->status);
-    qemu_put_byte(f, n->promisc);
-    qemu_put_byte(f, n->allmulti);
-    qemu_put_be32(f, n->mac_table.in_use);
-    qemu_put_buffer(f, n->mac_table.macs, n->mac_table.in_use * ETH_ALEN);
-    qemu_put_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3);
-    qemu_put_be32(f, n->has_vnet_hdr);
-    qemu_put_byte(f, n->mac_table.multi_overflow);
-    qemu_put_byte(f, n->mac_table.uni_overflow);
-    qemu_put_byte(f, n->alluni);
-    qemu_put_byte(f, n->nomulti);
-    qemu_put_byte(f, n->nouni);
-    qemu_put_byte(f, n->nobcast);
-    qemu_put_byte(f, n->has_ufo);
-    if (n->max_queues > 1) {
-        qemu_put_be16(f, n->max_queues);
-        qemu_put_be16(f, n->curr_queues);
-        for (i = 1; i < n->curr_queues; i++) {
-            qemu_put_be32(f, n->vqs[i].tx_waiting);
-        }
-    }
-
-    if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
-        qemu_put_be64(f, n->curr_guest_offloads);
-    }
-}
-
-static int virtio_net_load_device(VirtIODevice *vdev, QEMUFile *f,
-                                  int version_id)
-{
-    VirtIONet *n = VIRTIO_NET(vdev);
+    VirtIONet *n = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
     int i, link_down;
 
-    qemu_get_buffer(f, n->mac, ETH_ALEN);
-    n->vqs[0].tx_waiting = qemu_get_be32(f);
-
-    virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f),
+    virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
                                virtio_vdev_has_feature(vdev,
                                                        VIRTIO_F_VERSION_1));
 
-    n->status = qemu_get_be16(f);
-
-    n->promisc = qemu_get_byte(f);
-    n->allmulti = qemu_get_byte(f);
-
-    n->mac_table.in_use = qemu_get_be32(f);
     /* MAC_TABLE_ENTRIES may be different from the saved image */
-    if (n->mac_table.in_use <= MAC_TABLE_ENTRIES) {
-        qemu_get_buffer(f, n->mac_table.macs,
-                        n->mac_table.in_use * ETH_ALEN);
-    } else {
-        int64_t i;
-
-        /* Overflow detected - can happen if source has a larger MAC table.
-         * We simply set overflow flag so there's no need to maintain the
-         * table of addresses, discard them all.
-         * Note: 64 bit math to avoid integer overflow.
-         */
-        for (i = 0; i < (int64_t)n->mac_table.in_use * ETH_ALEN; ++i) {
-            qemu_get_byte(f);
-        }
-        n->mac_table.multi_overflow = n->mac_table.uni_overflow = 1;
+    if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
         n->mac_table.in_use = 0;
     }
- 
-    qemu_get_buffer(f, (uint8_t *)n->vlans, MAX_VLAN >> 3);
-
-    if (qemu_get_be32(f) && !peer_has_vnet_hdr(n)) {
-        error_report("virtio-net: saved image requires vnet_hdr=on");
-        return -1;
-    }
-
-    n->mac_table.multi_overflow = qemu_get_byte(f);
-    n->mac_table.uni_overflow = qemu_get_byte(f);
-
-    n->alluni = qemu_get_byte(f);
-    n->nomulti = qemu_get_byte(f);
-    n->nouni = qemu_get_byte(f);
-    n->nobcast = qemu_get_byte(f);
-
-    if (qemu_get_byte(f) && !peer_has_ufo(n)) {
-        error_report("virtio-net: saved image requires TUN_F_UFO support");
-        return -1;
-    }
 
-    if (n->max_queues > 1) {
-        if (n->max_queues != qemu_get_be16(f)) {
-            error_report("virtio-net: different max_queues ");
-            return -1;
-        }
-
-        n->curr_queues = qemu_get_be16(f);
-        if (n->curr_queues > n->max_queues) {
-            error_report("virtio-net: curr_queues %x > max_queues %x",
-                         n->curr_queues, n->max_queues);
-            return -1;
-        }
-        for (i = 1; i < n->curr_queues; i++) {
-            n->vqs[i].tx_waiting = qemu_get_be32(f);
-        }
-    }
-
-    if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
-        n->curr_guest_offloads = qemu_get_be64(f);
-    } else {
+    if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
     }
 
@@ -1703,6 +1606,210 @@ static int virtio_net_load_device(VirtIODevice *vdev, QEMUFile *f,
     return 0;
 }
 
+/* tx_waiting field of a VirtIONetQueue */
+static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
+    .name = "virtio-net-queue-tx_waiting",
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
+        VMSTATE_END_OF_LIST()
+   },
+};
+
+static bool max_queues_gt_1(void *opaque, int version_id)
+{
+    return VIRTIO_NET(opaque)->max_queues > 1;
+}
+
+static bool has_ctrl_guest_offloads(void *opaque, int version_id)
+{
+    return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
+                                   VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
+}
+
+static bool mac_table_fits(void *opaque, int version_id)
+{
+    return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
+}
+
+static bool mac_table_doesnt_fit(void *opaque, int version_id)
+{
+    return !mac_table_fits(opaque, version_id);
+}
+
+/* This temporary type is shared by all the WITH_TMP methods
+ * although only some fields are used by each.
+ */
+struct VirtIONetMigTmp {
+    VirtIONet      *parent;
+    VirtIONetQueue *vqs_1;
+    uint16_t        curr_queues_1;
+    uint8_t         has_ufo;
+    uint32_t        has_vnet_hdr;
+};
+
+/* The 2nd and subsequent tx_waiting flags are loaded later than
+ * the 1st entry in the queues and only if there's more than one
+ * entry.  We use the tmp mechanism to calculate a temporary
+ * pointer and count and also validate the count.
+ */
+
+static void virtio_net_tx_waiting_pre_save(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    tmp->vqs_1 = tmp->parent->vqs + 1;
+    tmp->curr_queues_1 = tmp->parent->curr_queues - 1;
+    if (tmp->parent->curr_queues == 0) {
+        tmp->curr_queues_1 = 0;
+    }
+}
+
+static int virtio_net_tx_waiting_pre_load(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    /* Reuse the pointer setup from save */
+    virtio_net_tx_waiting_pre_save(opaque);
+
+    if (tmp->parent->curr_queues > tmp->parent->max_queues) {
+        error_report("virtio-net: curr_queues %x > max_queues %x",
+            tmp->parent->curr_queues, tmp->parent->max_queues);
+
+        return -EINVAL;
+    }
+
+    return 0; /* all good */
+}
+
+static const VMStateDescription vmstate_virtio_net_tx_waiting = {
+    .name      = "virtio-net-tx_waiting",
+    .pre_load  = virtio_net_tx_waiting_pre_load,
+    .pre_save  = virtio_net_tx_waiting_pre_save,
+    .fields    = (VMStateField[]) {
+        VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
+                                     curr_queues_1,
+                                     vmstate_virtio_net_queue_tx_waiting,
+                                     struct VirtIONetQueue),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+/* the 'has_ufo' flag is just tested; if the incoming stream has the
+ * flag set we need to check that we have it
+ */
+static int virtio_net_ufo_post_load(void *opaque, int version_id)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
+        error_report("virtio-net: saved image requires TUN_F_UFO support");
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void virtio_net_ufo_pre_save(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    tmp->has_ufo = tmp->parent->has_ufo;
+}
+
+static const VMStateDescription vmstate_virtio_net_has_ufo = {
+    .name      = "virtio-net-ufo",
+    .post_load = virtio_net_ufo_post_load,
+    .pre_save  = virtio_net_ufo_pre_save,
+    .fields    = (VMStateField[]) {
+        VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+/* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
+ * flag set we need to check that we have it
+ */
+static int virtio_net_vnet_post_load(void *opaque, int version_id)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
+        error_report("virtio-net: saved image requires vnet_hdr=on");
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static void virtio_net_vnet_pre_save(void *opaque)
+{
+    struct VirtIONetMigTmp *tmp = opaque;
+
+    tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
+}
+
+static const VMStateDescription vmstate_virtio_net_has_vnet = {
+    .name      = "virtio-net-vnet",
+    .post_load = virtio_net_vnet_post_load,
+    .pre_save  = virtio_net_vnet_pre_save,
+    .fields    = (VMStateField[]) {
+        VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static const VMStateDescription vmstate_virtio_net_device = {
+    .name = "virtio-net-device",
+    .version_id = VIRTIO_NET_VM_VERSION,
+    .minimum_version_id = VIRTIO_NET_VM_VERSION,
+    .post_load = virtio_net_post_load_device,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
+        VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
+                               vmstate_virtio_net_queue_tx_waiting,
+                               VirtIONetQueue),
+        VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
+        VMSTATE_UINT16(status, VirtIONet),
+        VMSTATE_UINT8(promisc, VirtIONet),
+        VMSTATE_UINT8(allmulti, VirtIONet),
+        VMSTATE_UINT32(mac_table.in_use, VirtIONet),
+
+        /* Guarded pair: If it fits we load it, else we throw it away
+         * - can happen if source has a larger MAC table.; post-load
+         *  sets flags in this case.
+         */
+        VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
+                                0, mac_table_fits, mac_table.in_use,
+                                 ETH_ALEN),
+        VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
+                                     mac_table.in_use, ETH_ALEN),
+
+        /* Note: This is an array of uint32's that's always been saved as a
+         * buffer; hold onto your endiannesses; it's actually used as a bitmap
+         * but based on the uint.
+         */
+        VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
+        VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
+                         vmstate_virtio_net_has_vnet),
+        VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
+        VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
+        VMSTATE_UINT8(alluni, VirtIONet),
+        VMSTATE_UINT8(nomulti, VirtIONet),
+        VMSTATE_UINT8(nouni, VirtIONet),
+        VMSTATE_UINT8(nobcast, VirtIONet),
+        VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
+                         vmstate_virtio_net_has_ufo),
+        VMSTATE_SINGLE_TEST(max_queues, VirtIONet, max_queues_gt_1, 0,
+                            vmstate_info_uint16_equal, uint16_t),
+        VMSTATE_UINT16_TEST(curr_queues, VirtIONet, max_queues_gt_1),
+        VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
+                         vmstate_virtio_net_tx_waiting),
+        VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
+                            has_ctrl_guest_offloads),
+        VMSTATE_END_OF_LIST()
+   },
+};
+
 static NetClientInfo net_virtio_info = {
     .type = NET_CLIENT_DRIVER_NIC,
     .size = sizeof(NICState),
@@ -1989,9 +2096,8 @@ static void virtio_net_class_init(ObjectClass *klass, void *data)
     vdc->set_status = virtio_net_set_status;
     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
-    vdc->load = virtio_net_load_device;
-    vdc->save = virtio_net_save_device;
     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
+    vdc->vmsd = &vmstate_virtio_net_device;
 }
 
 static const TypeInfo virtio_net_info = {
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index 8ea56a8f60..1eec9a2da3 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -47,7 +47,7 @@ typedef struct VirtIONetQueue {
     VirtQueue *tx_vq;
     QEMUTimer *tx_timer;
     QEMUBH *tx_bh;
-    int tx_waiting;
+    uint32_t tx_waiting;
     struct {
         VirtQueueElement *elem;
     } async_tx;
@@ -68,7 +68,7 @@ typedef struct VirtIONet {
     size_t guest_hdr_len;
     uint32_t host_features;
     uint8_t has_ufo;
-    int mergeable_rx_bufs;
+    uint32_t mergeable_rx_bufs;
     uint8_t promisc;
     uint8_t allmulti;
     uint8_t alluni;
-- 
cgit v1.2.3-55-g7522