From 0ce265ffef87f19f4dd1ff0663e09a63d66ae408 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Nov 2016 11:34:02 +0100 Subject: exec: introduce memory_ldst.inc.c Templatize the address_space_* and *_phys functions, so that we can add similar functions in the next patch that work with a lightweight, cache-like version of address_space_map/unmap. Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini --- include/exec/cpu-common.h | 15 --------------- include/exec/memory.h | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index cffdc130e6..bd15853e51 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -94,21 +94,6 @@ bool cpu_physical_memory_is_io(hwaddr phys_addr); */ void qemu_flush_coalesced_mmio_buffer(void); -uint32_t ldub_phys(AddressSpace *as, hwaddr addr); -uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr); -uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr); -uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr); -uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr); -uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr); -uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr); -void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val); -void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val); -void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val); -void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val); -void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val); -void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val); -void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val); - void cpu_physical_memory_write_rom(AddressSpace *as, hwaddr addr, const uint8_t *buf, int len); void cpu_flush_icache_range(hwaddr start, int len); diff --git a/include/exec/memory.h b/include/exec/memory.h index 9728a2fb1a..f35b6125ab 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1404,6 +1404,21 @@ void address_space_stq_le(AddressSpace *as, hwaddr addr, uint64_t val, void address_space_stq_be(AddressSpace *as, hwaddr addr, uint64_t val, MemTxAttrs attrs, MemTxResult *result); +uint32_t ldub_phys(AddressSpace *as, hwaddr addr); +uint32_t lduw_le_phys(AddressSpace *as, hwaddr addr); +uint32_t lduw_be_phys(AddressSpace *as, hwaddr addr); +uint32_t ldl_le_phys(AddressSpace *as, hwaddr addr); +uint32_t ldl_be_phys(AddressSpace *as, hwaddr addr); +uint64_t ldq_le_phys(AddressSpace *as, hwaddr addr); +uint64_t ldq_be_phys(AddressSpace *as, hwaddr addr); +void stb_phys(AddressSpace *as, hwaddr addr, uint32_t val); +void stw_le_phys(AddressSpace *as, hwaddr addr, uint32_t val); +void stw_be_phys(AddressSpace *as, hwaddr addr, uint32_t val); +void stl_le_phys(AddressSpace *as, hwaddr addr, uint32_t val); +void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val); +void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val); +void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val); + /* address_space_translate: translate an address range into an address space * into a MemoryRegion and an address range into that section. Should be * called from an RCU critical section, to avoid that the last reference -- cgit v1.2.3-55-g7522 From 1f4e496e1fc2eb6c8bf377a0f9695930c380bfd3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Nov 2016 12:04:52 +0100 Subject: exec: introduce MemoryRegionCache Device models often have to perform multiple access to a single memory region that is known in advance, but would to use "DMA-style" functions instead of address_space_map/unmap. This can happen for example when the data has to undergo endianness conversion. Introduce a new data structure to cache the result of address_space_translate without forcing usage of a host address like address_space_map does. Signed-off-by: Paolo Bonzini --- exec.c | 76 ++++++++++++++++++++++++ include/exec/cpu-all.h | 23 ++++++++ include/exec/memory.h | 151 ++++++++++++++++++++++++++++++++++++++++++++++++ include/qemu/typedefs.h | 1 + 4 files changed, 251 insertions(+) (limited to 'include') diff --git a/exec.c b/exec.c index d4b36568f1..8d4bb0e8c1 100644 --- a/exec.c +++ b/exec.c @@ -3077,6 +3077,82 @@ void cpu_physical_memory_unmap(void *buffer, hwaddr len, #define RCU_READ_UNLOCK(...) rcu_read_unlock() #include "memory_ldst.inc.c" +int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpace *as, + hwaddr addr, + hwaddr len, + bool is_write) +{ + hwaddr l, xlat; + MemoryRegion *mr; + void *ptr; + + assert(len > 0); + + l = len; + mr = address_space_translate(as, addr, &xlat, &l, is_write); + if (!memory_access_is_direct(mr, is_write)) { + return -EINVAL; + } + + l = address_space_extend_translation(as, addr, len, mr, xlat, l, is_write); + ptr = qemu_ram_ptr_length(mr->ram_block, xlat, &l); + + cache->xlat = xlat; + cache->is_write = is_write; + cache->mr = mr; + cache->ptr = ptr; + cache->len = l; + memory_region_ref(cache->mr); + + return l; +} + +void address_space_cache_invalidate(MemoryRegionCache *cache, + hwaddr addr, + hwaddr access_len) +{ + assert(cache->is_write); + invalidate_and_set_dirty(cache->mr, addr + cache->xlat, access_len); +} + +void address_space_cache_destroy(MemoryRegionCache *cache) +{ + if (!cache->mr) { + return; + } + + if (xen_enabled()) { + xen_invalidate_map_cache_entry(cache->ptr); + } + memory_region_unref(cache->mr); +} + +/* Called from RCU critical section. This function has the same + * semantics as address_space_translate, but it only works on a + * predefined range of a MemoryRegion that was mapped with + * address_space_cache_init. + */ +static inline MemoryRegion *address_space_translate_cached( + MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat, + hwaddr *plen, bool is_write) +{ + assert(addr < cache->len && *plen <= cache->len - addr); + *xlat = addr + cache->xlat; + return cache->mr; +} + +#define ARG1_DECL MemoryRegionCache *cache +#define ARG1 cache +#define SUFFIX _cached +#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__) +#define IS_DIRECT(mr, is_write) true +#define MAP_RAM(mr, ofs) (cache->ptr + (ofs - cache->xlat)) +#define INVALIDATE(mr, ofs, len) ((void)0) +#define RCU_READ_LOCK() ((void)0) +#define RCU_READ_UNLOCK() ((void)0) +#include "memory_ldst.inc.c" + /* virtual memory access for debug (includes writing to ROM) */ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, uint8_t *buf, int len, int is_write) diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index e9004e5798..ffe43d5654 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -186,6 +186,29 @@ void address_space_stl(AddressSpace *as, hwaddr addr, uint32_t val, MemTxAttrs attrs, MemTxResult *result); void address_space_stq(AddressSpace *as, hwaddr addr, uint64_t val, MemTxAttrs attrs, MemTxResult *result); + +uint32_t lduw_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint32_t ldl_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint64_t ldq_phys_cached(MemoryRegionCache *cache, hwaddr addr); +void stl_phys_notdirty_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stw_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stl_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stq_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint64_t val); + +uint32_t address_space_lduw_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint32_t address_space_ldl_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint64_t address_space_ldq_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stl_notdirty_cached(MemoryRegionCache *cache, hwaddr addr, + uint32_t val, MemTxAttrs attrs, MemTxResult *result); +void address_space_stw_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stl_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stq_cached(MemoryRegionCache *cache, hwaddr addr, uint64_t val, + MemTxAttrs attrs, MemTxResult *result); #endif /* page related stuff */ diff --git a/include/exec/memory.h b/include/exec/memory.h index f35b6125ab..64560f61b4 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -1419,6 +1419,125 @@ void stl_be_phys(AddressSpace *as, hwaddr addr, uint32_t val); void stq_le_phys(AddressSpace *as, hwaddr addr, uint64_t val); void stq_be_phys(AddressSpace *as, hwaddr addr, uint64_t val); +struct MemoryRegionCache { + hwaddr xlat; + void *ptr; + hwaddr len; + MemoryRegion *mr; + bool is_write; +}; + +/* address_space_cache_init: prepare for repeated access to a physical + * memory region + * + * @cache: #MemoryRegionCache to be filled + * @as: #AddressSpace to be accessed + * @addr: address within that address space + * @len: length of buffer + * @is_write: indicates the transfer direction + * + * Will only work with RAM, and may map a subset of the requested range by + * returning a value that is less than @len. On failure, return a negative + * errno value. + * + * Because it only works with RAM, this function can be used for + * read-modify-write operations. In this case, is_write should be %true. + * + * Note that addresses passed to the address_space_*_cached functions + * are relative to @addr. + */ +int64_t address_space_cache_init(MemoryRegionCache *cache, + AddressSpace *as, + hwaddr addr, + hwaddr len, + bool is_write); + +/** + * address_space_cache_invalidate: complete a write to a #MemoryRegionCache + * + * @cache: The #MemoryRegionCache to operate on. + * @addr: The first physical address that was written, relative to the + * address that was passed to @address_space_cache_init. + * @access_len: The number of bytes that were written starting at @addr. + */ +void address_space_cache_invalidate(MemoryRegionCache *cache, + hwaddr addr, + hwaddr access_len); + +/** + * address_space_cache_destroy: free a #MemoryRegionCache + * + * @cache: The #MemoryRegionCache whose memory should be released. + */ +void address_space_cache_destroy(MemoryRegionCache *cache); + +/* address_space_ld*_cached: load from a cached #MemoryRegion + * address_space_st*_cached: store into a cached #MemoryRegion + * + * These functions perform a load or store of the byte, word, + * longword or quad to the specified address. The address is + * a physical address in the AddressSpace, but it must lie within + * a #MemoryRegion that was mapped with address_space_cache_init. + * + * The _le suffixed functions treat the data as little endian; + * _be indicates big endian; no suffix indicates "same endianness + * as guest CPU". + * + * The "guest CPU endianness" accessors are deprecated for use outside + * target-* code; devices should be CPU-agnostic and use either the LE + * or the BE accessors. + * + * @cache: previously initialized #MemoryRegionCache to be accessed + * @addr: address within the address space + * @val: data value, for stores + * @attrs: memory transaction attributes + * @result: location to write the success/failure of the transaction; + * if NULL, this information is discarded + */ +uint32_t address_space_ldub_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint32_t address_space_lduw_le_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint32_t address_space_lduw_be_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint32_t address_space_ldl_le_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint32_t address_space_ldl_be_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint64_t address_space_ldq_le_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +uint64_t address_space_ldq_be_cached(MemoryRegionCache *cache, hwaddr addr, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stb_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stw_le_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stw_be_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stl_le_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stl_be_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stq_le_cached(MemoryRegionCache *cache, hwaddr addr, uint64_t val, + MemTxAttrs attrs, MemTxResult *result); +void address_space_stq_be_cached(MemoryRegionCache *cache, hwaddr addr, uint64_t val, + MemTxAttrs attrs, MemTxResult *result); + +uint32_t ldub_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint32_t lduw_le_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint32_t lduw_be_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint32_t ldl_le_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint32_t ldl_be_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint64_t ldq_le_phys_cached(MemoryRegionCache *cache, hwaddr addr); +uint64_t ldq_be_phys_cached(MemoryRegionCache *cache, hwaddr addr); +void stb_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stw_le_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stw_be_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stl_le_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stl_be_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint32_t val); +void stq_le_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint64_t val); +void stq_be_phys_cached(MemoryRegionCache *cache, hwaddr addr, uint64_t val); + /* address_space_translate: translate an address range into an address space * into a MemoryRegion and an address range into that section. Should be * called from an RCU critical section, to avoid that the last reference @@ -1544,6 +1663,38 @@ MemTxResult address_space_read(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, return result; } +/** + * address_space_read_cached: read from a cached RAM region + * + * @cache: Cached region to be addressed + * @addr: address relative to the base of the RAM region + * @buf: buffer with the data transferred + * @len: length of the data transferred + */ +static inline void +address_space_read_cached(MemoryRegionCache *cache, hwaddr addr, + void *buf, int len) +{ + assert(addr < cache->len && len <= cache->len - addr); + memcpy(buf, cache->ptr + addr, len); +} + +/** + * address_space_write_cached: write to a cached RAM region + * + * @cache: Cached region to be addressed + * @addr: address relative to the base of the RAM region + * @buf: buffer with the data transferred + * @len: length of the data transferred + */ +static inline void +address_space_write_cached(MemoryRegionCache *cache, hwaddr addr, + void *buf, int len) +{ + assert(addr < cache->len && len <= cache->len - addr); + memcpy(cache->ptr + addr, buf, len); +} + #endif #endif diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 1b8c30a7a0..9a8bcbde36 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -45,6 +45,7 @@ typedef struct MachineState MachineState; typedef struct MemoryListener MemoryListener; typedef struct MemoryMappingList MemoryMappingList; typedef struct MemoryRegion MemoryRegion; +typedef struct MemoryRegionCache MemoryRegionCache; typedef struct MemoryRegionSection MemoryRegionSection; typedef struct MigrationIncomingState MigrationIncomingState; typedef struct MigrationParams MigrationParams; -- cgit v1.2.3-55-g7522 From 45241cf9d7b141cf1090366597923fc1a5366a3f Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 30 Nov 2016 23:30:38 -0500 Subject: timer: fix misleading comment in timer.h It's timer to expire, not clock. Signed-off-by: Yaowei Bai Message-Id: <1480566640-27264-1-git-send-email-baiyaowei@cmss.chinamobile.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini --- include/qemu/timer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/qemu/timer.h b/include/qemu/timer.h index bdfae004e4..9abed51ae8 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -133,7 +133,7 @@ bool qemu_clock_has_timers(QEMUClockType type); * @type: the clock type * * Determines whether a clock's default timer list - * has an expired clock. + * has an expired timer. * * Returns: true if the clock's default timer list has * an expired timer -- cgit v1.2.3-55-g7522 From 11717bc93a2a6bae12d1a59170e09b3f68840097 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 30 Nov 2016 23:30:39 -0500 Subject: main-loop: update comment for qemu_mutex_lock/unlock_iothread Commit 49cf57281b7 (vl: delay thread initialization after daemonization) makes the global mutex is taken after daemonization instead before daemonization by qemu_init_main_loop(). Signed-off-by: Yaowei Bai Message-Id: <1480566640-27264-2-git-send-email-baiyaowei@cmss.chinamobile.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini --- include/qemu/main-loop.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 470f600bbc..a9d4f23cd9 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -238,7 +238,7 @@ bool qemu_mutex_iothread_locked(void); * qemu_mutex_lock_iothread: Lock the main loop mutex. * * This function locks the main loop mutex. The mutex is taken by - * qemu_init_main_loop and always taken except while waiting on + * main() in vl.c and always taken except while waiting on * external events (such as with select). The mutex should be taken * by threads other than the main loop thread when calling * qemu_bh_new(), qemu_set_fd_handler() and basically all other @@ -253,7 +253,7 @@ void qemu_mutex_lock_iothread(void); * qemu_mutex_unlock_iothread: Unlock the main loop mutex. * * This function unlocks the main loop mutex. The mutex is taken by - * qemu_init_main_loop and always taken except while waiting on + * main() in vl.c and always taken except while waiting on * external events (such as with select). The mutex should be unlocked * as soon as possible by threads other than the main loop thread, * because it prevents the main loop from processing callbacks, -- cgit v1.2.3-55-g7522 From 722f8d90990b5623f51e3b1dce07dd6ed210be8d Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Wed, 30 Nov 2016 23:30:40 -0500 Subject: block: drop remaining legacy aio functions in comment Commit 87f68d318222563822b5c6b28192215fc4b4e441 (block: drop aio functions that operate on the main AioContext) drops qemu_aio_wait function references mostly while leaves these behind, clean up them. Signed-off-by: Yaowei Bai Message-Id: <1480566640-27264-3-git-send-email-baiyaowei@cmss.chinamobile.com> Reviewed-by: Stefan Hajnoczi Signed-off-by: Paolo Bonzini --- include/block/aio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/block/aio.h b/include/block/aio.h index c7ae27c91c..ca551e346f 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -195,8 +195,8 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque); * aio_notify: Force processing of pending events. * * Similar to signaling a condition variable, aio_notify forces - * aio_wait to exit, so that the next call will re-examine pending events. - * The caller of aio_notify will usually call aio_wait again very soon, + * aio_poll to exit, so that the next call will re-examine pending events. + * The caller of aio_notify will usually call aio_poll again very soon, * or go through another iteration of the GLib main loop. Hence, aio_notify * also has the side effect of recalculating the sets of file descriptors * that the main loop waits for. -- cgit v1.2.3-55-g7522 From be232eb07692a5d10d07aee8386faa7e860e73ea Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Sat, 5 Nov 2016 03:19:48 -0400 Subject: pc: make smbus configurable Signed-off-by: Chao Peng Message-Id: <1478330391-74060-2-git-send-email-chao.p.peng@linux.intel.com> Signed-off-by: Paolo Bonzini --- hw/i386/pc.c | 18 ++++++++++++++++++ hw/i386/pc_q35.c | 12 +++++++----- include/hw/i386/pc.h | 2 ++ 3 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index a9e64a88e5..40bec88681 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -2158,6 +2158,20 @@ static void pc_machine_set_nvdimm(Object *obj, bool value, Error **errp) pcms->acpi_nvdimm_state.is_enabled = value; } +static bool pc_machine_get_smbus(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->smbus; +} + +static void pc_machine_set_smbus(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->smbus = value; +} + static void pc_machine_initfn(Object *obj) { PCMachineState *pcms = PC_MACHINE(obj); @@ -2169,6 +2183,7 @@ static void pc_machine_initfn(Object *obj) pcms->acpi_nvdimm_state.is_enabled = false; /* acpi build is enabled by default if machine supports it */ pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build; + pcms->smbus = true; } static void pc_machine_reset(void) @@ -2329,6 +2344,9 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_add_bool(oc, PC_MACHINE_NVDIMM, pc_machine_get_nvdimm, pc_machine_set_nvdimm, &error_abort); + + object_class_property_add_bool(oc, PC_MACHINE_SMBUS, + pc_machine_get_smbus, pc_machine_set_smbus, &error_abort); } static const TypeInfo pc_machine_info = { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index b40d19ee00..5efc65aec6 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -248,11 +248,13 @@ static void pc_q35_init(MachineState *machine) ehci_create_ich9_with_companions(host_bus, 0x1d); } - /* TODO: Populate SPD eeprom data. */ - smbus_eeprom_init(ich9_smb_init(host_bus, - PCI_DEVFN(ICH9_SMB_DEV, ICH9_SMB_FUNC), - 0xb100), - 8, NULL, 0); + if (pcms->smbus) { + /* TODO: Populate SPD eeprom data. */ + smbus_eeprom_init(ich9_smb_init(host_bus, + PCI_DEVFN(ICH9_SMB_DEV, ICH9_SMB_FUNC), + 0xb100), + 8, NULL, 0); + } pc_cmos_init(pcms, idebus[0], idebus[1], rtc_state); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 4b74130559..e5447a08c6 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -63,6 +63,7 @@ struct PCMachineState { AcpiNVDIMMState acpi_nvdimm_state; bool acpi_build_enabled; + bool smbus; /* RAM information (sizes, addresses, configuration): */ ram_addr_t below_4g_mem_size, above_4g_mem_size; @@ -88,6 +89,7 @@ struct PCMachineState { #define PC_MACHINE_VMPORT "vmport" #define PC_MACHINE_SMM "smm" #define PC_MACHINE_NVDIMM "nvdimm" +#define PC_MACHINE_SMBUS "smbus" /** * PCMachineClass: -- cgit v1.2.3-55-g7522 From 272f042877d3ad95be99ea313789972752781c0b Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Sat, 5 Nov 2016 03:19:49 -0400 Subject: pc: make sata configurable Signed-off-by: Chao Peng Message-Id: <1478330391-74060-3-git-send-email-chao.p.peng@linux.intel.com> Signed-off-by: Paolo Bonzini --- hw/i386/pc.c | 29 ++++++++++++++++++++++++----- hw/i386/pc_q35.c | 24 ++++++++++++++---------- include/hw/i386/pc.h | 2 ++ 3 files changed, 40 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 40bec88681..b0beea3ef9 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -400,13 +400,13 @@ static void pc_cmos_init_late(void *opaque) int i, trans; val = 0; - if (ide_get_geometry(arg->idebus[0], 0, - &cylinders, &heads, §ors) >= 0) { + if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 0, + &cylinders, &heads, §ors) >= 0) { cmos_init_hd(s, 0x19, 0x1b, cylinders, heads, sectors); val |= 0xf0; } - if (ide_get_geometry(arg->idebus[0], 1, - &cylinders, &heads, §ors) >= 0) { + if (arg->idebus[0] && ide_get_geometry(arg->idebus[0], 1, + &cylinders, &heads, §ors) >= 0) { cmos_init_hd(s, 0x1a, 0x24, cylinders, heads, sectors); val |= 0x0f; } @@ -418,7 +418,8 @@ static void pc_cmos_init_late(void *opaque) geometry. It is always such that: 1 <= sects <= 63, 1 <= heads <= 16, 1 <= cylinders <= 16383. The BIOS geometry can be different if a translation is done. */ - if (ide_get_geometry(arg->idebus[i / 2], i % 2, + if (arg->idebus[i / 2] && + ide_get_geometry(arg->idebus[i / 2], i % 2, &cylinders, &heads, §ors) >= 0) { trans = ide_get_bios_chs_trans(arg->idebus[i / 2], i % 2) - 1; assert((trans & ~3) == 0); @@ -2172,6 +2173,20 @@ static void pc_machine_set_smbus(Object *obj, bool value, Error **errp) pcms->smbus = value; } +static bool pc_machine_get_sata(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->sata; +} + +static void pc_machine_set_sata(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->sata = value; +} + static void pc_machine_initfn(Object *obj) { PCMachineState *pcms = PC_MACHINE(obj); @@ -2184,6 +2199,7 @@ static void pc_machine_initfn(Object *obj) /* acpi build is enabled by default if machine supports it */ pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build; pcms->smbus = true; + pcms->sata = true; } static void pc_machine_reset(void) @@ -2347,6 +2363,9 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_add_bool(oc, PC_MACHINE_SMBUS, pc_machine_get_smbus, pc_machine_set_smbus, &error_abort); + + object_class_property_add_bool(oc, PC_MACHINE_SATA, + pc_machine_get_sata, pc_machine_set_sata, &error_abort); } static const TypeInfo pc_machine_info = { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 5efc65aec6..205c33e1a2 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -232,16 +232,20 @@ static void pc_q35_init(MachineState *machine) /* connect pm stuff to lpc */ ich9_lpc_pm_init(lpc, pc_machine_is_smm_enabled(pcms)); - /* ahci and SATA device, for q35 1 ahci controller is built-in */ - ahci = pci_create_simple_multifunction(host_bus, - PCI_DEVFN(ICH9_SATA1_DEV, - ICH9_SATA1_FUNC), - true, "ich9-ahci"); - idebus[0] = qdev_get_child_bus(&ahci->qdev, "ide.0"); - idebus[1] = qdev_get_child_bus(&ahci->qdev, "ide.1"); - g_assert(MAX_SATA_PORTS == ICH_AHCI(ahci)->ahci.ports); - ide_drive_get(hd, ICH_AHCI(ahci)->ahci.ports); - ahci_ide_create_devs(ahci, hd); + if (pcms->sata) { + /* ahci and SATA device, for q35 1 ahci controller is built-in */ + ahci = pci_create_simple_multifunction(host_bus, + PCI_DEVFN(ICH9_SATA1_DEV, + ICH9_SATA1_FUNC), + true, "ich9-ahci"); + idebus[0] = qdev_get_child_bus(&ahci->qdev, "ide.0"); + idebus[1] = qdev_get_child_bus(&ahci->qdev, "ide.1"); + g_assert(MAX_SATA_PORTS == ICH_AHCI(ahci)->ahci.ports); + ide_drive_get(hd, ICH_AHCI(ahci)->ahci.ports); + ahci_ide_create_devs(ahci, hd); + } else { + idebus[0] = idebus[1] = NULL; + } if (machine_usb(machine)) { /* Should we create 6 UHCI according to ich9 spec? */ diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index e5447a08c6..9535f4a252 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -64,6 +64,7 @@ struct PCMachineState { bool acpi_build_enabled; bool smbus; + bool sata; /* RAM information (sizes, addresses, configuration): */ ram_addr_t below_4g_mem_size, above_4g_mem_size; @@ -90,6 +91,7 @@ struct PCMachineState { #define PC_MACHINE_SMM "smm" #define PC_MACHINE_NVDIMM "nvdimm" #define PC_MACHINE_SMBUS "smbus" +#define PC_MACHINE_SATA "sata" /** * PCMachineClass: -- cgit v1.2.3-55-g7522 From feddd2fd91524b3457f3f4a20c22285acf887ba8 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Sat, 5 Nov 2016 03:19:50 -0400 Subject: pc: make pit configurable Signed-off-by: Chao Peng Message-Id: <1478330391-74060-4-git-send-email-chao.p.peng@linux.intel.com> Signed-off-by: Paolo Bonzini --- hw/i386/pc.c | 21 ++++++++++++++++++++- hw/i386/pc_piix.c | 2 +- hw/i386/pc_q35.c | 3 ++- include/hw/i386/pc.h | 3 +++ 4 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/hw/i386/pc.c b/hw/i386/pc.c index b0beea3ef9..25e8586b48 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1536,6 +1536,7 @@ void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi, ISADevice **rtc_state, bool create_fdctrl, bool no_vmport, + bool has_pit, uint32_t hpet_irqs) { int i; @@ -1589,7 +1590,7 @@ void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi, qemu_register_boot_set(pc_boot_set, *rtc_state); - if (!xen_enabled()) { + if (!xen_enabled() && has_pit) { if (kvm_pit_in_kernel()) { pit = kvm_pit_init(isa_bus, 0x40); } else { @@ -2187,6 +2188,20 @@ static void pc_machine_set_sata(Object *obj, bool value, Error **errp) pcms->sata = value; } +static bool pc_machine_get_pit(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->pit; +} + +static void pc_machine_set_pit(Object *obj, bool value, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->pit = value; +} + static void pc_machine_initfn(Object *obj) { PCMachineState *pcms = PC_MACHINE(obj); @@ -2200,6 +2215,7 @@ static void pc_machine_initfn(Object *obj) pcms->acpi_build_enabled = PC_MACHINE_GET_CLASS(pcms)->has_acpi_build; pcms->smbus = true; pcms->sata = true; + pcms->pit = true; } static void pc_machine_reset(void) @@ -2366,6 +2382,9 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) object_class_property_add_bool(oc, PC_MACHINE_SATA, pc_machine_get_sata, pc_machine_set_sata, &error_abort); + + object_class_property_add_bool(oc, PC_MACHINE_PIT, + pc_machine_get_pit, pc_machine_set_pit, &error_abort); } static const TypeInfo pc_machine_info = { diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index a54a468c0a..5e1adbe53c 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -235,7 +235,7 @@ static void pc_init1(MachineState *machine, /* init basic PC hardware */ pc_basic_device_init(isa_bus, pcms->gsi, &rtc_state, true, - (pcms->vmport != ON_OFF_AUTO_ON), 0x4); + (pcms->vmport != ON_OFF_AUTO_ON), pcms->pit, 0x4); pc_nic_init(isa_bus, pci_bus); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 205c33e1a2..d042fe0843 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -227,7 +227,8 @@ static void pc_q35_init(MachineState *machine) /* init basic PC hardware */ pc_basic_device_init(isa_bus, pcms->gsi, &rtc_state, !mc->no_floppy, - (pcms->vmport != ON_OFF_AUTO_ON), 0xff0104); + (pcms->vmport != ON_OFF_AUTO_ON), pcms->pit, + 0xff0104); /* connect pm stuff to lpc */ ich9_lpc_pm_init(lpc, pc_machine_is_smm_enabled(pcms)); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 9535f4a252..b37bc5b139 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -65,6 +65,7 @@ struct PCMachineState { bool acpi_build_enabled; bool smbus; bool sata; + bool pit; /* RAM information (sizes, addresses, configuration): */ ram_addr_t below_4g_mem_size, above_4g_mem_size; @@ -92,6 +93,7 @@ struct PCMachineState { #define PC_MACHINE_NVDIMM "nvdimm" #define PC_MACHINE_SMBUS "smbus" #define PC_MACHINE_SATA "sata" +#define PC_MACHINE_PIT "pit" /** * PCMachineClass: @@ -264,6 +266,7 @@ void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi, ISADevice **rtc_state, bool create_fdctrl, bool no_vmport, + bool has_pit, uint32_t hpet_irqs); void pc_init_ne2k_isa(ISABus *bus, NICInfo *nd); void pc_cmos_init(PCMachineState *pcms, -- cgit v1.2.3-55-g7522 From bc20403598702fac96b5a732bdb184ccbe1fcb48 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 10 Dec 2016 15:21:49 -0200 Subject: kvm: sync linux headers Import KVM_CLOCK_TSC_STABLE. Signed-off-by: Marcelo Tosatti Message-Id: <20161210172324.402794293@redhat.com> Signed-off-by: Paolo Bonzini --- include/standard-headers/linux/input.h | 1 + include/standard-headers/linux/pci_regs.h | 15 ++++++++++++++- linux-headers/asm-arm/kvm.h | 7 +++++++ linux-headers/asm-x86/unistd_32.h | 3 +++ linux-headers/asm-x86/unistd_64.h | 3 +++ linux-headers/asm-x86/unistd_x32.h | 3 +++ linux-headers/linux/kvm.h | 7 +++++++ 7 files changed, 38 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h index 7361a16b50..b472b8530c 100644 --- a/include/standard-headers/linux/input.h +++ b/include/standard-headers/linux/input.h @@ -245,6 +245,7 @@ struct input_mask { #define BUS_SPI 0x1C #define BUS_RMI 0x1D #define BUS_CEC 0x1E +#define BUS_INTEL_ISHTP 0x1F /* * MT_TOOL types diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index 404095124a..e5a2e68b22 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -612,6 +612,8 @@ */ #define PCI_EXP_DEVCAP2 36 /* Device Capabilities 2 */ #define PCI_EXP_DEVCAP2_ARI 0x00000020 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCAP2_ATOMIC_ROUTE 0x00000040 /* Atomic Op routing */ +#define PCI_EXP_DEVCAP2_ATOMIC_COMP64 0x00000100 /* Atomic 64-bit compare */ #define PCI_EXP_DEVCAP2_LTR 0x00000800 /* Latency tolerance reporting */ #define PCI_EXP_DEVCAP2_OBFF_MASK 0x000c0000 /* OBFF support mechanism */ #define PCI_EXP_DEVCAP2_OBFF_MSG 0x00040000 /* New message signaling */ @@ -619,6 +621,7 @@ #define PCI_EXP_DEVCTL2 40 /* Device Control 2 */ #define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f /* Completion Timeout Value */ #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ +#define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */ #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ @@ -671,7 +674,8 @@ #define PCI_EXT_CAP_ID_PMUX 0x1A /* Protocol Multiplexing */ #define PCI_EXT_CAP_ID_PASID 0x1B /* Process Address Space ID */ #define PCI_EXT_CAP_ID_DPC 0x1D /* Downstream Port Containment */ -#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_DPC +#define PCI_EXT_CAP_ID_PTM 0x1F /* Precision Time Measurement */ +#define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PTM #define PCI_EXT_CAP_DSN_SIZEOF 12 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40 @@ -964,4 +968,13 @@ #define PCI_EXP_DPC_SOURCE_ID 10 /* DPC Source Identifier */ +/* Precision Time Measurement */ +#define PCI_PTM_CAP 0x04 /* PTM Capability */ +#define PCI_PTM_CAP_REQ 0x00000001 /* Requester capable */ +#define PCI_PTM_CAP_ROOT 0x00000004 /* Root capable */ +#define PCI_PTM_GRANULARITY_MASK 0x0000FF00 /* Clock granularity */ +#define PCI_PTM_CTRL 0x08 /* PTM Control */ +#define PCI_PTM_CTRL_ENABLE 0x00000001 /* PTM enable */ +#define PCI_PTM_CTRL_ROOT 0x00000002 /* Root select */ + #endif /* LINUX_PCI_REGS_H */ diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h index 541268c946..2fb7859465 100644 --- a/linux-headers/asm-arm/kvm.h +++ b/linux-headers/asm-arm/kvm.h @@ -84,6 +84,13 @@ struct kvm_regs { #define KVM_VGIC_V2_DIST_SIZE 0x1000 #define KVM_VGIC_V2_CPU_SIZE 0x2000 +/* Supported VGICv3 address types */ +#define KVM_VGIC_V3_ADDR_TYPE_DIST 2 +#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 + +#define KVM_VGIC_V3_DIST_SIZE SZ_64K +#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) + #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index abeaf40d37..d45ea28e15 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -377,5 +377,8 @@ #define __NR_copy_file_range 377 #define __NR_preadv2 378 #define __NR_pwritev2 379 +#define __NR_pkey_mprotect 380 +#define __NR_pkey_alloc 381 +#define __NR_pkey_free 382 #endif /* _ASM_X86_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index 73c3d1f66a..e22db9171e 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -330,5 +330,8 @@ #define __NR_copy_file_range 326 #define __NR_preadv2 327 #define __NR_pwritev2 328 +#define __NR_pkey_mprotect 329 +#define __NR_pkey_alloc 330 +#define __NR_pkey_free 331 #endif /* _ASM_X86_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index e5aea761f8..84e58b202d 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -283,6 +283,9 @@ #define __NR_membarrier (__X32_SYSCALL_BIT + 324) #define __NR_mlock2 (__X32_SYSCALL_BIT + 325) #define __NR_copy_file_range (__X32_SYSCALL_BIT + 326) +#define __NR_pkey_mprotect (__X32_SYSCALL_BIT + 329) +#define __NR_pkey_alloc (__X32_SYSCALL_BIT + 330) +#define __NR_pkey_free (__X32_SYSCALL_BIT + 331) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 4806e069e7..bb0ed71223 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -972,12 +972,19 @@ struct kvm_irqfd { __u8 pad[16]; }; +/* For KVM_CAP_ADJUST_CLOCK */ + +/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags. */ +#define KVM_CLOCK_TSC_STABLE 2 + struct kvm_clock_data { __u64 clock; __u32 flags; __u32 pad[9]; }; +/* For KVM_CAP_SW_TLB */ + #define KVM_MMU_FSL_BOOKE_NOHV 0 #define KVM_MMU_FSL_BOOKE_HV 1 -- cgit v1.2.3-55-g7522 From 6053a86fe7bd3d5b07b49dae6c05f2cd0d44e687 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 21 Nov 2016 08:50:04 -0200 Subject: kvmclock: reduce kvmclock difference on migration Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which indicates that KVM_GET_CLOCK returns a value as seen by the guest at that moment. For new machine types, use this value rather than reading from guest memory. This reduces kvmclock difference on migration from 5s to 0.1s (when max_downtime == 5s). Signed-off-by: Marcelo Tosatti Message-Id: <20161121105052.598267440@redhat.com> [Add comment explaining what is going on. - Paolo] Signed-off-by: Paolo Bonzini --- hw/i386/kvm/clock.c | 142 +++++++++++++++++++++++++++++++++++++++++++------ include/hw/i386/pc.h | 5 ++ target/i386/kvm.c | 7 +++ target/i386/kvm_i386.h | 1 + 4 files changed, 140 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c index 0f75dd385a..ef9d560f9c 100644 --- a/hw/i386/kvm/clock.c +++ b/hw/i386/kvm/clock.c @@ -36,6 +36,13 @@ typedef struct KVMClockState { uint64_t clock; bool clock_valid; + + /* whether machine type supports reliable KVM_GET_CLOCK */ + bool mach_use_reliable_get_clock; + + /* whether the 'clock' value was obtained in a host with + * reliable KVM_GET_CLOCK */ + bool clock_is_reliable; } KVMClockState; struct pvclock_vcpu_time_info { @@ -81,6 +88,60 @@ static uint64_t kvmclock_current_nsec(KVMClockState *s) return nsec + time.system_time; } +static void kvm_update_clock(KVMClockState *s) +{ + struct kvm_clock_data data; + int ret; + + ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); + if (ret < 0) { + fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); + abort(); + } + s->clock = data.clock; + + /* If kvm_has_adjust_clock_stable() is false, KVM_GET_CLOCK returns + * essentially CLOCK_MONOTONIC plus a guest-specific adjustment. This + * can drift from the TSC-based value that is computed by the guest, + * so we need to go through kvmclock_current_nsec(). If + * kvm_has_adjust_clock_stable() is true, and the flags contain + * KVM_CLOCK_TSC_STABLE, then KVM_GET_CLOCK returns a TSC-based value + * and kvmclock_current_nsec() is not necessary. + * + * Here, however, we need not check KVM_CLOCK_TSC_STABLE. This is because: + * + * - if the host has disabled the kvmclock master clock, the guest already + * has protection against time going backwards. This "safety net" is only + * absent when kvmclock is stable; + * + * - therefore, we can replace a check like + * + * if last KVM_GET_CLOCK was not reliable then + * read from memory + * + * with + * + * if last KVM_GET_CLOCK was not reliable && masterclock is enabled + * read from memory + * + * However: + * + * - if kvm_has_adjust_clock_stable() returns false, the left side is + * always true (KVM_GET_CLOCK is never reliable), and the right side is + * unknown (because we don't have data.flags). We must assume it's true + * and read from memory. + * + * - if kvm_has_adjust_clock_stable() returns true, the result of the && + * is always false (masterclock is enabled iff KVM_GET_CLOCK is reliable) + * + * So we can just use this instead: + * + * if !kvm_has_adjust_clock_stable() then + * read from memory + */ + s->clock_is_reliable = kvm_has_adjust_clock_stable(); +} + static void kvmclock_vm_state_change(void *opaque, int running, RunState state) { @@ -91,15 +152,21 @@ static void kvmclock_vm_state_change(void *opaque, int running, if (running) { struct kvm_clock_data data = {}; - uint64_t time_at_migration = kvmclock_current_nsec(s); - - s->clock_valid = false; - /* We can't rely on the migrated clock value, just discard it */ - if (time_at_migration) { - s->clock = time_at_migration; + /* + * If the host where s->clock was read did not support reliable + * KVM_GET_CLOCK, read kvmclock value from memory. + */ + if (!s->clock_is_reliable) { + uint64_t pvclock_via_mem = kvmclock_current_nsec(s); + /* We can't rely on the saved clock value, just discard it */ + if (pvclock_via_mem) { + s->clock = pvclock_via_mem; + } } + s->clock_valid = false; + data.clock = s->clock; ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data); if (ret < 0) { @@ -120,8 +187,6 @@ static void kvmclock_vm_state_change(void *opaque, int running, } } } else { - struct kvm_clock_data data; - int ret; if (s->clock_valid) { return; @@ -129,13 +194,7 @@ static void kvmclock_vm_state_change(void *opaque, int running, kvm_synchronize_all_tsc(); - ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); - if (ret < 0) { - fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); - abort(); - } - s->clock = data.clock; - + kvm_update_clock(s); /* * If the VM is stopped, declare the clock state valid to * avoid re-reading it on next vmsave (which would return @@ -149,25 +208,78 @@ static void kvmclock_realize(DeviceState *dev, Error **errp) { KVMClockState *s = KVM_CLOCK(dev); + kvm_update_clock(s); + qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s); } +static bool kvmclock_clock_is_reliable_needed(void *opaque) +{ + KVMClockState *s = opaque; + + return s->mach_use_reliable_get_clock; +} + +static const VMStateDescription kvmclock_reliable_get_clock = { + .name = "kvmclock/clock_is_reliable", + .version_id = 1, + .minimum_version_id = 1, + .needed = kvmclock_clock_is_reliable_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(clock_is_reliable, KVMClockState), + VMSTATE_END_OF_LIST() + } +}; + +/* + * When migrating, read the clock just before migration, + * so that the guest clock counts during the events + * between: + * + * * vm_stop() + * * + * * pre_save() + * + * This reduces kvmclock difference on migration from 5s + * to 0.1s (when max_downtime == 5s), because sending the + * final pages of memory (which happens between vm_stop() + * and pre_save()) takes max_downtime. + */ +static void kvmclock_pre_save(void *opaque) +{ + KVMClockState *s = opaque; + + kvm_update_clock(s); +} + static const VMStateDescription kvmclock_vmsd = { .name = "kvmclock", .version_id = 1, .minimum_version_id = 1, + .pre_save = kvmclock_pre_save, .fields = (VMStateField[]) { VMSTATE_UINT64(clock, KVMClockState), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * []) { + &kvmclock_reliable_get_clock, + NULL } }; +static Property kvmclock_properties[] = { + DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState, + mach_use_reliable_get_clock, true), + DEFINE_PROP_END_OF_LIST(), +}; + static void kvmclock_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); dc->realize = kvmclock_realize; dc->vmsd = &kvmclock_vmsd; + dc->props = kvmclock_properties; } static const TypeInfo kvmclock_info = { diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index b37bc5b139..b22e699c46 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -378,6 +378,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); #define PC_COMPAT_2_7 \ HW_COMPAT_2_7 \ + {\ + .driver = "kvmclock",\ + .property = "x-mach-use-reliable-get-clock",\ + .value = "off",\ + },\ {\ .driver = TYPE_X86_CPU,\ .property = "l3-cache",\ diff --git a/target/i386/kvm.c b/target/i386/kvm.c index f62264a7a8..10a9cd8f7f 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -117,6 +117,13 @@ bool kvm_has_smm(void) return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM); } +bool kvm_has_adjust_clock_stable(void) +{ + int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK); + + return (ret == KVM_CLOCK_TSC_STABLE); +} + bool kvm_allows_irq0_override(void) { return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing(); diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h index 76079295b2..bfce427f86 100644 --- a/target/i386/kvm_i386.h +++ b/target/i386/kvm_i386.h @@ -17,6 +17,7 @@ bool kvm_allows_irq0_override(void); bool kvm_has_smm(void); +bool kvm_has_adjust_clock_stable(void); void kvm_synchronize_all_tsc(void); void kvm_arch_reset_vcpu(X86CPU *cs); void kvm_arch_do_init_vcpu(X86CPU *cs); -- cgit v1.2.3-55-g7522