summaryrefslogtreecommitdiffstats
path: root/accel
diff options
context:
space:
mode:
Diffstat (limited to 'accel')
-rw-r--r--accel/Kconfig3
-rw-r--r--accel/accel-common.c32
-rw-r--r--accel/kvm/kvm-all.c615
-rw-r--r--accel/kvm/trace-events7
-rw-r--r--accel/tcg/cpu-exec.c5
-rw-r--r--accel/tcg/cputlb.c233
-rw-r--r--accel/tcg/tb-context.h41
-rw-r--r--accel/tcg/tb-hash.h69
-rw-r--r--accel/tcg/tb-lookup.h49
-rw-r--r--accel/tcg/tcg-runtime-gvec.c36
-rw-r--r--accel/tcg/tcg-runtime.c2
-rw-r--r--accel/tcg/translate-all.c15
12 files changed, 875 insertions, 232 deletions
diff --git a/accel/Kconfig b/accel/Kconfig
index 461104c771..8bdedb7d15 100644
--- a/accel/Kconfig
+++ b/accel/Kconfig
@@ -1,6 +1,9 @@
config WHPX
bool
+config NVMM
+ bool
+
config HAX
bool
diff --git a/accel/accel-common.c b/accel/accel-common.c
index 9901b0531c..cf07f78421 100644
--- a/accel/accel-common.c
+++ b/accel/accel-common.c
@@ -54,10 +54,23 @@ static void accel_init_cpu_int_aux(ObjectClass *klass, void *opaque)
CPUClass *cc = CPU_CLASS(klass);
AccelCPUClass *accel_cpu = opaque;
+ /*
+ * The first callback allows accel-cpu to run initializations
+ * for the CPU, customizing CPU behavior according to the accelerator.
+ *
+ * The second one allows the CPU to customize the accel-cpu
+ * behavior according to the CPU.
+ *
+ * The second is currently only used by TCG, to specialize the
+ * TCGCPUOps depending on the CPU type.
+ */
cc->accel_cpu = accel_cpu;
if (accel_cpu->cpu_class_init) {
accel_cpu->cpu_class_init(cc);
}
+ if (cc->init_accel_cpu) {
+ cc->init_accel_cpu(accel_cpu, cc);
+ }
}
/* initialize the arch-specific accel CpuClass interfaces */
@@ -89,6 +102,25 @@ void accel_init_interfaces(AccelClass *ac)
accel_init_cpu_interfaces(ac);
}
+void accel_cpu_instance_init(CPUState *cpu)
+{
+ CPUClass *cc = CPU_GET_CLASS(cpu);
+
+ if (cc->accel_cpu && cc->accel_cpu->cpu_instance_init) {
+ cc->accel_cpu->cpu_instance_init(cpu);
+ }
+}
+
+bool accel_cpu_realizefn(CPUState *cpu, Error **errp)
+{
+ CPUClass *cc = CPU_GET_CLASS(cpu);
+
+ if (cc->accel_cpu && cc->accel_cpu->cpu_realizefn) {
+ return cc->accel_cpu->cpu_realizefn(cpu, errp);
+ }
+ return true;
+}
+
static const TypeInfo accel_cpu_type = {
.name = TYPE_ACCEL_CPU,
.parent = TYPE_OBJECT,
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 4e0168e88b..c7ec538850 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -15,6 +15,7 @@
#include "qemu/osdep.h"
#include <sys/ioctl.h>
+#include <poll.h>
#include <linux/kvm.h>
@@ -78,6 +79,25 @@ struct KVMParkedVcpu {
QLIST_ENTRY(KVMParkedVcpu) node;
};
+enum KVMDirtyRingReaperState {
+ KVM_DIRTY_RING_REAPER_NONE = 0,
+ /* The reaper is sleeping */
+ KVM_DIRTY_RING_REAPER_WAIT,
+ /* The reaper is reaping for dirty pages */
+ KVM_DIRTY_RING_REAPER_REAPING,
+};
+
+/*
+ * KVM reaper instance, responsible for collecting the KVM dirty bits
+ * via the dirty ring.
+ */
+struct KVMDirtyRingReaper {
+ /* The reaper thread */
+ QemuThread reaper_thr;
+ volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
+ volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
+};
+
struct KVMState
{
AccelState parent_obj;
@@ -126,6 +146,9 @@ struct KVMState
KVMMemoryListener *ml;
AddressSpace *as;
} *as;
+ uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */
+ uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */
+ struct KVMDirtyRingReaper reaper;
};
KVMState *kvm_state;
@@ -172,8 +195,12 @@ typedef struct KVMResampleFd KVMResampleFd;
static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
-#define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock)
-#define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock)
+static QemuMutex kml_slots_lock;
+
+#define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock)
+#define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock)
+
+static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
static inline void kvm_resample_fd_remove(int gsi)
{
@@ -239,9 +266,9 @@ bool kvm_has_free_slot(MachineState *ms)
bool result;
KVMMemoryListener *kml = &s->memory_listener;
- kvm_slots_lock(kml);
+ kvm_slots_lock();
result = !!kvm_get_free_slot(kml);
- kvm_slots_unlock(kml);
+ kvm_slots_unlock();
return result;
}
@@ -307,7 +334,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
KVMMemoryListener *kml = &s->memory_listener;
int i, ret = 0;
- kvm_slots_lock(kml);
+ kvm_slots_lock();
for (i = 0; i < s->nr_slots; i++) {
KVMSlot *mem = &kml->slots[i];
@@ -317,7 +344,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
break;
}
}
- kvm_slots_unlock(kml);
+ kvm_slots_unlock();
return ret;
}
@@ -383,6 +410,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
goto err;
}
+ if (cpu->kvm_dirty_gfns) {
+ ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
+ if (ret < 0) {
+ goto err;
+ }
+ }
+
vcpu = g_malloc0(sizeof(*vcpu));
vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
vcpu->kvm_fd = cpu->kvm_fd;
@@ -459,6 +493,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
(void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
}
+ if (s->kvm_dirty_ring_size) {
+ /* Use MAP_SHARED to share pages with the kernel */
+ cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ cpu->kvm_fd,
+ PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
+ if (cpu->kvm_dirty_gfns == MAP_FAILED) {
+ ret = -errno;
+ DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
+ goto err;
+ }
+ }
+
ret = kvm_arch_init_vcpu(cpu);
if (ret < 0) {
error_setg_errno(errp, -ret,
@@ -498,6 +545,7 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
return 0;
}
+ kvm_slot_init_dirty_bitmap(mem);
return kvm_set_user_memory_region(kml, mem, false);
}
@@ -513,7 +561,7 @@ static int kvm_section_update_flags(KVMMemoryListener *kml,
return 0;
}
- kvm_slots_lock(kml);
+ kvm_slots_lock();
while (size && !ret) {
slot_size = MIN(kvm_max_slot_size, size);
@@ -529,7 +577,7 @@ static int kvm_section_update_flags(KVMMemoryListener *kml,
}
out:
- kvm_slots_unlock(kml);
+ kvm_slots_unlock();
return ret;
}
@@ -568,22 +616,28 @@ static void kvm_log_stop(MemoryListener *listener,
}
/* get kvm's dirty pages bitmap and update qemu's */
-static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
- unsigned long *bitmap)
+static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
{
- ram_addr_t start = section->offset_within_region +
- memory_region_get_ram_addr(section->mr);
- ram_addr_t pages = int128_get64(section->size) / qemu_real_host_page_size;
+ ram_addr_t start = slot->ram_start_offset;
+ ram_addr_t pages = slot->memory_size / qemu_real_host_page_size;
- cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
- return 0;
+ cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
+}
+
+static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
+{
+ memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
}
#define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
/* Allocate the dirty bitmap for a slot */
-static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
+static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
{
+ if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
+ return;
+ }
+
/*
* XXX bad kernel interface alert
* For dirty bitmap, kernel allocates array of size aligned to
@@ -604,6 +658,196 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size,
/*HOST_LONG_BITS*/ 64) / 8;
mem->dirty_bmap = g_malloc0(bitmap_size);
+ mem->dirty_bmap_size = bitmap_size;
+}
+
+/*
+ * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
+ * succeeded, false otherwise
+ */
+static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
+{
+ struct kvm_dirty_log d = {};
+ int ret;
+
+ d.dirty_bitmap = slot->dirty_bmap;
+ d.slot = slot->slot | (slot->as_id << 16);
+ ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
+
+ if (ret == -ENOENT) {
+ /* kernel does not have dirty bitmap in this slot */
+ ret = 0;
+ }
+ if (ret) {
+ error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
+ __func__, ret);
+ }
+ return ret == 0;
+}
+
+/* Should be with all slots_lock held for the address spaces. */
+static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
+ uint32_t slot_id, uint64_t offset)
+{
+ KVMMemoryListener *kml;
+ KVMSlot *mem;
+
+ if (as_id >= s->nr_as) {
+ return;
+ }
+
+ kml = s->as[as_id].ml;
+ mem = &kml->slots[slot_id];
+
+ if (!mem->memory_size || offset >=
+ (mem->memory_size / qemu_real_host_page_size)) {
+ return;
+ }
+
+ set_bit(offset, mem->dirty_bmap);
+}
+
+static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+ return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+ gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+/*
+ * Should be with all slots_lock held for the address spaces. It returns the
+ * dirty page we've collected on this dirty ring.
+ */
+static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
+{
+ struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
+ uint32_t ring_size = s->kvm_dirty_ring_size;
+ uint32_t count = 0, fetch = cpu->kvm_fetch_index;
+
+ assert(dirty_gfns && ring_size);
+ trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
+
+ while (true) {
+ cur = &dirty_gfns[fetch % ring_size];
+ if (!dirty_gfn_is_dirtied(cur)) {
+ break;
+ }
+ kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
+ cur->offset);
+ dirty_gfn_set_collected(cur);
+ trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
+ fetch++;
+ count++;
+ }
+ cpu->kvm_fetch_index = fetch;
+
+ return count;
+}
+
+/* Must be with slots_lock held */
+static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
+{
+ int ret;
+ CPUState *cpu;
+ uint64_t total = 0;
+ int64_t stamp;
+
+ stamp = get_clock();
+
+ CPU_FOREACH(cpu) {
+ total += kvm_dirty_ring_reap_one(s, cpu);
+ }
+
+ if (total) {
+ ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
+ assert(ret == total);
+ }
+
+ stamp = get_clock() - stamp;
+
+ if (total) {
+ trace_kvm_dirty_ring_reap(total, stamp / 1000);
+ }
+
+ return total;
+}
+
+/*
+ * Currently for simplicity, we must hold BQL before calling this. We can
+ * consider to drop the BQL if we're clear with all the race conditions.
+ */
+static uint64_t kvm_dirty_ring_reap(KVMState *s)
+{
+ uint64_t total;
+
+ /*
+ * We need to lock all kvm slots for all address spaces here,
+ * because:
+ *
+ * (1) We need to mark dirty for dirty bitmaps in multiple slots
+ * and for tons of pages, so it's better to take the lock here
+ * once rather than once per page. And more importantly,
+ *
+ * (2) We must _NOT_ publish dirty bits to the other threads
+ * (e.g., the migration thread) via the kvm memory slot dirty
+ * bitmaps before correctly re-protect those dirtied pages.
+ * Otherwise we can have potential risk of data corruption if
+ * the page data is read in the other thread before we do
+ * reset below.
+ */
+ kvm_slots_lock();
+ total = kvm_dirty_ring_reap_locked(s);
+ kvm_slots_unlock();
+
+ return total;
+}
+
+static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
+{
+ /* No need to do anything */
+}
+
+/*
+ * Kick all vcpus out in a synchronized way. When returned, we
+ * guarantee that every vcpu has been kicked and at least returned to
+ * userspace once.
+ */
+static void kvm_cpu_synchronize_kick_all(void)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
+ }
+}
+
+/*
+ * Flush all the existing dirty pages to the KVM slot buffers. When
+ * this call returns, we guarantee that all the touched dirty pages
+ * before calling this function have been put into the per-kvmslot
+ * dirty bitmap.
+ *
+ * This function must be called with BQL held.
+ */
+static void kvm_dirty_ring_flush(void)
+{
+ trace_kvm_dirty_ring_flush(0);
+ /*
+ * The function needs to be serialized. Since this function
+ * should always be with BQL held, serialization is guaranteed.
+ * However, let's be sure of it.
+ */
+ assert(qemu_mutex_iothread_locked());
+ /*
+ * First make sure to flush the hardware buffers by kicking all
+ * vcpus out in a synchronous way.
+ */
+ kvm_cpu_synchronize_kick_all();
+ kvm_dirty_ring_reap(kvm_state);
+ trace_kvm_dirty_ring_flush(1);
}
/**
@@ -617,53 +861,28 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
* @kml: the KVM memory listener object
* @section: the memory section to sync the dirty bitmap with
*/
-static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
- MemoryRegionSection *section)
+static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
+ MemoryRegionSection *section)
{
KVMState *s = kvm_state;
- struct kvm_dirty_log d = {};
KVMSlot *mem;
hwaddr start_addr, size;
- hwaddr slot_size, slot_offset = 0;
- int ret = 0;
+ hwaddr slot_size;
size = kvm_align_section(section, &start_addr);
while (size) {
- MemoryRegionSection subsection = *section;
-
slot_size = MIN(kvm_max_slot_size, size);
mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
if (!mem) {
/* We don't have a slot if we want to trap every access. */
- goto out;
- }
-
- if (!mem->dirty_bmap) {
- /* Allocate on the first log_sync, once and for all */
- kvm_memslot_init_dirty_bitmap(mem);
+ return;
}
-
- d.dirty_bitmap = mem->dirty_bmap;
- d.slot = mem->slot | (kml->as_id << 16);
- ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
- if (ret == -ENOENT) {
- /* kernel does not have dirty bitmap in this slot */
- ret = 0;
- } else if (ret < 0) {
- error_report("ioctl KVM_GET_DIRTY_LOG failed: %d", errno);
- goto out;
- } else {
- subsection.offset_within_region += slot_offset;
- subsection.size = int128_make64(slot_size);
- kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap);
+ if (kvm_slot_get_dirty_log(s, mem)) {
+ kvm_slot_sync_dirty_pages(mem);
}
-
- slot_offset += slot_size;
start_addr += slot_size;
size -= slot_size;
}
-out:
- return ret;
}
/* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
@@ -810,7 +1029,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml,
return ret;
}
- kvm_slots_lock(kml);
+ kvm_slots_lock();
for (i = 0; i < s->nr_slots; i++) {
mem = &kml->slots[i];
@@ -836,7 +1055,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml,
}
}
- kvm_slots_unlock(kml);
+ kvm_slots_unlock();
return ret;
}
@@ -1119,7 +1338,8 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
int err;
MemoryRegion *mr = section->mr;
bool writeable = !mr->readonly && !mr->rom_device;
- hwaddr start_addr, size, slot_size;
+ hwaddr start_addr, size, slot_size, mr_offset;
+ ram_addr_t ram_start_offset;
void *ram;
if (!memory_region_is_ram(mr)) {
@@ -1137,11 +1357,15 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
return;
}
- /* use aligned delta to align the ram address */
- ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
- (start_addr - section->offset_within_address_space);
+ /* The offset of the kvmslot within the memory region */
+ mr_offset = section->offset_within_region + start_addr -
+ section->offset_within_address_space;
- kvm_slots_lock(kml);
+ /* use aligned delta to align the ram address and offset */
+ ram = memory_region_get_ram_ptr(mr) + mr_offset;
+ ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
+
+ kvm_slots_lock();
if (!add) {
do {
@@ -1151,7 +1375,25 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
goto out;
}
if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
- kvm_physical_sync_dirty_bitmap(kml, section);
+ /*
+ * NOTE: We should be aware of the fact that here we're only
+ * doing a best effort to sync dirty bits. No matter whether
+ * we're using dirty log or dirty ring, we ignored two facts:
+ *
+ * (1) dirty bits can reside in hardware buffers (PML)
+ *
+ * (2) after we collected dirty bits here, pages can be dirtied
+ * again before we do the final KVM_SET_USER_MEMORY_REGION to
+ * remove the slot.
+ *
+ * Not easy. Let's cross the fingers until it's fixed.
+ */
+ if (kvm_state->kvm_dirty_ring_size) {
+ kvm_dirty_ring_reap_locked(kvm_state);
+ } else {
+ kvm_slot_get_dirty_log(kvm_state, mem);
+ }
+ kvm_slot_sync_dirty_pages(mem);
}
/* unregister the slot */
@@ -1175,18 +1417,13 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
do {
slot_size = MIN(kvm_max_slot_size, size);
mem = kvm_alloc_slot(kml);
+ mem->as_id = kml->as_id;
mem->memory_size = slot_size;
mem->start_addr = start_addr;
+ mem->ram_start_offset = ram_start_offset;
mem->ram = ram;
mem->flags = kvm_mem_flags(mr);
-
- if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
- /*
- * Reallocate the bmap; it means it doesn't disappear in
- * middle of a migrate.
- */
- kvm_memslot_init_dirty_bitmap(mem);
- }
+ kvm_slot_init_dirty_bitmap(mem);
err = kvm_set_user_memory_region(kml, mem, true);
if (err) {
fprintf(stderr, "%s: error registering slot: %s\n", __func__,
@@ -1194,12 +1431,58 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
abort();
}
start_addr += slot_size;
+ ram_start_offset += slot_size;
ram += slot_size;
size -= slot_size;
} while (size);
out:
- kvm_slots_unlock(kml);
+ kvm_slots_unlock();
+}
+
+static void *kvm_dirty_ring_reaper_thread(void *data)
+{
+ KVMState *s = data;
+ struct KVMDirtyRingReaper *r = &s->reaper;
+
+ rcu_register_thread();
+
+ trace_kvm_dirty_ring_reaper("init");
+
+ while (true) {
+ r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
+ trace_kvm_dirty_ring_reaper("wait");
+ /*
+ * TODO: provide a smarter timeout rather than a constant?
+ */
+ sleep(1);
+
+ trace_kvm_dirty_ring_reaper("wakeup");
+ r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
+
+ qemu_mutex_lock_iothread();
+ kvm_dirty_ring_reap(s);
+ qemu_mutex_unlock_iothread();
+
+ r->reaper_iteration++;
+ }
+
+ trace_kvm_dirty_ring_reaper("exit");
+
+ rcu_unregister_thread();
+
+ return NULL;
+}
+
+static int kvm_dirty_ring_reaper_init(KVMState *s)
+{
+ struct KVMDirtyRingReaper *r = &s->reaper;
+
+ qemu_thread_create(&r->reaper_thr, "kvm-reaper",
+ kvm_dirty_ring_reaper_thread,
+ s, QEMU_THREAD_JOINABLE);
+
+ return 0;
}
static void kvm_region_add(MemoryListener *listener,
@@ -1224,14 +1507,40 @@ static void kvm_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
- int r;
- kvm_slots_lock(kml);
- r = kvm_physical_sync_dirty_bitmap(kml, section);
- kvm_slots_unlock(kml);
- if (r < 0) {
- abort();
+ kvm_slots_lock();
+ kvm_physical_sync_dirty_bitmap(kml, section);
+ kvm_slots_unlock();
+}
+
+static void kvm_log_sync_global(MemoryListener *l)
+{
+ KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
+ KVMState *s = kvm_state;
+ KVMSlot *mem;
+ int i;
+
+ /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
+ kvm_dirty_ring_flush();
+
+ /*
+ * TODO: make this faster when nr_slots is big while there are
+ * only a few used slots (small VMs).
+ */
+ kvm_slots_lock();
+ for (i = 0; i < s->nr_slots; i++) {
+ mem = &kml->slots[i];
+ if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ kvm_slot_sync_dirty_pages(mem);
+ /*
+ * This is not needed by KVM_GET_DIRTY_LOG because the
+ * ioctl will unconditionally overwrite the whole region.
+ * However kvm dirty ring has no such side effect.
+ */
+ kvm_slot_reset_dirty_pages(mem);
+ }
}
+ kvm_slots_unlock();
}
static void kvm_log_clear(MemoryListener *listener,
@@ -1328,7 +1637,6 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
{
int i;
- qemu_mutex_init(&kml->slots_lock);
kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
kml->as_id = as_id;
@@ -1340,10 +1648,15 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
kml->listener.region_del = kvm_region_del;
kml->listener.log_start = kvm_log_start;
kml->listener.log_stop = kvm_log_stop;
- kml->listener.log_sync = kvm_log_sync;
- kml->listener.log_clear = kvm_log_clear;
kml->listener.priority = 10;
+ if (s->kvm_dirty_ring_size) {
+ kml->listener.log_sync_global = kvm_log_sync_global;
+ } else {
+ kml->listener.log_sync = kvm_log_sync;
+ kml->listener.log_clear = kvm_log_clear;
+ }
+
memory_listener_register(&kml->listener, as);
for (i = 0; i < s->nr_as; ++i) {
@@ -2001,6 +2314,8 @@ static int kvm_init(MachineState *ms)
int type = 0;
uint64_t dirty_log_manual_caps;
+ qemu_mutex_init(&kml_slots_lock);
+
s = KVM_STATE(ms->accelerator);
/*
@@ -2017,7 +2332,6 @@ static int kvm_init(MachineState *ms)
QTAILQ_INIT(&s->kvm_sw_breakpoints);
#endif
QLIST_INIT(&s->kvm_parked_vcpus);
- s->vmfd = -1;
s->fd = qemu_open_old("/dev/kvm", O_RDWR);
if (s->fd == -1) {
fprintf(stderr, "Could not access KVM kernel module: %m\n");
@@ -2125,20 +2439,70 @@ static int kvm_init(MachineState *ms)
s->coalesced_pio = s->coalesced_mmio &&
kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
- dirty_log_manual_caps =
- kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
- dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
- KVM_DIRTY_LOG_INITIALLY_SET);
- s->manual_dirty_log_protect = dirty_log_manual_caps;
- if (dirty_log_manual_caps) {
- ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
- dirty_log_manual_caps);
- if (ret) {
- warn_report("Trying to enable capability %"PRIu64" of "
- "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
- "Falling back to the legacy mode. ",
- dirty_log_manual_caps);
- s->manual_dirty_log_protect = 0;
+ /*
+ * Enable KVM dirty ring if supported, otherwise fall back to
+ * dirty logging mode
+ */
+ if (s->kvm_dirty_ring_size > 0) {
+ uint64_t ring_bytes;
+
+ ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn);
+
+ /* Read the max supported pages */
+ ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING);
+ if (ret > 0) {
+ if (ring_bytes > ret) {
+ error_report("KVM dirty ring size %" PRIu32 " too big "
+ "(maximum is %ld). Please use a smaller value.",
+ s->kvm_dirty_ring_size,
+ (long)ret / sizeof(struct kvm_dirty_gfn));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes);
+ if (ret) {
+ error_report("Enabling of KVM dirty ring failed: %s. "
+ "Suggested mininum value is 1024.", strerror(-ret));
+ goto err;
+ }
+
+ s->kvm_dirty_ring_bytes = ring_bytes;
+ } else {
+ warn_report("KVM dirty ring not available, using bitmap method");
+ s->kvm_dirty_ring_size = 0;
+ }
+ }
+
+ /*
+ * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
+ * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
+ * page is wr-protected initially, which is against how kvm dirty ring is
+ * usage - kvm dirty ring requires all pages are wr-protected at the very
+ * beginning. Enabling this feature for dirty ring causes data corruption.
+ *
+ * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
+ * we may expect a higher stall time when starting the migration. In the
+ * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
+ * instead of clearing dirty bit, it can be a way to explicitly wr-protect
+ * guest pages.
+ */
+ if (!s->kvm_dirty_ring_size) {
+ dirty_log_manual_caps =
+ kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+ s->manual_dirty_log_protect = dirty_log_manual_caps;
+ if (dirty_log_manual_caps) {
+ ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
+ dirty_log_manual_caps);
+ if (ret) {
+ warn_report("Trying to enable capability %"PRIu64" of "
+ "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
+ "Falling back to the legacy mode. ",
+ dirty_log_manual_caps);
+ s->manual_dirty_log_protect = 0;
+ }
}
}
@@ -2224,6 +2588,14 @@ static int kvm_init(MachineState *ms)
ret = ram_block_discard_disable(true);
assert(!ret);
}
+
+ if (s->kvm_dirty_ring_size) {
+ ret = kvm_dirty_ring_reaper_init(s);
+ if (ret) {
+ goto err;
+ }
+ }
+
return 0;
err:
@@ -2536,6 +2908,17 @@ int kvm_cpu_exec(CPUState *cpu)
case KVM_EXIT_INTERNAL_ERROR:
ret = kvm_handle_internal_error(cpu, run);
break;
+ case KVM_EXIT_DIRTY_RING_FULL:
+ /*
+ * We shouldn't continue if the dirty ring of this vcpu is
+ * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
+ */
+ trace_kvm_dirty_ring_full(cpu->cpu_index);
+ qemu_mutex_lock_iothread();
+ kvm_dirty_ring_reap(kvm_state);
+ qemu_mutex_unlock_iothread();
+ ret = 0;
+ break;
case KVM_EXIT_SYSTEM_EVENT:
switch (run->system_event.type) {
case KVM_SYSTEM_EVENT_SHUTDOWN:
@@ -3112,6 +3495,11 @@ static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
KVMState *s = KVM_STATE(obj);
int64_t value;
+ if (s->fd != -1) {
+ error_setg(errp, "Cannot set properties after the accelerator has been initialized");
+ return;
+ }
+
if (!visit_type_int(v, name, &value, errp)) {
return;
}
@@ -3126,6 +3514,11 @@ static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
KVMState *s = KVM_STATE(obj);
OnOffSplit mode;
+ if (s->fd != -1) {
+ error_setg(errp, "Cannot set properties after the accelerator has been initialized");
+ return;
+ }
+
if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
return;
}
@@ -3168,13 +3561,53 @@ bool kvm_kernel_irqchip_split(void)
return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
}
+static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ uint32_t value = s->kvm_dirty_ring_size;
+
+ visit_type_uint32(v, name, &value, errp);
+}
+
+static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ KVMState *s = KVM_STATE(obj);
+ Error *error = NULL;
+ uint32_t value;
+
+ if (s->fd != -1) {
+ error_setg(errp, "Cannot set properties after the accelerator has been initialized");
+ return;
+ }
+
+ visit_type_uint32(v, name, &value, &error);
+ if (error) {
+ error_propagate(errp, error);
+ return;
+ }
+ if (value & (value - 1)) {
+ error_setg(errp, "dirty-ring-size must be a power of two.");
+ return;
+ }
+
+ s->kvm_dirty_ring_size = value;
+}
+
static void kvm_accel_instance_init(Object *obj)
{
KVMState *s = KVM_STATE(obj);
+ s->fd = -1;
+ s->vmfd = -1;
s->kvm_shadow_mem = -1;
s->kernel_irqchip_allowed = true;
s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
+ /* KVM dirty ring is by default off */
+ s->kvm_dirty_ring_size = 0;
}
static void kvm_accel_class_init(ObjectClass *oc, void *data)
@@ -3196,6 +3629,12 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data)
NULL, NULL);
object_class_property_set_description(oc, "kvm-shadow-mem",
"KVM shadow MMU size");
+
+ object_class_property_add(oc, "dirty-ring-size", "uint32",
+ kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
+ NULL, NULL);
+ object_class_property_set_description(oc, "dirty-ring-size",
+ "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
}
static const TypeInfo kvm_accel_type = {
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index e15ae8980d..72a01320a1 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -18,4 +18,11 @@ kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t
kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
kvm_resample_fd_notify(int gsi) "gsi %d"
+kvm_dirty_ring_full(int id) "vcpu %d"
+kvm_dirty_ring_reap_vcpu(int id) "vcpu %d"
+kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch %"PRIu32" offset 0x%"PRIx64
+kvm_dirty_ring_reaper(const char *s) "%s"
+kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)"
+kvm_dirty_ring_reaper_kick(const char *reason) "%s"
+kvm_dirty_ring_flush(int finished) "%d"
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index 0dc5271715..ad1279d2ed 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -29,8 +29,6 @@
#include "qemu/compiler.h"
#include "qemu/timer.h"
#include "qemu/rcu.h"
-#include "exec/tb-hash.h"
-#include "exec/tb-lookup.h"
#include "exec/log.h"
#include "qemu/main-loop.h"
#if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
@@ -40,6 +38,9 @@
#include "exec/cpu-all.h"
#include "sysemu/cpu-timers.h"
#include "sysemu/replay.h"
+#include "tb-hash.h"
+#include "tb-lookup.h"
+#include "tb-context.h"
#include "internal.h"
/* -icount align implementation. */
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 84e7d91a5c..f24348e979 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -24,7 +24,6 @@
#include "exec/memory.h"
#include "exec/cpu_ldst.h"
#include "exec/cputlb.h"
-#include "exec/tb-hash.h"
#include "exec/memory-internal.h"
#include "exec/ram_addr.h"
#include "tcg/tcg.h"
@@ -36,6 +35,7 @@
#include "exec/translate-all.h"
#include "trace/trace-root.h"
#include "trace/mem.h"
+#include "tb-hash.h"
#include "internal.h"
#ifdef CONFIG_PLUGIN
#include "qemu/plugin-memory.h"
@@ -707,8 +707,9 @@ void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr)
tlb_flush_page_by_mmuidx_all_cpus_synced(src, addr, ALL_MMUIDX_BITS);
}
-static void tlb_flush_page_bits_locked(CPUArchState *env, int midx,
- target_ulong page, unsigned bits)
+static void tlb_flush_range_locked(CPUArchState *env, int midx,
+ target_ulong addr, target_ulong len,
+ unsigned bits)
{
CPUTLBDesc *d = &env_tlb(env)->d[midx];
CPUTLBDescFast *f = &env_tlb(env)->f[midx];
@@ -718,20 +719,26 @@ static void tlb_flush_page_bits_locked(CPUArchState *env, int midx,
* If @bits is smaller than the tlb size, there may be multiple entries
* within the TLB; otherwise all addresses that match under @mask hit
* the same TLB entry.
- *
* TODO: Perhaps allow bits to be a few bits less than the size.
* For now, just flush the entire TLB.
+ *
+ * If @len is larger than the tlb size, then it will take longer to
+ * test all of the entries in the TLB than it will to flush it all.
*/
- if (mask < f->mask) {
+ if (mask < f->mask || len > f->mask) {
tlb_debug("forcing full flush midx %d ("
- TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
- midx, page, mask);
+ TARGET_FMT_lx "/" TARGET_FMT_lx "+" TARGET_FMT_lx ")\n",
+ midx, addr, mask, len);
tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime());
return;
}
- /* Check if we need to flush due to large pages. */
- if ((page & d->large_page_mask) == d->large_page_addr) {
+ /*
+ * Check if we need to flush due to large pages.
+ * Because large_page_mask contains all 1's from the msb,
+ * we only need to test the end of the range.
+ */
+ if (((addr + len - 1) & d->large_page_mask) == d->large_page_addr) {
tlb_debug("forcing full flush midx %d ("
TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
midx, d->large_page_addr, d->large_page_mask);
@@ -739,85 +746,67 @@ static void tlb_flush_page_bits_locked(CPUArchState *env, int midx,
return;
}
- if (tlb_flush_entry_mask_locked(tlb_entry(env, midx, page), page, mask)) {
- tlb_n_used_entries_dec(env, midx);
+ for (target_ulong i = 0; i < len; i += TARGET_PAGE_SIZE) {
+ target_ulong page = addr + i;
+ CPUTLBEntry *entry = tlb_entry(env, midx, page);
+
+ if (tlb_flush_entry_mask_locked(entry, page, mask)) {
+ tlb_n_used_entries_dec(env, midx);
+ }
+ tlb_flush_vtlb_page_mask_locked(env, midx, page, mask);
}
- tlb_flush_vtlb_page_mask_locked(env, midx, page, mask);
}
typedef struct {
target_ulong addr;
+ target_ulong len;
uint16_t idxmap;
uint16_t bits;
-} TLBFlushPageBitsByMMUIdxData;
+} TLBFlushRangeData;
-static void
-tlb_flush_page_bits_by_mmuidx_async_0(CPUState *cpu,
- TLBFlushPageBitsByMMUIdxData d)
+static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu,
+ TLBFlushRangeData d)
{
CPUArchState *env = cpu->env_ptr;
int mmu_idx;
assert_cpu_is_self(cpu);
- tlb_debug("page addr:" TARGET_FMT_lx "/%u mmu_map:0x%x\n",
- d.addr, d.bits, d.idxmap);
+ tlb_debug("range:" TARGET_FMT_lx "/%u+" TARGET_FMT_lx " mmu_map:0x%x\n",
+ d.addr, d.bits, d.len, d.idxmap);
qemu_spin_lock(&env_tlb(env)->c.lock);
for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
if ((d.idxmap >> mmu_idx) & 1) {
- tlb_flush_page_bits_locked(env, mmu_idx, d.addr, d.bits);
+ tlb_flush_range_locked(env, mmu_idx, d.addr, d.len, d.bits);
}
}
qemu_spin_unlock(&env_tlb(env)->c.lock);
- tb_flush_jmp_cache(cpu, d.addr);
-}
-
-static bool encode_pbm_to_runon(run_on_cpu_data *out,
- TLBFlushPageBitsByMMUIdxData d)
-{
- /* We need 6 bits to hold to hold @bits up to 63. */
- if (d.idxmap <= MAKE_64BIT_MASK(0, TARGET_PAGE_BITS - 6)) {
- *out = RUN_ON_CPU_TARGET_PTR(d.addr | (d.idxmap << 6) | d.bits);
- return true;
+ for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) {
+ tb_flush_jmp_cache(cpu, d.addr + i);
}
- return false;
-}
-
-static TLBFlushPageBitsByMMUIdxData
-decode_runon_to_pbm(run_on_cpu_data data)
-{
- target_ulong addr_map_bits = (target_ulong) data.target_ptr;
- return (TLBFlushPageBitsByMMUIdxData){
- .addr = addr_map_bits & TARGET_PAGE_MASK,
- .idxmap = (addr_map_bits & ~TARGET_PAGE_MASK) >> 6,
- .bits = addr_map_bits & 0x3f
- };
}
-static void tlb_flush_page_bits_by_mmuidx_async_1(CPUState *cpu,
- run_on_cpu_data runon)
+static void tlb_flush_range_by_mmuidx_async_1(CPUState *cpu,
+ run_on_cpu_data data)
{
- tlb_flush_page_bits_by_mmuidx_async_0(cpu, decode_runon_to_pbm(runon));
-}
-
-static void tlb_flush_page_bits_by_mmuidx_async_2(CPUState *cpu,
- run_on_cpu_data data)
-{
- TLBFlushPageBitsByMMUIdxData *d = data.host_ptr;
- tlb_flush_page_bits_by_mmuidx_async_0(cpu, *d);
+ TLBFlushRangeData *d = data.host_ptr;
+ tlb_flush_range_by_mmuidx_async_0(cpu, *d);
g_free(d);
}
-void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, target_ulong addr,
- uint16_t idxmap, unsigned bits)
+void tlb_flush_range_by_mmuidx(CPUState *cpu, target_ulong addr,
+ target_ulong len, uint16_t idxmap,
+ unsigned bits)
{
- TLBFlushPageBitsByMMUIdxData d;
- run_on_cpu_data runon;
+ TLBFlushRangeData d;
- /* If all bits are significant, this devolves to tlb_flush_page. */
- if (bits >= TARGET_LONG_BITS) {
+ /*
+ * If all bits are significant, and len is small,
+ * this devolves to tlb_flush_page.
+ */
+ if (bits >= TARGET_LONG_BITS && len <= TARGET_PAGE_SIZE) {
tlb_flush_page_by_mmuidx(cpu, addr, idxmap);
return;
}
@@ -829,34 +818,38 @@ void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, target_ulong addr,
/* This should already be page aligned */
d.addr = addr & TARGET_PAGE_MASK;
+ d.len = len;
d.idxmap = idxmap;
d.bits = bits;
if (qemu_cpu_is_self(cpu)) {
- tlb_flush_page_bits_by_mmuidx_async_0(cpu, d);
- } else if (encode_pbm_to_runon(&runon, d)) {
- async_run_on_cpu(cpu, tlb_flush_page_bits_by_mmuidx_async_1, runon);
+ tlb_flush_range_by_mmuidx_async_0(cpu, d);
} else {
- TLBFlushPageBitsByMMUIdxData *p
- = g_new(TLBFlushPageBitsByMMUIdxData, 1);
-
/* Otherwise allocate a structure, freed by the worker. */
- *p = d;
- async_run_on_cpu(cpu, tlb_flush_page_bits_by_mmuidx_async_2,
+ TLBFlushRangeData *p = g_memdup(&d, sizeof(d));
+ async_run_on_cpu(cpu, tlb_flush_range_by_mmuidx_async_1,
RUN_ON_CPU_HOST_PTR(p));
}
}
-void tlb_flush_page_bits_by_mmuidx_all_cpus(CPUState *src_cpu,
- target_ulong addr,
- uint16_t idxmap,
- unsigned bits)
+void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, target_ulong addr,
+ uint16_t idxmap, unsigned bits)
+{
+ tlb_flush_range_by_mmuidx(cpu, addr, TARGET_PAGE_SIZE, idxmap, bits);
+}
+
+void tlb_flush_range_by_mmuidx_all_cpus(CPUState *src_cpu,
+ target_ulong addr, target_ulong len,
+ uint16_t idxmap, unsigned bits)
{
- TLBFlushPageBitsByMMUIdxData d;
- run_on_cpu_data runon;
+ TLBFlushRangeData d;
+ CPUState *dst_cpu;
- /* If all bits are significant, this devolves to tlb_flush_page. */
- if (bits >= TARGET_LONG_BITS) {
+ /*
+ * If all bits are significant, and len is small,
+ * this devolves to tlb_flush_page.
+ */
+ if (bits >= TARGET_LONG_BITS && len <= TARGET_PAGE_SIZE) {
tlb_flush_page_by_mmuidx_all_cpus(src_cpu, addr, idxmap);
return;
}
@@ -868,40 +861,45 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus(CPUState *src_cpu,
/* This should already be page aligned */
d.addr = addr & TARGET_PAGE_MASK;
+ d.len = len;
d.idxmap = idxmap;
d.bits = bits;
- if (encode_pbm_to_runon(&runon, d)) {
- flush_all_helper(src_cpu, tlb_flush_page_bits_by_mmuidx_async_1, runon);
- } else {
- CPUState *dst_cpu;
- TLBFlushPageBitsByMMUIdxData *p;
-
- /* Allocate a separate data block for each destination cpu. */
- CPU_FOREACH(dst_cpu) {
- if (dst_cpu != src_cpu) {
- p = g_new(TLBFlushPageBitsByMMUIdxData, 1);
- *p = d;
- async_run_on_cpu(dst_cpu,
- tlb_flush_page_bits_by_mmuidx_async_2,
- RUN_ON_CPU_HOST_PTR(p));
- }
+ /* Allocate a separate data block for each destination cpu. */
+ CPU_FOREACH(dst_cpu) {
+ if (dst_cpu != src_cpu) {
+ TLBFlushRangeData *p = g_memdup(&d, sizeof(d));
+ async_run_on_cpu(dst_cpu,
+ tlb_flush_range_by_mmuidx_async_1,
+ RUN_ON_CPU_HOST_PTR(p));
}
}
- tlb_flush_page_bits_by_mmuidx_async_0(src_cpu, d);
+ tlb_flush_range_by_mmuidx_async_0(src_cpu, d);
}
-void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
- target_ulong addr,
- uint16_t idxmap,
- unsigned bits)
+void tlb_flush_page_bits_by_mmuidx_all_cpus(CPUState *src_cpu,
+ target_ulong addr,
+ uint16_t idxmap, unsigned bits)
{
- TLBFlushPageBitsByMMUIdxData d;
- run_on_cpu_data runon;
+ tlb_flush_range_by_mmuidx_all_cpus(src_cpu, addr, TARGET_PAGE_SIZE,
+ idxmap, bits);
+}
+
+void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+ target_ulong addr,
+ target_ulong len,
+ uint16_t idxmap,
+ unsigned bits)
+{
+ TLBFlushRangeData d, *p;
+ CPUState *dst_cpu;
- /* If all bits are significant, this devolves to tlb_flush_page. */
- if (bits >= TARGET_LONG_BITS) {
+ /*
+ * If all bits are significant, and len is small,
+ * this devolves to tlb_flush_page.
+ */
+ if (bits >= TARGET_LONG_BITS && len <= TARGET_PAGE_SIZE) {
tlb_flush_page_by_mmuidx_all_cpus_synced(src_cpu, addr, idxmap);
return;
}
@@ -913,32 +911,31 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
/* This should already be page aligned */
d.addr = addr & TARGET_PAGE_MASK;
+ d.len = len;
d.idxmap = idxmap;
d.bits = bits;
- if (encode_pbm_to_runon(&runon, d)) {
- flush_all_helper(src_cpu, tlb_flush_page_bits_by_mmuidx_async_1, runon);
- async_safe_run_on_cpu(src_cpu, tlb_flush_page_bits_by_mmuidx_async_1,
- runon);
- } else {
- CPUState *dst_cpu;
- TLBFlushPageBitsByMMUIdxData *p;
-
- /* Allocate a separate data block for each destination cpu. */
- CPU_FOREACH(dst_cpu) {
- if (dst_cpu != src_cpu) {
- p = g_new(TLBFlushPageBitsByMMUIdxData, 1);
- *p = d;
- async_run_on_cpu(dst_cpu, tlb_flush_page_bits_by_mmuidx_async_2,
- RUN_ON_CPU_HOST_PTR(p));
- }
+ /* Allocate a separate data block for each destination cpu. */
+ CPU_FOREACH(dst_cpu) {
+ if (dst_cpu != src_cpu) {
+ p = g_memdup(&d, sizeof(d));
+ async_run_on_cpu(dst_cpu, tlb_flush_range_by_mmuidx_async_1,
+ RUN_ON_CPU_HOST_PTR(p));
}
-
- p = g_new(TLBFlushPageBitsByMMUIdxData, 1);
- *p = d;
- async_safe_run_on_cpu(src_cpu, tlb_flush_page_bits_by_mmuidx_async_2,
- RUN_ON_CPU_HOST_PTR(p));
}
+
+ p = g_memdup(&d, sizeof(d));
+ async_safe_run_on_cpu(src_cpu, tlb_flush_range_by_mmuidx_async_1,
+ RUN_ON_CPU_HOST_PTR(p));
+}
+
+void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu,
+ target_ulong addr,
+ uint16_t idxmap,
+ unsigned bits)
+{
+ tlb_flush_range_by_mmuidx_all_cpus_synced(src_cpu, addr, TARGET_PAGE_SIZE,
+ idxmap, bits);
}
/* update the TLBs so that writes to code in the virtual page 'addr'
diff --git a/accel/tcg/tb-context.h b/accel/tcg/tb-context.h
new file mode 100644
index 0000000000..cc33979113
--- /dev/null
+++ b/accel/tcg/tb-context.h
@@ -0,0 +1,41 @@
+/*
+ * Internal structs that QEMU exports to TCG
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef QEMU_TB_CONTEXT_H
+#define QEMU_TB_CONTEXT_H
+
+#include "qemu/thread.h"
+#include "qemu/qht.h"
+
+#define CODE_GEN_HTABLE_BITS 15
+#define CODE_GEN_HTABLE_SIZE (1 << CODE_GEN_HTABLE_BITS)
+
+typedef struct TBContext TBContext;
+
+struct TBContext {
+
+ struct qht htable;
+
+ /* statistics */
+ unsigned tb_flush_count;
+};
+
+extern TBContext tb_ctx;
+
+#endif
diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h
new file mode 100644
index 0000000000..0a273d9605
--- /dev/null
+++ b/accel/tcg/tb-hash.h
@@ -0,0 +1,69 @@
+/*
+ * internal execution defines for qemu
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EXEC_TB_HASH_H
+#define EXEC_TB_HASH_H
+
+#include "exec/cpu-defs.h"
+#include "exec/exec-all.h"
+#include "qemu/xxhash.h"
+
+#ifdef CONFIG_SOFTMMU
+
+/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for
+ addresses on the same page. The top bits are the same. This allows
+ TLB invalidation to quickly clear a subset of the hash table. */
+#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2)
+#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS)
+#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1)
+#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE)
+
+static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc)
+{
+ target_ulong tmp;
+ tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+ return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK;
+}
+
+static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
+{
+ target_ulong tmp;
+ tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS));
+ return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK)
+ | (tmp & TB_JMP_ADDR_MASK));
+}
+
+#else
+
+/* In user-mode we can get better hashing because we do not have a TLB */
+static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
+{
+ return (pc ^ (pc >> TB_JMP_CACHE_BITS)) & (TB_JMP_CACHE_SIZE - 1);
+}
+
+#endif /* CONFIG_SOFTMMU */
+
+static inline
+uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags,
+ uint32_t cf_mask, uint32_t trace_vcpu_dstate)
+{
+ return qemu_xxhash7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate);
+}
+
+#endif
diff --git a/accel/tcg/tb-lookup.h b/accel/tcg/tb-lookup.h
new file mode 100644
index 0000000000..9c9e0079da
--- /dev/null
+++ b/accel/tcg/tb-lookup.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef EXEC_TB_LOOKUP_H
+#define EXEC_TB_LOOKUP_H
+
+#ifdef NEED_CPU_H
+#include "cpu.h"
+#else
+#include "exec/poison.h"
+#endif
+
+#include "exec/exec-all.h"
+#include "tb-hash.h"
+
+/* Might cause an exception, so have a longjmp destination ready */
+static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
+ target_ulong cs_base,
+ uint32_t flags, uint32_t cflags)
+{
+ TranslationBlock *tb;
+ uint32_t hash;
+
+ /* we should never be trying to look up an INVALID tb */
+ tcg_debug_assert(!(cflags & CF_INVALID));
+
+ hash = tb_jmp_cache_hash_func(pc);
+ tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+
+ if (likely(tb &&
+ tb->pc == pc &&
+ tb->cs_base == cs_base &&
+ tb->flags == flags &&
+ tb->trace_vcpu_dstate == *cpu->trace_dstate &&
+ tb_cflags(tb) == cflags)) {
+ return tb;
+ }
+ tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags);
+ if (tb == NULL) {
+ return NULL;
+ }
+ qatomic_set(&cpu->tb_jmp_cache[hash], tb);
+ return tb;
+}
+
+#endif /* EXEC_TB_LOOKUP_H */
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index 521da4a813..ac7d28c251 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -1073,9 +1073,8 @@ void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(int32_t)) {
int32_t ai = *(int32_t *)(a + i);
int32_t bi = *(int32_t *)(b + i);
- int32_t di = ai + bi;
- if (((di ^ ai) &~ (ai ^ bi)) < 0) {
- /* Signed overflow. */
+ int32_t di;
+ if (sadd32_overflow(ai, bi, &di)) {
di = (di < 0 ? INT32_MAX : INT32_MIN);
}
*(int32_t *)(d + i) = di;
@@ -1091,9 +1090,8 @@ void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(int64_t)) {
int64_t ai = *(int64_t *)(a + i);
int64_t bi = *(int64_t *)(b + i);
- int64_t di = ai + bi;
- if (((di ^ ai) &~ (ai ^ bi)) < 0) {
- /* Signed overflow. */
+ int64_t di;
+ if (sadd64_overflow(ai, bi, &di)) {
di = (di < 0 ? INT64_MAX : INT64_MIN);
}
*(int64_t *)(d + i) = di;
@@ -1143,9 +1141,8 @@ void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(int32_t)) {
int32_t ai = *(int32_t *)(a + i);
int32_t bi = *(int32_t *)(b + i);
- int32_t di = ai - bi;
- if (((di ^ ai) & (ai ^ bi)) < 0) {
- /* Signed overflow. */
+ int32_t di;
+ if (ssub32_overflow(ai, bi, &di)) {
di = (di < 0 ? INT32_MAX : INT32_MIN);
}
*(int32_t *)(d + i) = di;
@@ -1161,9 +1158,8 @@ void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(int64_t)) {
int64_t ai = *(int64_t *)(a + i);
int64_t bi = *(int64_t *)(b + i);
- int64_t di = ai - bi;
- if (((di ^ ai) & (ai ^ bi)) < 0) {
- /* Signed overflow. */
+ int64_t di;
+ if (ssub64_overflow(ai, bi, &di)) {
di = (di < 0 ? INT64_MAX : INT64_MIN);
}
*(int64_t *)(d + i) = di;
@@ -1209,8 +1205,8 @@ void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
uint32_t ai = *(uint32_t *)(a + i);
uint32_t bi = *(uint32_t *)(b + i);
- uint32_t di = ai + bi;
- if (di < ai) {
+ uint32_t di;
+ if (uadd32_overflow(ai, bi, &di)) {
di = UINT32_MAX;
}
*(uint32_t *)(d + i) = di;
@@ -1226,8 +1222,8 @@ void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
uint64_t ai = *(uint64_t *)(a + i);
uint64_t bi = *(uint64_t *)(b + i);
- uint64_t di = ai + bi;
- if (di < ai) {
+ uint64_t di;
+ if (uadd64_overflow(ai, bi, &di)) {
di = UINT64_MAX;
}
*(uint64_t *)(d + i) = di;
@@ -1273,8 +1269,8 @@ void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
uint32_t ai = *(uint32_t *)(a + i);
uint32_t bi = *(uint32_t *)(b + i);
- uint32_t di = ai - bi;
- if (ai < bi) {
+ uint32_t di;
+ if (usub32_overflow(ai, bi, &di)) {
di = 0;
}
*(uint32_t *)(d + i) = di;
@@ -1290,8 +1286,8 @@ void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
uint64_t ai = *(uint64_t *)(a + i);
uint64_t bi = *(uint64_t *)(b + i);
- uint64_t di = ai - bi;
- if (ai < bi) {
+ uint64_t di;
+ if (usub64_overflow(ai, bi, &di)) {
di = 0;
}
*(uint64_t *)(d + i) = di;
diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c
index 49f5de37e8..66ac830e2f 100644
--- a/accel/tcg/tcg-runtime.c
+++ b/accel/tcg/tcg-runtime.c
@@ -30,7 +30,7 @@
#include "disas/disas.h"
#include "exec/log.h"
#include "tcg/tcg.h"
-#include "exec/tb-lookup.h"
+#include "tb-lookup.h"
/* 32-bit helpers */
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index ae7e873713..1eefe6ea8d 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -47,7 +47,6 @@
#endif
#include "exec/cputlb.h"
-#include "exec/tb-hash.h"
#include "exec/translate-all.h"
#include "qemu/bitmap.h"
#include "qemu/error-report.h"
@@ -60,6 +59,8 @@
#include "sysemu/tcg.h"
#include "qapi/error.h"
#include "hw/core/tcg-cpu-ops.h"
+#include "tb-hash.h"
+#include "tb-context.h"
#include "internal.h"
/* #define DEBUG_TB_INVALIDATE */
@@ -1912,6 +1913,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
tcg_ctx->cpu = env_cpu(env);
gen_intermediate_code(cpu, tb, max_insns);
+ assert(tb->size != 0);
tcg_ctx->cpu = NULL;
max_insns = tb->icount;
@@ -2042,8 +2044,15 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
int i;
qemu_log(" data: [size=%d]\n", data_size);
for (i = 0; i < data_size / sizeof(tcg_target_ulong); i++) {
- qemu_log("0x%08" PRIxPTR ": .quad 0x%" TCG_PRIlx "\n",
- (uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
+ if (sizeof(tcg_target_ulong) == 8) {
+ qemu_log("0x%08" PRIxPTR ": .quad 0x%016" TCG_PRIlx "\n",
+ (uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
+ } else if (sizeof(tcg_target_ulong) == 4) {
+ qemu_log("0x%08" PRIxPTR ": .long 0x%08" TCG_PRIlx "\n",
+ (uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]);
+ } else {
+ qemu_build_not_reached();
+ }
}
}
qemu_log("\n");