diff options
Diffstat (limited to 'accel')
| -rw-r--r-- | accel/Kconfig | 3 | ||||
| -rw-r--r-- | accel/accel-common.c | 32 | ||||
| -rw-r--r-- | accel/kvm/kvm-all.c | 615 | ||||
| -rw-r--r-- | accel/kvm/trace-events | 7 | ||||
| -rw-r--r-- | accel/tcg/cpu-exec.c | 5 | ||||
| -rw-r--r-- | accel/tcg/cputlb.c | 233 | ||||
| -rw-r--r-- | accel/tcg/tb-context.h | 41 | ||||
| -rw-r--r-- | accel/tcg/tb-hash.h | 69 | ||||
| -rw-r--r-- | accel/tcg/tb-lookup.h | 49 | ||||
| -rw-r--r-- | accel/tcg/tcg-runtime-gvec.c | 36 | ||||
| -rw-r--r-- | accel/tcg/tcg-runtime.c | 2 | ||||
| -rw-r--r-- | accel/tcg/translate-all.c | 15 |
12 files changed, 875 insertions, 232 deletions
diff --git a/accel/Kconfig b/accel/Kconfig index 461104c771..8bdedb7d15 100644 --- a/accel/Kconfig +++ b/accel/Kconfig @@ -1,6 +1,9 @@ config WHPX bool +config NVMM + bool + config HAX bool diff --git a/accel/accel-common.c b/accel/accel-common.c index 9901b0531c..cf07f78421 100644 --- a/accel/accel-common.c +++ b/accel/accel-common.c @@ -54,10 +54,23 @@ static void accel_init_cpu_int_aux(ObjectClass *klass, void *opaque) CPUClass *cc = CPU_CLASS(klass); AccelCPUClass *accel_cpu = opaque; + /* + * The first callback allows accel-cpu to run initializations + * for the CPU, customizing CPU behavior according to the accelerator. + * + * The second one allows the CPU to customize the accel-cpu + * behavior according to the CPU. + * + * The second is currently only used by TCG, to specialize the + * TCGCPUOps depending on the CPU type. + */ cc->accel_cpu = accel_cpu; if (accel_cpu->cpu_class_init) { accel_cpu->cpu_class_init(cc); } + if (cc->init_accel_cpu) { + cc->init_accel_cpu(accel_cpu, cc); + } } /* initialize the arch-specific accel CpuClass interfaces */ @@ -89,6 +102,25 @@ void accel_init_interfaces(AccelClass *ac) accel_init_cpu_interfaces(ac); } +void accel_cpu_instance_init(CPUState *cpu) +{ + CPUClass *cc = CPU_GET_CLASS(cpu); + + if (cc->accel_cpu && cc->accel_cpu->cpu_instance_init) { + cc->accel_cpu->cpu_instance_init(cpu); + } +} + +bool accel_cpu_realizefn(CPUState *cpu, Error **errp) +{ + CPUClass *cc = CPU_GET_CLASS(cpu); + + if (cc->accel_cpu && cc->accel_cpu->cpu_realizefn) { + return cc->accel_cpu->cpu_realizefn(cpu, errp); + } + return true; +} + static const TypeInfo accel_cpu_type = { .name = TYPE_ACCEL_CPU, .parent = TYPE_OBJECT, diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index 4e0168e88b..c7ec538850 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -15,6 +15,7 @@ #include "qemu/osdep.h" #include <sys/ioctl.h> +#include <poll.h> #include <linux/kvm.h> @@ -78,6 +79,25 @@ struct KVMParkedVcpu { QLIST_ENTRY(KVMParkedVcpu) node; }; +enum KVMDirtyRingReaperState { + KVM_DIRTY_RING_REAPER_NONE = 0, + /* The reaper is sleeping */ + KVM_DIRTY_RING_REAPER_WAIT, + /* The reaper is reaping for dirty pages */ + KVM_DIRTY_RING_REAPER_REAPING, +}; + +/* + * KVM reaper instance, responsible for collecting the KVM dirty bits + * via the dirty ring. + */ +struct KVMDirtyRingReaper { + /* The reaper thread */ + QemuThread reaper_thr; + volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ + volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ +}; + struct KVMState { AccelState parent_obj; @@ -126,6 +146,9 @@ struct KVMState KVMMemoryListener *ml; AddressSpace *as; } *as; + uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ + uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ + struct KVMDirtyRingReaper reaper; }; KVMState *kvm_state; @@ -172,8 +195,12 @@ typedef struct KVMResampleFd KVMResampleFd; static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list = QLIST_HEAD_INITIALIZER(kvm_resample_fd_list); -#define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock) -#define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock) +static QemuMutex kml_slots_lock; + +#define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock) +#define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock) + +static void kvm_slot_init_dirty_bitmap(KVMSlot *mem); static inline void kvm_resample_fd_remove(int gsi) { @@ -239,9 +266,9 @@ bool kvm_has_free_slot(MachineState *ms) bool result; KVMMemoryListener *kml = &s->memory_listener; - kvm_slots_lock(kml); + kvm_slots_lock(); result = !!kvm_get_free_slot(kml); - kvm_slots_unlock(kml); + kvm_slots_unlock(); return result; } @@ -307,7 +334,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, KVMMemoryListener *kml = &s->memory_listener; int i, ret = 0; - kvm_slots_lock(kml); + kvm_slots_lock(); for (i = 0; i < s->nr_slots; i++) { KVMSlot *mem = &kml->slots[i]; @@ -317,7 +344,7 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, break; } } - kvm_slots_unlock(kml); + kvm_slots_unlock(); return ret; } @@ -383,6 +410,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu) goto err; } + if (cpu->kvm_dirty_gfns) { + ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size); + if (ret < 0) { + goto err; + } + } + vcpu = g_malloc0(sizeof(*vcpu)); vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); vcpu->kvm_fd = cpu->kvm_fd; @@ -459,6 +493,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp) (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; } + if (s->kvm_dirty_ring_size) { + /* Use MAP_SHARED to share pages with the kernel */ + cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size, + PROT_READ | PROT_WRITE, MAP_SHARED, + cpu->kvm_fd, + PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET); + if (cpu->kvm_dirty_gfns == MAP_FAILED) { + ret = -errno; + DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret); + goto err; + } + } + ret = kvm_arch_init_vcpu(cpu); if (ret < 0) { error_setg_errno(errp, -ret, @@ -498,6 +545,7 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, return 0; } + kvm_slot_init_dirty_bitmap(mem); return kvm_set_user_memory_region(kml, mem, false); } @@ -513,7 +561,7 @@ static int kvm_section_update_flags(KVMMemoryListener *kml, return 0; } - kvm_slots_lock(kml); + kvm_slots_lock(); while (size && !ret) { slot_size = MIN(kvm_max_slot_size, size); @@ -529,7 +577,7 @@ static int kvm_section_update_flags(KVMMemoryListener *kml, } out: - kvm_slots_unlock(kml); + kvm_slots_unlock(); return ret; } @@ -568,22 +616,28 @@ static void kvm_log_stop(MemoryListener *listener, } /* get kvm's dirty pages bitmap and update qemu's */ -static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, - unsigned long *bitmap) +static void kvm_slot_sync_dirty_pages(KVMSlot *slot) { - ram_addr_t start = section->offset_within_region + - memory_region_get_ram_addr(section->mr); - ram_addr_t pages = int128_get64(section->size) / qemu_real_host_page_size; + ram_addr_t start = slot->ram_start_offset; + ram_addr_t pages = slot->memory_size / qemu_real_host_page_size; - cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); - return 0; + cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages); +} + +static void kvm_slot_reset_dirty_pages(KVMSlot *slot) +{ + memset(slot->dirty_bmap, 0, slot->dirty_bmap_size); } #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) /* Allocate the dirty bitmap for a slot */ -static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) +static void kvm_slot_init_dirty_bitmap(KVMSlot *mem) { + if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) { + return; + } + /* * XXX bad kernel interface alert * For dirty bitmap, kernel allocates array of size aligned to @@ -604,6 +658,196 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size, /*HOST_LONG_BITS*/ 64) / 8; mem->dirty_bmap = g_malloc0(bitmap_size); + mem->dirty_bmap_size = bitmap_size; +} + +/* + * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if + * succeeded, false otherwise + */ +static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot) +{ + struct kvm_dirty_log d = {}; + int ret; + + d.dirty_bitmap = slot->dirty_bmap; + d.slot = slot->slot | (slot->as_id << 16); + ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); + + if (ret == -ENOENT) { + /* kernel does not have dirty bitmap in this slot */ + ret = 0; + } + if (ret) { + error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d", + __func__, ret); + } + return ret == 0; +} + +/* Should be with all slots_lock held for the address spaces. */ +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id, + uint32_t slot_id, uint64_t offset) +{ + KVMMemoryListener *kml; + KVMSlot *mem; + + if (as_id >= s->nr_as) { + return; + } + + kml = s->as[as_id].ml; + mem = &kml->slots[slot_id]; + + if (!mem->memory_size || offset >= + (mem->memory_size / qemu_real_host_page_size)) { + return; + } + + set_bit(offset, mem->dirty_bmap); +} + +static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) +{ + return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; +} + +static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) +{ + gfn->flags = KVM_DIRTY_GFN_F_RESET; +} + +/* + * Should be with all slots_lock held for the address spaces. It returns the + * dirty page we've collected on this dirty ring. + */ +static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu) +{ + struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur; + uint32_t ring_size = s->kvm_dirty_ring_size; + uint32_t count = 0, fetch = cpu->kvm_fetch_index; + + assert(dirty_gfns && ring_size); + trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index); + + while (true) { + cur = &dirty_gfns[fetch % ring_size]; + if (!dirty_gfn_is_dirtied(cur)) { + break; + } + kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff, + cur->offset); + dirty_gfn_set_collected(cur); + trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset); + fetch++; + count++; + } + cpu->kvm_fetch_index = fetch; + + return count; +} + +/* Must be with slots_lock held */ +static uint64_t kvm_dirty_ring_reap_locked(KVMState *s) +{ + int ret; + CPUState *cpu; + uint64_t total = 0; + int64_t stamp; + + stamp = get_clock(); + + CPU_FOREACH(cpu) { + total += kvm_dirty_ring_reap_one(s, cpu); + } + + if (total) { + ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS); + assert(ret == total); + } + + stamp = get_clock() - stamp; + + if (total) { + trace_kvm_dirty_ring_reap(total, stamp / 1000); + } + + return total; +} + +/* + * Currently for simplicity, we must hold BQL before calling this. We can + * consider to drop the BQL if we're clear with all the race conditions. + */ +static uint64_t kvm_dirty_ring_reap(KVMState *s) +{ + uint64_t total; + + /* + * We need to lock all kvm slots for all address spaces here, + * because: + * + * (1) We need to mark dirty for dirty bitmaps in multiple slots + * and for tons of pages, so it's better to take the lock here + * once rather than once per page. And more importantly, + * + * (2) We must _NOT_ publish dirty bits to the other threads + * (e.g., the migration thread) via the kvm memory slot dirty + * bitmaps before correctly re-protect those dirtied pages. + * Otherwise we can have potential risk of data corruption if + * the page data is read in the other thread before we do + * reset below. + */ + kvm_slots_lock(); + total = kvm_dirty_ring_reap_locked(s); + kvm_slots_unlock(); + + return total; +} + +static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg) +{ + /* No need to do anything */ +} + +/* + * Kick all vcpus out in a synchronized way. When returned, we + * guarantee that every vcpu has been kicked and at least returned to + * userspace once. + */ +static void kvm_cpu_synchronize_kick_all(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL); + } +} + +/* + * Flush all the existing dirty pages to the KVM slot buffers. When + * this call returns, we guarantee that all the touched dirty pages + * before calling this function have been put into the per-kvmslot + * dirty bitmap. + * + * This function must be called with BQL held. + */ +static void kvm_dirty_ring_flush(void) +{ + trace_kvm_dirty_ring_flush(0); + /* + * The function needs to be serialized. Since this function + * should always be with BQL held, serialization is guaranteed. + * However, let's be sure of it. + */ + assert(qemu_mutex_iothread_locked()); + /* + * First make sure to flush the hardware buffers by kicking all + * vcpus out in a synchronous way. + */ + kvm_cpu_synchronize_kick_all(); + kvm_dirty_ring_reap(kvm_state); + trace_kvm_dirty_ring_flush(1); } /** @@ -617,53 +861,28 @@ static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) * @kml: the KVM memory listener object * @section: the memory section to sync the dirty bitmap with */ -static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, - MemoryRegionSection *section) +static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, + MemoryRegionSection *section) { KVMState *s = kvm_state; - struct kvm_dirty_log d = {}; KVMSlot *mem; hwaddr start_addr, size; - hwaddr slot_size, slot_offset = 0; - int ret = 0; + hwaddr slot_size; size = kvm_align_section(section, &start_addr); while (size) { - MemoryRegionSection subsection = *section; - slot_size = MIN(kvm_max_slot_size, size); mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); if (!mem) { /* We don't have a slot if we want to trap every access. */ - goto out; - } - - if (!mem->dirty_bmap) { - /* Allocate on the first log_sync, once and for all */ - kvm_memslot_init_dirty_bitmap(mem); + return; } - - d.dirty_bitmap = mem->dirty_bmap; - d.slot = mem->slot | (kml->as_id << 16); - ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); - if (ret == -ENOENT) { - /* kernel does not have dirty bitmap in this slot */ - ret = 0; - } else if (ret < 0) { - error_report("ioctl KVM_GET_DIRTY_LOG failed: %d", errno); - goto out; - } else { - subsection.offset_within_region += slot_offset; - subsection.size = int128_make64(slot_size); - kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); + if (kvm_slot_get_dirty_log(s, mem)) { + kvm_slot_sync_dirty_pages(mem); } - - slot_offset += slot_size; start_addr += slot_size; size -= slot_size; } -out: - return ret; } /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ @@ -810,7 +1029,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, return ret; } - kvm_slots_lock(kml); + kvm_slots_lock(); for (i = 0; i < s->nr_slots; i++) { mem = &kml->slots[i]; @@ -836,7 +1055,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, } } - kvm_slots_unlock(kml); + kvm_slots_unlock(); return ret; } @@ -1119,7 +1338,8 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, int err; MemoryRegion *mr = section->mr; bool writeable = !mr->readonly && !mr->rom_device; - hwaddr start_addr, size, slot_size; + hwaddr start_addr, size, slot_size, mr_offset; + ram_addr_t ram_start_offset; void *ram; if (!memory_region_is_ram(mr)) { @@ -1137,11 +1357,15 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, return; } - /* use aligned delta to align the ram address */ - ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + - (start_addr - section->offset_within_address_space); + /* The offset of the kvmslot within the memory region */ + mr_offset = section->offset_within_region + start_addr - + section->offset_within_address_space; - kvm_slots_lock(kml); + /* use aligned delta to align the ram address and offset */ + ram = memory_region_get_ram_ptr(mr) + mr_offset; + ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset; + + kvm_slots_lock(); if (!add) { do { @@ -1151,7 +1375,25 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, goto out; } if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { - kvm_physical_sync_dirty_bitmap(kml, section); + /* + * NOTE: We should be aware of the fact that here we're only + * doing a best effort to sync dirty bits. No matter whether + * we're using dirty log or dirty ring, we ignored two facts: + * + * (1) dirty bits can reside in hardware buffers (PML) + * + * (2) after we collected dirty bits here, pages can be dirtied + * again before we do the final KVM_SET_USER_MEMORY_REGION to + * remove the slot. + * + * Not easy. Let's cross the fingers until it's fixed. + */ + if (kvm_state->kvm_dirty_ring_size) { + kvm_dirty_ring_reap_locked(kvm_state); + } else { + kvm_slot_get_dirty_log(kvm_state, mem); + } + kvm_slot_sync_dirty_pages(mem); } /* unregister the slot */ @@ -1175,18 +1417,13 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, do { slot_size = MIN(kvm_max_slot_size, size); mem = kvm_alloc_slot(kml); + mem->as_id = kml->as_id; mem->memory_size = slot_size; mem->start_addr = start_addr; + mem->ram_start_offset = ram_start_offset; mem->ram = ram; mem->flags = kvm_mem_flags(mr); - - if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { - /* - * Reallocate the bmap; it means it doesn't disappear in - * middle of a migrate. - */ - kvm_memslot_init_dirty_bitmap(mem); - } + kvm_slot_init_dirty_bitmap(mem); err = kvm_set_user_memory_region(kml, mem, true); if (err) { fprintf(stderr, "%s: error registering slot: %s\n", __func__, @@ -1194,12 +1431,58 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, abort(); } start_addr += slot_size; + ram_start_offset += slot_size; ram += slot_size; size -= slot_size; } while (size); out: - kvm_slots_unlock(kml); + kvm_slots_unlock(); +} + +static void *kvm_dirty_ring_reaper_thread(void *data) +{ + KVMState *s = data; + struct KVMDirtyRingReaper *r = &s->reaper; + + rcu_register_thread(); + + trace_kvm_dirty_ring_reaper("init"); + + while (true) { + r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT; + trace_kvm_dirty_ring_reaper("wait"); + /* + * TODO: provide a smarter timeout rather than a constant? + */ + sleep(1); + + trace_kvm_dirty_ring_reaper("wakeup"); + r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING; + + qemu_mutex_lock_iothread(); + kvm_dirty_ring_reap(s); + qemu_mutex_unlock_iothread(); + + r->reaper_iteration++; + } + + trace_kvm_dirty_ring_reaper("exit"); + + rcu_unregister_thread(); + + return NULL; +} + +static int kvm_dirty_ring_reaper_init(KVMState *s) +{ + struct KVMDirtyRingReaper *r = &s->reaper; + + qemu_thread_create(&r->reaper_thr, "kvm-reaper", + kvm_dirty_ring_reaper_thread, + s, QEMU_THREAD_JOINABLE); + + return 0; } static void kvm_region_add(MemoryListener *listener, @@ -1224,14 +1507,40 @@ static void kvm_log_sync(MemoryListener *listener, MemoryRegionSection *section) { KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); - int r; - kvm_slots_lock(kml); - r = kvm_physical_sync_dirty_bitmap(kml, section); - kvm_slots_unlock(kml); - if (r < 0) { - abort(); + kvm_slots_lock(); + kvm_physical_sync_dirty_bitmap(kml, section); + kvm_slots_unlock(); +} + +static void kvm_log_sync_global(MemoryListener *l) +{ + KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener); + KVMState *s = kvm_state; + KVMSlot *mem; + int i; + + /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ + kvm_dirty_ring_flush(); + + /* + * TODO: make this faster when nr_slots is big while there are + * only a few used slots (small VMs). + */ + kvm_slots_lock(); + for (i = 0; i < s->nr_slots; i++) { + mem = &kml->slots[i]; + if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { + kvm_slot_sync_dirty_pages(mem); + /* + * This is not needed by KVM_GET_DIRTY_LOG because the + * ioctl will unconditionally overwrite the whole region. + * However kvm dirty ring has no such side effect. + */ + kvm_slot_reset_dirty_pages(mem); + } } + kvm_slots_unlock(); } static void kvm_log_clear(MemoryListener *listener, @@ -1328,7 +1637,6 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, { int i; - qemu_mutex_init(&kml->slots_lock); kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); kml->as_id = as_id; @@ -1340,10 +1648,15 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, kml->listener.region_del = kvm_region_del; kml->listener.log_start = kvm_log_start; kml->listener.log_stop = kvm_log_stop; - kml->listener.log_sync = kvm_log_sync; - kml->listener.log_clear = kvm_log_clear; kml->listener.priority = 10; + if (s->kvm_dirty_ring_size) { + kml->listener.log_sync_global = kvm_log_sync_global; + } else { + kml->listener.log_sync = kvm_log_sync; + kml->listener.log_clear = kvm_log_clear; + } + memory_listener_register(&kml->listener, as); for (i = 0; i < s->nr_as; ++i) { @@ -2001,6 +2314,8 @@ static int kvm_init(MachineState *ms) int type = 0; uint64_t dirty_log_manual_caps; + qemu_mutex_init(&kml_slots_lock); + s = KVM_STATE(ms->accelerator); /* @@ -2017,7 +2332,6 @@ static int kvm_init(MachineState *ms) QTAILQ_INIT(&s->kvm_sw_breakpoints); #endif QLIST_INIT(&s->kvm_parked_vcpus); - s->vmfd = -1; s->fd = qemu_open_old("/dev/kvm", O_RDWR); if (s->fd == -1) { fprintf(stderr, "Could not access KVM kernel module: %m\n"); @@ -2125,20 +2439,70 @@ static int kvm_init(MachineState *ms) s->coalesced_pio = s->coalesced_mmio && kvm_check_extension(s, KVM_CAP_COALESCED_PIO); - dirty_log_manual_caps = - kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); - dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | - KVM_DIRTY_LOG_INITIALLY_SET); - s->manual_dirty_log_protect = dirty_log_manual_caps; - if (dirty_log_manual_caps) { - ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, - dirty_log_manual_caps); - if (ret) { - warn_report("Trying to enable capability %"PRIu64" of " - "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " - "Falling back to the legacy mode. ", - dirty_log_manual_caps); - s->manual_dirty_log_protect = 0; + /* + * Enable KVM dirty ring if supported, otherwise fall back to + * dirty logging mode + */ + if (s->kvm_dirty_ring_size > 0) { + uint64_t ring_bytes; + + ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn); + + /* Read the max supported pages */ + ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING); + if (ret > 0) { + if (ring_bytes > ret) { + error_report("KVM dirty ring size %" PRIu32 " too big " + "(maximum is %ld). Please use a smaller value.", + s->kvm_dirty_ring_size, + (long)ret / sizeof(struct kvm_dirty_gfn)); + ret = -EINVAL; + goto err; + } + + ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes); + if (ret) { + error_report("Enabling of KVM dirty ring failed: %s. " + "Suggested mininum value is 1024.", strerror(-ret)); + goto err; + } + + s->kvm_dirty_ring_bytes = ring_bytes; + } else { + warn_report("KVM dirty ring not available, using bitmap method"); + s->kvm_dirty_ring_size = 0; + } + } + + /* + * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is + * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no + * page is wr-protected initially, which is against how kvm dirty ring is + * usage - kvm dirty ring requires all pages are wr-protected at the very + * beginning. Enabling this feature for dirty ring causes data corruption. + * + * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, + * we may expect a higher stall time when starting the migration. In the + * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: + * instead of clearing dirty bit, it can be a way to explicitly wr-protect + * guest pages. + */ + if (!s->kvm_dirty_ring_size) { + dirty_log_manual_caps = + kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + s->manual_dirty_log_protect = dirty_log_manual_caps; + if (dirty_log_manual_caps) { + ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, + dirty_log_manual_caps); + if (ret) { + warn_report("Trying to enable capability %"PRIu64" of " + "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " + "Falling back to the legacy mode. ", + dirty_log_manual_caps); + s->manual_dirty_log_protect = 0; + } } } @@ -2224,6 +2588,14 @@ static int kvm_init(MachineState *ms) ret = ram_block_discard_disable(true); assert(!ret); } + + if (s->kvm_dirty_ring_size) { + ret = kvm_dirty_ring_reaper_init(s); + if (ret) { + goto err; + } + } + return 0; err: @@ -2536,6 +2908,17 @@ int kvm_cpu_exec(CPUState *cpu) case KVM_EXIT_INTERNAL_ERROR: ret = kvm_handle_internal_error(cpu, run); break; + case KVM_EXIT_DIRTY_RING_FULL: + /* + * We shouldn't continue if the dirty ring of this vcpu is + * still full. Got kicked by KVM_RESET_DIRTY_RINGS. + */ + trace_kvm_dirty_ring_full(cpu->cpu_index); + qemu_mutex_lock_iothread(); + kvm_dirty_ring_reap(kvm_state); + qemu_mutex_unlock_iothread(); + ret = 0; + break; case KVM_EXIT_SYSTEM_EVENT: switch (run->system_event.type) { case KVM_SYSTEM_EVENT_SHUTDOWN: @@ -3112,6 +3495,11 @@ static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v, KVMState *s = KVM_STATE(obj); int64_t value; + if (s->fd != -1) { + error_setg(errp, "Cannot set properties after the accelerator has been initialized"); + return; + } + if (!visit_type_int(v, name, &value, errp)) { return; } @@ -3126,6 +3514,11 @@ static void kvm_set_kernel_irqchip(Object *obj, Visitor *v, KVMState *s = KVM_STATE(obj); OnOffSplit mode; + if (s->fd != -1) { + error_setg(errp, "Cannot set properties after the accelerator has been initialized"); + return; + } + if (!visit_type_OnOffSplit(v, name, &mode, errp)) { return; } @@ -3168,13 +3561,53 @@ bool kvm_kernel_irqchip_split(void) return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; } +static void kvm_get_dirty_ring_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + KVMState *s = KVM_STATE(obj); + uint32_t value = s->kvm_dirty_ring_size; + + visit_type_uint32(v, name, &value, errp); +} + +static void kvm_set_dirty_ring_size(Object *obj, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + KVMState *s = KVM_STATE(obj); + Error *error = NULL; + uint32_t value; + + if (s->fd != -1) { + error_setg(errp, "Cannot set properties after the accelerator has been initialized"); + return; + } + + visit_type_uint32(v, name, &value, &error); + if (error) { + error_propagate(errp, error); + return; + } + if (value & (value - 1)) { + error_setg(errp, "dirty-ring-size must be a power of two."); + return; + } + + s->kvm_dirty_ring_size = value; +} + static void kvm_accel_instance_init(Object *obj) { KVMState *s = KVM_STATE(obj); + s->fd = -1; + s->vmfd = -1; s->kvm_shadow_mem = -1; s->kernel_irqchip_allowed = true; s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; + /* KVM dirty ring is by default off */ + s->kvm_dirty_ring_size = 0; } static void kvm_accel_class_init(ObjectClass *oc, void *data) @@ -3196,6 +3629,12 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "kvm-shadow-mem", "KVM shadow MMU size"); + + object_class_property_add(oc, "dirty-ring-size", "uint32", + kvm_get_dirty_ring_size, kvm_set_dirty_ring_size, + NULL, NULL); + object_class_property_set_description(oc, "dirty-ring-size", + "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)"); } static const TypeInfo kvm_accel_type = { diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index e15ae8980d..72a01320a1 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -18,4 +18,11 @@ kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d" kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32 kvm_resample_fd_notify(int gsi) "gsi %d" +kvm_dirty_ring_full(int id) "vcpu %d" +kvm_dirty_ring_reap_vcpu(int id) "vcpu %d" +kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch %"PRIu32" offset 0x%"PRIx64 +kvm_dirty_ring_reaper(const char *s) "%s" +kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)" +kvm_dirty_ring_reaper_kick(const char *reason) "%s" +kvm_dirty_ring_flush(int finished) "%d" diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 0dc5271715..ad1279d2ed 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -29,8 +29,6 @@ #include "qemu/compiler.h" #include "qemu/timer.h" #include "qemu/rcu.h" -#include "exec/tb-hash.h" -#include "exec/tb-lookup.h" #include "exec/log.h" #include "qemu/main-loop.h" #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY) @@ -40,6 +38,9 @@ #include "exec/cpu-all.h" #include "sysemu/cpu-timers.h" #include "sysemu/replay.h" +#include "tb-hash.h" +#include "tb-lookup.h" +#include "tb-context.h" #include "internal.h" /* -icount align implementation. */ diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c index 84e7d91a5c..f24348e979 100644 --- a/accel/tcg/cputlb.c +++ b/accel/tcg/cputlb.c @@ -24,7 +24,6 @@ #include "exec/memory.h" #include "exec/cpu_ldst.h" #include "exec/cputlb.h" -#include "exec/tb-hash.h" #include "exec/memory-internal.h" #include "exec/ram_addr.h" #include "tcg/tcg.h" @@ -36,6 +35,7 @@ #include "exec/translate-all.h" #include "trace/trace-root.h" #include "trace/mem.h" +#include "tb-hash.h" #include "internal.h" #ifdef CONFIG_PLUGIN #include "qemu/plugin-memory.h" @@ -707,8 +707,9 @@ void tlb_flush_page_all_cpus_synced(CPUState *src, target_ulong addr) tlb_flush_page_by_mmuidx_all_cpus_synced(src, addr, ALL_MMUIDX_BITS); } -static void tlb_flush_page_bits_locked(CPUArchState *env, int midx, - target_ulong page, unsigned bits) +static void tlb_flush_range_locked(CPUArchState *env, int midx, + target_ulong addr, target_ulong len, + unsigned bits) { CPUTLBDesc *d = &env_tlb(env)->d[midx]; CPUTLBDescFast *f = &env_tlb(env)->f[midx]; @@ -718,20 +719,26 @@ static void tlb_flush_page_bits_locked(CPUArchState *env, int midx, * If @bits is smaller than the tlb size, there may be multiple entries * within the TLB; otherwise all addresses that match under @mask hit * the same TLB entry. - * * TODO: Perhaps allow bits to be a few bits less than the size. * For now, just flush the entire TLB. + * + * If @len is larger than the tlb size, then it will take longer to + * test all of the entries in the TLB than it will to flush it all. */ - if (mask < f->mask) { + if (mask < f->mask || len > f->mask) { tlb_debug("forcing full flush midx %d (" - TARGET_FMT_lx "/" TARGET_FMT_lx ")\n", - midx, page, mask); + TARGET_FMT_lx "/" TARGET_FMT_lx "+" TARGET_FMT_lx ")\n", + midx, addr, mask, len); tlb_flush_one_mmuidx_locked(env, midx, get_clock_realtime()); return; } - /* Check if we need to flush due to large pages. */ - if ((page & d->large_page_mask) == d->large_page_addr) { + /* + * Check if we need to flush due to large pages. + * Because large_page_mask contains all 1's from the msb, + * we only need to test the end of the range. + */ + if (((addr + len - 1) & d->large_page_mask) == d->large_page_addr) { tlb_debug("forcing full flush midx %d (" TARGET_FMT_lx "/" TARGET_FMT_lx ")\n", midx, d->large_page_addr, d->large_page_mask); @@ -739,85 +746,67 @@ static void tlb_flush_page_bits_locked(CPUArchState *env, int midx, return; } - if (tlb_flush_entry_mask_locked(tlb_entry(env, midx, page), page, mask)) { - tlb_n_used_entries_dec(env, midx); + for (target_ulong i = 0; i < len; i += TARGET_PAGE_SIZE) { + target_ulong page = addr + i; + CPUTLBEntry *entry = tlb_entry(env, midx, page); + + if (tlb_flush_entry_mask_locked(entry, page, mask)) { + tlb_n_used_entries_dec(env, midx); + } + tlb_flush_vtlb_page_mask_locked(env, midx, page, mask); } - tlb_flush_vtlb_page_mask_locked(env, midx, page, mask); } typedef struct { target_ulong addr; + target_ulong len; uint16_t idxmap; uint16_t bits; -} TLBFlushPageBitsByMMUIdxData; +} TLBFlushRangeData; -static void -tlb_flush_page_bits_by_mmuidx_async_0(CPUState *cpu, - TLBFlushPageBitsByMMUIdxData d) +static void tlb_flush_range_by_mmuidx_async_0(CPUState *cpu, + TLBFlushRangeData d) { CPUArchState *env = cpu->env_ptr; int mmu_idx; assert_cpu_is_self(cpu); - tlb_debug("page addr:" TARGET_FMT_lx "/%u mmu_map:0x%x\n", - d.addr, d.bits, d.idxmap); + tlb_debug("range:" TARGET_FMT_lx "/%u+" TARGET_FMT_lx " mmu_map:0x%x\n", + d.addr, d.bits, d.len, d.idxmap); qemu_spin_lock(&env_tlb(env)->c.lock); for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { if ((d.idxmap >> mmu_idx) & 1) { - tlb_flush_page_bits_locked(env, mmu_idx, d.addr, d.bits); + tlb_flush_range_locked(env, mmu_idx, d.addr, d.len, d.bits); } } qemu_spin_unlock(&env_tlb(env)->c.lock); - tb_flush_jmp_cache(cpu, d.addr); -} - -static bool encode_pbm_to_runon(run_on_cpu_data *out, - TLBFlushPageBitsByMMUIdxData d) -{ - /* We need 6 bits to hold to hold @bits up to 63. */ - if (d.idxmap <= MAKE_64BIT_MASK(0, TARGET_PAGE_BITS - 6)) { - *out = RUN_ON_CPU_TARGET_PTR(d.addr | (d.idxmap << 6) | d.bits); - return true; + for (target_ulong i = 0; i < d.len; i += TARGET_PAGE_SIZE) { + tb_flush_jmp_cache(cpu, d.addr + i); } - return false; -} - -static TLBFlushPageBitsByMMUIdxData -decode_runon_to_pbm(run_on_cpu_data data) -{ - target_ulong addr_map_bits = (target_ulong) data.target_ptr; - return (TLBFlushPageBitsByMMUIdxData){ - .addr = addr_map_bits & TARGET_PAGE_MASK, - .idxmap = (addr_map_bits & ~TARGET_PAGE_MASK) >> 6, - .bits = addr_map_bits & 0x3f - }; } -static void tlb_flush_page_bits_by_mmuidx_async_1(CPUState *cpu, - run_on_cpu_data runon) +static void tlb_flush_range_by_mmuidx_async_1(CPUState *cpu, + run_on_cpu_data data) { - tlb_flush_page_bits_by_mmuidx_async_0(cpu, decode_runon_to_pbm(runon)); -} - -static void tlb_flush_page_bits_by_mmuidx_async_2(CPUState *cpu, - run_on_cpu_data data) -{ - TLBFlushPageBitsByMMUIdxData *d = data.host_ptr; - tlb_flush_page_bits_by_mmuidx_async_0(cpu, *d); + TLBFlushRangeData *d = data.host_ptr; + tlb_flush_range_by_mmuidx_async_0(cpu, *d); g_free(d); } -void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, target_ulong addr, - uint16_t idxmap, unsigned bits) +void tlb_flush_range_by_mmuidx(CPUState *cpu, target_ulong addr, + target_ulong len, uint16_t idxmap, + unsigned bits) { - TLBFlushPageBitsByMMUIdxData d; - run_on_cpu_data runon; + TLBFlushRangeData d; - /* If all bits are significant, this devolves to tlb_flush_page. */ - if (bits >= TARGET_LONG_BITS) { + /* + * If all bits are significant, and len is small, + * this devolves to tlb_flush_page. + */ + if (bits >= TARGET_LONG_BITS && len <= TARGET_PAGE_SIZE) { tlb_flush_page_by_mmuidx(cpu, addr, idxmap); return; } @@ -829,34 +818,38 @@ void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, target_ulong addr, /* This should already be page aligned */ d.addr = addr & TARGET_PAGE_MASK; + d.len = len; d.idxmap = idxmap; d.bits = bits; if (qemu_cpu_is_self(cpu)) { - tlb_flush_page_bits_by_mmuidx_async_0(cpu, d); - } else if (encode_pbm_to_runon(&runon, d)) { - async_run_on_cpu(cpu, tlb_flush_page_bits_by_mmuidx_async_1, runon); + tlb_flush_range_by_mmuidx_async_0(cpu, d); } else { - TLBFlushPageBitsByMMUIdxData *p - = g_new(TLBFlushPageBitsByMMUIdxData, 1); - /* Otherwise allocate a structure, freed by the worker. */ - *p = d; - async_run_on_cpu(cpu, tlb_flush_page_bits_by_mmuidx_async_2, + TLBFlushRangeData *p = g_memdup(&d, sizeof(d)); + async_run_on_cpu(cpu, tlb_flush_range_by_mmuidx_async_1, RUN_ON_CPU_HOST_PTR(p)); } } -void tlb_flush_page_bits_by_mmuidx_all_cpus(CPUState *src_cpu, - target_ulong addr, - uint16_t idxmap, - unsigned bits) +void tlb_flush_page_bits_by_mmuidx(CPUState *cpu, target_ulong addr, + uint16_t idxmap, unsigned bits) +{ + tlb_flush_range_by_mmuidx(cpu, addr, TARGET_PAGE_SIZE, idxmap, bits); +} + +void tlb_flush_range_by_mmuidx_all_cpus(CPUState *src_cpu, + target_ulong addr, target_ulong len, + uint16_t idxmap, unsigned bits) { - TLBFlushPageBitsByMMUIdxData d; - run_on_cpu_data runon; + TLBFlushRangeData d; + CPUState *dst_cpu; - /* If all bits are significant, this devolves to tlb_flush_page. */ - if (bits >= TARGET_LONG_BITS) { + /* + * If all bits are significant, and len is small, + * this devolves to tlb_flush_page. + */ + if (bits >= TARGET_LONG_BITS && len <= TARGET_PAGE_SIZE) { tlb_flush_page_by_mmuidx_all_cpus(src_cpu, addr, idxmap); return; } @@ -868,40 +861,45 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus(CPUState *src_cpu, /* This should already be page aligned */ d.addr = addr & TARGET_PAGE_MASK; + d.len = len; d.idxmap = idxmap; d.bits = bits; - if (encode_pbm_to_runon(&runon, d)) { - flush_all_helper(src_cpu, tlb_flush_page_bits_by_mmuidx_async_1, runon); - } else { - CPUState *dst_cpu; - TLBFlushPageBitsByMMUIdxData *p; - - /* Allocate a separate data block for each destination cpu. */ - CPU_FOREACH(dst_cpu) { - if (dst_cpu != src_cpu) { - p = g_new(TLBFlushPageBitsByMMUIdxData, 1); - *p = d; - async_run_on_cpu(dst_cpu, - tlb_flush_page_bits_by_mmuidx_async_2, - RUN_ON_CPU_HOST_PTR(p)); - } + /* Allocate a separate data block for each destination cpu. */ + CPU_FOREACH(dst_cpu) { + if (dst_cpu != src_cpu) { + TLBFlushRangeData *p = g_memdup(&d, sizeof(d)); + async_run_on_cpu(dst_cpu, + tlb_flush_range_by_mmuidx_async_1, + RUN_ON_CPU_HOST_PTR(p)); } } - tlb_flush_page_bits_by_mmuidx_async_0(src_cpu, d); + tlb_flush_range_by_mmuidx_async_0(src_cpu, d); } -void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu, - target_ulong addr, - uint16_t idxmap, - unsigned bits) +void tlb_flush_page_bits_by_mmuidx_all_cpus(CPUState *src_cpu, + target_ulong addr, + uint16_t idxmap, unsigned bits) { - TLBFlushPageBitsByMMUIdxData d; - run_on_cpu_data runon; + tlb_flush_range_by_mmuidx_all_cpus(src_cpu, addr, TARGET_PAGE_SIZE, + idxmap, bits); +} + +void tlb_flush_range_by_mmuidx_all_cpus_synced(CPUState *src_cpu, + target_ulong addr, + target_ulong len, + uint16_t idxmap, + unsigned bits) +{ + TLBFlushRangeData d, *p; + CPUState *dst_cpu; - /* If all bits are significant, this devolves to tlb_flush_page. */ - if (bits >= TARGET_LONG_BITS) { + /* + * If all bits are significant, and len is small, + * this devolves to tlb_flush_page. + */ + if (bits >= TARGET_LONG_BITS && len <= TARGET_PAGE_SIZE) { tlb_flush_page_by_mmuidx_all_cpus_synced(src_cpu, addr, idxmap); return; } @@ -913,32 +911,31 @@ void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu, /* This should already be page aligned */ d.addr = addr & TARGET_PAGE_MASK; + d.len = len; d.idxmap = idxmap; d.bits = bits; - if (encode_pbm_to_runon(&runon, d)) { - flush_all_helper(src_cpu, tlb_flush_page_bits_by_mmuidx_async_1, runon); - async_safe_run_on_cpu(src_cpu, tlb_flush_page_bits_by_mmuidx_async_1, - runon); - } else { - CPUState *dst_cpu; - TLBFlushPageBitsByMMUIdxData *p; - - /* Allocate a separate data block for each destination cpu. */ - CPU_FOREACH(dst_cpu) { - if (dst_cpu != src_cpu) { - p = g_new(TLBFlushPageBitsByMMUIdxData, 1); - *p = d; - async_run_on_cpu(dst_cpu, tlb_flush_page_bits_by_mmuidx_async_2, - RUN_ON_CPU_HOST_PTR(p)); - } + /* Allocate a separate data block for each destination cpu. */ + CPU_FOREACH(dst_cpu) { + if (dst_cpu != src_cpu) { + p = g_memdup(&d, sizeof(d)); + async_run_on_cpu(dst_cpu, tlb_flush_range_by_mmuidx_async_1, + RUN_ON_CPU_HOST_PTR(p)); } - - p = g_new(TLBFlushPageBitsByMMUIdxData, 1); - *p = d; - async_safe_run_on_cpu(src_cpu, tlb_flush_page_bits_by_mmuidx_async_2, - RUN_ON_CPU_HOST_PTR(p)); } + + p = g_memdup(&d, sizeof(d)); + async_safe_run_on_cpu(src_cpu, tlb_flush_range_by_mmuidx_async_1, + RUN_ON_CPU_HOST_PTR(p)); +} + +void tlb_flush_page_bits_by_mmuidx_all_cpus_synced(CPUState *src_cpu, + target_ulong addr, + uint16_t idxmap, + unsigned bits) +{ + tlb_flush_range_by_mmuidx_all_cpus_synced(src_cpu, addr, TARGET_PAGE_SIZE, + idxmap, bits); } /* update the TLBs so that writes to code in the virtual page 'addr' diff --git a/accel/tcg/tb-context.h b/accel/tcg/tb-context.h new file mode 100644 index 0000000000..cc33979113 --- /dev/null +++ b/accel/tcg/tb-context.h @@ -0,0 +1,41 @@ +/* + * Internal structs that QEMU exports to TCG + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef QEMU_TB_CONTEXT_H +#define QEMU_TB_CONTEXT_H + +#include "qemu/thread.h" +#include "qemu/qht.h" + +#define CODE_GEN_HTABLE_BITS 15 +#define CODE_GEN_HTABLE_SIZE (1 << CODE_GEN_HTABLE_BITS) + +typedef struct TBContext TBContext; + +struct TBContext { + + struct qht htable; + + /* statistics */ + unsigned tb_flush_count; +}; + +extern TBContext tb_ctx; + +#endif diff --git a/accel/tcg/tb-hash.h b/accel/tcg/tb-hash.h new file mode 100644 index 0000000000..0a273d9605 --- /dev/null +++ b/accel/tcg/tb-hash.h @@ -0,0 +1,69 @@ +/* + * internal execution defines for qemu + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef EXEC_TB_HASH_H +#define EXEC_TB_HASH_H + +#include "exec/cpu-defs.h" +#include "exec/exec-all.h" +#include "qemu/xxhash.h" + +#ifdef CONFIG_SOFTMMU + +/* Only the bottom TB_JMP_PAGE_BITS of the jump cache hash bits vary for + addresses on the same page. The top bits are the same. This allows + TLB invalidation to quickly clear a subset of the hash table. */ +#define TB_JMP_PAGE_BITS (TB_JMP_CACHE_BITS / 2) +#define TB_JMP_PAGE_SIZE (1 << TB_JMP_PAGE_BITS) +#define TB_JMP_ADDR_MASK (TB_JMP_PAGE_SIZE - 1) +#define TB_JMP_PAGE_MASK (TB_JMP_CACHE_SIZE - TB_JMP_PAGE_SIZE) + +static inline unsigned int tb_jmp_cache_hash_page(target_ulong pc) +{ + target_ulong tmp; + tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)); + return (tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK; +} + +static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) +{ + target_ulong tmp; + tmp = pc ^ (pc >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)); + return (((tmp >> (TARGET_PAGE_BITS - TB_JMP_PAGE_BITS)) & TB_JMP_PAGE_MASK) + | (tmp & TB_JMP_ADDR_MASK)); +} + +#else + +/* In user-mode we can get better hashing because we do not have a TLB */ +static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc) +{ + return (pc ^ (pc >> TB_JMP_CACHE_BITS)) & (TB_JMP_CACHE_SIZE - 1); +} + +#endif /* CONFIG_SOFTMMU */ + +static inline +uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags, + uint32_t cf_mask, uint32_t trace_vcpu_dstate) +{ + return qemu_xxhash7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate); +} + +#endif diff --git a/accel/tcg/tb-lookup.h b/accel/tcg/tb-lookup.h new file mode 100644 index 0000000000..9c9e0079da --- /dev/null +++ b/accel/tcg/tb-lookup.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2017, Emilio G. Cota <cota@braap.org> + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef EXEC_TB_LOOKUP_H +#define EXEC_TB_LOOKUP_H + +#ifdef NEED_CPU_H +#include "cpu.h" +#else +#include "exec/poison.h" +#endif + +#include "exec/exec-all.h" +#include "tb-hash.h" + +/* Might cause an exception, so have a longjmp destination ready */ +static inline TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc, + target_ulong cs_base, + uint32_t flags, uint32_t cflags) +{ + TranslationBlock *tb; + uint32_t hash; + + /* we should never be trying to look up an INVALID tb */ + tcg_debug_assert(!(cflags & CF_INVALID)); + + hash = tb_jmp_cache_hash_func(pc); + tb = qatomic_rcu_read(&cpu->tb_jmp_cache[hash]); + + if (likely(tb && + tb->pc == pc && + tb->cs_base == cs_base && + tb->flags == flags && + tb->trace_vcpu_dstate == *cpu->trace_dstate && + tb_cflags(tb) == cflags)) { + return tb; + } + tb = tb_htable_lookup(cpu, pc, cs_base, flags, cflags); + if (tb == NULL) { + return NULL; + } + qatomic_set(&cpu->tb_jmp_cache[hash], tb); + return tb; +} + +#endif /* EXEC_TB_LOOKUP_H */ diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c index 521da4a813..ac7d28c251 100644 --- a/accel/tcg/tcg-runtime-gvec.c +++ b/accel/tcg/tcg-runtime-gvec.c @@ -1073,9 +1073,8 @@ void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(int32_t)) { int32_t ai = *(int32_t *)(a + i); int32_t bi = *(int32_t *)(b + i); - int32_t di = ai + bi; - if (((di ^ ai) &~ (ai ^ bi)) < 0) { - /* Signed overflow. */ + int32_t di; + if (sadd32_overflow(ai, bi, &di)) { di = (di < 0 ? INT32_MAX : INT32_MIN); } *(int32_t *)(d + i) = di; @@ -1091,9 +1090,8 @@ void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(int64_t)) { int64_t ai = *(int64_t *)(a + i); int64_t bi = *(int64_t *)(b + i); - int64_t di = ai + bi; - if (((di ^ ai) &~ (ai ^ bi)) < 0) { - /* Signed overflow. */ + int64_t di; + if (sadd64_overflow(ai, bi, &di)) { di = (di < 0 ? INT64_MAX : INT64_MIN); } *(int64_t *)(d + i) = di; @@ -1143,9 +1141,8 @@ void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(int32_t)) { int32_t ai = *(int32_t *)(a + i); int32_t bi = *(int32_t *)(b + i); - int32_t di = ai - bi; - if (((di ^ ai) & (ai ^ bi)) < 0) { - /* Signed overflow. */ + int32_t di; + if (ssub32_overflow(ai, bi, &di)) { di = (di < 0 ? INT32_MAX : INT32_MIN); } *(int32_t *)(d + i) = di; @@ -1161,9 +1158,8 @@ void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(int64_t)) { int64_t ai = *(int64_t *)(a + i); int64_t bi = *(int64_t *)(b + i); - int64_t di = ai - bi; - if (((di ^ ai) & (ai ^ bi)) < 0) { - /* Signed overflow. */ + int64_t di; + if (ssub64_overflow(ai, bi, &di)) { di = (di < 0 ? INT64_MAX : INT64_MIN); } *(int64_t *)(d + i) = di; @@ -1209,8 +1205,8 @@ void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(uint32_t)) { uint32_t ai = *(uint32_t *)(a + i); uint32_t bi = *(uint32_t *)(b + i); - uint32_t di = ai + bi; - if (di < ai) { + uint32_t di; + if (uadd32_overflow(ai, bi, &di)) { di = UINT32_MAX; } *(uint32_t *)(d + i) = di; @@ -1226,8 +1222,8 @@ void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(uint64_t)) { uint64_t ai = *(uint64_t *)(a + i); uint64_t bi = *(uint64_t *)(b + i); - uint64_t di = ai + bi; - if (di < ai) { + uint64_t di; + if (uadd64_overflow(ai, bi, &di)) { di = UINT64_MAX; } *(uint64_t *)(d + i) = di; @@ -1273,8 +1269,8 @@ void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(uint32_t)) { uint32_t ai = *(uint32_t *)(a + i); uint32_t bi = *(uint32_t *)(b + i); - uint32_t di = ai - bi; - if (ai < bi) { + uint32_t di; + if (usub32_overflow(ai, bi, &di)) { di = 0; } *(uint32_t *)(d + i) = di; @@ -1290,8 +1286,8 @@ void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc) for (i = 0; i < oprsz; i += sizeof(uint64_t)) { uint64_t ai = *(uint64_t *)(a + i); uint64_t bi = *(uint64_t *)(b + i); - uint64_t di = ai - bi; - if (ai < bi) { + uint64_t di; + if (usub64_overflow(ai, bi, &di)) { di = 0; } *(uint64_t *)(d + i) = di; diff --git a/accel/tcg/tcg-runtime.c b/accel/tcg/tcg-runtime.c index 49f5de37e8..66ac830e2f 100644 --- a/accel/tcg/tcg-runtime.c +++ b/accel/tcg/tcg-runtime.c @@ -30,7 +30,7 @@ #include "disas/disas.h" #include "exec/log.h" #include "tcg/tcg.h" -#include "exec/tb-lookup.h" +#include "tb-lookup.h" /* 32-bit helpers */ diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index ae7e873713..1eefe6ea8d 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -47,7 +47,6 @@ #endif #include "exec/cputlb.h" -#include "exec/tb-hash.h" #include "exec/translate-all.h" #include "qemu/bitmap.h" #include "qemu/error-report.h" @@ -60,6 +59,8 @@ #include "sysemu/tcg.h" #include "qapi/error.h" #include "hw/core/tcg-cpu-ops.h" +#include "tb-hash.h" +#include "tb-context.h" #include "internal.h" /* #define DEBUG_TB_INVALIDATE */ @@ -1912,6 +1913,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, tcg_ctx->cpu = env_cpu(env); gen_intermediate_code(cpu, tb, max_insns); + assert(tb->size != 0); tcg_ctx->cpu = NULL; max_insns = tb->icount; @@ -2042,8 +2044,15 @@ TranslationBlock *tb_gen_code(CPUState *cpu, int i; qemu_log(" data: [size=%d]\n", data_size); for (i = 0; i < data_size / sizeof(tcg_target_ulong); i++) { - qemu_log("0x%08" PRIxPTR ": .quad 0x%" TCG_PRIlx "\n", - (uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]); + if (sizeof(tcg_target_ulong) == 8) { + qemu_log("0x%08" PRIxPTR ": .quad 0x%016" TCG_PRIlx "\n", + (uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]); + } else if (sizeof(tcg_target_ulong) == 4) { + qemu_log("0x%08" PRIxPTR ": .long 0x%08" TCG_PRIlx "\n", + (uintptr_t)&rx_data_gen_ptr[i], rx_data_gen_ptr[i]); + } else { + qemu_build_not_reached(); + } } } qemu_log("\n"); |
