From 2ac01d6dafabd4a726254eea98824c798d416ee4 Mon Sep 17 00:00:00 2001 From: Emilio G. Cota Date: Fri, 23 Jun 2017 19:00:11 -0400 Subject: translate-all: use a binary search tree to track TBs in TBContext This is a prerequisite for supporting multiple TCG contexts, since we will have threads generating code in separate regions of code_gen_buffer. For this we need a new field (.size) in struct tb_tc to keep track of the size of the translated code. This field uses a size_t to avoid adding a hole to the struct, although really an unsigned int would have been enough. The comparison function we use is optimized for the common case: insertions. Profiling shows that upon booting debian-arm, 98% of comparisons are between existing tb's (i.e. a->size and b->size are both !0), which happens during insertions (and removals, but those are rare). The remaining cases are lookups. From reading the glib sources we see that the first key is always the lookup key. However, the code does not assume this to always be the case because this behaviour is not guaranteed in the glib docs. However, we embed this knowledge in the code as a branch hint for the compiler. Note that tb_free does not free space in the code_gen_buffer anymore, since we cannot easily know whether the tb is the last one inserted in code_gen_buffer. The next patch in this series renames tb_free to tb_remove to reflect this. Performance-wise, lookups in tb_find_pc are the same as before: O(log n). However, insertions are O(log n) instead of O(1), which results in a small slowdown when booting debian-arm: Performance counter stats for 'build/arm-softmmu/qemu-system-arm \ -machine type=virt -nographic -smp 1 -m 4096 \ -netdev user,id=unet,hostfwd=tcp::2222-:22 \ -device virtio-net-device,netdev=unet \ -drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \ -device virtio-blk-device,drive=myblock \ -kernel img/arm/aarch32-current-linux-kernel-only.img \ -append console=ttyAMA0 root=/dev/vda1 \ -name arm,debug-threads=on -smp 1' (10 runs): - Before: 8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% ) 16,974 context-switches # 0.002 M/sec ( +- 0.12% ) 0 cpu-migrations # 0.000 K/sec 10,125 page-faults # 0.001 M/sec ( +- 1.23% ) 35,144,901,879 cycles # 4.367 GHz ( +- 0.14% ) stalled-cycles-frontend stalled-cycles-backend 65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% ) 10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% ) 192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% ) 8.640869419 seconds time elapsed ( +- 0.57% ) - After: 8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% ) 17,016 context-switches # 0.002 M/sec ( +- 0.40% ) 0 cpu-migrations # 0.000 K/sec 18,769 page-faults # 0.002 M/sec ( +- 0.45% ) 35,660,956,120 cycles # 4.378 GHz ( +- 1.22% ) stalled-cycles-frontend stalled-cycles-backend 65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% ) 10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% ) 195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% ) 8.828660235 seconds time elapsed ( +- 0.38% ) Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- include/exec/tb-context.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/exec/tb-context.h') diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index 25c2afe753..1fa8dcc737 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -31,10 +31,8 @@ typedef struct TBContext TBContext; struct TBContext { - TranslationBlock **tbs; + GTree *tb_tree; struct qht htable; - size_t tbs_size; - int nb_tbs; /* any access to the tbs or the page table must use this lock */ QemuMutex tb_lock; -- cgit v1.2.3-55-g7522 From 44ded3d04821bec57407cc26a8b4db620da2be04 Mon Sep 17 00:00:00 2001 From: Emilio G. Cota Date: Fri, 23 Jun 2017 20:04:43 -0400 Subject: tcg: take tb_ctx out of TCGContext Groundwork for supporting multiple TCG contexts. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Richard Henderson --- accel/tcg/cpu-exec.c | 2 +- accel/tcg/translate-all.c | 57 +++++++++++++++++++++++------------------------ include/exec/tb-context.h | 2 ++ linux-user/main.c | 6 ++--- tcg/tcg.h | 2 -- 5 files changed, 34 insertions(+), 35 deletions(-) (limited to 'include/exec/tb-context.h') diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c index 9b58cdee28..4318441e4c 100644 --- a/accel/tcg/cpu-exec.c +++ b/accel/tcg/cpu-exec.c @@ -327,7 +327,7 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc, phys_pc = get_page_addr_code(desc.env, pc); desc.phys_page1 = phys_pc & TARGET_PAGE_MASK; h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate); - return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h); + return qht_lookup(&tb_ctx.htable, tb_cmp, &desc, h); } void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index 7e2c0cdb98..b238b724a8 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -154,6 +154,7 @@ static void *l1_map[V_L1_MAX_SIZE]; /* code generation context */ TCGContext tcg_ctx; +TBContext tb_ctx; bool parallel_cpus; /* translation block context */ @@ -185,7 +186,7 @@ static void page_table_config_init(void) void tb_lock(void) { assert_tb_unlocked(); - qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); + qemu_mutex_lock(&tb_ctx.tb_lock); have_tb_lock++; } @@ -193,13 +194,13 @@ void tb_unlock(void) { assert_tb_locked(); have_tb_lock--; - qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); + qemu_mutex_unlock(&tb_ctx.tb_lock); } void tb_lock_reset(void) { if (have_tb_lock) { - qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); + qemu_mutex_unlock(&tb_ctx.tb_lock); have_tb_lock = 0; } } @@ -824,15 +825,15 @@ static inline void code_gen_alloc(size_t tb_size) fprintf(stderr, "Could not allocate dynamic translator buffer\n"); exit(1); } - tcg_ctx.tb_ctx.tb_tree = g_tree_new(tb_tc_cmp); - qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); + tb_ctx.tb_tree = g_tree_new(tb_tc_cmp); + qemu_mutex_init(&tb_ctx.tb_lock); } static void tb_htable_init(void) { unsigned int mode = QHT_MODE_AUTO_RESIZE; - qht_init(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE, mode); + qht_init(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE, mode); } /* Must be called before using the QEMU cpus. 'tb_size' is the size @@ -876,7 +877,7 @@ void tb_remove(TranslationBlock *tb) { assert_tb_locked(); - g_tree_remove(tcg_ctx.tb_ctx.tb_tree, &tb->tc); + g_tree_remove(tb_ctx.tb_tree, &tb->tc); } static inline void invalidate_page_bitmap(PageDesc *p) @@ -938,15 +939,15 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) /* If it is already been done on request of another CPU, * just retry. */ - if (tcg_ctx.tb_ctx.tb_flush_count != tb_flush_count.host_int) { + if (tb_ctx.tb_flush_count != tb_flush_count.host_int) { goto done; } if (DEBUG_TB_FLUSH_GATE) { - size_t nb_tbs = g_tree_nnodes(tcg_ctx.tb_ctx.tb_tree); + size_t nb_tbs = g_tree_nnodes(tb_ctx.tb_tree); size_t host_size = 0; - g_tree_foreach(tcg_ctx.tb_ctx.tb_tree, tb_host_size_iter, &host_size); + g_tree_foreach(tb_ctx.tb_tree, tb_host_size_iter, &host_size); printf("qemu: flush code_size=%td nb_tbs=%zu avg_tb_size=%zu\n", tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer, nb_tbs, nb_tbs > 0 ? host_size / nb_tbs : 0); @@ -961,17 +962,16 @@ static void do_tb_flush(CPUState *cpu, run_on_cpu_data tb_flush_count) } /* Increment the refcount first so that destroy acts as a reset */ - g_tree_ref(tcg_ctx.tb_ctx.tb_tree); - g_tree_destroy(tcg_ctx.tb_ctx.tb_tree); + g_tree_ref(tb_ctx.tb_tree); + g_tree_destroy(tb_ctx.tb_tree); - qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE); + qht_reset_size(&tb_ctx.htable, CODE_GEN_HTABLE_SIZE); page_flush_tb(); tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; /* XXX: flush processor icache at this point if cache flush is expensive */ - atomic_mb_set(&tcg_ctx.tb_ctx.tb_flush_count, - tcg_ctx.tb_ctx.tb_flush_count + 1); + atomic_mb_set(&tb_ctx.tb_flush_count, tb_ctx.tb_flush_count + 1); done: tb_unlock(); @@ -980,7 +980,7 @@ done: void tb_flush(CPUState *cpu) { if (tcg_enabled()) { - unsigned tb_flush_count = atomic_mb_read(&tcg_ctx.tb_ctx.tb_flush_count); + unsigned tb_flush_count = atomic_mb_read(&tb_ctx.tb_flush_count); async_safe_run_on_cpu(cpu, do_tb_flush, RUN_ON_CPU_HOST_INT(tb_flush_count)); } @@ -1013,7 +1013,7 @@ do_tb_invalidate_check(struct qht *ht, void *p, uint32_t hash, void *userp) static void tb_invalidate_check(target_ulong address) { address &= TARGET_PAGE_MASK; - qht_iter(&tcg_ctx.tb_ctx.htable, do_tb_invalidate_check, &address); + qht_iter(&tb_ctx.htable, do_tb_invalidate_check, &address); } static void @@ -1033,7 +1033,7 @@ do_tb_page_check(struct qht *ht, void *p, uint32_t hash, void *userp) /* verify that all the pages have correct rights for code */ static void tb_page_check(void) { - qht_iter(&tcg_ctx.tb_ctx.htable, do_tb_page_check, NULL); + qht_iter(&tb_ctx.htable, do_tb_page_check, NULL); } #endif /* CONFIG_USER_ONLY */ @@ -1133,7 +1133,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK); h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK, tb->trace_vcpu_dstate); - qht_remove(&tcg_ctx.tb_ctx.htable, tb, h); + qht_remove(&tb_ctx.htable, tb, h); /* remove the TB from the page list */ if (tb->page_addr[0] != page_addr) { @@ -1162,7 +1162,7 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr) /* suppress any remaining jumps to this TB */ tb_jmp_unlink(tb); - tcg_ctx.tb_ctx.tb_phys_invalidate_count++; + tb_ctx.tb_phys_invalidate_count++; } #ifdef CONFIG_SOFTMMU @@ -1278,7 +1278,7 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc, /* add in the hash table */ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK, tb->trace_vcpu_dstate); - qht_insert(&tcg_ctx.tb_ctx.htable, tb, h); + qht_insert(&tb_ctx.htable, tb, h); #ifdef CONFIG_USER_ONLY if (DEBUG_TB_CHECK_GATE) { @@ -1441,7 +1441,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu, * through the physical hash table and physical page list. */ tb_link_page(tb, phys_pc, phys_page2); - g_tree_insert(tcg_ctx.tb_ctx.tb_tree, &tb->tc, tb); + g_tree_insert(tb_ctx.tb_tree, &tb->tc, tb); return tb; } @@ -1713,7 +1713,7 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr) { struct tb_tc s = { .ptr = (void *)tc_ptr }; - return g_tree_lookup(tcg_ctx.tb_ctx.tb_tree, &s); + return g_tree_lookup(tb_ctx.tb_tree, &s); } #if !defined(CONFIG_USER_ONLY) @@ -1930,8 +1930,8 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) tb_lock(); - nb_tbs = g_tree_nnodes(tcg_ctx.tb_ctx.tb_tree); - g_tree_foreach(tcg_ctx.tb_ctx.tb_tree, tb_tree_stats_iter, &tst); + nb_tbs = g_tree_nnodes(tb_ctx.tb_tree); + g_tree_foreach(tb_ctx.tb_tree, tb_tree_stats_iter, &tst); /* XXX: avoid using doubles ? */ cpu_fprintf(f, "Translation buffer state:\n"); /* @@ -1957,15 +1957,14 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) tst.direct_jmp2_count, nb_tbs ? (tst.direct_jmp2_count * 100) / nb_tbs : 0); - qht_statistics_init(&tcg_ctx.tb_ctx.htable, &hst); + qht_statistics_init(&tb_ctx.htable, &hst); print_qht_statistics(f, cpu_fprintf, hst); qht_statistics_destroy(&hst); cpu_fprintf(f, "\nStatistics:\n"); cpu_fprintf(f, "TB flush count %u\n", - atomic_read(&tcg_ctx.tb_ctx.tb_flush_count)); - cpu_fprintf(f, "TB invalidate count %d\n", - tcg_ctx.tb_ctx.tb_phys_invalidate_count); + atomic_read(&tb_ctx.tb_flush_count)); + cpu_fprintf(f, "TB invalidate count %d\n", tb_ctx.tb_phys_invalidate_count); cpu_fprintf(f, "TLB flush count %zu\n", tlb_flush_count()); tcg_dump_info(f, cpu_fprintf); diff --git a/include/exec/tb-context.h b/include/exec/tb-context.h index 1fa8dcc737..1d41202485 100644 --- a/include/exec/tb-context.h +++ b/include/exec/tb-context.h @@ -41,4 +41,6 @@ struct TBContext { int tb_phys_invalidate_count; }; +extern TBContext tb_ctx; + #endif diff --git a/linux-user/main.c b/linux-user/main.c index dde04c769a..0a2a0d75b3 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -129,7 +129,7 @@ int cpu_get_pic_interrupt(CPUX86State *env) void fork_start(void) { cpu_list_lock(); - qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock); + qemu_mutex_lock(&tb_ctx.tb_lock); mmap_fork_start(); } @@ -145,11 +145,11 @@ void fork_end(int child) QTAILQ_REMOVE(&cpus, cpu, node); } } - qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock); + qemu_mutex_init(&tb_ctx.tb_lock); qemu_init_cpu_list(); gdbserver_fork(thread_cpu); } else { - qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock); + qemu_mutex_unlock(&tb_ctx.tb_lock); cpu_list_unlock(); } } diff --git a/tcg/tcg.h b/tcg/tcg.h index 7c39eac428..76324c9ad6 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -663,8 +663,6 @@ struct TCGContext { /* Threshold to flush the translated code buffer. */ void *code_gen_highwater; - TBContext tb_ctx; - /* Track which vCPU triggers events */ CPUState *cpu; /* *_trans */ TCGv_env tcg_env; /* *_exec */ -- cgit v1.2.3-55-g7522