diff options
Diffstat (limited to 'tcg/i386/tcg-target.c')
-rw-r--r-- | tcg/i386/tcg-target.c | 285 |
1 files changed, 222 insertions, 63 deletions
diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 495b901080..f832282d1a 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -88,6 +88,11 @@ static const int tcg_target_call_oarg_regs[] = { #endif }; +/* Constants we accept. */ +#define TCG_CT_CONST_S32 0x100 +#define TCG_CT_CONST_U32 0x200 +#define TCG_CT_CONST_I32 0x400 + /* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on i386. */ @@ -99,18 +104,41 @@ static const int tcg_target_call_oarg_regs[] = { # define TCG_REG_L1 TCG_REG_EDX #endif +/* The host compiler should supply <cpuid.h> to enable runtime features + detection, as we're not going to go so far as our own inline assembly. + If not available, default values will be assumed. */ +#if defined(CONFIG_CPUID_H) +#include <cpuid.h> +#endif + /* For 32-bit, we are going to attempt to determine at runtime whether cmov - is available. However, the host compiler must supply <cpuid.h>, as we're - not going to go so far as our own inline assembly. */ + is available. */ #if TCG_TARGET_REG_BITS == 64 # define have_cmov 1 -#elif defined(CONFIG_CPUID_H) -#include <cpuid.h> +#elif defined(CONFIG_CPUID_H) && defined(bit_CMOV) static bool have_cmov; #else # define have_cmov 0 #endif +/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are + going to attempt to determine at runtime whether movbe is available. */ +#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE) +static bool have_movbe; +#else +# define have_movbe 0 +#endif + +/* We need this symbol in tcg-target.h, and we can't properly conditionalize + it there. Therefore we always define the variable. */ +bool have_bmi1; + +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2) +static bool have_bmi2; +#else +# define have_bmi2 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -153,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX); break; case 'c': + case_c: ct->ct |= TCG_CT_REG; tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX); break; @@ -181,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xf); break; case 'r': + case_r: ct->ct |= TCG_CT_REG; if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(ct->u.regs, 0, 0xffff); @@ -188,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xff); } break; + case 'C': + /* With SHRX et al, we need not use ECX as shift count register. */ + if (have_bmi2) { + goto case_r; + } else { + goto case_c; + } /* qemu_ld/st address constraint */ case 'L': @@ -207,6 +244,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) case 'Z': ct->ct |= TCG_CT_CONST_U32; break; + case 'I': + ct->ct |= TCG_CT_CONST_I32; + break; default: return -1; @@ -230,6 +270,9 @@ static inline int tcg_target_const_match(tcg_target_long val, if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { return 1; } + if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { + return 1; + } return 0; } @@ -240,13 +283,14 @@ static inline int tcg_target_const_match(tcg_target_long val, #endif #define P_EXT 0x100 /* 0x0f opcode prefix */ -#define P_DATA16 0x200 /* 0x66 opcode prefix */ +#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */ +#define P_DATA16 0x400 /* 0x66 opcode prefix */ #if TCG_TARGET_REG_BITS == 64 -# define P_ADDR32 0x400 /* 0x67 opcode prefix */ -# define P_REXW 0x800 /* Set REX.W = 1 */ -# define P_REXB_R 0x1000 /* REG field as byte register */ -# define P_REXB_RM 0x2000 /* R/M field as byte register */ -# define P_GS 0x4000 /* gs segment override */ +# define P_ADDR32 0x800 /* 0x67 opcode prefix */ +# define P_REXW 0x1000 /* Set REX.W = 1 */ +# define P_REXB_R 0x2000 /* REG field as byte register */ +# define P_REXB_RM 0x4000 /* R/M field as byte register */ +# define P_GS 0x8000 /* gs segment override */ #else # define P_ADDR32 0 # define P_REXW 0 @@ -254,10 +298,13 @@ static inline int tcg_target_const_match(tcg_target_long val, # define P_REXB_RM 0 # define P_GS 0 #endif +#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */ +#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ +#define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) #define OPC_BSWAP (0xc8 | P_EXT) #define OPC_CALL_Jz (0xe8) @@ -279,6 +326,8 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_MOVB_EvIz (0xc6) #define OPC_MOVL_EvIz (0xc7) #define OPC_MOVL_Iv (0xb8) +#define OPC_MOVBE_GyMy (0xf0 | P_EXT38) +#define OPC_MOVBE_MyGy (0xf1 | P_EXT38) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) @@ -293,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_SHIFT_1 (0xd1) #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) +#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) +#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) +#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) #define OPC_TESTL (0x85) #define OPC_XCHG_ax_r32 (0x90) @@ -381,10 +433,10 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) } rex = 0; - rex |= (opc & P_REXW) >> 8; /* REX.W */ - rex |= (r & 8) >> 1; /* REX.R */ - rex |= (x & 8) >> 2; /* REX.X */ - rex |= (rm & 8) >> 3; /* REX.B */ + rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ + rex |= (r & 8) >> 1; /* REX.R */ + rex |= (x & 8) >> 2; /* REX.X */ + rex |= (rm & 8) >> 3; /* REX.B */ /* P_REXB_{R,RM} indicates that the given register is the low byte. For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, @@ -398,9 +450,13 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) tcg_out8(s, (uint8_t)(rex | 0x40)); } - if (opc & P_EXT) { + if (opc & (P_EXT | P_EXT38)) { tcg_out8(s, 0x0f); + if (opc & P_EXT38) { + tcg_out8(s, 0x38); + } } + tcg_out8(s, opc); } #else @@ -409,8 +465,11 @@ static void tcg_out_opc(TCGContext *s, int opc) if (opc & P_DATA16) { tcg_out8(s, 0x66); } - if (opc & P_EXT) { + if (opc & (P_EXT | P_EXT38)) { tcg_out8(s, 0x0f); + if (opc & P_EXT38) { + tcg_out8(s, 0x38); + } } tcg_out8(s, opc); } @@ -426,6 +485,48 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } +static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +{ + int tmp; + + if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) { + /* Three byte VEX prefix. */ + tcg_out8(s, 0xc4); + + /* VEX.m-mmmm */ + if (opc & P_EXT38) { + tmp = 2; + } else if (opc & P_EXT) { + tmp = 1; + } else { + tcg_abort(); + } + tmp |= 0x40; /* VEX.X */ + tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ + tcg_out8(s, tmp); + + tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ + } else { + /* Two byte VEX prefix. */ + tcg_out8(s, 0xc5); + + tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + } + /* VEX.pp */ + if (opc & P_DATA16) { + tmp |= 1; /* 0x66 */ + } else if (opc & P_SIMDF3) { + tmp |= 2; /* 0xf3 */ + } else if (opc & P_SIMDF2) { + tmp |= 3; /* 0xf2 */ + } + tmp |= (~v & 15) << 3; /* VEX.vvvv */ + tcg_out8(s, tmp); + tcg_out8(s, opc); + tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); +} + /* Output an opcode with a full "rm + (index<<shift) + offset" address mode. We handle either RM and INDEX missing with a negative value. In 64-bit mode for absolute addresses, ~RM is the size of the immediate operand @@ -1336,7 +1437,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, TCGReg base, intptr_t ofs, int seg, TCGMemOp memop) { - const TCGMemOp bswap = memop & MO_BSWAP; + const TCGMemOp real_bswap = memop & MO_BSWAP; + TCGMemOp bswap = real_bswap; + int movop = OPC_MOVL_GvEv; + + if (have_movbe && real_bswap) { + bswap = 0; + movop = OPC_MOVBE_GyMy; + } switch (memop & MO_SSIZE) { case MO_UB: @@ -1347,14 +1455,19 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, break; case MO_UW: tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - if (bswap) { + if (real_bswap) { tcg_out_rolw_8(s, datalo); } break; case MO_SW: - if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - tcg_out_rolw_8(s, datalo); + if (real_bswap) { + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + tcg_out_rolw_8(s, datalo); + } tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, @@ -1362,16 +1475,18 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, } break; case MO_UL: - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); if (bswap) { tcg_out_bswap32(s, datalo); } break; #if TCG_TARGET_REG_BITS == 64 case MO_SL: - if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - tcg_out_bswap32(s, datalo); + if (real_bswap) { + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + if (bswap) { + tcg_out_bswap32(s, datalo); + } tcg_out_ext32s(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); @@ -1380,27 +1495,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, #endif case MO_Q: if (TCG_TARGET_REG_BITS == 64) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); if (bswap) { tcg_out_bswap64(s, datalo); } } else { - if (bswap) { + if (real_bswap) { int t = datalo; datalo = datahi; datahi = t; } if (base != datalo) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); } if (bswap) { tcg_out_bswap32(s, datalo); @@ -1476,13 +1586,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, TCGReg base, intptr_t ofs, int seg, TCGMemOp memop) { - const TCGMemOp bswap = memop & MO_BSWAP; - /* ??? Ideally we wouldn't need a scratch register. For user-only, we could perform the bswap twice to restore the original value instead of moving to the scratch. But as it is, the L constraint means that TCG_REG_L0 is definitely free here. */ const TCGReg scratch = TCG_REG_L0; + const TCGMemOp real_bswap = memop & MO_BSWAP; + TCGMemOp bswap = real_bswap; + int movop = OPC_MOVL_EvGv; + + if (have_movbe && real_bswap) { + bswap = 0; + movop = OPC_MOVBE_MyGy; + } switch (memop & MO_SIZE) { case MO_8: @@ -1501,8 +1617,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_rolw_8(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs); break; case MO_32: if (bswap) { @@ -1510,7 +1625,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap32(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); break; case MO_64: if (TCG_TARGET_REG_BITS == 64) { @@ -1519,8 +1634,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap64(s, scratch); datalo = scratch; } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs); } else if (bswap) { tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); tcg_out_bswap32(s, scratch); @@ -1529,8 +1643,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, tcg_out_bswap32(s, scratch); tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4); + if (real_bswap) { + int t = datalo; + datalo = datahi; + datahi = t; + } + tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4); } break; default: @@ -1597,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) { - int c, rexw = 0; + int c, vexop, rexw = 0; #if TCG_TARGET_REG_BITS == 64 # define OP_32_64(x) \ @@ -1733,6 +1852,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + OP_32_64(andc): + if (const_args[2]) { + tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, + args[0], args[1]); + tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0); + } else { + tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]); + } + break; + OP_32_64(mul): if (const_args[2]) { int32_t val; @@ -1758,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, OP_32_64(shl): c = SHIFT_SHL; - goto gen_shift; + vexop = OPC_SHLX; + goto gen_shift_maybe_vex; OP_32_64(shr): c = SHIFT_SHR; - goto gen_shift; + vexop = OPC_SHRX; + goto gen_shift_maybe_vex; OP_32_64(sar): c = SHIFT_SAR; - goto gen_shift; + vexop = OPC_SARX; + goto gen_shift_maybe_vex; OP_32_64(rotl): c = SHIFT_ROL; goto gen_shift; OP_32_64(rotr): c = SHIFT_ROR; goto gen_shift; + gen_shift_maybe_vex: + if (have_bmi2 && !const_args[2]) { + tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); + break; + } + /* FALLTHRU */ gen_shift: if (const_args[2]) { tcg_out_shifti(s, c + rexw, args[0], args[2]); @@ -1961,10 +2099,11 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i32, { "r", "0", "ri" } }, { INDEX_op_or_i32, { "r", "0", "ri" } }, { INDEX_op_xor_i32, { "r", "0", "ri" } }, + { INDEX_op_andc_i32, { "r", "r", "ri" } }, - { INDEX_op_shl_i32, { "r", "0", "ci" } }, - { INDEX_op_shr_i32, { "r", "0", "ci" } }, - { INDEX_op_sar_i32, { "r", "0", "ci" } }, + { INDEX_op_shl_i32, { "r", "0", "Ci" } }, + { INDEX_op_shr_i32, { "r", "0", "Ci" } }, + { INDEX_op_sar_i32, { "r", "0", "Ci" } }, { INDEX_op_rotl_i32, { "r", "0", "ci" } }, { INDEX_op_rotr_i32, { "r", "0", "ci" } }, @@ -1985,9 +2124,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_setcond_i32, { "q", "r", "ri" } }, { INDEX_op_deposit_i32, { "Q", "0", "Q" } }, -#if TCG_TARGET_HAS_movcond_i32 { INDEX_op_movcond_i32, { "r", "r", "ri", "r", "0" } }, -#endif { INDEX_op_mulu2_i32, { "a", "d", "a", "r" } }, { INDEX_op_muls2_i32, { "a", "d", "a", "r" } }, @@ -2020,10 +2157,11 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i64, { "r", "0", "reZ" } }, { INDEX_op_or_i64, { "r", "0", "re" } }, { INDEX_op_xor_i64, { "r", "0", "re" } }, + { INDEX_op_andc_i64, { "r", "r", "rI" } }, - { INDEX_op_shl_i64, { "r", "0", "ci" } }, - { INDEX_op_shr_i64, { "r", "0", "ci" } }, - { INDEX_op_sar_i64, { "r", "0", "ci" } }, + { INDEX_op_shl_i64, { "r", "0", "Ci" } }, + { INDEX_op_shr_i64, { "r", "0", "Ci" } }, + { INDEX_op_sar_i64, { "r", "0", "Ci" } }, { INDEX_op_rotl_i64, { "r", "0", "ci" } }, { INDEX_op_rotr_i64, { "r", "0", "ci" } }, @@ -2157,13 +2295,34 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { - /* For 32-bit, 99% certainty that we're running on hardware that supports - cmov, but we still need to check. In case cmov is not available, we'll - use a small forward branch. */ +#ifdef CONFIG_CPUID_H + unsigned a, b, c, d; + int max = __get_cpuid_max(0, 0); + + if (max >= 1) { + __cpuid(1, a, b, c, d); #ifndef have_cmov - { - unsigned a, b, c, d; - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); + /* For 32-bit, 99% certainty that we're running on hardware that + supports cmov, but we still need to check. In case cmov is not + available, we'll use a small forward branch. */ + have_cmov = (d & bit_CMOV) != 0; +#endif +#ifndef have_movbe + /* MOVBE is only available on Intel Atom and Haswell CPUs, so we + need to probe for it. */ + have_movbe = (c & bit_MOVBE) != 0; +#endif + } + + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b, c, d); +#ifdef bit_BMI + have_bmi1 = (b & bit_BMI) != 0; +#endif +#ifndef have_bmi2 + have_bmi2 = (b & bit_BMI2) != 0; +#endif } #endif |