diff options
author | Stefan Hajnoczi | 2022-09-02 19:24:28 +0200 |
---|---|---|
committer | Stefan Hajnoczi | 2022-09-02 19:24:28 +0200 |
commit | 61fd710b8da8aedcea9b4f197283dc38638e4b60 (patch) | |
tree | fc9d835042c6f1cfe17d9c6b71e22303098db8f6 /target | |
parent | Merge tag 'net-pull-request' of https://github.com/jasowang/qemu into staging (diff) | |
parent | target/i386: AVX+AES helpers prep (diff) | |
download | qemu-61fd710b8da8aedcea9b4f197283dc38638e4b60.tar.gz qemu-61fd710b8da8aedcea9b4f197283dc38638e4b60.tar.xz qemu-61fd710b8da8aedcea9b4f197283dc38638e4b60.zip |
Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging
* SCSI fixes for Mac OS 9
* Fix CPU reset for x86/KVM nested virtualization state
* remove feature_not_found() from the configure script
* Meson cleanups from muon
* improved i386 TCG tests for BMI and SSE
* SSE bugfixes
# -----BEGIN PGP SIGNATURE-----
#
# iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmMQ+IQUHHBib256aW5p
# QHJlZGhhdC5jb20ACgkQv/vSX3jHroNofQgArLRlbhua699UyDkTEGGv+gBVRRKg
# qJndTFJp5cvjJo3fNeE1XyZGY0PGLH09ilwHKnGqvn7Bc996ty6zi3sLDC+iT/SO
# cRik6EVgZH/0QseYZijviuz7NklL8so/bgn7sORP9ibRWwiojBzm91emUt4X2l5N
# WOmxLYNIPXR/G8LOSv5Dh4C4WXU3zuaLvTmg/fWPoWTF8P+9LU0gEKUzyk0jMJu4
# hb9lVLXyNbgEcdtK+VewWjsdJcdmF1tMAR94GTmbUdwxbwmATqX8w16jGUbnXPt2
# FZfmjS6CJO90uV7wBA91NnFlrJpWyDn1dKQ+ozpW0ZOAO+wfghpVq7/IRA==
# =VRK4
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 01 Sep 2022 14:23:00 EDT
# gpg: using RSA key F13338574B662389866C7682BFFBD25F78C7AE83
# gpg: issuer "pbonzini@redhat.com"
# gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full]
# gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" [full]
# Primary key fingerprint: 46F5 9FBD 57D6 12E7 BFD4 E2F7 7E15 100C CD36 69B1
# Subkey fingerprint: F133 3857 4B66 2389 866C 7682 BFFB D25F 78C7 AE83
* tag 'for-upstream' of https://gitlab.com/bonzini/qemu: (39 commits)
target/i386: AVX+AES helpers prep
target/i386: AVX pclmulqdq prep
target/i386: Rewrite blendv helpers
target/i386: Misc AVX helper prep
target/i386: Destructive FP helpers for AVX
target/i386: Dot product AVX helper prep
target/i386: reimplement AVX comparison helpers
target/i386: Floating point arithmetic helper AVX prep
target/i386: Destructive vector helpers for AVX
target/i386: Misc integer AVX helper prep
target/i386: Rewrite simple integer vector helpers
target/i386: Rewrite vector shift helper
target/i386: rewrite destructive 3DNow operations
target/i386: Add CHECK_NO_VEX
target/i386: do not cast gen_helper_* function pointers
target/i386: Add size suffix to vector FP helpers
target/i386: isolate MMX code more
target/i386: check SSE table flags instead of hardcoding opcodes
target/i386: Move 3DNOW decoder
target/i386: Rework sse_op_table6/7
...
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Diffstat (limited to 'target')
-rw-r--r-- | target/i386/kvm/kvm.c | 54 | ||||
-rw-r--r-- | target/i386/ops_sse.h | 1835 | ||||
-rw-r--r-- | target/i386/ops_sse_header.h | 70 | ||||
-rw-r--r-- | target/i386/tcg/translate.c | 833 | ||||
-rw-r--r-- | target/riscv/meson.build | 2 |
5 files changed, 1465 insertions, 1329 deletions
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index f148a6d52f..a1fd1f5379 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -1695,6 +1695,30 @@ static void kvm_init_xsave(CPUX86State *env) env->xsave_buf_len); } +static void kvm_init_nested_state(CPUX86State *env) +{ + struct kvm_vmx_nested_state_hdr *vmx_hdr; + uint32_t size; + + if (!env->nested_state) { + return; + } + + size = env->nested_state->size; + + memset(env->nested_state, 0, size); + env->nested_state->size = size; + + if (cpu_has_vmx(env)) { + env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; + vmx_hdr = &env->nested_state->hdr.vmx; + vmx_hdr->vmxon_pa = -1ull; + vmx_hdr->vmcs12_pa = -1ull; + } else if (cpu_has_svm(env)) { + env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; + } +} + int kvm_arch_init_vcpu(CPUState *cs) { struct { @@ -2122,19 +2146,10 @@ int kvm_arch_init_vcpu(CPUState *cs) assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data)); if (cpu_has_vmx(env) || cpu_has_svm(env)) { - struct kvm_vmx_nested_state_hdr *vmx_hdr; - env->nested_state = g_malloc0(max_nested_state_len); env->nested_state->size = max_nested_state_len; - if (cpu_has_vmx(env)) { - env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX; - vmx_hdr = &env->nested_state->hdr.vmx; - vmx_hdr->vmxon_pa = -1ull; - vmx_hdr->vmcs12_pa = -1ull; - } else { - env->nested_state->format = KVM_STATE_NESTED_FORMAT_SVM; - } + kvm_init_nested_state(env); } } @@ -2199,6 +2214,8 @@ void kvm_arch_reset_vcpu(X86CPU *cpu) /* enabled by default */ env->poll_control_msr = 1; + kvm_init_nested_state(env); + sev_es_set_reset_vector(CPU(cpu)); } @@ -4512,6 +4529,18 @@ int kvm_arch_put_registers(CPUState *cpu, int level) assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu)); + /* + * Put MSR_IA32_FEATURE_CONTROL first, this ensures the VM gets out of VMX + * root operation upon vCPU reset. kvm_put_msr_feature_control() should also + * preceed kvm_put_nested_state() when 'real' nested state is set. + */ + if (level >= KVM_PUT_RESET_STATE) { + ret = kvm_put_msr_feature_control(x86_cpu); + if (ret < 0) { + return ret; + } + } + /* must be before kvm_put_nested_state so that EFER.SVME is set */ ret = has_sregs2 ? kvm_put_sregs2(x86_cpu) : kvm_put_sregs(x86_cpu); if (ret < 0) { @@ -4523,11 +4552,6 @@ int kvm_arch_put_registers(CPUState *cpu, int level) if (ret < 0) { return ret; } - - ret = kvm_put_msr_feature_control(x86_cpu); - if (ret < 0) { - return ret; - } } if (level == KVM_PUT_FULL_STATE) { diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 535440f882..c0766de18d 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -22,7 +22,6 @@ #if SHIFT == 0 #define Reg MMXReg -#define SIZE 8 #define XMM_ONLY(...) #define B(n) MMX_B(n) #define W(n) MMX_W(n) @@ -31,7 +30,6 @@ #define SUFFIX _mmx #else #define Reg ZMMReg -#define SIZE 16 #define XMM_ONLY(...) __VA_ARGS__ #define B(n) ZMM_B(n) #define W(n) ZMM_W(n) @@ -40,275 +38,210 @@ #define SUFFIX _xmm #endif -/* - * Copy the relevant parts of a Reg value around. In the case where - * sizeof(Reg) > SIZE, these helpers operate only on the lower bytes of - * a 64 byte ZMMReg, so we must copy only those and keep the top bytes - * untouched in the guest-visible destination destination register. - * Note that the "lower bytes" are placed last in memory on big-endian - * hosts, which store the vector backwards in memory. In that case the - * copy *starts* at B(SIZE - 1) and ends at B(0), the opposite of - * the little-endian case. - */ -#if HOST_BIG_ENDIAN -#define MOVE(d, r) memcpy(&((d).B(SIZE - 1)), &(r).B(SIZE - 1), SIZE) -#else -#define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE) +#define LANE_WIDTH (SHIFT ? 16 : 8) +#define PACK_WIDTH (LANE_WIDTH / 2) + +#if SHIFT == 0 +#define FPSRL(x, c) ((x) >> shift) +#define FPSRAW(x, c) ((int16_t)(x) >> shift) +#define FPSRAL(x, c) ((int32_t)(x) >> shift) +#define FPSLL(x, c) ((x) << shift) #endif -void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 15) { - d->Q(0) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + if (c->Q(0) > 15) { + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } else { - shift = s->B(0); - d->W(0) >>= shift; - d->W(1) >>= shift; - d->W(2) >>= shift; - d->W(3) >>= shift; -#if SHIFT == 1 - d->W(4) >>= shift; - d->W(5) >>= shift; - d->W(6) >>= shift; - d->W(7) >>= shift; -#endif + shift = c->B(0); + for (int i = 0; i < 4 << SHIFT; i++) { + d->W(i) = FPSRL(s->W(i), shift); + } } } -void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 15) { - shift = 15; + if (c->Q(0) > 15) { + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } else { - shift = s->B(0); + shift = c->B(0); + for (int i = 0; i < 4 << SHIFT; i++) { + d->W(i) = FPSLL(s->W(i), shift); + } } - d->W(0) = (int16_t)d->W(0) >> shift; - d->W(1) = (int16_t)d->W(1) >> shift; - d->W(2) = (int16_t)d->W(2) >> shift; - d->W(3) = (int16_t)d->W(3) >> shift; -#if SHIFT == 1 - d->W(4) = (int16_t)d->W(4) >> shift; - d->W(5) = (int16_t)d->W(5) >> shift; - d->W(6) = (int16_t)d->W(6) >> shift; - d->W(7) = (int16_t)d->W(7) >> shift; -#endif } -void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 15) { - d->Q(0) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + if (c->Q(0) > 15) { + shift = 15; } else { - shift = s->B(0); - d->W(0) <<= shift; - d->W(1) <<= shift; - d->W(2) <<= shift; - d->W(3) <<= shift; -#if SHIFT == 1 - d->W(4) <<= shift; - d->W(5) <<= shift; - d->W(6) <<= shift; - d->W(7) <<= shift; -#endif + shift = c->B(0); + } + for (int i = 0; i < 4 << SHIFT; i++) { + d->W(i) = FPSRAW(s->W(i), shift); } } -void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 31) { - d->Q(0) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + if (c->Q(0) > 31) { + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } else { - shift = s->B(0); - d->L(0) >>= shift; - d->L(1) >>= shift; -#if SHIFT == 1 - d->L(2) >>= shift; - d->L(3) >>= shift; -#endif + shift = c->B(0); + for (int i = 0; i < 2 << SHIFT; i++) { + d->L(i) = FPSRL(s->L(i), shift); + } } } -void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 31) { - shift = 31; + if (c->Q(0) > 31) { + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } else { - shift = s->B(0); + shift = c->B(0); + for (int i = 0; i < 2 << SHIFT; i++) { + d->L(i) = FPSLL(s->L(i), shift); + } } - d->L(0) = (int32_t)d->L(0) >> shift; - d->L(1) = (int32_t)d->L(1) >> shift; -#if SHIFT == 1 - d->L(2) = (int32_t)d->L(2) >> shift; - d->L(3) = (int32_t)d->L(3) >> shift; -#endif } -void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 31) { - d->Q(0) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + if (c->Q(0) > 31) { + shift = 31; } else { - shift = s->B(0); - d->L(0) <<= shift; - d->L(1) <<= shift; -#if SHIFT == 1 - d->L(2) <<= shift; - d->L(3) <<= shift; -#endif + shift = c->B(0); + } + for (int i = 0; i < 2 << SHIFT; i++) { + d->L(i) = FPSRAL(s->L(i), shift); } } -void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 63) { - d->Q(0) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + if (c->Q(0) > 63) { + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } else { - shift = s->B(0); - d->Q(0) >>= shift; -#if SHIFT == 1 - d->Q(1) >>= shift; -#endif + shift = c->B(0); + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = FPSRL(s->Q(i), shift); + } } } -void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { + Reg *s = d; int shift; - - if (s->Q(0) > 63) { - d->Q(0) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + if (c->Q(0) > 63) { + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } else { - shift = s->B(0); - d->Q(0) <<= shift; -#if SHIFT == 1 - d->Q(1) <<= shift; -#endif + shift = c->B(0); + for (int i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = FPSLL(s->Q(i), shift); + } } } -#if SHIFT == 1 -void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +#if SHIFT >= 1 +void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { - int shift, i; + Reg *s = d; + int shift, i, j; - shift = s->L(0); + shift = c->L(0); if (shift > 16) { shift = 16; } - for (i = 0; i < 16 - shift; i++) { - d->B(i) = d->B(i + shift); - } - for (i = 16 - shift; i < 16; i++) { - d->B(i) = 0; + for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { + for (i = 0; i < 16 - shift; i++) { + d->B(j + i) = s->B(j + i + shift); + } + for (i = 16 - shift; i < 16; i++) { + d->B(j + i) = 0; + } } } -void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c) { - int shift, i; + Reg *s = d; + int shift, i, j; - shift = s->L(0); + shift = c->L(0); if (shift > 16) { shift = 16; } - for (i = 15; i >= shift; i--) { - d->B(i) = d->B(i - shift); - } - for (i = 0; i < shift; i++) { - d->B(i) = 0; + for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) { + for (i = 15; i >= shift; i--) { + d->B(j + i) = s->B(j + i - shift); + } + for (i = 0; i < shift; i++) { + d->B(j + i) = 0; + } } } #endif -#define SSE_HELPER_B(name, F) \ +#define SSE_HELPER_1(name, elem, num, F) \ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ - d->B(0) = F(d->B(0), s->B(0)); \ - d->B(1) = F(d->B(1), s->B(1)); \ - d->B(2) = F(d->B(2), s->B(2)); \ - d->B(3) = F(d->B(3), s->B(3)); \ - d->B(4) = F(d->B(4), s->B(4)); \ - d->B(5) = F(d->B(5), s->B(5)); \ - d->B(6) = F(d->B(6), s->B(6)); \ - d->B(7) = F(d->B(7), s->B(7)); \ - XMM_ONLY( \ - d->B(8) = F(d->B(8), s->B(8)); \ - d->B(9) = F(d->B(9), s->B(9)); \ - d->B(10) = F(d->B(10), s->B(10)); \ - d->B(11) = F(d->B(11), s->B(11)); \ - d->B(12) = F(d->B(12), s->B(12)); \ - d->B(13) = F(d->B(13), s->B(13)); \ - d->B(14) = F(d->B(14), s->B(14)); \ - d->B(15) = F(d->B(15), s->B(15)); \ - ) \ - } + int n = num; \ + for (int i = 0; i < n; i++) { \ + d->elem(i) = F(s->elem(i)); \ + } \ + } -#define SSE_HELPER_W(name, F) \ +#define SSE_HELPER_2(name, elem, num, F) \ void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ - d->W(0) = F(d->W(0), s->W(0)); \ - d->W(1) = F(d->W(1), s->W(1)); \ - d->W(2) = F(d->W(2), s->W(2)); \ - d->W(3) = F(d->W(3), s->W(3)); \ - XMM_ONLY( \ - d->W(4) = F(d->W(4), s->W(4)); \ - d->W(5) = F(d->W(5), s->W(5)); \ - d->W(6) = F(d->W(6), s->W(6)); \ - d->W(7) = F(d->W(7), s->W(7)); \ - ) \ - } + Reg *v = d; \ + int n = num; \ + for (int i = 0; i < n; i++) { \ + d->elem(i) = F(v->elem(i), s->elem(i)); \ + } \ + } + +#define SSE_HELPER_B(name, F) \ + SSE_HELPER_2(name, B, 8 << SHIFT, F) + +#define SSE_HELPER_W(name, F) \ + SSE_HELPER_2(name, W, 4 << SHIFT, F) #define SSE_HELPER_L(name, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ - { \ - d->L(0) = F(d->L(0), s->L(0)); \ - d->L(1) = F(d->L(1), s->L(1)); \ - XMM_ONLY( \ - d->L(2) = F(d->L(2), s->L(2)); \ - d->L(3) = F(d->L(3), s->L(3)); \ - ) \ - } + SSE_HELPER_2(name, L, 2 << SHIFT, F) #define SSE_HELPER_Q(name, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ - { \ - d->Q(0) = F(d->Q(0), s->Q(0)); \ - XMM_ONLY( \ - d->Q(1) = F(d->Q(1), s->Q(1)); \ - ) \ - } + SSE_HELPER_2(name, Q, 1 << SHIFT, F) #if SHIFT == 0 static inline int satub(int x) @@ -440,19 +373,22 @@ SSE_HELPER_W(helper_pavgw, FAVG) void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0); -#if SHIFT == 1 - d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2); -#endif + Reg *v = d; + int i; + + for (i = 0; i < (1 << SHIFT); i++) { + d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2); + } } void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { + Reg *v = d; int i; for (i = 0; i < (2 << SHIFT); i++) { - d->L(i) = (int16_t)s->W(2 * i) * (int16_t)d->W(2 * i) + - (int16_t)s->W(2 * i + 1) * (int16_t)d->W(2 * i + 1); + d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) + + (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1); } } @@ -466,34 +402,27 @@ static inline int abs1(int a) } } #endif + void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - unsigned int val; + Reg *v = d; + int i; - val = 0; - val += abs1(d->B(0) - s->B(0)); - val += abs1(d->B(1) - s->B(1)); - val += abs1(d->B(2) - s->B(2)); - val += abs1(d->B(3) - s->B(3)); - val += abs1(d->B(4) - s->B(4)); - val += abs1(d->B(5) - s->B(5)); - val += abs1(d->B(6) - s->B(6)); - val += abs1(d->B(7) - s->B(7)); - d->Q(0) = val; -#if SHIFT == 1 - val = 0; - val += abs1(d->B(8) - s->B(8)); - val += abs1(d->B(9) - s->B(9)); - val += abs1(d->B(10) - s->B(10)); - val += abs1(d->B(11) - s->B(11)); - val += abs1(d->B(12) - s->B(12)); - val += abs1(d->B(13) - s->B(13)); - val += abs1(d->B(14) - s->B(14)); - val += abs1(d->B(15) - s->B(15)); - d->Q(1) = val; -#endif + for (i = 0; i < (1 << SHIFT); i++) { + unsigned int val = 0; + val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0)); + val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1)); + val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2)); + val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3)); + val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4)); + val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5)); + val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6)); + val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7)); + d->Q(i) = val; + } } +#if SHIFT < 2 void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, target_ulong a0) { @@ -505,128 +434,161 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } } } +#endif void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val) { + int i; + d->L(0) = val; d->L(1) = 0; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + for (i = 1; i < (1 << SHIFT); i++) { + d->Q(i) = 0; + } } #ifdef TARGET_X86_64 void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val) { + int i; + d->Q(0) = val; -#if SHIFT == 1 - d->Q(1) = 0; -#endif + for (i = 1; i < (1 << SHIFT); i++) { + d->Q(i) = 0; + } } #endif +#define SHUFFLE4(F, a, b, offset) do { \ + r0 = a->F((order & 3) + offset); \ + r1 = a->F(((order >> 2) & 3) + offset); \ + r2 = b->F(((order >> 4) & 3) + offset); \ + r3 = b->F(((order >> 6) & 3) + offset); \ + d->F(offset) = r0; \ + d->F(offset + 1) = r1; \ + d->F(offset + 2) = r2; \ + d->F(offset + 3) = r3; \ + } while (0) + #if SHIFT == 0 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint16_t r0, r1, r2, r3; - r.W(0) = s->W(order & 3); - r.W(1) = s->W((order >> 2) & 3); - r.W(2) = s->W((order >> 4) & 3); - r.W(3) = s->W((order >> 6) & 3); - MOVE(*d, r); + SHUFFLE4(W, s, s, 0); } #else -void helper_shufps(Reg *d, Reg *s, int order) +void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + Reg *v = d; + uint32_t r0, r1, r2, r3; + int i; - r.L(0) = d->L(order & 3); - r.L(1) = d->L((order >> 2) & 3); - r.L(2) = s->L((order >> 4) & 3); - r.L(3) = s->L((order >> 6) & 3); - MOVE(*d, r); + for (i = 0; i < 2 << SHIFT; i += 4) { + SHUFFLE4(L, v, s, i); + } } -void helper_shufpd(Reg *d, Reg *s, int order) +void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + Reg *v = d; + uint64_t r0, r1; + int i; - r.Q(0) = d->Q(order & 1); - r.Q(1) = s->Q((order >> 1) & 1); - MOVE(*d, r); + for (i = 0; i < 1 << SHIFT; i += 2) { + r0 = v->Q(((order & 1) & 1) + i); + r1 = s->Q(((order >> 1) & 1) + i); + d->Q(i) = r0; + d->Q(i + 1) = r1; + order >>= 2; + } } void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint32_t r0, r1, r2, r3; + int i; - r.L(0) = s->L(order & 3); - r.L(1) = s->L((order >> 2) & 3); - r.L(2) = s->L((order >> 4) & 3); - r.L(3) = s->L((order >> 6) & 3); - MOVE(*d, r); + for (i = 0; i < 2 << SHIFT; i += 4) { + SHUFFLE4(L, s, s, i); + } } void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint16_t r0, r1, r2, r3; + int i, j; - r.W(0) = s->W(order & 3); - r.W(1) = s->W((order >> 2) & 3); - r.W(2) = s->W((order >> 4) & 3); - r.W(3) = s->W((order >> 6) & 3); - r.Q(1) = s->Q(1); - MOVE(*d, r); + for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) { + SHUFFLE4(W, s, s, i); + d->Q(j) = s->Q(j); + } } void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order) { - Reg r; + uint16_t r0, r1, r2, r3; + int i, j; - r.Q(0) = s->Q(0); - r.W(4) = s->W(4 + (order & 3)); - r.W(5) = s->W(4 + ((order >> 2) & 3)); - r.W(6) = s->W(4 + ((order >> 4) & 3)); - r.W(7) = s->W(4 + ((order >> 6) & 3)); - MOVE(*d, r); + for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) { + d->Q(j) = s->Q(j); + SHUFFLE4(W, s, s, i); + } } #endif -#if SHIFT == 1 +#if SHIFT >= 1 /* FPU ops */ /* XXX: not accurate */ -#define SSE_HELPER_S(name, F) \ - void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s) \ +#define SSE_HELPER_P(name, F) \ + void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s) \ { \ - d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - d->ZMM_S(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ - d->ZMM_S(2) = F(32, d->ZMM_S(2), s->ZMM_S(2)); \ - d->ZMM_S(3) = F(32, d->ZMM_S(3), s->ZMM_S(3)); \ + Reg *v = d; \ + int i; \ + for (i = 0; i < 2 << SHIFT; i++) { \ + d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \ + } \ } \ \ - void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ + void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s) \ { \ - d->ZMM_S(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - } \ + Reg *v = d; \ + int i; \ + for (i = 0; i < 1 << SHIFT; i++) { \ + d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \ + } \ + } + +#if SHIFT == 1 + +#define SSE_HELPER_S(name, F) \ + SSE_HELPER_P(name, F) \ \ - void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s) \ + void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\ { \ - d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ - d->ZMM_D(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ + Reg *v = d; \ + d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \ } \ \ - void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ + void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\ { \ - d->ZMM_D(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ + Reg *v = d; \ + d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \ } +#else + +#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F) + +#endif + #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) -#define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status) /* Note that the choice of comparison op here is important to get the * special cases right: for min and max Intel specifies that (-0,0), @@ -643,27 +605,56 @@ SSE_HELPER_S(mul, FPU_MUL) SSE_HELPER_S(div, FPU_DIV) SSE_HELPER_S(min, FPU_MIN) SSE_HELPER_S(max, FPU_MAX) -SSE_HELPER_S(sqrt, FPU_SQRT) +void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + int i; + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status); + } +} -/* float to float conversions */ -void helper_cvtps2pd(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - float32 s0, s1; + int i; + for (i = 0; i < 1 << SHIFT; i++) { + d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status); + } +} - s0 = s->ZMM_S(0); - s1 = s->ZMM_S(1); - d->ZMM_D(0) = float32_to_float64(s0, &env->sse_status); - d->ZMM_D(1) = float32_to_float64(s1, &env->sse_status); +#if SHIFT == 1 +void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s) +{ + d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status); } -void helper_cvtpd2ps(CPUX86State *env, Reg *d, Reg *s) +void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s) { - d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); - d->ZMM_S(1) = float64_to_float32(s->ZMM_D(1), &env->sse_status); - d->Q(1) = 0; + d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status); } +#endif +/* float to float conversions */ +void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + int i; + for (i = 1 << SHIFT; --i >= 0; ) { + d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status); + } +} + +void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + int i; + for (i = 0; i < 1 << SHIFT; i++) { + d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status); + } + for (i >>= 1; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } +} + +#if SHIFT == 1 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status); @@ -673,26 +664,27 @@ void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s) { d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status); } +#endif /* integer to float */ -void helper_cvtdq2ps(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - d->ZMM_S(0) = int32_to_float32(s->ZMM_L(0), &env->sse_status); - d->ZMM_S(1) = int32_to_float32(s->ZMM_L(1), &env->sse_status); - d->ZMM_S(2) = int32_to_float32(s->ZMM_L(2), &env->sse_status); - d->ZMM_S(3) = int32_to_float32(s->ZMM_L(3), &env->sse_status); + int i; + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status); + } } -void helper_cvtdq2pd(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - int32_t l0, l1; - - l0 = (int32_t)s->ZMM_L(0); - l1 = (int32_t)s->ZMM_L(1); - d->ZMM_D(0) = int32_to_float64(l0, &env->sse_status); - d->ZMM_D(1) = int32_to_float64(l1, &env->sse_status); + int i; + for (i = 1 << SHIFT; --i >= 0; ) { + int32_t l = s->ZMM_L(i); + d->ZMM_D(i) = int32_to_float64(l, &env->sse_status); + } } +#if SHIFT == 1 void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s) { d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status); @@ -727,8 +719,11 @@ void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val) } #endif +#endif + /* float to integer */ +#if SHIFT == 1 /* * x86 mandates that we return the indefinite integer value for the result * of any float-to-integer conversion that raises the 'invalid' exception. @@ -759,22 +754,28 @@ WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN) WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN) WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN) +#endif -void helper_cvtps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - d->ZMM_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); - d->ZMM_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status); - d->ZMM_L(2) = x86_float32_to_int32(s->ZMM_S(2), &env->sse_status); - d->ZMM_L(3) = x86_float32_to_int32(s->ZMM_S(3), &env->sse_status); + int i; + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status); + } } -void helper_cvtpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - d->ZMM_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status); - d->ZMM_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status); - d->ZMM_Q(1) = 0; + int i; + for (i = 0; i < 1 << SHIFT; i++) { + d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status); + } + for (i >>= 1; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } +#if SHIFT == 1 void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) { d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status); @@ -808,23 +809,31 @@ int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s) return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status); } #endif +#endif /* float to integer truncated */ -void helper_cvttps2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - d->ZMM_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); - d->ZMM_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status); - d->ZMM_L(2) = x86_float32_to_int32_round_to_zero(s->ZMM_S(2), &env->sse_status); - d->ZMM_L(3) = x86_float32_to_int32_round_to_zero(s->ZMM_S(3), &env->sse_status); + int i; + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i), + &env->sse_status); + } } -void helper_cvttpd2dq(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { - d->ZMM_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status); - d->ZMM_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status); - d->ZMM_Q(1) = 0; + int i; + for (i = 0; i < 1 << SHIFT; i++) { + d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i), + &env->sse_status); + } + for (i >>= 1; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } } +#if SHIFT == 1 void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s) { d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status); @@ -858,25 +867,21 @@ int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s) return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status); } #endif +#endif -void helper_rsqrtps(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); - d->ZMM_S(0) = float32_div(float32_one, - float32_sqrt(s->ZMM_S(0), &env->sse_status), - &env->sse_status); - d->ZMM_S(1) = float32_div(float32_one, - float32_sqrt(s->ZMM_S(1), &env->sse_status), - &env->sse_status); - d->ZMM_S(2) = float32_div(float32_one, - float32_sqrt(s->ZMM_S(2), &env->sse_status), - &env->sse_status); - d->ZMM_S(3) = float32_div(float32_one, - float32_sqrt(s->ZMM_S(3), &env->sse_status), - &env->sse_status); + int i; + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_S(i) = float32_div(float32_one, + float32_sqrt(s->ZMM_S(i), &env->sse_status), + &env->sse_status); + } set_float_exception_flags(old_flags, &env->sse_status); } +#if SHIFT == 1 void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); @@ -885,24 +890,28 @@ void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s) &env->sse_status); set_float_exception_flags(old_flags, &env->sse_status); } +#endif -void helper_rcpps(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); - d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); - d->ZMM_S(1) = float32_div(float32_one, s->ZMM_S(1), &env->sse_status); - d->ZMM_S(2) = float32_div(float32_one, s->ZMM_S(2), &env->sse_status); - d->ZMM_S(3) = float32_div(float32_one, s->ZMM_S(3), &env->sse_status); + int i; + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status); + } set_float_exception_flags(old_flags, &env->sse_status); } +#if SHIFT == 1 void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status); set_float_exception_flags(old_flags, &env->sse_status); } +#endif +#if SHIFT == 1 static inline uint64_t helper_extrq(uint64_t src, int shift, int len) { uint64_t mask; @@ -946,113 +955,134 @@ void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length) { d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length); } +#endif -void helper_haddps(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) = float32_add(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) = float32_add(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) = float32_add(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) = float32_add(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - MOVE(*d, r); -} - -void helper_haddpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_D(0) = float64_add(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) = float64_add(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - MOVE(*d, r); -} - -void helper_hsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_S(0) = float32_sub(d->ZMM_S(0), d->ZMM_S(1), &env->sse_status); - r.ZMM_S(1) = float32_sub(d->ZMM_S(2), d->ZMM_S(3), &env->sse_status); - r.ZMM_S(2) = float32_sub(s->ZMM_S(0), s->ZMM_S(1), &env->sse_status); - r.ZMM_S(3) = float32_sub(s->ZMM_S(2), s->ZMM_S(3), &env->sse_status); - MOVE(*d, r); -} - -void helper_hsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - ZMMReg r; - - r.ZMM_D(0) = float64_sub(d->ZMM_D(0), d->ZMM_D(1), &env->sse_status); - r.ZMM_D(1) = float64_sub(s->ZMM_D(0), s->ZMM_D(1), &env->sse_status); - MOVE(*d, r); -} - -void helper_addsubps(CPUX86State *env, ZMMReg *d, ZMMReg *s) -{ - d->ZMM_S(0) = float32_sub(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status); - d->ZMM_S(1) = float32_add(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status); - d->ZMM_S(2) = float32_sub(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status); - d->ZMM_S(3) = float32_add(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status); +#define SSE_HELPER_HPS(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + float32 r[2 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ + for (i = j = 0; j < 4; i++, j += 2) { \ + r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \ + } \ + for (j = 0; j < 4; i++, j += 2) { \ + r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \ + } \ + } \ + for (i = 0; i < 2 << SHIFT; i++) { \ + d->ZMM_S(i) = r[i]; \ + } \ +} + +SSE_HELPER_HPS(haddps, float32_add) +SSE_HELPER_HPS(hsubps, float32_sub) + +#define SSE_HELPER_HPD(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + float64 r[1 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \ + for (i = j = 0; j < 2; i++, j += 2) { \ + r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \ + } \ + for (j = 0; j < 2; i++, j += 2) { \ + r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \ + } \ + } \ + for (i = 0; i < 1 << SHIFT; i++) { \ + d->ZMM_D(i) = r[i]; \ + } \ +} + +SSE_HELPER_HPD(haddpd, float64_add) +SSE_HELPER_HPD(hsubpd, float64_sub) + +void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + Reg *v = d; + int i; + for (i = 0; i < 2 << SHIFT; i += 2) { + d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); + d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); + } } -void helper_addsubpd(CPUX86State *env, ZMMReg *d, ZMMReg *s) +void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - d->ZMM_D(0) = float64_sub(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status); - d->ZMM_D(1) = float64_add(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status); + Reg *v = d; + int i; + for (i = 0; i < 1 << SHIFT; i += 2) { + d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status); + d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status); + } } -/* XXX: unordered */ -#define SSE_HELPER_CMP(name, F) \ - void helper_ ## name ## ps(CPUX86State *env, Reg *d, Reg *s) \ - { \ - d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - d->ZMM_L(1) = F(32, d->ZMM_S(1), s->ZMM_S(1)); \ - d->ZMM_L(2) = F(32, d->ZMM_S(2), s->ZMM_S(2)); \ - d->ZMM_L(3) = F(32, d->ZMM_S(3), s->ZMM_S(3)); \ - } \ - \ - void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ +#define SSE_HELPER_CMP_P(name, F, C) \ + void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s) \ { \ - d->ZMM_L(0) = F(32, d->ZMM_S(0), s->ZMM_S(0)); \ - } \ - \ - void helper_ ## name ## pd(CPUX86State *env, Reg *d, Reg *s) \ - { \ - d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ - d->ZMM_Q(1) = F(64, d->ZMM_D(1), s->ZMM_D(1)); \ + Reg *v = d; \ + int i; \ + for (i = 0; i < 2 << SHIFT; i++) { \ + d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \ + } \ } \ \ - void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ + void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s) \ { \ - d->ZMM_Q(0) = F(64, d->ZMM_D(0), s->ZMM_D(0)); \ - } - -#define FPU_CMPEQ(size, a, b) \ - (float ## size ## _eq_quiet(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPLT(size, a, b) \ - (float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPLE(size, a, b) \ - (float ## size ## _le(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPUNORD(size, a, b) \ - (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? -1 : 0) -#define FPU_CMPNEQ(size, a, b) \ - (float ## size ## _eq_quiet(a, b, &env->sse_status) ? 0 : -1) -#define FPU_CMPNLT(size, a, b) \ - (float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1) -#define FPU_CMPNLE(size, a, b) \ - (float ## size ## _le(a, b, &env->sse_status) ? 0 : -1) -#define FPU_CMPORD(size, a, b) \ - (float ## size ## _unordered_quiet(a, b, &env->sse_status) ? 0 : -1) - -SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) -SSE_HELPER_CMP(cmplt, FPU_CMPLT) -SSE_HELPER_CMP(cmple, FPU_CMPLE) -SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD) -SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ) -SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT) -SSE_HELPER_CMP(cmpnle, FPU_CMPNLE) -SSE_HELPER_CMP(cmpord, FPU_CMPORD) + Reg *v = d; \ + int i; \ + for (i = 0; i < 1 << SHIFT; i++) { \ + d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \ + } \ + } + +#if SHIFT == 1 +#define SSE_HELPER_CMP(name, F, C) \ + SSE_HELPER_CMP_P(name, F, C) \ + void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \ + { \ + Reg *v = d; \ + d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \ + } \ + \ + void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \ + { \ + Reg *v = d; \ + d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \ + } + +#define FPU_EQ(x) (x == float_relation_equal) +#define FPU_LT(x) (x == float_relation_less) +#define FPU_LE(x) (x <= float_relation_equal) +#define FPU_UNORD(x) (x == float_relation_unordered) + +#define FPU_CMPQ(size, a, b) \ + float ## size ## _compare_quiet(a, b, &env->sse_status) +#define FPU_CMPS(size, a, b) \ + float ## size ## _compare(a, b, &env->sse_status) + +#else +#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C) +#endif +SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) +SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) +SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) +SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) +SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) +SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) +SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) +SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) + +#undef SSE_HELPER_CMP + +#if SHIFT == 1 static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s) @@ -1098,25 +1128,30 @@ void helper_comisd(CPUX86State *env, Reg *d, Reg *s) ret = float64_compare(d0, d1, &env->sse_status); CC_SRC = comis_eflags[ret + 1]; } +#endif -uint32_t helper_movmskps(CPUX86State *env, Reg *s) +uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s) { - int b0, b1, b2, b3; + uint32_t mask; + int i; - b0 = s->ZMM_L(0) >> 31; - b1 = s->ZMM_L(1) >> 31; - b2 = s->ZMM_L(2) >> 31; - b3 = s->ZMM_L(3) >> 31; - return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3); + mask = 0; + for (i = 0; i < 2 << SHIFT; i++) { + mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i); + } + return mask; } -uint32_t helper_movmskpd(CPUX86State *env, Reg *s) +uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s) { - int b0, b1; + uint32_t mask; + int i; - b0 = s->ZMM_L(1) >> 31; - b1 = s->ZMM_L(3) >> 31; - return b0 | (b1 << 1); + mask = 0; + for (i = 0; i < 1 << SHIFT; i++) { + mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i); + } + return mask; } #endif @@ -1124,179 +1159,150 @@ uint32_t helper_movmskpd(CPUX86State *env, Reg *s) uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s) { uint32_t val; + int i; val = 0; - val |= (s->B(0) >> 7); - val |= (s->B(1) >> 6) & 0x02; - val |= (s->B(2) >> 5) & 0x04; - val |= (s->B(3) >> 4) & 0x08; - val |= (s->B(4) >> 3) & 0x10; - val |= (s->B(5) >> 2) & 0x20; - val |= (s->B(6) >> 1) & 0x40; - val |= (s->B(7)) & 0x80; -#if SHIFT == 1 - val |= (s->B(8) << 1) & 0x0100; - val |= (s->B(9) << 2) & 0x0200; - val |= (s->B(10) << 3) & 0x0400; - val |= (s->B(11) << 4) & 0x0800; - val |= (s->B(12) << 5) & 0x1000; - val |= (s->B(13) << 6) & 0x2000; - val |= (s->B(14) << 7) & 0x4000; - val |= (s->B(15) << 8) & 0x8000; -#endif + for (i = 0; i < (1 << SHIFT); i++) { + uint8_t byte = 0; + byte |= (s->B(8 * i + 0) >> 7); + byte |= (s->B(8 * i + 1) >> 6) & 0x02; + byte |= (s->B(8 * i + 2) >> 5) & 0x04; + byte |= (s->B(8 * i + 3) >> 4) & 0x08; + byte |= (s->B(8 * i + 4) >> 3) & 0x10; + byte |= (s->B(8 * i + 5) >> 2) & 0x20; + byte |= (s->B(8 * i + 6) >> 1) & 0x40; + byte |= (s->B(8 * i + 7)) & 0x80; + val |= byte << (8 * i); + } return val; } -void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.B(0) = satsb((int16_t)d->W(0)); - r.B(1) = satsb((int16_t)d->W(1)); - r.B(2) = satsb((int16_t)d->W(2)); - r.B(3) = satsb((int16_t)d->W(3)); -#if SHIFT == 1 - r.B(4) = satsb((int16_t)d->W(4)); - r.B(5) = satsb((int16_t)d->W(5)); - r.B(6) = satsb((int16_t)d->W(6)); - r.B(7) = satsb((int16_t)d->W(7)); -#endif - r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0)); - r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1)); - r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2)); - r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3)); -#if SHIFT == 1 - r.B(12) = satsb((int16_t)s->W(4)); - r.B(13) = satsb((int16_t)s->W(5)); - r.B(14) = satsb((int16_t)s->W(6)); - r.B(15) = satsb((int16_t)s->W(7)); -#endif - MOVE(*d, r); -} - -void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.B(0) = satub((int16_t)d->W(0)); - r.B(1) = satub((int16_t)d->W(1)); - r.B(2) = satub((int16_t)d->W(2)); - r.B(3) = satub((int16_t)d->W(3)); -#if SHIFT == 1 - r.B(4) = satub((int16_t)d->W(4)); - r.B(5) = satub((int16_t)d->W(5)); - r.B(6) = satub((int16_t)d->W(6)); - r.B(7) = satub((int16_t)d->W(7)); -#endif - r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0)); - r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1)); - r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2)); - r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3)); -#if SHIFT == 1 - r.B(12) = satub((int16_t)s->W(4)); - r.B(13) = satub((int16_t)s->W(5)); - r.B(14) = satub((int16_t)s->W(6)); - r.B(15) = satub((int16_t)s->W(7)); -#endif - MOVE(*d, r); -} +#define PACK_HELPER_B(name, F) \ +void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \ + Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + uint8_t r[PACK_WIDTH * 2]; \ + int j, k; \ + for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \ + for (k = 0; k < PACK_WIDTH; k++) { \ + r[k] = F((int16_t)v->W(j + k)); \ + } \ + for (k = 0; k < PACK_WIDTH; k++) { \ + r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \ + } \ + for (k = 0; k < PACK_WIDTH * 2; k++) { \ + d->B(2 * j + k) = r[k]; \ + } \ + } \ +} + +PACK_HELPER_B(sswb, satsb) +PACK_HELPER_B(uswb, satub) void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - Reg r; + Reg *v = d; + uint16_t r[PACK_WIDTH]; + int j, k; - r.W(0) = satsw(d->L(0)); - r.W(1) = satsw(d->L(1)); -#if SHIFT == 1 - r.W(2) = satsw(d->L(2)); - r.W(3) = satsw(d->L(3)); -#endif - r.W((2 << SHIFT) + 0) = satsw(s->L(0)); - r.W((2 << SHIFT) + 1) = satsw(s->L(1)); -#if SHIFT == 1 - r.W(6) = satsw(s->L(2)); - r.W(7) = satsw(s->L(3)); -#endif - MOVE(*d, r); + for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) { + for (k = 0; k < PACK_WIDTH / 2; k++) { + r[k] = satsw(v->L(j + k)); + } + for (k = 0; k < PACK_WIDTH / 2; k++) { + r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k)); + } + for (k = 0; k < PACK_WIDTH; k++) { + d->W(2 * j + k) = r[k]; + } + } } #define UNPCK_OP(base_name, base) \ \ void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *s) \ { \ - Reg r; \ + Reg *v = d; \ + uint8_t r[PACK_WIDTH * 2]; \ + int j, i; \ \ - r.B(0) = d->B((base << (SHIFT + 2)) + 0); \ - r.B(1) = s->B((base << (SHIFT + 2)) + 0); \ - r.B(2) = d->B((base << (SHIFT + 2)) + 1); \ - r.B(3) = s->B((base << (SHIFT + 2)) + 1); \ - r.B(4) = d->B((base << (SHIFT + 2)) + 2); \ - r.B(5) = s->B((base << (SHIFT + 2)) + 2); \ - r.B(6) = d->B((base << (SHIFT + 2)) + 3); \ - r.B(7) = s->B((base << (SHIFT + 2)) + 3); \ - XMM_ONLY( \ - r.B(8) = d->B((base << (SHIFT + 2)) + 4); \ - r.B(9) = s->B((base << (SHIFT + 2)) + 4); \ - r.B(10) = d->B((base << (SHIFT + 2)) + 5); \ - r.B(11) = s->B((base << (SHIFT + 2)) + 5); \ - r.B(12) = d->B((base << (SHIFT + 2)) + 6); \ - r.B(13) = s->B((base << (SHIFT + 2)) + 6); \ - r.B(14) = d->B((base << (SHIFT + 2)) + 7); \ - r.B(15) = s->B((base << (SHIFT + 2)) + 7); \ - ) \ - MOVE(*d, r); \ + for (j = 0; j < 8 << SHIFT; ) { \ + int k = j + base * PACK_WIDTH; \ + for (i = 0; i < PACK_WIDTH; i++) { \ + r[2 * i] = v->B(k + i); \ + r[2 * i + 1] = s->B(k + i); \ + } \ + for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \ + d->B(j) = r[i]; \ + } \ + } \ } \ \ void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *s) \ { \ - Reg r; \ + Reg *v = d; \ + uint16_t r[PACK_WIDTH]; \ + int j, i; \ \ - r.W(0) = d->W((base << (SHIFT + 1)) + 0); \ - r.W(1) = s->W((base << (SHIFT + 1)) + 0); \ - r.W(2) = d->W((base << (SHIFT + 1)) + 1); \ - r.W(3) = s->W((base << (SHIFT + 1)) + 1); \ - XMM_ONLY( \ - r.W(4) = d->W((base << (SHIFT + 1)) + 2); \ - r.W(5) = s->W((base << (SHIFT + 1)) + 2); \ - r.W(6) = d->W((base << (SHIFT + 1)) + 3); \ - r.W(7) = s->W((base << (SHIFT + 1)) + 3); \ - ) \ - MOVE(*d, r); \ + for (j = 0; j < 4 << SHIFT; ) { \ + int k = j + base * PACK_WIDTH / 2; \ + for (i = 0; i < PACK_WIDTH / 2; i++) { \ + r[2 * i] = v->W(k + i); \ + r[2 * i + 1] = s->W(k + i); \ + } \ + for (i = 0; i < PACK_WIDTH; i++, j++) { \ + d->W(j) = r[i]; \ + } \ + } \ } \ \ void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\ - Reg *d, Reg *s) \ + Reg *d, Reg *s) \ { \ - Reg r; \ + Reg *v = d; \ + uint32_t r[PACK_WIDTH / 2]; \ + int j, i; \ \ - r.L(0) = d->L((base << SHIFT) + 0); \ - r.L(1) = s->L((base << SHIFT) + 0); \ - XMM_ONLY( \ - r.L(2) = d->L((base << SHIFT) + 1); \ - r.L(3) = s->L((base << SHIFT) + 1); \ - ) \ - MOVE(*d, r); \ + for (j = 0; j < 2 << SHIFT; ) { \ + int k = j + base * PACK_WIDTH / 4; \ + for (i = 0; i < PACK_WIDTH / 4; i++) { \ + r[2 * i] = v->L(k + i); \ + r[2 * i + 1] = s->L(k + i); \ + } \ + for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \ + d->L(j) = r[i]; \ + } \ + } \ } \ \ XMM_ONLY( \ - void glue(helper_punpck ## base_name ## qdq, SUFFIX)(CPUX86State \ - *env, \ - Reg *d, \ - Reg *s) \ + void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \ + CPUX86State *env, Reg *d, Reg *s) \ { \ - Reg r; \ + Reg *v = d; \ + uint64_t r[2]; \ + int i; \ \ - r.Q(0) = d->Q(base); \ - r.Q(1) = s->Q(base); \ - MOVE(*d, r); \ + for (i = 0; i < 1 << SHIFT; i += 2) { \ + r[0] = v->Q(base + i); \ + r[1] = s->Q(base + i); \ + d->Q(i) = r[0]; \ + d->Q(i + 1) = r[1]; \ + } \ } \ ) UNPCK_OP(l, 0) UNPCK_OP(h, 1) +#undef PACK_WIDTH +#undef PACK_HELPER_B +#undef UNPCK_OP + + /* 3DNow! float ops */ #if SHIFT == 0 void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s) @@ -1327,11 +1333,11 @@ void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s) void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s) { - MMXReg r; + float32 r; - r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); - r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); - MOVE(*d, r); + r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); + d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); + d->MMX_S(0) = r; } void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s) @@ -1392,20 +1398,20 @@ void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s) void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s) { - MMXReg r; + float32 r; - r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); - r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); - MOVE(*d, r); + r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); + d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); + d->MMX_S(0) = r; } void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s) { - MMXReg r; + float32 r; - r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); - r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); - MOVE(*d, r); + r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); + d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); + d->MMX_S(0) = r; } void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s) @@ -1438,132 +1444,105 @@ void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s) void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s) { - MMXReg r; + uint32_t r; - r.MMX_L(0) = s->MMX_L(1); - r.MMX_L(1) = s->MMX_L(0); - MOVE(*d, r); + r = s->MMX_L(0); + d->MMX_L(0) = s->MMX_L(1); + d->MMX_L(1) = r; } #endif /* SSSE3 op helpers */ void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { + Reg *v = d; int i; - Reg r; +#if SHIFT == 0 + uint8_t r[8]; - for (i = 0; i < (8 << SHIFT); i++) { - r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1))); + for (i = 0; i < 8; i++) { + r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7)); } + for (i = 0; i < 8; i++) { + d->B(i) = r[i]; + } +#else + uint8_t r[8 << SHIFT]; - MOVE(*d, r); -} - -void glue(helper_phaddw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - - Reg r; - - r.W(0) = (int16_t)d->W(0) + (int16_t)d->W(1); - r.W(1) = (int16_t)d->W(2) + (int16_t)d->W(3); - XMM_ONLY(r.W(2) = (int16_t)d->W(4) + (int16_t)d->W(5)); - XMM_ONLY(r.W(3) = (int16_t)d->W(6) + (int16_t)d->W(7)); - r.W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1); - r.W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3); - XMM_ONLY(r.W(6) = (int16_t)s->W(4) + (int16_t)s->W(5)); - XMM_ONLY(r.W(7) = (int16_t)s->W(6) + (int16_t)s->W(7)); - - MOVE(*d, r); -} - -void glue(helper_phaddd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.L(0) = (int32_t)d->L(0) + (int32_t)d->L(1); - XMM_ONLY(r.L(1) = (int32_t)d->L(2) + (int32_t)d->L(3)); - r.L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1); - XMM_ONLY(r.L(3) = (int32_t)s->L(2) + (int32_t)s->L(3)); - - MOVE(*d, r); -} - -void glue(helper_phaddsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - Reg r; - - r.W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1)); - r.W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3)); - XMM_ONLY(r.W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5))); - XMM_ONLY(r.W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7))); - r.W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1)); - r.W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3)); - XMM_ONLY(r.W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5))); - XMM_ONLY(r.W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7))); - - MOVE(*d, r); -} - -void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->W(0) = satsw((int8_t)s->B(0) * (uint8_t)d->B(0) + - (int8_t)s->B(1) * (uint8_t)d->B(1)); - d->W(1) = satsw((int8_t)s->B(2) * (uint8_t)d->B(2) + - (int8_t)s->B(3) * (uint8_t)d->B(3)); - d->W(2) = satsw((int8_t)s->B(4) * (uint8_t)d->B(4) + - (int8_t)s->B(5) * (uint8_t)d->B(5)); - d->W(3) = satsw((int8_t)s->B(6) * (uint8_t)d->B(6) + - (int8_t)s->B(7) * (uint8_t)d->B(7)); -#if SHIFT == 1 - d->W(4) = satsw((int8_t)s->B(8) * (uint8_t)d->B(8) + - (int8_t)s->B(9) * (uint8_t)d->B(9)); - d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) + - (int8_t)s->B(11) * (uint8_t)d->B(11)); - d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) + - (int8_t)s->B(13) * (uint8_t)d->B(13)); - d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) + - (int8_t)s->B(15) * (uint8_t)d->B(15)); + for (i = 0; i < 8 << SHIFT; i++) { + int j = i & ~0xf; + r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf)); + } + for (i = 0; i < 8 << SHIFT; i++) { + d->B(i) = r[i]; + } #endif } -void glue(helper_phsubw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1); - d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3); - XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5)); - XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7)); - d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1); - d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3); - XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5)); - XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7)); -} +#define SSE_HELPER_HW(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + uint16_t r[4 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \ + for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ + r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \ + } \ + for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \ + r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \ + } \ + } \ + for (i = 0; i < 4 << SHIFT; i++) { \ + d->W(i) = r[i]; \ + } \ +} + +#define SSE_HELPER_HL(name, F) \ +void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ +{ \ + Reg *v = d; \ + uint32_t r[2 << SHIFT]; \ + int i, j, k; \ + for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \ + for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ + r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \ + } \ + for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \ + r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \ + } \ + } \ + for (i = 0; i < 2 << SHIFT; i++) { \ + d->L(i) = r[i]; \ + } \ +} + +SSE_HELPER_HW(phaddw, FADD) +SSE_HELPER_HW(phsubw, FSUB) +SSE_HELPER_HW(phaddsw, FADDSW) +SSE_HELPER_HW(phsubsw, FSUBSW) +SSE_HELPER_HL(phaddd, FADD) +SSE_HELPER_HL(phsubd, FSUB) + +#undef SSE_HELPER_HW +#undef SSE_HELPER_HL -void glue(helper_phsubd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) -{ - d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1); - XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3)); - d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1); - XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3)); -} - -void glue(helper_phsubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1)); - d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3)); - XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5))); - XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7))); - d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1)); - d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3)); - XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5))); - XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7))); + Reg *v = d; + int i; + for (i = 0; i < 4 << SHIFT; i++) { + d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) + + (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1)); + } } -#define FABSB(_, x) (x > INT8_MAX ? -(int8_t)x : x) -#define FABSW(_, x) (x > INT16_MAX ? -(int16_t)x : x) -#define FABSL(_, x) (x > INT32_MAX ? -(int32_t)x : x) -SSE_HELPER_B(helper_pabsb, FABSB) -SSE_HELPER_W(helper_pabsw, FABSW) -SSE_HELPER_L(helper_pabsd, FABSL) +#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x) +#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x) +#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x) +SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB) +SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW) +SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL) #define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15) SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) @@ -1578,147 +1557,117 @@ SSE_HELPER_L(helper_psignd, FSIGNL) void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, int32_t shift) { - Reg r; + Reg *v = d; + int i; /* XXX could be checked during translation */ - if (shift >= (16 << SHIFT)) { - r.Q(0) = 0; - XMM_ONLY(r.Q(1) = 0); + if (shift >= (SHIFT ? 32 : 16)) { + for (i = 0; i < (1 << SHIFT); i++) { + d->Q(i) = 0; + } } else { shift <<= 3; #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) #if SHIFT == 0 - r.Q(0) = SHR(s->Q(0), shift - 0) | - SHR(d->Q(0), shift - 64); + d->Q(0) = SHR(s->Q(0), shift - 0) | + SHR(v->Q(0), shift - 64); #else - r.Q(0) = SHR(s->Q(0), shift - 0) | - SHR(s->Q(1), shift - 64) | - SHR(d->Q(0), shift - 128) | - SHR(d->Q(1), shift - 192); - r.Q(1) = SHR(s->Q(0), shift + 64) | - SHR(s->Q(1), shift - 0) | - SHR(d->Q(0), shift - 64) | - SHR(d->Q(1), shift - 128); + for (i = 0; i < (1 << SHIFT); i += 2) { + uint64_t r0, r1; + + r0 = SHR(s->Q(i), shift - 0) | + SHR(s->Q(i + 1), shift - 64) | + SHR(v->Q(i), shift - 128) | + SHR(v->Q(i + 1), shift - 192); + r1 = SHR(s->Q(i), shift + 64) | + SHR(s->Q(i + 1), shift - 0) | + SHR(v->Q(i), shift - 64) | + SHR(v->Q(i + 1), shift - 128); + d->Q(i) = r0; + d->Q(i + 1) = r1; + } #endif #undef SHR } - - MOVE(*d, r); } -#define XMM0 (env->xmm_regs[0]) +#if SHIFT >= 1 -#if SHIFT == 1 #define SSE_HELPER_V(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ { \ - d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0)); \ - d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1)); \ - if (num > 2) { \ - d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2)); \ - d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3)); \ - if (num > 4) { \ - d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4)); \ - d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5)); \ - d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6)); \ - d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7)); \ - if (num > 8) { \ - d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \ - d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \ - d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \ - d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \ - d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \ - d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \ - d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \ - d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \ - } \ - } \ + Reg *v = d; \ + Reg *m = &env->xmm_regs[0]; \ + int i; \ + for (i = 0; i < num; i++) { \ + d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \ } \ } #define SSE_HELPER_I(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, \ + uint32_t imm) \ { \ - d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1)); \ - d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1)); \ - if (num > 2) { \ - d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1)); \ - d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1)); \ - if (num > 4) { \ - d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1)); \ - d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1)); \ - d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1)); \ - d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1)); \ - if (num > 8) { \ - d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1)); \ - d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1)); \ - d->elem(10) = F(d->elem(10), s->elem(10), \ - ((imm >> 10) & 1)); \ - d->elem(11) = F(d->elem(11), s->elem(11), \ - ((imm >> 11) & 1)); \ - d->elem(12) = F(d->elem(12), s->elem(12), \ - ((imm >> 12) & 1)); \ - d->elem(13) = F(d->elem(13), s->elem(13), \ - ((imm >> 13) & 1)); \ - d->elem(14) = F(d->elem(14), s->elem(14), \ - ((imm >> 14) & 1)); \ - d->elem(15) = F(d->elem(15), s->elem(15), \ - ((imm >> 15) & 1)); \ - } \ - } \ + Reg *v = d; \ + int i; \ + for (i = 0; i < num; i++) { \ + int j = i & 7; \ + d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1); \ } \ } /* SSE4.1 op helpers */ -#define FBLENDVB(d, s, m) ((m & 0x80) ? s : d) -#define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d) -#define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d) -SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) -SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) -SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) +#define FBLENDVB(v, s, m) ((m & 0x80) ? s : v) +#define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v) +#define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v) +SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB) +SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS) +SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD) void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); - uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); + uint64_t zf = 0, cf = 0; + int i; + for (i = 0; i < 1 << SHIFT; i++) { + zf |= (s->Q(i) & d->Q(i)); + cf |= (s->Q(i) & ~d->Q(i)); + } CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); } -#define SSE_HELPER_F(name, elem, num, F) \ - void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ - { \ - if (num > 2) { \ - if (num > 4) { \ - d->elem(7) = F(7); \ - d->elem(6) = F(6); \ - d->elem(5) = F(5); \ - d->elem(4) = F(4); \ - } \ - d->elem(3) = F(3); \ - d->elem(2) = F(2); \ - } \ - d->elem(1) = F(1); \ - d->elem(0) = F(0); \ - } - -SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) -SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) -SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) -SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) -SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) -SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) -SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) -SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) -SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) -SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) -SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) -SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) +#define SSE_HELPER_F(name, elem, num, F) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \ + { \ + int n = num; \ + for (int i = n; --i >= 0; ) { \ + d->elem(i) = F(i); \ + } \ + } + +#if SHIFT > 0 +SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B) +SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B) +SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B) +SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W) +SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W) +SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L) +SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B) +SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B) +SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B) +SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W) +SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W) +SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L) +#endif void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - d->Q(0) = (int64_t)(int32_t) d->L(0) * (int32_t) s->L(0); - d->Q(1) = (int64_t)(int32_t) d->L(2) * (int32_t) s->L(2); + Reg *v = d; + int i; + + for (i = 0; i < 1 << SHIFT; i++) { + d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i); + } } #define FCMPEQQ(d, s) (d == s ? -1 : 0) @@ -1726,17 +1675,23 @@ SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { - Reg r; - - r.W(0) = satuw((int32_t) d->L(0)); - r.W(1) = satuw((int32_t) d->L(1)); - r.W(2) = satuw((int32_t) d->L(2)); - r.W(3) = satuw((int32_t) d->L(3)); - r.W(4) = satuw((int32_t) s->L(0)); - r.W(5) = satuw((int32_t) s->L(1)); - r.W(6) = satuw((int32_t) s->L(2)); - r.W(7) = satuw((int32_t) s->L(3)); - MOVE(*d, r); + Reg *v = d; + uint16_t r[8]; + int i, j, k; + + for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) { + r[0] = satuw(v->L(j)); + r[1] = satuw(v->L(j + 1)); + r[2] = satuw(v->L(j + 2)); + r[3] = satuw(v->L(j + 3)); + r[4] = satuw(s->L(j)); + r[5] = satuw(s->L(j + 1)); + r[6] = satuw(s->L(j + 2)); + r[7] = satuw(s->L(j + 3)); + for (k = 0; k < 8; k++) { + d->W(i + k) = r[k]; + } + } } #define FMINSB(d, s) MIN((int8_t)d, (int8_t)s) @@ -1755,6 +1710,7 @@ SSE_HELPER_L(helper_pmaxud, MAX) #define FMULLD(d, s) ((int32_t)d * (int32_t)s) SSE_HELPER_L(helper_pmulld, FMULLD) +#if SHIFT == 1 void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int idx = 0; @@ -1786,12 +1742,14 @@ void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) d->L(1) = 0; d->Q(1) = 0; } +#endif void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) { uint8_t old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; + int i; prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { @@ -1811,10 +1769,9 @@ void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } } - d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); - d->ZMM_S(1) = float32_round_to_int(s->ZMM_S(1), &env->sse_status); - d->ZMM_S(2) = float32_round_to_int(s->ZMM_S(2), &env->sse_status); - d->ZMM_S(3) = float32_round_to_int(s->ZMM_S(3), &env->sse_status); + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status); + } if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { set_float_exception_flags(get_float_exception_flags(&env->sse_status) & @@ -1829,6 +1786,7 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, { uint8_t old_flags = get_float_exception_flags(&env->sse_status); signed char prev_rounding_mode; + int i; prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { @@ -1848,8 +1806,9 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } } - d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); - d->ZMM_D(1) = float64_round_to_int(s->ZMM_D(1), &env->sse_status); + for (i = 0; i < 1 << SHIFT; i++) { + d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status); + } if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) { set_float_exception_flags(get_float_exception_flags(&env->sse_status) & @@ -1859,6 +1818,7 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, env->sse_status.float_rounding_mode = prev_rounding_mode; } +#if SHIFT == 1 void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mode) { @@ -1926,89 +1886,109 @@ void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, } env->sse_status.float_rounding_mode = prev_rounding_mode; } +#endif -#define FBLENDP(d, s, m) (m ? s : d) -SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) -SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) -SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) +#define FBLENDP(v, s, m) (m ? s : v) +SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP) +SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP) +SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP) -void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) +void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, + uint32_t mask) { - float32 iresult = float32_zero; + Reg *v = d; + float32 prod1, prod2, temp2, temp3, temp4; + int i; - if (mask & (1 << 4)) { - iresult = float32_add(iresult, - float32_mul(d->ZMM_S(0), s->ZMM_S(0), - &env->sse_status), - &env->sse_status); - } - if (mask & (1 << 5)) { - iresult = float32_add(iresult, - float32_mul(d->ZMM_S(1), s->ZMM_S(1), - &env->sse_status), - &env->sse_status); - } - if (mask & (1 << 6)) { - iresult = float32_add(iresult, - float32_mul(d->ZMM_S(2), s->ZMM_S(2), - &env->sse_status), - &env->sse_status); - } - if (mask & (1 << 7)) { - iresult = float32_add(iresult, - float32_mul(d->ZMM_S(3), s->ZMM_S(3), - &env->sse_status), - &env->sse_status); + for (i = 0; i < 2 << SHIFT; i += 4) { + /* + * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D + * to correctly round the intermediate results + */ + if (mask & (1 << 4)) { + prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status); + } else { + prod1 = float32_zero; + } + if (mask & (1 << 5)) { + prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status); + } else { + prod2 = float32_zero; + } + temp2 = float32_add(prod1, prod2, &env->sse_status); + if (mask & (1 << 6)) { + prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status); + } else { + prod1 = float32_zero; + } + if (mask & (1 << 7)) { + prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status); + } else { + prod2 = float32_zero; + } + temp3 = float32_add(prod1, prod2, &env->sse_status); + temp4 = float32_add(temp2, temp3, &env->sse_status); + + d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero; + d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero; + d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero; + d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero; } - d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero; - d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero; - d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero; - d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero; } -void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask) +#if SHIFT == 1 +/* Oddly, there is no ymm version of dppd */ +void glue(helper_dppd, SUFFIX)(CPUX86State *env, + Reg *d, Reg *s, uint32_t mask) { - float64 iresult = float64_zero; + Reg *v = d; + float64 prod1, prod2, temp2; if (mask & (1 << 4)) { - iresult = float64_add(iresult, - float64_mul(d->ZMM_D(0), s->ZMM_D(0), - &env->sse_status), - &env->sse_status); + prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status); + } else { + prod1 = float64_zero; } if (mask & (1 << 5)) { - iresult = float64_add(iresult, - float64_mul(d->ZMM_D(1), s->ZMM_D(1), - &env->sse_status), - &env->sse_status); + prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status); + } else { + prod2 = float64_zero; } - d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero; - d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero; + temp2 = float64_add(prod1, prod2, &env->sse_status); + d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero; + d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero; } +#endif void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t offset) { - int s0 = (offset & 3) << 2; - int d0 = (offset & 4) << 0; - int i; - Reg r; - - for (i = 0; i < 8; i++, d0++) { - r.W(i) = 0; - r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); - r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); - r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); - r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); + Reg *v = d; + int i, j; + uint16_t r[8]; + + for (j = 0; j < 4 << SHIFT; ) { + int s0 = (j * 2) + ((offset & 3) << 2); + int d0 = (j * 2) + ((offset & 4) << 0); + for (i = 0; i < LANE_WIDTH / 2; i++, d0++) { + r[i] = 0; + r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0)); + r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1)); + r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2)); + r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3)); + } + for (i = 0; i < LANE_WIDTH / 2; i++, j++) { + d->W(j) = r[i]; + } + offset >>= 3; } - - MOVE(*d, r); } /* SSE4.2 op helpers */ #define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0) SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) +#if SHIFT == 1 static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl) { target_long val, limit; @@ -2229,14 +2209,16 @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) return crc; } -void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, - uint32_t ctrl) +#endif + +#if SHIFT == 1 +static void clmulq(uint64_t *dest_l, uint64_t *dest_h, + uint64_t a, uint64_t b) { - uint64_t ah, al, b, resh, resl; + uint64_t al, ah, resh, resl; ah = 0; - al = d->Q((ctrl & 1) != 0); - b = s->Q((ctrl & 16) != 0); + al = a; resh = resl = 0; while (b) { @@ -2249,8 +2231,23 @@ void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, b >>= 1; } - d->Q(0) = resl; - d->Q(1) = resh; + *dest_l = resl; + *dest_h = resh; +} +#endif + +void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, + uint32_t ctrl) +{ + Reg *v = d; + uint64_t a, b; + int i; + + for (i = 0; i < 1 << SHIFT; i += 2) { + a = v->Q(((ctrl & 1) != 0) + i); + b = s->Q(((ctrl & 16) != 0) + i); + clmulq(&d->Q(i), &d->Q(i + 1), a, b); + } } void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) @@ -2259,11 +2256,12 @@ void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) Reg st = *d; Reg rk = *s; - for (i = 0 ; i < 4 ; i++) { - d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^ - AES_Td1[st.B(AES_ishifts[4*i+1])] ^ - AES_Td2[st.B(AES_ishifts[4*i+2])] ^ - AES_Td3[st.B(AES_ishifts[4*i+3])]); + for (i = 0 ; i < 2 << SHIFT ; i++) { + int j = i & 3; + d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^ + AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^ + AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^ + AES_Td3[st.B(AES_ishifts[4 * j + 3])]); } } @@ -2273,8 +2271,8 @@ void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) Reg st = *d; Reg rk = *s; - for (i = 0; i < 16; i++) { - d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]); + for (i = 0; i < 8 << SHIFT; i++) { + d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]); } } @@ -2284,11 +2282,12 @@ void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) Reg st = *d; Reg rk = *s; - for (i = 0 ; i < 4 ; i++) { - d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^ - AES_Te1[st.B(AES_shifts[4*i+1])] ^ - AES_Te2[st.B(AES_shifts[4*i+2])] ^ - AES_Te3[st.B(AES_shifts[4*i+3])]); + for (i = 0 ; i < 2 << SHIFT ; i++) { + int j = i & 3; + d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^ + AES_Te1[st.B(AES_shifts[4 * j + 1])] ^ + AES_Te2[st.B(AES_shifts[4 * j + 2])] ^ + AES_Te3[st.B(AES_shifts[4 * j + 3])]); } } @@ -2298,22 +2297,22 @@ void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) Reg st = *d; Reg rk = *s; - for (i = 0; i < 16; i++) { - d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]); + for (i = 0; i < 8 << SHIFT; i++) { + d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]); } - } +#if SHIFT == 1 void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) { int i; Reg tmp = *s; for (i = 0 ; i < 4 ; i++) { - d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^ - AES_imc[tmp.B(4*i+1)][1] ^ - AES_imc[tmp.B(4*i+2)][2] ^ - AES_imc[tmp.B(4*i+3)][3]); + d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^ + AES_imc[tmp.B(4 * i + 1)][1] ^ + AES_imc[tmp.B(4 * i + 2)][2] ^ + AES_imc[tmp.B(4 * i + 3)][3]); } } @@ -2331,6 +2330,9 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl; } #endif +#endif + +#undef SSE_HELPER_S #undef SHIFT #undef XMM_ONLY @@ -2340,4 +2342,3 @@ void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, #undef L #undef Q #undef SUFFIX -#undef SIZE diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index cef28f2aae..d99464afb0 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -122,8 +122,8 @@ DEF_HELPER_2(glue(movq_mm_T0, SUFFIX), void, Reg, i64) #if SHIFT == 0 DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int) #else -DEF_HELPER_3(shufps, void, Reg, Reg, int) -DEF_HELPER_3(shufpd, void, Reg, Reg, int) +DEF_HELPER_3(glue(shufps, SUFFIX), void, Reg, Reg, int) +DEF_HELPER_3(glue(shufpd, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int) DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int) @@ -134,9 +134,9 @@ DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int) /* XXX: not accurate */ #define SSE_HELPER_S(name, F) \ - DEF_HELPER_3(name ## ps, void, env, Reg, Reg) \ + DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## pd, void, env, Reg, Reg) \ + DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \ DEF_HELPER_3(name ## sd, void, env, Reg, Reg) SSE_HELPER_S(add, FPU_ADD) @@ -148,12 +148,12 @@ SSE_HELPER_S(max, FPU_MAX) SSE_HELPER_S(sqrt, FPU_SQRT) -DEF_HELPER_3(cvtps2pd, void, env, Reg, Reg) -DEF_HELPER_3(cvtpd2ps, void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtps2pd, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtpd2ps, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(cvtss2sd, void, env, Reg, Reg) DEF_HELPER_3(cvtsd2ss, void, env, Reg, Reg) -DEF_HELPER_3(cvtdq2ps, void, env, Reg, Reg) -DEF_HELPER_3(cvtdq2pd, void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtdq2ps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_3(glue(cvtdq2pd, SUFFIX), void, env, Reg, Reg) DEF_HELPER_3(cvtpi2ps, void, env, ZMMReg, MMXReg) DEF_HELPER_3(cvtpi2pd, void, env, ZMMReg, MMXReg) DEF_HELPER_3(cvtsi2ss, void, env, ZMMReg, i32) @@ -164,8 +164,8 @@ DEF_HELPER_3(cvtsq2ss, void, env, ZMMReg, i64) DEF_HELPER_3(cvtsq2sd, void, env, ZMMReg, i64) #endif -DEF_HELPER_3(cvtps2dq, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(cvtpd2dq, void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvtps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvtpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) DEF_HELPER_3(cvtps2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_3(cvtpd2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_2(cvtss2si, s32, env, ZMMReg) @@ -175,8 +175,8 @@ DEF_HELPER_2(cvtss2sq, s64, env, ZMMReg) DEF_HELPER_2(cvtsd2sq, s64, env, ZMMReg) #endif -DEF_HELPER_3(cvttps2dq, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(cvttpd2dq, void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvttps2dq, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(cvttpd2dq, SUFFIX), void, env, ZMMReg, ZMMReg) DEF_HELPER_3(cvttps2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_3(cvttpd2pi, void, env, MMXReg, ZMMReg) DEF_HELPER_2(cvttss2si, s32, env, ZMMReg) @@ -186,42 +186,42 @@ DEF_HELPER_2(cvttss2sq, s64, env, ZMMReg) DEF_HELPER_2(cvttsd2sq, s64, env, ZMMReg) #endif -DEF_HELPER_3(rsqrtps, void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(rsqrtps, SUFFIX), void, env, ZMMReg, ZMMReg) DEF_HELPER_3(rsqrtss, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(rcpps, void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(rcpps, SUFFIX), void, env, ZMMReg, ZMMReg) DEF_HELPER_3(rcpss, void, env, ZMMReg, ZMMReg) DEF_HELPER_3(extrq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_4(extrq_i, void, env, ZMMReg, int, int) DEF_HELPER_3(insertq_r, void, env, ZMMReg, ZMMReg) DEF_HELPER_4(insertq_i, void, env, ZMMReg, int, int) -DEF_HELPER_3(haddps, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(haddpd, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(hsubps, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(hsubpd, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(addsubps, void, env, ZMMReg, ZMMReg) -DEF_HELPER_3(addsubpd, void, env, ZMMReg, ZMMReg) - -#define SSE_HELPER_CMP(name, F) \ - DEF_HELPER_3(name ## ps, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ - DEF_HELPER_3(name ## pd, void, env, Reg, Reg) \ +DEF_HELPER_3(glue(haddps, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(haddpd, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(hsubps, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(hsubpd, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(addsubps, SUFFIX), void, env, ZMMReg, ZMMReg) +DEF_HELPER_3(glue(addsubpd, SUFFIX), void, env, ZMMReg, ZMMReg) + +#define SSE_HELPER_CMP(name, F, C) \ + DEF_HELPER_3(glue(name ## ps, SUFFIX), void, env, Reg, Reg) \ + DEF_HELPER_3(name ## ss, void, env, Reg, Reg) \ + DEF_HELPER_3(glue(name ## pd, SUFFIX), void, env, Reg, Reg) \ DEF_HELPER_3(name ## sd, void, env, Reg, Reg) -SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) -SSE_HELPER_CMP(cmplt, FPU_CMPLT) -SSE_HELPER_CMP(cmple, FPU_CMPLE) -SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD) -SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ) -SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT) -SSE_HELPER_CMP(cmpnle, FPU_CMPNLE) -SSE_HELPER_CMP(cmpord, FPU_CMPORD) +SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ) +SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT) +SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE) +SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD) +SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ) +SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT) +SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE) +SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD) DEF_HELPER_3(ucomiss, void, env, Reg, Reg) DEF_HELPER_3(comiss, void, env, Reg, Reg) DEF_HELPER_3(ucomisd, void, env, Reg, Reg) DEF_HELPER_3(comisd, void, env, Reg, Reg) -DEF_HELPER_2(movmskps, i32, env, Reg) -DEF_HELPER_2(movmskpd, i32, env, Reg) +DEF_HELPER_2(glue(movmskps, SUFFIX), i32, env, Reg) +DEF_HELPER_2(glue(movmskpd, SUFFIX), i32, env, Reg) #endif DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg) diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index b7972f0ff5..fc081e6ad6 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2777,157 +2777,209 @@ static inline void gen_op_movq_env_0(DisasContext *s, int d_offset) tcg_gen_st_i64(s->tmp1_i64, cpu_env, d_offset); } +#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg]) + typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg); typedef void (*SSEFunc_0_epi)(TCGv_ptr env, TCGv_ptr reg, TCGv_i32 val); typedef void (*SSEFunc_0_epl)(TCGv_ptr env, TCGv_ptr reg, TCGv_i64 val); typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b); +typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c); typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val); typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val); typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv val); -#define SSE_SPECIAL ((void *)1) -#define SSE_DUMMY ((void *)2) +#define SSE_OPF_CMP (1 << 1) /* does not write for first operand */ +#define SSE_OPF_SPECIAL (1 << 3) /* magic */ +#define SSE_OPF_3DNOW (1 << 4) /* 3DNow! instruction */ +#define SSE_OPF_MMX (1 << 5) /* MMX/integer/AVX2 instruction */ +#define SSE_OPF_SCALAR (1 << 6) /* Has SSE scalar variants */ +#define SSE_OPF_SHUF (1 << 9) /* pshufx/shufpx */ + +#define OP(op, flags, a, b, c, d) \ + {flags, {{.op = a}, {.op = b}, {.op = c}, {.op = d} } } + +#define MMX_OP(x) OP(op1, SSE_OPF_MMX, \ + gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm, NULL, NULL) + +#define SSE_FOP(name) OP(op1, SSE_OPF_SCALAR, \ + gen_helper_##name##ps##_xmm, gen_helper_##name##pd##_xmm, \ + gen_helper_##name##ss, gen_helper_##name##sd) +#define SSE_OP(sname, dname, op, flags) OP(op, flags, \ + gen_helper_##sname##_xmm, gen_helper_##dname##_xmm, NULL, NULL) + +typedef union SSEFuncs { + SSEFunc_0_epp op1; + SSEFunc_0_ppi op1i; + SSEFunc_0_eppt op1t; +} SSEFuncs; + +struct SSEOpHelper_table1 { + int flags; + SSEFuncs fn[4]; +}; -#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } -#define SSE_FOP(x) { gen_helper_ ## x ## ps, gen_helper_ ## x ## pd, \ - gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, } +#define SSE_3DNOW { SSE_OPF_3DNOW } +#define SSE_SPECIAL { SSE_OPF_SPECIAL } -static const SSEFunc_0_epp sse_op_table1[256][4] = { +static const struct SSEOpHelper_table1 sse_op_table1[256] = { /* 3DNow! extensions */ - [0x0e] = { SSE_DUMMY }, /* femms */ - [0x0f] = { SSE_DUMMY }, /* pf... */ + [0x0e] = SSE_SPECIAL, /* femms */ + [0x0f] = SSE_3DNOW, /* pf... (sse_op_table5) */ /* pure SSE operations */ - [0x10] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */ - [0x11] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movups, movupd, movss, movsd */ - [0x12] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd, movsldup, movddup */ - [0x13] = { SSE_SPECIAL, SSE_SPECIAL }, /* movlps, movlpd */ - [0x14] = { gen_helper_punpckldq_xmm, gen_helper_punpcklqdq_xmm }, - [0x15] = { gen_helper_punpckhdq_xmm, gen_helper_punpckhqdq_xmm }, - [0x16] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd, movshdup */ - [0x17] = { SSE_SPECIAL, SSE_SPECIAL }, /* movhps, movhpd */ - - [0x28] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */ - [0x29] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */ - [0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */ - [0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd, movntss, movntsd */ - [0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */ - [0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */ - [0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd }, - [0x2f] = { gen_helper_comiss, gen_helper_comisd }, - [0x50] = { SSE_SPECIAL, SSE_SPECIAL }, /* movmskps, movmskpd */ - [0x51] = SSE_FOP(sqrt), - [0x52] = { gen_helper_rsqrtps, NULL, gen_helper_rsqrtss, NULL }, - [0x53] = { gen_helper_rcpps, NULL, gen_helper_rcpss, NULL }, - [0x54] = { gen_helper_pand_xmm, gen_helper_pand_xmm }, /* andps, andpd */ - [0x55] = { gen_helper_pandn_xmm, gen_helper_pandn_xmm }, /* andnps, andnpd */ - [0x56] = { gen_helper_por_xmm, gen_helper_por_xmm }, /* orps, orpd */ - [0x57] = { gen_helper_pxor_xmm, gen_helper_pxor_xmm }, /* xorps, xorpd */ + [0x10] = SSE_SPECIAL, /* movups, movupd, movss, movsd */ + [0x11] = SSE_SPECIAL, /* movups, movupd, movss, movsd */ + [0x12] = SSE_SPECIAL, /* movlps, movlpd, movsldup, movddup */ + [0x13] = SSE_SPECIAL, /* movlps, movlpd */ + [0x14] = SSE_OP(punpckldq, punpcklqdq, op1, 0), /* unpcklps, unpcklpd */ + [0x15] = SSE_OP(punpckhdq, punpckhqdq, op1, 0), /* unpckhps, unpckhpd */ + [0x16] = SSE_SPECIAL, /* movhps, movhpd, movshdup */ + [0x17] = SSE_SPECIAL, /* movhps, movhpd */ + + [0x28] = SSE_SPECIAL, /* movaps, movapd */ + [0x29] = SSE_SPECIAL, /* movaps, movapd */ + [0x2a] = SSE_SPECIAL, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */ + [0x2b] = SSE_SPECIAL, /* movntps, movntpd, movntss, movntsd */ + [0x2c] = SSE_SPECIAL, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */ + [0x2d] = SSE_SPECIAL, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */ + [0x2e] = OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR, + gen_helper_ucomiss, gen_helper_ucomisd, NULL, NULL), + [0x2f] = OP(op1, SSE_OPF_CMP | SSE_OPF_SCALAR, + gen_helper_comiss, gen_helper_comisd, NULL, NULL), + [0x50] = SSE_SPECIAL, /* movmskps, movmskpd */ + [0x51] = OP(op1, SSE_OPF_SCALAR, + gen_helper_sqrtps_xmm, gen_helper_sqrtpd_xmm, + gen_helper_sqrtss, gen_helper_sqrtsd), + [0x52] = OP(op1, SSE_OPF_SCALAR, + gen_helper_rsqrtps_xmm, NULL, gen_helper_rsqrtss, NULL), + [0x53] = OP(op1, SSE_OPF_SCALAR, + gen_helper_rcpps_xmm, NULL, gen_helper_rcpss, NULL), + [0x54] = SSE_OP(pand, pand, op1, 0), /* andps, andpd */ + [0x55] = SSE_OP(pandn, pandn, op1, 0), /* andnps, andnpd */ + [0x56] = SSE_OP(por, por, op1, 0), /* orps, orpd */ + [0x57] = SSE_OP(pxor, pxor, op1, 0), /* xorps, xorpd */ [0x58] = SSE_FOP(add), [0x59] = SSE_FOP(mul), - [0x5a] = { gen_helper_cvtps2pd, gen_helper_cvtpd2ps, - gen_helper_cvtss2sd, gen_helper_cvtsd2ss }, - [0x5b] = { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvttps2dq }, + [0x5a] = OP(op1, SSE_OPF_SCALAR, + gen_helper_cvtps2pd_xmm, gen_helper_cvtpd2ps_xmm, + gen_helper_cvtss2sd, gen_helper_cvtsd2ss), + [0x5b] = OP(op1, 0, + gen_helper_cvtdq2ps_xmm, gen_helper_cvtps2dq_xmm, + gen_helper_cvttps2dq_xmm, NULL), [0x5c] = SSE_FOP(sub), [0x5d] = SSE_FOP(min), [0x5e] = SSE_FOP(div), [0x5f] = SSE_FOP(max), - [0xc2] = SSE_FOP(cmpeq), - [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps, - (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */ + [0xc2] = SSE_FOP(cmpeq), /* sse_op_table4 */ + [0xc6] = SSE_OP(shufps, shufpd, op1i, SSE_OPF_SHUF), /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */ - [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, - [0x3a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, + [0x38] = SSE_SPECIAL, + [0x3a] = SSE_SPECIAL, /* MMX ops and their SSE extensions */ - [0x60] = MMX_OP2(punpcklbw), - [0x61] = MMX_OP2(punpcklwd), - [0x62] = MMX_OP2(punpckldq), - [0x63] = MMX_OP2(packsswb), - [0x64] = MMX_OP2(pcmpgtb), - [0x65] = MMX_OP2(pcmpgtw), - [0x66] = MMX_OP2(pcmpgtl), - [0x67] = MMX_OP2(packuswb), - [0x68] = MMX_OP2(punpckhbw), - [0x69] = MMX_OP2(punpckhwd), - [0x6a] = MMX_OP2(punpckhdq), - [0x6b] = MMX_OP2(packssdw), - [0x6c] = { NULL, gen_helper_punpcklqdq_xmm }, - [0x6d] = { NULL, gen_helper_punpckhqdq_xmm }, - [0x6e] = { SSE_SPECIAL, SSE_SPECIAL }, /* movd mm, ea */ - [0x6f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, , movqdu */ - [0x70] = { (SSEFunc_0_epp)gen_helper_pshufw_mmx, - (SSEFunc_0_epp)gen_helper_pshufd_xmm, - (SSEFunc_0_epp)gen_helper_pshufhw_xmm, - (SSEFunc_0_epp)gen_helper_pshuflw_xmm }, /* XXX: casts */ - [0x71] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftw */ - [0x72] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftd */ - [0x73] = { SSE_SPECIAL, SSE_SPECIAL }, /* shiftq */ - [0x74] = MMX_OP2(pcmpeqb), - [0x75] = MMX_OP2(pcmpeqw), - [0x76] = MMX_OP2(pcmpeql), - [0x77] = { SSE_DUMMY }, /* emms */ - [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */ - [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r }, - [0x7c] = { NULL, gen_helper_haddpd, NULL, gen_helper_haddps }, - [0x7d] = { NULL, gen_helper_hsubpd, NULL, gen_helper_hsubps }, - [0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */ - [0x7f] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movq, movdqa, movdqu */ - [0xc4] = { SSE_SPECIAL, SSE_SPECIAL }, /* pinsrw */ - [0xc5] = { SSE_SPECIAL, SSE_SPECIAL }, /* pextrw */ - [0xd0] = { NULL, gen_helper_addsubpd, NULL, gen_helper_addsubps }, - [0xd1] = MMX_OP2(psrlw), - [0xd2] = MMX_OP2(psrld), - [0xd3] = MMX_OP2(psrlq), - [0xd4] = MMX_OP2(paddq), - [0xd5] = MMX_OP2(pmullw), - [0xd6] = { NULL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, - [0xd7] = { SSE_SPECIAL, SSE_SPECIAL }, /* pmovmskb */ - [0xd8] = MMX_OP2(psubusb), - [0xd9] = MMX_OP2(psubusw), - [0xda] = MMX_OP2(pminub), - [0xdb] = MMX_OP2(pand), - [0xdc] = MMX_OP2(paddusb), - [0xdd] = MMX_OP2(paddusw), - [0xde] = MMX_OP2(pmaxub), - [0xdf] = MMX_OP2(pandn), - [0xe0] = MMX_OP2(pavgb), - [0xe1] = MMX_OP2(psraw), - [0xe2] = MMX_OP2(psrad), - [0xe3] = MMX_OP2(pavgw), - [0xe4] = MMX_OP2(pmulhuw), - [0xe5] = MMX_OP2(pmulhw), - [0xe6] = { NULL, gen_helper_cvttpd2dq, gen_helper_cvtdq2pd, gen_helper_cvtpd2dq }, - [0xe7] = { SSE_SPECIAL , SSE_SPECIAL }, /* movntq, movntq */ - [0xe8] = MMX_OP2(psubsb), - [0xe9] = MMX_OP2(psubsw), - [0xea] = MMX_OP2(pminsw), - [0xeb] = MMX_OP2(por), - [0xec] = MMX_OP2(paddsb), - [0xed] = MMX_OP2(paddsw), - [0xee] = MMX_OP2(pmaxsw), - [0xef] = MMX_OP2(pxor), - [0xf0] = { NULL, NULL, NULL, SSE_SPECIAL }, /* lddqu */ - [0xf1] = MMX_OP2(psllw), - [0xf2] = MMX_OP2(pslld), - [0xf3] = MMX_OP2(psllq), - [0xf4] = MMX_OP2(pmuludq), - [0xf5] = MMX_OP2(pmaddwd), - [0xf6] = MMX_OP2(psadbw), - [0xf7] = { (SSEFunc_0_epp)gen_helper_maskmov_mmx, - (SSEFunc_0_epp)gen_helper_maskmov_xmm }, /* XXX: casts */ - [0xf8] = MMX_OP2(psubb), - [0xf9] = MMX_OP2(psubw), - [0xfa] = MMX_OP2(psubl), - [0xfb] = MMX_OP2(psubq), - [0xfc] = MMX_OP2(paddb), - [0xfd] = MMX_OP2(paddw), - [0xfe] = MMX_OP2(paddl), + [0x60] = MMX_OP(punpcklbw), + [0x61] = MMX_OP(punpcklwd), + [0x62] = MMX_OP(punpckldq), + [0x63] = MMX_OP(packsswb), + [0x64] = MMX_OP(pcmpgtb), + [0x65] = MMX_OP(pcmpgtw), + [0x66] = MMX_OP(pcmpgtl), + [0x67] = MMX_OP(packuswb), + [0x68] = MMX_OP(punpckhbw), + [0x69] = MMX_OP(punpckhwd), + [0x6a] = MMX_OP(punpckhdq), + [0x6b] = MMX_OP(packssdw), + [0x6c] = OP(op1, SSE_OPF_MMX, + NULL, gen_helper_punpcklqdq_xmm, NULL, NULL), + [0x6d] = OP(op1, SSE_OPF_MMX, + NULL, gen_helper_punpckhqdq_xmm, NULL, NULL), + [0x6e] = SSE_SPECIAL, /* movd mm, ea */ + [0x6f] = SSE_SPECIAL, /* movq, movdqa, , movqdu */ + [0x70] = OP(op1i, SSE_OPF_SHUF | SSE_OPF_MMX, + gen_helper_pshufw_mmx, gen_helper_pshufd_xmm, + gen_helper_pshufhw_xmm, gen_helper_pshuflw_xmm), + [0x71] = SSE_SPECIAL, /* shiftw */ + [0x72] = SSE_SPECIAL, /* shiftd */ + [0x73] = SSE_SPECIAL, /* shiftq */ + [0x74] = MMX_OP(pcmpeqb), + [0x75] = MMX_OP(pcmpeqw), + [0x76] = MMX_OP(pcmpeql), + [0x77] = SSE_SPECIAL, /* emms */ + [0x78] = SSE_SPECIAL, /* extrq_i, insertq_i (sse4a) */ + [0x79] = OP(op1, 0, + NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r), + [0x7c] = OP(op1, 0, + NULL, gen_helper_haddpd_xmm, NULL, gen_helper_haddps_xmm), + [0x7d] = OP(op1, 0, + NULL, gen_helper_hsubpd_xmm, NULL, gen_helper_hsubps_xmm), + [0x7e] = SSE_SPECIAL, /* movd, movd, , movq */ + [0x7f] = SSE_SPECIAL, /* movq, movdqa, movdqu */ + [0xc4] = SSE_SPECIAL, /* pinsrw */ + [0xc5] = SSE_SPECIAL, /* pextrw */ + [0xd0] = OP(op1, 0, + NULL, gen_helper_addsubpd_xmm, NULL, gen_helper_addsubps_xmm), + [0xd1] = MMX_OP(psrlw), + [0xd2] = MMX_OP(psrld), + [0xd3] = MMX_OP(psrlq), + [0xd4] = MMX_OP(paddq), + [0xd5] = MMX_OP(pmullw), + [0xd6] = SSE_SPECIAL, + [0xd7] = SSE_SPECIAL, /* pmovmskb */ + [0xd8] = MMX_OP(psubusb), + [0xd9] = MMX_OP(psubusw), + [0xda] = MMX_OP(pminub), + [0xdb] = MMX_OP(pand), + [0xdc] = MMX_OP(paddusb), + [0xdd] = MMX_OP(paddusw), + [0xde] = MMX_OP(pmaxub), + [0xdf] = MMX_OP(pandn), + [0xe0] = MMX_OP(pavgb), + [0xe1] = MMX_OP(psraw), + [0xe2] = MMX_OP(psrad), + [0xe3] = MMX_OP(pavgw), + [0xe4] = MMX_OP(pmulhuw), + [0xe5] = MMX_OP(pmulhw), + [0xe6] = OP(op1, 0, + NULL, gen_helper_cvttpd2dq_xmm, + gen_helper_cvtdq2pd_xmm, gen_helper_cvtpd2dq_xmm), + [0xe7] = SSE_SPECIAL, /* movntq, movntq */ + [0xe8] = MMX_OP(psubsb), + [0xe9] = MMX_OP(psubsw), + [0xea] = MMX_OP(pminsw), + [0xeb] = MMX_OP(por), + [0xec] = MMX_OP(paddsb), + [0xed] = MMX_OP(paddsw), + [0xee] = MMX_OP(pmaxsw), + [0xef] = MMX_OP(pxor), + [0xf0] = SSE_SPECIAL, /* lddqu */ + [0xf1] = MMX_OP(psllw), + [0xf2] = MMX_OP(pslld), + [0xf3] = MMX_OP(psllq), + [0xf4] = MMX_OP(pmuludq), + [0xf5] = MMX_OP(pmaddwd), + [0xf6] = MMX_OP(psadbw), + [0xf7] = OP(op1t, SSE_OPF_MMX, + gen_helper_maskmov_mmx, gen_helper_maskmov_xmm, NULL, NULL), + [0xf8] = MMX_OP(psubb), + [0xf9] = MMX_OP(psubw), + [0xfa] = MMX_OP(psubl), + [0xfb] = MMX_OP(psubq), + [0xfc] = MMX_OP(paddb), + [0xfd] = MMX_OP(paddw), + [0xfe] = MMX_OP(paddl), }; +#undef MMX_OP +#undef OP +#undef SSE_FOP +#undef SSE_OP +#undef SSE_SPECIAL + +#define MMX_OP2(x) { gen_helper_ ## x ## _mmx, gen_helper_ ## x ## _xmm } static const SSEFunc_0_epp sse_op_table2[3 * 8][2] = { [0 + 2] = MMX_OP2(psrlw), @@ -2970,16 +3022,20 @@ static const SSEFunc_l_ep sse_op_table3bq[] = { }; #endif +#define SSE_CMP(x) { \ + gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \ + gen_helper_ ## x ## ss, gen_helper_ ## x ## sd} static const SSEFunc_0_epp sse_op_table4[8][4] = { - SSE_FOP(cmpeq), - SSE_FOP(cmplt), - SSE_FOP(cmple), - SSE_FOP(cmpunord), - SSE_FOP(cmpneq), - SSE_FOP(cmpnlt), - SSE_FOP(cmpnle), - SSE_FOP(cmpord), + SSE_CMP(cmpeq), + SSE_CMP(cmplt), + SSE_CMP(cmple), + SSE_CMP(cmpunord), + SSE_CMP(cmpneq), + SSE_CMP(cmpnlt), + SSE_CMP(cmpnle), + SSE_CMP(cmpord), }; +#undef SSE_CMP static const SSEFunc_0_epp sse_op_table5[256] = { [0x0c] = gen_helper_pi2fw, @@ -3005,117 +3061,146 @@ static const SSEFunc_0_epp sse_op_table5[256] = { [0xb6] = gen_helper_movq, /* pfrcpit2 */ [0xb7] = gen_helper_pmulhrw_mmx, [0xbb] = gen_helper_pswapd, - [0xbf] = gen_helper_pavgb_mmx /* pavgusb */ + [0xbf] = gen_helper_pavgb_mmx, }; -struct SSEOpHelper_epp { - SSEFunc_0_epp op[2]; +struct SSEOpHelper_table6 { + SSEFuncs fn[2]; uint32_t ext_mask; + int flags; }; -struct SSEOpHelper_eppi { - SSEFunc_0_eppi op[2]; +struct SSEOpHelper_table7 { + union { + SSEFunc_0_eppi op1; + } fn[2]; uint32_t ext_mask; + int flags; }; -#define SSSE3_OP(x) { MMX_OP2(x), CPUID_EXT_SSSE3 } -#define SSE41_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE41 } -#define SSE42_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_SSE42 } -#define SSE41_SPECIAL { { NULL, SSE_SPECIAL }, CPUID_EXT_SSE41 } -#define PCLMULQDQ_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, \ - CPUID_EXT_PCLMULQDQ } -#define AESNI_OP(x) { { NULL, gen_helper_ ## x ## _xmm }, CPUID_EXT_AES } - -static const struct SSEOpHelper_epp sse_op_table6[256] = { - [0x00] = SSSE3_OP(pshufb), - [0x01] = SSSE3_OP(phaddw), - [0x02] = SSSE3_OP(phaddd), - [0x03] = SSSE3_OP(phaddsw), - [0x04] = SSSE3_OP(pmaddubsw), - [0x05] = SSSE3_OP(phsubw), - [0x06] = SSSE3_OP(phsubd), - [0x07] = SSSE3_OP(phsubsw), - [0x08] = SSSE3_OP(psignb), - [0x09] = SSSE3_OP(psignw), - [0x0a] = SSSE3_OP(psignd), - [0x0b] = SSSE3_OP(pmulhrsw), - [0x10] = SSE41_OP(pblendvb), - [0x14] = SSE41_OP(blendvps), - [0x15] = SSE41_OP(blendvpd), - [0x17] = SSE41_OP(ptest), - [0x1c] = SSSE3_OP(pabsb), - [0x1d] = SSSE3_OP(pabsw), - [0x1e] = SSSE3_OP(pabsd), - [0x20] = SSE41_OP(pmovsxbw), - [0x21] = SSE41_OP(pmovsxbd), - [0x22] = SSE41_OP(pmovsxbq), - [0x23] = SSE41_OP(pmovsxwd), - [0x24] = SSE41_OP(pmovsxwq), - [0x25] = SSE41_OP(pmovsxdq), - [0x28] = SSE41_OP(pmuldq), - [0x29] = SSE41_OP(pcmpeqq), - [0x2a] = SSE41_SPECIAL, /* movntqda */ - [0x2b] = SSE41_OP(packusdw), - [0x30] = SSE41_OP(pmovzxbw), - [0x31] = SSE41_OP(pmovzxbd), - [0x32] = SSE41_OP(pmovzxbq), - [0x33] = SSE41_OP(pmovzxwd), - [0x34] = SSE41_OP(pmovzxwq), - [0x35] = SSE41_OP(pmovzxdq), - [0x37] = SSE42_OP(pcmpgtq), - [0x38] = SSE41_OP(pminsb), - [0x39] = SSE41_OP(pminsd), - [0x3a] = SSE41_OP(pminuw), - [0x3b] = SSE41_OP(pminud), - [0x3c] = SSE41_OP(pmaxsb), - [0x3d] = SSE41_OP(pmaxsd), - [0x3e] = SSE41_OP(pmaxuw), - [0x3f] = SSE41_OP(pmaxud), - [0x40] = SSE41_OP(pmulld), - [0x41] = SSE41_OP(phminposuw), - [0xdb] = AESNI_OP(aesimc), - [0xdc] = AESNI_OP(aesenc), - [0xdd] = AESNI_OP(aesenclast), - [0xde] = AESNI_OP(aesdec), - [0xdf] = AESNI_OP(aesdeclast), +#define gen_helper_special_xmm NULL + +#define OP(name, op, flags, ext, mmx_name) \ + {{{.op = mmx_name}, {.op = gen_helper_ ## name ## _xmm} }, \ + CPUID_EXT_ ## ext, flags} +#define BINARY_OP_MMX(name, ext) \ + OP(name, op1, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) +#define BINARY_OP(name, ext, flags) \ + OP(name, op1, flags, ext, NULL) +#define UNARY_OP_MMX(name, ext) \ + OP(name, op1, SSE_OPF_MMX, ext, gen_helper_ ## name ## _mmx) +#define UNARY_OP(name, ext, flags) \ + OP(name, op1, flags, ext, NULL) +#define BLENDV_OP(name, ext, flags) OP(name, op1, 0, ext, NULL) +#define CMP_OP(name, ext) OP(name, op1, SSE_OPF_CMP, ext, NULL) +#define SPECIAL_OP(ext) OP(special, op1, SSE_OPF_SPECIAL, ext, NULL) + +/* prefix [66] 0f 38 */ +static const struct SSEOpHelper_table6 sse_op_table6[256] = { + [0x00] = BINARY_OP_MMX(pshufb, SSSE3), + [0x01] = BINARY_OP_MMX(phaddw, SSSE3), + [0x02] = BINARY_OP_MMX(phaddd, SSSE3), + [0x03] = BINARY_OP_MMX(phaddsw, SSSE3), + [0x04] = BINARY_OP_MMX(pmaddubsw, SSSE3), + [0x05] = BINARY_OP_MMX(phsubw, SSSE3), + [0x06] = BINARY_OP_MMX(phsubd, SSSE3), + [0x07] = BINARY_OP_MMX(phsubsw, SSSE3), + [0x08] = BINARY_OP_MMX(psignb, SSSE3), + [0x09] = BINARY_OP_MMX(psignw, SSSE3), + [0x0a] = BINARY_OP_MMX(psignd, SSSE3), + [0x0b] = BINARY_OP_MMX(pmulhrsw, SSSE3), + [0x10] = BLENDV_OP(pblendvb, SSE41, SSE_OPF_MMX), + [0x14] = BLENDV_OP(blendvps, SSE41, 0), + [0x15] = BLENDV_OP(blendvpd, SSE41, 0), + [0x17] = CMP_OP(ptest, SSE41), + [0x1c] = UNARY_OP_MMX(pabsb, SSSE3), + [0x1d] = UNARY_OP_MMX(pabsw, SSSE3), + [0x1e] = UNARY_OP_MMX(pabsd, SSSE3), + [0x20] = UNARY_OP(pmovsxbw, SSE41, SSE_OPF_MMX), + [0x21] = UNARY_OP(pmovsxbd, SSE41, SSE_OPF_MMX), + [0x22] = UNARY_OP(pmovsxbq, SSE41, SSE_OPF_MMX), + [0x23] = UNARY_OP(pmovsxwd, SSE41, SSE_OPF_MMX), + [0x24] = UNARY_OP(pmovsxwq, SSE41, SSE_OPF_MMX), + [0x25] = UNARY_OP(pmovsxdq, SSE41, SSE_OPF_MMX), + [0x28] = BINARY_OP(pmuldq, SSE41, SSE_OPF_MMX), + [0x29] = BINARY_OP(pcmpeqq, SSE41, SSE_OPF_MMX), + [0x2a] = SPECIAL_OP(SSE41), /* movntqda */ + [0x2b] = BINARY_OP(packusdw, SSE41, SSE_OPF_MMX), + [0x30] = UNARY_OP(pmovzxbw, SSE41, SSE_OPF_MMX), + [0x31] = UNARY_OP(pmovzxbd, SSE41, SSE_OPF_MMX), + [0x32] = UNARY_OP(pmovzxbq, SSE41, SSE_OPF_MMX), + [0x33] = UNARY_OP(pmovzxwd, SSE41, SSE_OPF_MMX), + [0x34] = UNARY_OP(pmovzxwq, SSE41, SSE_OPF_MMX), + [0x35] = UNARY_OP(pmovzxdq, SSE41, SSE_OPF_MMX), + [0x37] = BINARY_OP(pcmpgtq, SSE41, SSE_OPF_MMX), + [0x38] = BINARY_OP(pminsb, SSE41, SSE_OPF_MMX), + [0x39] = BINARY_OP(pminsd, SSE41, SSE_OPF_MMX), + [0x3a] = BINARY_OP(pminuw, SSE41, SSE_OPF_MMX), + [0x3b] = BINARY_OP(pminud, SSE41, SSE_OPF_MMX), + [0x3c] = BINARY_OP(pmaxsb, SSE41, SSE_OPF_MMX), + [0x3d] = BINARY_OP(pmaxsd, SSE41, SSE_OPF_MMX), + [0x3e] = BINARY_OP(pmaxuw, SSE41, SSE_OPF_MMX), + [0x3f] = BINARY_OP(pmaxud, SSE41, SSE_OPF_MMX), + [0x40] = BINARY_OP(pmulld, SSE41, SSE_OPF_MMX), + [0x41] = UNARY_OP(phminposuw, SSE41, 0), + [0xdb] = UNARY_OP(aesimc, AES, 0), + [0xdc] = BINARY_OP(aesenc, AES, 0), + [0xdd] = BINARY_OP(aesenclast, AES, 0), + [0xde] = BINARY_OP(aesdec, AES, 0), + [0xdf] = BINARY_OP(aesdeclast, AES, 0), }; -static const struct SSEOpHelper_eppi sse_op_table7[256] = { - [0x08] = SSE41_OP(roundps), - [0x09] = SSE41_OP(roundpd), - [0x0a] = SSE41_OP(roundss), - [0x0b] = SSE41_OP(roundsd), - [0x0c] = SSE41_OP(blendps), - [0x0d] = SSE41_OP(blendpd), - [0x0e] = SSE41_OP(pblendw), - [0x0f] = SSSE3_OP(palignr), - [0x14] = SSE41_SPECIAL, /* pextrb */ - [0x15] = SSE41_SPECIAL, /* pextrw */ - [0x16] = SSE41_SPECIAL, /* pextrd/pextrq */ - [0x17] = SSE41_SPECIAL, /* extractps */ - [0x20] = SSE41_SPECIAL, /* pinsrb */ - [0x21] = SSE41_SPECIAL, /* insertps */ - [0x22] = SSE41_SPECIAL, /* pinsrd/pinsrq */ - [0x40] = SSE41_OP(dpps), - [0x41] = SSE41_OP(dppd), - [0x42] = SSE41_OP(mpsadbw), - [0x44] = PCLMULQDQ_OP(pclmulqdq), - [0x60] = SSE42_OP(pcmpestrm), - [0x61] = SSE42_OP(pcmpestri), - [0x62] = SSE42_OP(pcmpistrm), - [0x63] = SSE42_OP(pcmpistri), - [0xdf] = AESNI_OP(aeskeygenassist), +/* prefix [66] 0f 3a */ +static const struct SSEOpHelper_table7 sse_op_table7[256] = { + [0x08] = UNARY_OP(roundps, SSE41, 0), + [0x09] = UNARY_OP(roundpd, SSE41, 0), + [0x0a] = UNARY_OP(roundss, SSE41, SSE_OPF_SCALAR), + [0x0b] = UNARY_OP(roundsd, SSE41, SSE_OPF_SCALAR), + [0x0c] = BINARY_OP(blendps, SSE41, 0), + [0x0d] = BINARY_OP(blendpd, SSE41, 0), + [0x0e] = BINARY_OP(pblendw, SSE41, SSE_OPF_MMX), + [0x0f] = BINARY_OP_MMX(palignr, SSSE3), + [0x14] = SPECIAL_OP(SSE41), /* pextrb */ + [0x15] = SPECIAL_OP(SSE41), /* pextrw */ + [0x16] = SPECIAL_OP(SSE41), /* pextrd/pextrq */ + [0x17] = SPECIAL_OP(SSE41), /* extractps */ + [0x20] = SPECIAL_OP(SSE41), /* pinsrb */ + [0x21] = SPECIAL_OP(SSE41), /* insertps */ + [0x22] = SPECIAL_OP(SSE41), /* pinsrd/pinsrq */ + [0x40] = BINARY_OP(dpps, SSE41, 0), + [0x41] = BINARY_OP(dppd, SSE41, 0), + [0x42] = BINARY_OP(mpsadbw, SSE41, SSE_OPF_MMX), + [0x44] = BINARY_OP(pclmulqdq, PCLMULQDQ, 0), + [0x60] = CMP_OP(pcmpestrm, SSE42), + [0x61] = CMP_OP(pcmpestri, SSE42), + [0x62] = CMP_OP(pcmpistrm, SSE42), + [0x63] = CMP_OP(pcmpistri, SSE42), + [0xdf] = UNARY_OP(aeskeygenassist, AES, 0), }; +#undef OP +#undef BINARY_OP_MMX +#undef BINARY_OP +#undef UNARY_OP_MMX +#undef UNARY_OP +#undef BLENDV_OP +#undef SPECIAL_OP + +/* VEX prefix not allowed */ +#define CHECK_NO_VEX(s) do { \ + if (s->prefix & PREFIX_VEX) \ + goto illegal_op; \ + } while (0) + static void gen_sse(CPUX86State *env, DisasContext *s, int b, target_ulong pc_start) { int b1, op1_offset, op2_offset, is_xmm, val; int modrm, mod, rm, reg; - SSEFunc_0_epp sse_fn_epp; - SSEFunc_0_eppi sse_fn_eppi; - SSEFunc_0_ppi sse_fn_ppi; - SSEFunc_0_eppt sse_fn_eppt; + int sse_op_flags; + SSEFuncs sse_op_fn; + const struct SSEOpHelper_table6 *op6; + const struct SSEOpHelper_table7 *op7; MemOp ot; b &= 0xff; @@ -3127,8 +3212,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, b1 = 3; else b1 = 0; - sse_fn_epp = sse_op_table1[b][b1]; - if (!sse_fn_epp) { + sse_op_flags = sse_op_table1[b].flags; + sse_op_fn = sse_op_table1[b].fn[b1]; + if ((sse_op_flags & (SSE_OPF_SPECIAL | SSE_OPF_3DNOW)) == 0 + && !sse_op_fn.op1) { goto unknown_op; } if ((b <= 0x5f && b >= 0x10) || b == 0xc6 || b == 0xc2) { @@ -3141,6 +3228,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, is_xmm = 1; } } + if (sse_op_flags & SSE_OPF_3DNOW) { + if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { + goto illegal_op; + } + } /* simple MMX/SSE operation */ if (s->flags & HF_TS_MASK) { gen_exception(s, EXCP07_PREX, pc_start - s->cs_base); @@ -3182,10 +3274,11 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, reg |= REX_R(s); } mod = (modrm >> 6) & 3; - if (sse_fn_epp == SSE_SPECIAL) { + if (sse_op_flags & SSE_OPF_SPECIAL) { b |= (b1 << 8); switch(b) { case 0x0e7: /* movntq */ + CHECK_NO_VEX(s); if (mod == 3) { goto illegal_op; } @@ -3198,13 +3291,13 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); - gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + gen_sto_env_A0(s, ZMM_OFFSET(reg)); break; case 0x3f0: /* lddqu */ if (mod == 3) goto illegal_op; gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + gen_ldo_env_A0(s, ZMM_OFFSET(reg)); break; case 0x22b: /* movntss */ case 0x32b: /* movntsd */ @@ -3221,6 +3314,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x6e: /* movd mm, ea */ + CHECK_NO_VEX(s); #ifdef TARGET_X86_64 if (s->dflag == MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); @@ -3240,20 +3334,19 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, #ifdef TARGET_X86_64 if (s->dflag == MO_64) { gen_ldst_modrm(env, s, modrm, MO_64, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); gen_helper_movq_mm_T0_xmm(s->ptr0, s->T0); } else #endif { gen_ldst_modrm(env, s, modrm, MO_32, OR_TMP0, 0); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0); gen_helper_movl_mm_T0_xmm(s->ptr0, s->tmp2_i32); } break; case 0x6f: /* movq mm, ea */ + CHECK_NO_VEX(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); @@ -3273,11 +3366,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x26f: /* movdqu xmm, ea */ if (mod != 3) { gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + gen_ldo_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); - gen_op_movo(s, offsetof(CPUX86State, xmm_regs[reg]), - offsetof(CPUX86State,xmm_regs[rm])); + gen_op_movo(s, ZMM_OFFSET(reg), ZMM_OFFSET(rm)); } break; case 0x210: /* movss xmm, ea */ @@ -3295,8 +3387,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(3))); } else { rm = (modrm & 7) | REX_B(s); - gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_L(0))); + tcg_gen_ld_i32(s->tmp2_i32, cpu_env, + offsetof(CPUX86State, xmm_regs[rm].ZMM_L(0))); + tcg_gen_st_i32(s->tmp2_i32, cpu_env, + offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0))); } break; case 0x310: /* movsd xmm, ea */ @@ -3312,7 +3406,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } else { rm = (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), - offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0))); + offsetof(CPUX86State, xmm_regs[rm].ZMM_Q(0))); } break; case 0x012: /* movlps */ @@ -3331,7 +3425,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x212: /* movsldup */ if (mod != 3) { gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + gen_ldo_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(0)), @@ -3373,7 +3467,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x216: /* movshdup */ if (mod != 3) { gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + gen_ldo_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); gen_op_movl(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_L(1)), @@ -3388,6 +3482,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; case 0x178: case 0x378: + CHECK_NO_VEX(s); { int bit_index, field_length; @@ -3395,8 +3490,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, goto illegal_op; field_length = x86_ldub_code(env, s) & 0x3F; bit_index = x86_ldub_code(env, s) & 0x3F; - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[reg])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(reg)); if (b1 == 1) gen_helper_extrq_i(cpu_env, s->ptr0, tcg_const_i32(bit_index), @@ -3408,6 +3502,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x7e: /* movd ea, mm */ + CHECK_NO_VEX(s); #ifdef TARGET_X86_64 if (s->dflag == MO_64) { tcg_gen_ld_i64(s->T0, cpu_env, @@ -3448,6 +3543,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1))); break; case 0x7f: /* movq ea, mm */ + CHECK_NO_VEX(s); if (mod != 3) { gen_lea_modrm(env, s, modrm); gen_stq_env_A0(s, offsetof(CPUX86State, fpregs[reg].mmx)); @@ -3465,11 +3561,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x27f: /* movdqu ea, xmm */ if (mod != 3) { gen_lea_modrm(env, s, modrm); - gen_sto_env_A0(s, offsetof(CPUX86State, xmm_regs[reg])); + gen_sto_env_A0(s, ZMM_OFFSET(reg)); } else { rm = (modrm & 7) | REX_B(s); - gen_op_movo(s, offsetof(CPUX86State, xmm_regs[rm]), - offsetof(CPUX86State,xmm_regs[reg])); + gen_op_movo(s, ZMM_OFFSET(rm), ZMM_OFFSET(reg)); } break; case 0x211: /* movss ea, xmm */ @@ -3531,6 +3626,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, offsetof(CPUX86State, xmm_t0.ZMM_L(1))); op1_offset = offsetof(CPUX86State,xmm_t0); } else { + CHECK_NO_VEX(s); tcg_gen_movi_tl(s->T0, val); tcg_gen_st32_tl(s->T0, cpu_env, offsetof(CPUX86State, mmx_t0.MMX_L(0))); @@ -3540,38 +3636,37 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, op1_offset = offsetof(CPUX86State,mmx_t0); } assert(b1 < 2); - sse_fn_epp = sse_op_table2[((b - 1) & 3) * 8 + + SSEFunc_0_epp fn = sse_op_table2[((b - 1) & 3) * 8 + (((modrm >> 3)) & 7)][b1]; - if (!sse_fn_epp) { + if (!fn) { goto unknown_op; } if (is_xmm) { rm = (modrm & 7) | REX_B(s); - op2_offset = offsetof(CPUX86State,xmm_regs[rm]); + op2_offset = ZMM_OFFSET(rm); } else { rm = (modrm & 7); op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); } tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op1_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); + fn(cpu_env, s->ptr0, s->ptr1); break; case 0x050: /* movmskps */ rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[rm])); - gen_helper_movmskps(s->tmp2_i32, cpu_env, s->ptr0); + tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm)); + gen_helper_movmskps_xmm(s->tmp2_i32, cpu_env, s->ptr0); tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); break; case 0x150: /* movmskpd */ rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State,xmm_regs[rm])); - gen_helper_movmskpd(s->tmp2_i32, cpu_env, s->ptr0); + tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm)); + gen_helper_movmskpd_xmm(s->tmp2_i32, cpu_env, s->ptr0); tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32); break; case 0x02a: /* cvtpi2ps */ case 0x12a: /* cvtpi2pd */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); if (mod != 3) { gen_lea_modrm(env, s, modrm); @@ -3581,7 +3676,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, rm = (modrm & 7); op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); } - op1_offset = offsetof(CPUX86State,xmm_regs[reg]); + op1_offset = ZMM_OFFSET(reg); tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); switch(b >> 8) { @@ -3598,7 +3693,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x32a: /* cvtsi2sd */ ot = mo_64_32(s->dflag); gen_ldst_modrm(env, s, modrm, ot, OR_TMP0, 0); - op1_offset = offsetof(CPUX86State,xmm_regs[reg]); + op1_offset = ZMM_OFFSET(reg); tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); if (ot == MO_32) { SSEFunc_0_epi sse_fn_epi = sse_op_table3ai[(b >> 8) & 1]; @@ -3617,6 +3712,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x12c: /* cvttpd2pi */ case 0x02d: /* cvtps2pi */ case 0x12d: /* cvtpd2pi */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); if (mod != 3) { gen_lea_modrm(env, s, modrm); @@ -3624,7 +3720,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_ldo_env_A0(s, op2_offset); } else { rm = (modrm & 7) | REX_B(s); - op2_offset = offsetof(CPUX86State,xmm_regs[rm]); + op2_offset = ZMM_OFFSET(rm); } op1_offset = offsetof(CPUX86State,fpregs[reg & 7].mmx); tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); @@ -3661,7 +3757,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, op2_offset = offsetof(CPUX86State,xmm_t0); } else { rm = (modrm & 7) | REX_B(s); - op2_offset = offsetof(CPUX86State,xmm_regs[rm]); + op2_offset = ZMM_OFFSET(rm); } tcg_gen_addi_ptr(s->ptr0, cpu_env, op2_offset); if (ot == MO_32) { @@ -3690,6 +3786,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, tcg_gen_st16_tl(s->T0, cpu_env, offsetof(CPUX86State,xmm_regs[reg].ZMM_W(val))); } else { + CHECK_NO_VEX(s); val &= 3; tcg_gen_st16_tl(s->T0, cpu_env, offsetof(CPUX86State,fpregs[reg].mmx.MMX_W(val))); @@ -3729,6 +3826,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } break; case 0x2d6: /* movq2dq */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); rm = (modrm & 7); gen_op_movq(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(0)), @@ -3736,6 +3834,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_op_movq_env_0(s, offsetof(CPUX86State, xmm_regs[reg].ZMM_Q(1))); break; case 0x3d6: /* movdq2q */ + CHECK_NO_VEX(s); gen_helper_enter_mmx(cpu_env); rm = (modrm & 7) | REX_B(s); gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx), @@ -3747,10 +3846,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, goto illegal_op; if (b1) { rm = (modrm & 7) | REX_B(s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, - offsetof(CPUX86State, xmm_regs[rm])); + tcg_gen_addi_ptr(s->ptr0, cpu_env, ZMM_OFFSET(rm)); gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0); } else { + CHECK_NO_VEX(s); rm = (modrm & 7); tcg_gen_addi_ptr(s->ptr0, cpu_env, offsetof(CPUX86State, fpregs[rm].mmx)); @@ -3772,17 +3871,18 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, mod = (modrm >> 6) & 3; assert(b1 < 2); - sse_fn_epp = sse_op_table6[b].op[b1]; - if (!sse_fn_epp) { + op6 = &sse_op_table6[b]; + if (op6->ext_mask == 0) { goto unknown_op; } - if (!(s->cpuid_ext_features & sse_op_table6[b].ext_mask)) + if (!(s->cpuid_ext_features & op6->ext_mask)) { goto illegal_op; + } if (b1) { - op1_offset = offsetof(CPUX86State,xmm_regs[reg]); + op1_offset = ZMM_OFFSET(reg); if (mod == 3) { - op2_offset = offsetof(CPUX86State,xmm_regs[rm | REX_B(s)]); + op2_offset = ZMM_OFFSET(rm | REX_B(s)); } else { op2_offset = offsetof(CPUX86State,xmm_t0); gen_lea_modrm(env, s, modrm); @@ -3813,7 +3913,17 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_ldo_env_A0(s, op2_offset); } } + if (!op6->fn[b1].op1) { + goto illegal_op; + } + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + op6->fn[b1].op1(cpu_env, s->ptr0, s->ptr1); } else { + CHECK_NO_VEX(s); + if ((op6->flags & SSE_OPF_MMX) == 0) { + goto unknown_op; + } op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); if (mod == 3) { op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); @@ -3822,16 +3932,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, op2_offset); } - } - if (sse_fn_epp == SSE_SPECIAL) { - goto unknown_op; + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + op6->fn[0].op1(cpu_env, s->ptr0, s->ptr1); } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - - if (b == 0x17) { + if (op6->flags & SSE_OPF_CMP) { set_cc_op(s, CC_OP_EFLAGS); } break; @@ -3848,6 +3954,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x3f0: /* crc32 Gd,Eb */ case 0x3f1: /* crc32 Gd,Ey */ do_crc32: + CHECK_NO_VEX(s); if (!(s->cpuid_ext_features & CPUID_EXT_SSE42)) { goto illegal_op; } @@ -3870,6 +3977,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x1f0: /* crc32 or movbe */ case 0x1f1: + CHECK_NO_VEX(s); /* For these insns, the f3 prefix is supposed to have priority over the 66 prefix, but that's not what we implement above setting b1. */ @@ -3879,6 +3987,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, /* FALLTHRU */ case 0x0f0: /* movbe Gy,My */ case 0x0f1: /* movbe My,Gy */ + CHECK_NO_VEX(s); if (!(s->cpuid_ext_features & CPUID_EXT_MOVBE)) { goto illegal_op; } @@ -4045,6 +4154,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, case 0x1f6: /* adcx Gy, Ey */ case 0x2f6: /* adox Gy, Ey */ + CHECK_NO_VEX(s); if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_ADX)) { goto illegal_op; } else { @@ -4200,16 +4310,21 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, mod = (modrm >> 6) & 3; assert(b1 < 2); - sse_fn_eppi = sse_op_table7[b].op[b1]; - if (!sse_fn_eppi) { + op7 = &sse_op_table7[b]; + if (op7->ext_mask == 0) { goto unknown_op; } - if (!(s->cpuid_ext_features & sse_op_table7[b].ext_mask)) + if (!(s->cpuid_ext_features & op7->ext_mask)) { goto illegal_op; + } s->rip_offset = 1; - if (sse_fn_eppi == SSE_SPECIAL) { + if (op7->flags & SSE_OPF_SPECIAL) { + /* None of the "special" ops are valid on mmx registers */ + if (b1 == 0) { + goto illegal_op; + } ot = mo_64_32(s->dflag); rm = (modrm & 7) | REX_B(s); if (mod != 3) @@ -4344,16 +4459,12 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, return; } - if (b1) { - op1_offset = offsetof(CPUX86State,xmm_regs[reg]); - if (mod == 3) { - op2_offset = offsetof(CPUX86State,xmm_regs[rm | REX_B(s)]); - } else { - op2_offset = offsetof(CPUX86State,xmm_t0); - gen_lea_modrm(env, s, modrm); - gen_ldo_env_A0(s, op2_offset); + if (b1 == 0) { + CHECK_NO_VEX(s); + /* MMX */ + if ((op7->flags & SSE_OPF_MMX) == 0) { + goto illegal_op; } - } else { op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); if (mod == 3) { op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); @@ -4362,9 +4473,29 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, gen_lea_modrm(env, s, modrm); gen_ldq_env_A0(s, op2_offset); } + val = x86_ldub_code(env, s); + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + + /* We only actually have one MMX instuction (palignr) */ + assert(b == 0x0f); + + op7->fn[0].op1(cpu_env, s->ptr0, s->ptr1, + tcg_const_i32(val)); + break; } - val = x86_ldub_code(env, s); + /* SSE */ + op1_offset = ZMM_OFFSET(reg); + if (mod == 3) { + op2_offset = ZMM_OFFSET(rm | REX_B(s)); + } else { + op2_offset = offsetof(CPUX86State, xmm_t0); + gen_lea_modrm(env, s, modrm); + gen_ldo_env_A0(s, op2_offset); + } + + val = x86_ldub_code(env, s); if ((b & 0xfc) == 0x60) { /* pcmpXstrX */ set_cc_op(s, CC_OP_EFLAGS); @@ -4376,7 +4507,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_eppi(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val)); + op7->fn[b1].op1(cpu_env, s->ptr0, s->ptr1, tcg_const_i32(val)); + if (op7->flags & SSE_OPF_CMP) { + set_cc_op(s, CC_OP_EFLAGS); + } break; case 0x33a: @@ -4427,33 +4561,29 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, break; } if (is_xmm) { - op1_offset = offsetof(CPUX86State,xmm_regs[reg]); + op1_offset = ZMM_OFFSET(reg); if (mod != 3) { int sz = 4; gen_lea_modrm(env, s, modrm); - op2_offset = offsetof(CPUX86State,xmm_t0); - - switch (b) { - case 0x50 ... 0x5a: - case 0x5c ... 0x5f: - case 0xc2: - /* Most sse scalar operations. */ - if (b1 == 2) { - sz = 2; - } else if (b1 == 3) { - sz = 3; - } - break; + op2_offset = offsetof(CPUX86State, xmm_t0); - case 0x2e: /* ucomis[sd] */ - case 0x2f: /* comis[sd] */ - if (b1 == 0) { - sz = 2; + if (sse_op_flags & SSE_OPF_SCALAR) { + if (sse_op_flags & SSE_OPF_CMP) { + /* ucomis[sd], comis[sd] */ + if (b1 == 0) { + sz = 2; + } else { + sz = 3; + } } else { - sz = 3; + /* Most sse scalar operations. */ + if (b1 == 2) { + sz = 2; + } else if (b1 == 3) { + sz = 3; + } } - break; } switch (sz) { @@ -4461,7 +4591,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, /* 32 bit access */ gen_op_ld_v(s, MO_32, s->T0, s->A0); tcg_gen_st32_tl(s->T0, cpu_env, - offsetof(CPUX86State,xmm_t0.ZMM_L(0))); + offsetof(CPUX86State, xmm_t0.ZMM_L(0))); break; case 3: /* 64 bit access */ @@ -4474,9 +4604,10 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, } } else { rm = (modrm & 7) | REX_B(s); - op2_offset = offsetof(CPUX86State,xmm_regs[rm]); + op2_offset = ZMM_OFFSET(rm); } } else { + CHECK_NO_VEX(s); op1_offset = offsetof(CPUX86State,fpregs[reg].mmx); if (mod != 3) { gen_lea_modrm(env, s, modrm); @@ -4486,60 +4617,42 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b, rm = (modrm & 7); op2_offset = offsetof(CPUX86State,fpregs[rm].mmx); } - } - switch(b) { - case 0x0f: /* 3DNow! data insns */ - val = x86_ldub_code(env, s); - sse_fn_epp = sse_op_table5[val]; - if (!sse_fn_epp) { - goto unknown_op; - } - if (!(s->cpuid_ext2_features & CPUID_EXT2_3DNOW)) { - goto illegal_op; + if (sse_op_flags & SSE_OPF_3DNOW) { + /* 3DNow! data insns */ + val = x86_ldub_code(env, s); + SSEFunc_0_epp op_3dnow = sse_op_table5[val]; + if (!op_3dnow) { + goto unknown_op; + } + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + op_3dnow(cpu_env, s->ptr0, s->ptr1); + return; } - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - break; - case 0x70: /* pshufx insn */ - case 0xc6: /* pshufx insn */ + } + tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); + tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); + if (sse_op_flags & SSE_OPF_SHUF) { val = x86_ldub_code(env, s); - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - /* XXX: introduce a new table? */ - sse_fn_ppi = (SSEFunc_0_ppi)sse_fn_epp; - sse_fn_ppi(s->ptr0, s->ptr1, tcg_const_i32(val)); - break; - case 0xc2: - /* compare insns, bits 7:3 (7:5 for AVX) are ignored */ - val = x86_ldub_code(env, s) & 7; - sse_fn_epp = sse_op_table4[val][b1]; - - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - break; - case 0xf7: + sse_op_fn.op1i(s->ptr0, s->ptr1, tcg_const_i32(val)); + } else if (b == 0xf7) { /* maskmov : we must prepare A0 */ - if (mod != 3) + if (mod != 3) { goto illegal_op; + } tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]); gen_extu(s->aflag, s->A0); gen_add_A0_ds_seg(s); - - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - /* XXX: introduce a new table? */ - sse_fn_eppt = (SSEFunc_0_eppt)sse_fn_epp; - sse_fn_eppt(cpu_env, s->ptr0, s->ptr1, s->A0); - break; - default: - tcg_gen_addi_ptr(s->ptr0, cpu_env, op1_offset); - tcg_gen_addi_ptr(s->ptr1, cpu_env, op2_offset); - sse_fn_epp(cpu_env, s->ptr0, s->ptr1); - break; + sse_op_fn.op1t(cpu_env, s->ptr0, s->ptr1, s->A0); + } else if (b == 0xc2) { + /* compare insns, bits 7:3 (7:5 for AVX) are ignored */ + val = x86_ldub_code(env, s) & 7; + sse_op_table4[val][b1](cpu_env, s->ptr0, s->ptr1); + } else { + sse_op_fn.op1(cpu_env, s->ptr0, s->ptr1); } - if (b == 0x2e || b == 0x2f) { + + if (sse_op_flags & SSE_OPF_CMP) { set_cc_op(s, CC_OP_EFLAGS); } } diff --git a/target/riscv/meson.build b/target/riscv/meson.build index 2c1975e72c..6b9435d69a 100644 --- a/target/riscv/meson.build +++ b/target/riscv/meson.build @@ -1,6 +1,4 @@ # FIXME extra_args should accept files() -dir = meson.current_source_dir() - gen = [ decodetree.process('insn16.decode', extra_args: ['--static-decode=decode_insn16', '--insnwidth=16']), decodetree.process('insn32.decode', extra_args: '--static-decode=decode_insn32'), |