diff options
author | Richard Henderson | 2021-02-25 00:05:32 +0100 |
---|---|---|
committer | Peter Maydell | 2021-03-05 16:17:34 +0100 |
commit | 519183d3fee58e52f7b51cf146c9dc9edc565059 (patch) | |
tree | a69f2eb4218c6fcf7fbd4908661eaf23d129d836 /target/arm/vec_helper.c | |
parent | hw/arm/xlnx-zynqmp: Remove obsolete 'has_rpu' property (diff) | |
download | qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.tar.gz qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.tar.xz qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.zip |
target/arm: Speed up aarch64 TBL/TBX
Always perform one call instead of two for 16-byte operands.
Use byte loads/stores directly into the vector register file
instead of extractions and deposits to a 64-bit local variable.
In order to easily receive pointers into the vector register file,
convert the helper to the gvec out-of-line signature. Move the
helper into vec_helper.c, where it can make use of H1 and clear_tail.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Tested-by: Alex Bennée <alex.bennee@linaro.org>
Message-id: 20210224230532.276878-1-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'target/arm/vec_helper.c')
-rw-r--r-- | target/arm/vec_helper.c | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index 7174030377..3fbeae87cb 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -1937,3 +1937,51 @@ DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) #undef DO_VRINT_RMODE + +#ifdef TARGET_AARCH64 +void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) +{ + const uint8_t *indices = vm; + CPUARMState *env = venv; + size_t oprsz = simd_oprsz(desc); + uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); + bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); + uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); + union { + uint8_t b[16]; + uint64_t d[2]; + } result; + + /* + * We must construct the final result in a temp, lest the output + * overlaps the input table. For TBL, begin with zero; for TBX, + * begin with the original register contents. Note that we always + * copy 16 bytes here to avoid an extra branch; clearing the high + * bits of the register for oprsz == 8 is handled below. + */ + if (is_tbx) { + memcpy(&result, vd, 16); + } else { + memset(&result, 0, 16); + } + + for (size_t i = 0; i < oprsz; ++i) { + uint32_t index = indices[H1(i)]; + + if (index < table_len) { + /* + * Convert index (a byte offset into the virtual table + * which is a series of 128-bit vectors concatenated) + * into the correct register element, bearing in mind + * that the table can wrap around from V31 to V0. + */ + const uint8_t *table = (const uint8_t *) + aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); + result.b[H1(i)] = table[H1(index % 16)]; + } + } + + memcpy(vd, &result, 16); + clear_tail(vd, oprsz, simd_maxsz(desc)); +} +#endif |