target/arm: Speed up aarch64 TBL/TBX

Always perform one call instead of two for 16-byte operands. Use byte loads/stores directly into the vector register file instead of extractions and deposits to a 64-bit local variable. In order to easily receive pointers into the vector register file, convert the helper to the gvec out-of-line signature. Move the helper into vec_helper.c, where it can make use of H1 and clear_tail. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Tested-by: Alex Bennée <alex.bennee@linaro.org> Message-id: 20210224230532.276878-1-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
author: Richard Henderson 2021-02-25 00:05:32 +0100
committer: Peter Maydell 2021-03-05 16:17:34 +0100
commit: 519183d3fee58e52f7b51cf146c9dc9edc565059 (patch)
tree: a69f2eb4218c6fcf7fbd4908661eaf23d129d836 /target/arm/vec_helper.c
parent: hw/arm/xlnx-zynqmp: Remove obsolete 'has_rpu' property (diff)
download: qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.tar.gz
qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.tar.xz
qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.zip
1 files changed, 48 insertions, 0 deletions
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 7174030377..3fbeae87cb 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1937,3 +1937,51 @@ DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
 DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
 
 #undef DO_VRINT_RMODE
+
+#ifdef TARGET_AARCH64
+void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
+{
+    const uint8_t *indices = vm;
+    CPUARMState *env = venv;
+    size_t oprsz = simd_oprsz(desc);
+    uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
+    bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
+    uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
+    union {
+        uint8_t b[16];
+        uint64_t d[2];
+    } result;
+
+    /*
+     * We must construct the final result in a temp, lest the output
+     * overlaps the input table.  For TBL, begin with zero; for TBX,
+     * begin with the original register contents.  Note that we always
+     * copy 16 bytes here to avoid an extra branch; clearing the high
+     * bits of the register for oprsz == 8 is handled below.
+     */
+    if (is_tbx) {
+        memcpy(&result, vd, 16);
+    } else {
+        memset(&result, 0, 16);
+    }
+
+    for (size_t i = 0; i < oprsz; ++i) {
+        uint32_t index = indices[H1(i)];
+
+        if (index < table_len) {
+            /*
+             * Convert index (a byte offset into the virtual table
+             * which is a series of 128-bit vectors concatenated)
+             * into the correct register element, bearing in mind
+             * that the table can wrap around from V31 to V0.
+             */
+            const uint8_t *table = (const uint8_t *)
+                aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
+            result.b[H1(i)] = table[H1(index % 16)];
+        }
+    }
+
+    memcpy(vd, &result, 16);
+    clear_tail(vd, oprsz, simd_maxsz(desc));
+}
+#endif
author	Richard Henderson	2021-02-25 00:05:32 +0100
committer	Peter Maydell	2021-03-05 16:17:34 +0100
commit	519183d3fee58e52f7b51cf146c9dc9edc565059 (patch)
tree	a69f2eb4218c6fcf7fbd4908661eaf23d129d836 /target/arm/vec_helper.c
parent	hw/arm/xlnx-zynqmp: Remove obsolete 'has_rpu' property (diff)
download	qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.tar.gz qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.tar.xz qemu-519183d3fee58e52f7b51cf146c9dc9edc565059.zip