summaryrefslogtreecommitdiffstats
path: root/target/arm/mve_helper.c
diff options
context:
space:
mode:
authorPeter Maydell2021-08-13 18:11:56 +0200
committerPeter Maydell2021-08-25 11:48:50 +0200
commitdc18628b1833157a50a424cb6b83b63eca560402 (patch)
tree50c4da0d00f8f22f52efe37ebf0b6d76977d6040 /target/arm/mve_helper.c
parenttarget/arm: Implement MVE VCTP (diff)
downloadqemu-dc18628b1833157a50a424cb6b83b63eca560402.tar.gz
qemu-dc18628b1833157a50a424cb6b83b63eca560402.tar.xz
qemu-dc18628b1833157a50a424cb6b83b63eca560402.zip
target/arm: Implement MVE scatter-gather insns
Implement the MVE gather-loads and scatter-stores which form the address by adding a base value from a scalar register to an offset in each element of a vector. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'target/arm/mve_helper.c')
-rw-r--r--target/arm/mve_helper.c129
1 files changed, 129 insertions, 0 deletions
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c
index 1752555a21..2b882db1c3 100644
--- a/target/arm/mve_helper.c
+++ b/target/arm/mve_helper.c
@@ -207,6 +207,135 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
#undef DO_VSTR
/*
+ * Gather loads/scatter stores. Here each element of Qm specifies
+ * an offset to use from the base register Rm. In the _os_ versions
+ * that offset is scaled by the element size.
+ * For loads, predicated lanes are zeroed instead of retaining
+ * their previous values.
+ */
+#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ TYPE *d = vd; \
+ OFFTYPE *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \
+ if (!(eci_mask & 1)) { \
+ continue; \
+ } \
+ addr = ADDRFN(base, m[H##ESIZE(e)]); \
+ d[H##ESIZE(e)] = (mask & 1) ? \
+ cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/* We know here TYPE is unsigned so always the same as the offset type */
+#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ TYPE *d = vd; \
+ TYPE *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
+ addr = ADDRFN(base, m[H##ESIZE(e)]); \
+ if (mask & 1) { \
+ cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+/*
+ * 64-bit accesses are slightly different: they are done as two 32-bit
+ * accesses, controlled by the predicate mask for the relevant beat,
+ * and with a single 32-bit offset in the first of the two Qm elements.
+ * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little).
+ */
+#define DO_VLDR64_SG(OP, ADDRFN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ uint32_t *d = vd; \
+ uint32_t *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ uint16_t eci_mask = mve_eci_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \
+ if (!(eci_mask & 1)) { \
+ continue; \
+ } \
+ addr = ADDRFN(base, m[H4(e & ~1)]); \
+ addr += 4 * (e & 1); \
+ d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define DO_VSTR64_SG(OP, ADDRFN) \
+ void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \
+ uint32_t base) \
+ { \
+ uint32_t *d = vd; \
+ uint32_t *m = vm; \
+ uint16_t mask = mve_element_mask(env); \
+ unsigned e; \
+ uint32_t addr; \
+ for (e = 0; e < 16 / 4; e++, mask >>= 4) { \
+ addr = ADDRFN(base, m[H4(e & ~1)]); \
+ addr += 4 * (e & 1); \
+ if (mask & 1) { \
+ cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \
+ } \
+ } \
+ mve_advance_vpt(env); \
+ }
+
+#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET))
+#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1))
+#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2))
+#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3))
+
+DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD)
+DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD)
+DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD)
+
+DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD)
+DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD)
+DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD)
+DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD)
+DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD)
+DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD)
+DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD)
+
+DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH)
+DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH)
+DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH)
+DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW)
+DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD)
+
+DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD)
+DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD)
+DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD)
+DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD)
+DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD)
+DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD)
+DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD)
+
+DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH)
+DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH)
+DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW)
+DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD)
+
+/*
* The mergemask(D, R, M) macro performs the operation "*D = R" but
* storing only the bytes which correspond to 1 bits in M,
* leaving other bytes in *D unchanged. We use _Generic