diff options
author | Peter Maydell | 2021-08-13 18:11:56 +0200 |
---|---|---|
committer | Peter Maydell | 2021-08-25 11:48:50 +0200 |
commit | dc18628b1833157a50a424cb6b83b63eca560402 (patch) | |
tree | 50c4da0d00f8f22f52efe37ebf0b6d76977d6040 /target/arm/mve_helper.c | |
parent | target/arm: Implement MVE VCTP (diff) | |
download | qemu-dc18628b1833157a50a424cb6b83b63eca560402.tar.gz qemu-dc18628b1833157a50a424cb6b83b63eca560402.tar.xz qemu-dc18628b1833157a50a424cb6b83b63eca560402.zip |
target/arm: Implement MVE scatter-gather insns
Implement the MVE gather-loads and scatter-stores which
form the address by adding a base value from a scalar
register to an offset in each element of a vector.
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'target/arm/mve_helper.c')
-rw-r--r-- | target/arm/mve_helper.c | 129 |
1 files changed, 129 insertions, 0 deletions
diff --git a/target/arm/mve_helper.c b/target/arm/mve_helper.c index 1752555a21..2b882db1c3 100644 --- a/target/arm/mve_helper.c +++ b/target/arm/mve_helper.c @@ -207,6 +207,135 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t) #undef DO_VSTR /* + * Gather loads/scatter stores. Here each element of Qm specifies + * an offset to use from the base register Rm. In the _os_ versions + * that offset is scaled by the element size. + * For loads, predicated lanes are zeroed instead of retaining + * their previous values. + */ +#define DO_VLDR_SG(OP, LDTYPE, ESIZE, TYPE, OFFTYPE, ADDRFN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + TYPE *d = vd; \ + OFFTYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE, eci_mask >>= ESIZE) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H##ESIZE(e)]); \ + d[H##ESIZE(e)] = (mask & 1) ? \ + cpu_##LDTYPE##_data_ra(env, addr, GETPC()) : 0; \ + } \ + mve_advance_vpt(env); \ + } + +/* We know here TYPE is unsigned so always the same as the offset type */ +#define DO_VSTR_SG(OP, STTYPE, ESIZE, TYPE, ADDRFN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + TYPE *d = vd; \ + TYPE *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \ + addr = ADDRFN(base, m[H##ESIZE(e)]); \ + if (mask & 1) { \ + cpu_##STTYPE##_data_ra(env, addr, d[H##ESIZE(e)], GETPC()); \ + } \ + } \ + mve_advance_vpt(env); \ + } + +/* + * 64-bit accesses are slightly different: they are done as two 32-bit + * accesses, controlled by the predicate mask for the relevant beat, + * and with a single 32-bit offset in the first of the two Qm elements. + * Note that for QEMU our IMPDEF AIRCR.ENDIANNESS is always 0 (little). + */ +#define DO_VLDR64_SG(OP, ADDRFN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + uint32_t *d = vd; \ + uint32_t *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + uint16_t eci_mask = mve_eci_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4, eci_mask >>= 4) { \ + if (!(eci_mask & 1)) { \ + continue; \ + } \ + addr = ADDRFN(base, m[H4(e & ~1)]); \ + addr += 4 * (e & 1); \ + d[H4(e)] = (mask & 1) ? cpu_ldl_data_ra(env, addr, GETPC()) : 0; \ + } \ + mve_advance_vpt(env); \ + } + +#define DO_VSTR64_SG(OP, ADDRFN) \ + void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm, \ + uint32_t base) \ + { \ + uint32_t *d = vd; \ + uint32_t *m = vm; \ + uint16_t mask = mve_element_mask(env); \ + unsigned e; \ + uint32_t addr; \ + for (e = 0; e < 16 / 4; e++, mask >>= 4) { \ + addr = ADDRFN(base, m[H4(e & ~1)]); \ + addr += 4 * (e & 1); \ + if (mask & 1) { \ + cpu_stl_data_ra(env, addr, d[H4(e)], GETPC()); \ + } \ + } \ + mve_advance_vpt(env); \ + } + +#define ADDR_ADD(BASE, OFFSET) ((BASE) + (OFFSET)) +#define ADDR_ADD_OSH(BASE, OFFSET) ((BASE) + ((OFFSET) << 1)) +#define ADDR_ADD_OSW(BASE, OFFSET) ((BASE) + ((OFFSET) << 2)) +#define ADDR_ADD_OSD(BASE, OFFSET) ((BASE) + ((OFFSET) << 3)) + +DO_VLDR_SG(vldrb_sg_sh, ldsb, 2, int16_t, uint16_t, ADDR_ADD) +DO_VLDR_SG(vldrb_sg_sw, ldsb, 4, int32_t, uint32_t, ADDR_ADD) +DO_VLDR_SG(vldrh_sg_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD) + +DO_VLDR_SG(vldrb_sg_ub, ldub, 1, uint8_t, uint8_t, ADDR_ADD) +DO_VLDR_SG(vldrb_sg_uh, ldub, 2, uint16_t, uint16_t, ADDR_ADD) +DO_VLDR_SG(vldrb_sg_uw, ldub, 4, uint32_t, uint32_t, ADDR_ADD) +DO_VLDR_SG(vldrh_sg_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD) +DO_VLDR_SG(vldrh_sg_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD) +DO_VLDR_SG(vldrw_sg_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD) +DO_VLDR64_SG(vldrd_sg_ud, ADDR_ADD) + +DO_VLDR_SG(vldrh_sg_os_sw, ldsw, 4, int32_t, uint32_t, ADDR_ADD_OSH) +DO_VLDR_SG(vldrh_sg_os_uh, lduw, 2, uint16_t, uint16_t, ADDR_ADD_OSH) +DO_VLDR_SG(vldrh_sg_os_uw, lduw, 4, uint32_t, uint32_t, ADDR_ADD_OSH) +DO_VLDR_SG(vldrw_sg_os_uw, ldl, 4, uint32_t, uint32_t, ADDR_ADD_OSW) +DO_VLDR64_SG(vldrd_sg_os_ud, ADDR_ADD_OSD) + +DO_VSTR_SG(vstrb_sg_ub, stb, 1, uint8_t, ADDR_ADD) +DO_VSTR_SG(vstrb_sg_uh, stb, 2, uint16_t, ADDR_ADD) +DO_VSTR_SG(vstrb_sg_uw, stb, 4, uint32_t, ADDR_ADD) +DO_VSTR_SG(vstrh_sg_uh, stw, 2, uint16_t, ADDR_ADD) +DO_VSTR_SG(vstrh_sg_uw, stw, 4, uint32_t, ADDR_ADD) +DO_VSTR_SG(vstrw_sg_uw, stl, 4, uint32_t, ADDR_ADD) +DO_VSTR64_SG(vstrd_sg_ud, ADDR_ADD) + +DO_VSTR_SG(vstrh_sg_os_uh, stw, 2, uint16_t, ADDR_ADD_OSH) +DO_VSTR_SG(vstrh_sg_os_uw, stw, 4, uint32_t, ADDR_ADD_OSH) +DO_VSTR_SG(vstrw_sg_os_uw, stl, 4, uint32_t, ADDR_ADD_OSW) +DO_VSTR64_SG(vstrd_sg_os_ud, ADDR_ADD_OSD) + +/* * The mergemask(D, R, M) macro performs the operation "*D = R" but * storing only the bytes which correspond to 1 bits in M, * leaving other bytes in *D unchanged. We use _Generic |