diff options
author | Richard Henderson | 2021-05-25 03:02:31 +0200 |
---|---|---|
committer | Peter Maydell | 2021-05-25 17:01:43 +0200 |
commit | 8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a (patch) | |
tree | 270d5ebbdd7b231ab39ac14eb46c43e50f89e2ce /target/arm/neon_helper.c | |
parent | target/arm: Implement SVE2 integer unary operations (predicated) (diff) | |
download | qemu-8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a.tar.gz qemu-8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a.tar.xz qemu-8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a.zip |
target/arm: Split out saturating/rounding shifts from neon
Split these operations out into a header that can be shared
between neon and sve. The "sat" pointer acts both as a boolean
for control of saturating behavior and controls the difference
in behavior between neon and sve -- QC bit or no QC bit.
Widen the shift operand in the new helpers, as the SVE2 insns treat
the whole input element as significant. For the neon uses, truncate
the shift to int8_t while passing the parameter.
Implement right-shift rounding as
tmp = src >> (shift - 1);
dst = (tmp >> 1) + (tmp & 1);
This is the same number of instructions as the current
tmp = 1 << (shift - 1);
dst = (src + tmp) >> shift;
without any possibility of intermediate overflow.
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210525010358.152808-6-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'target/arm/neon_helper.c')
-rw-r--r-- | target/arm/neon_helper.c | 519 |
1 files changed, 89 insertions, 430 deletions
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c index b637265691..338b9189d5 100644 --- a/target/arm/neon_helper.c +++ b/target/arm/neon_helper.c @@ -11,6 +11,7 @@ #include "cpu.h" #include "exec/helper-proto.h" #include "fpu/softfloat.h" +#include "vec_internal.h" #define SIGNBIT (uint32_t)0x80000000 #define SIGNBIT64 ((uint64_t)1 << 63) @@ -576,496 +577,154 @@ NEON_POP(pmax_s16, neon_s16, 2) NEON_POP(pmax_u16, neon_u16, 2) #undef NEON_FN -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8 || \ - tmp <= -(ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp < 0) { \ - dest = src1 >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) NEON_VOP(shl_u16, neon_u16, 2) #undef NEON_FN -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ - dest = src1 >> (sizeof(src1) * 8 - 1); \ - } else if (tmp < 0) { \ - dest = src1 >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL)) NEON_VOP(shl_s16, neon_s16, 2) #undef NEON_FN -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if ((tmp >= (ssize_t)sizeof(src1) * 8) \ - || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \ - dest = 0; \ - } else if (tmp < 0) { \ - dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) NEON_VOP(rshl_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) NEON_VOP(rshl_s16, neon_s16, 2) #undef NEON_FN -/* The addition of the rounding constant may overflow, so we use an - * intermediate 64 bit accumulator. */ -uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop) +uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift) { - int32_t dest; - int32_t val = (int32_t)valop; - int8_t shift = (int8_t)shiftop; - if ((shift >= 32) || (shift <= -32)) { - dest = 0; - } else if (shift < 0) { - int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); - dest = big_dest >> -shift; - } else { - dest = val << shift; - } - return dest; + return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL); } -/* Handling addition overflow with 64 bit input values is more - * tricky than with 32 bit values. */ -uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) -{ - int8_t shift = (int8_t)shiftop; - int64_t val = valop; - if ((shift >= 64) || (shift <= -64)) { - val = 0; - } else if (shift < 0) { - val >>= (-shift - 1); - if (val == INT64_MAX) { - /* In this case, it means that the rounding constant is 1, - * and the addition would overflow. Return the actual - * result directly. */ - val = 0x4000000000000000LL; - } else { - val++; - val >>= 1; - } - } else { - val <<= shift; - } - return val; +uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift) +{ + return do_sqrshl_d(val, (int8_t)shift, true, NULL); } -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8 || \ - tmp < -(ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ - dest = src1 >> (-tmp - 1); \ - } else if (tmp < 0) { \ - dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL)) NEON_VOP(rshl_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL)) NEON_VOP(rshl_u16, neon_u16, 2) #undef NEON_FN -/* The addition of the rounding constant may overflow, so we use an - * intermediate 64 bit accumulator. */ -uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) +uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift) { - uint32_t dest; - int8_t shift = (int8_t)shiftop; - if (shift >= 32 || shift < -32) { - dest = 0; - } else if (shift == -32) { - dest = val >> 31; - } else if (shift < 0) { - uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); - dest = big_dest >> -shift; - } else { - dest = val << shift; - } - return dest; + return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL); } -/* Handling addition overflow with 64 bit input values is more - * tricky than with 32 bit values. */ -uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) -{ - int8_t shift = (uint8_t)shiftop; - if (shift >= 64 || shift < -64) { - val = 0; - } else if (shift == -64) { - /* Rounding a 1-bit result just preserves that bit. */ - val >>= 63; - } else if (shift < 0) { - val >>= (-shift - 1); - if (val == UINT64_MAX) { - /* In this case, it means that the rounding constant is 1, - * and the addition would overflow. Return the actual - * result directly. */ - val = 0x8000000000000000ULL; - } else { - val++; - val >>= 1; - } - } else { - val <<= shift; - } - return val; +uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift) +{ + return do_uqrshl_d(val, (int8_t)shift, true, NULL); } -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8) { \ - if (src1) { \ - SET_QC(); \ - dest = ~0; \ - } else { \ - dest = 0; \ - } \ - } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp < 0) { \ - dest = src1 >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - if ((dest >> tmp) != src1) { \ - SET_QC(); \ - dest = ~0; \ - } \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) NEON_VOP_ENV(qshl_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) NEON_VOP_ENV(qshl_u16, neon_u16, 2) -NEON_VOP_ENV(qshl_u32, neon_u32, 1) #undef NEON_FN -uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop) -{ - int8_t shift = (int8_t)shiftop; - if (shift >= 64) { - if (val) { - val = ~(uint64_t)0; - SET_QC(); - } - } else if (shift <= -64) { - val = 0; - } else if (shift < 0) { - val >>= -shift; - } else { - uint64_t tmp = val; - val <<= shift; - if ((val >> shift) != tmp) { - SET_QC(); - val = ~(uint64_t)0; - } - } - return val; +uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); } -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8) { \ - if (src1) { \ - SET_QC(); \ - dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ - if (src1 > 0) { \ - dest--; \ - } \ - } else { \ - dest = src1; \ - } \ - } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ - dest = src1 >> 31; \ - } else if (tmp < 0) { \ - dest = src1 >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - if ((dest >> tmp) != src1) { \ - SET_QC(); \ - dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ - if (src1 > 0) { \ - dest--; \ - } \ - } \ - }} while (0) +uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) NEON_VOP_ENV(qshl_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) NEON_VOP_ENV(qshl_s16, neon_s16, 2) -NEON_VOP_ENV(qshl_s32, neon_s32, 1) #undef NEON_FN -uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) -{ - int8_t shift = (uint8_t)shiftop; - int64_t val = valop; - if (shift >= 64) { - if (val) { - SET_QC(); - val = (val >> 63) ^ ~SIGNBIT64; - } - } else if (shift <= -64) { - val >>= 63; - } else if (shift < 0) { - val >>= -shift; - } else { - int64_t tmp = val; - val <<= shift; - if ((val >> shift) != tmp) { - SET_QC(); - val = (tmp >> 63) ^ ~SIGNBIT64; - } - } - return val; +uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) +{ + return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); } -#define NEON_FN(dest, src1, src2) do { \ - if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \ - SET_QC(); \ - dest = 0; \ - } else { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8) { \ - if (src1) { \ - SET_QC(); \ - dest = ~0; \ - } else { \ - dest = 0; \ - } \ - } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp < 0) { \ - dest = src1 >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - if ((dest >> tmp) != src1) { \ - SET_QC(); \ - dest = ~0; \ - } \ - } \ - }} while (0) -NEON_VOP_ENV(qshlu_s8, neon_u8, 4) -NEON_VOP_ENV(qshlu_s16, neon_u16, 2) +uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc); +} + +#define NEON_FN(dest, src1, src2) \ + (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc)) +NEON_VOP_ENV(qshlu_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc)) +NEON_VOP_ENV(qshlu_s16, neon_s16, 2) #undef NEON_FN -uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop) +uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift) { - if ((int32_t)valop < 0) { - SET_QC(); - return 0; - } - return helper_neon_qshl_u32(env, valop, shiftop); + return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc); } -uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) +uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift) { - if ((int64_t)valop < 0) { - SET_QC(); - return 0; - } - return helper_neon_qshl_u64(env, valop, shiftop); + return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc); } -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8) { \ - if (src1) { \ - SET_QC(); \ - dest = ~0; \ - } else { \ - dest = 0; \ - } \ - } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ - dest = src1 >> (sizeof(src1) * 8 - 1); \ - } else if (tmp < 0) { \ - dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - if ((dest >> tmp) != src1) { \ - SET_QC(); \ - dest = ~0; \ - } \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) NEON_VOP_ENV(qrshl_u8, neon_u8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) NEON_VOP_ENV(qrshl_u16, neon_u16, 2) #undef NEON_FN -/* The addition of the rounding constant may overflow, so we use an - * intermediate 64 bit accumulator. */ -uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop) +uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift) { - uint32_t dest; - int8_t shift = (int8_t)shiftop; - if (shift >= 32) { - if (val) { - SET_QC(); - dest = ~0; - } else { - dest = 0; - } - } else if (shift < -32) { - dest = 0; - } else if (shift == -32) { - dest = val >> 31; - } else if (shift < 0) { - uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); - dest = big_dest >> -shift; - } else { - dest = val << shift; - if ((dest >> shift) != val) { - SET_QC(); - dest = ~0; - } - } - return dest; + return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); } -/* Handling addition overflow with 64 bit input values is more - * tricky than with 32 bit values. */ -uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop) -{ - int8_t shift = (int8_t)shiftop; - if (shift >= 64) { - if (val) { - SET_QC(); - val = ~0; - } - } else if (shift < -64) { - val = 0; - } else if (shift == -64) { - val >>= 63; - } else if (shift < 0) { - val >>= (-shift - 1); - if (val == UINT64_MAX) { - /* In this case, it means that the rounding constant is 1, - * and the addition would overflow. Return the actual - * result directly. */ - val = 0x8000000000000000ULL; - } else { - val++; - val >>= 1; - } - } else { \ - uint64_t tmp = val; - val <<= shift; - if ((val >> shift) != tmp) { - SET_QC(); - val = ~0; - } - } - return val; +uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc); } -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp >= (ssize_t)sizeof(src1) * 8) { \ - if (src1) { \ - SET_QC(); \ - dest = (typeof(dest))(1 << (sizeof(src1) * 8 - 1)); \ - if (src1 > 0) { \ - dest--; \ - } \ - } else { \ - dest = 0; \ - } \ - } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ - dest = 0; \ - } else if (tmp < 0) { \ - dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - if ((dest >> tmp) != src1) { \ - SET_QC(); \ - dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ - if (src1 > 0) { \ - dest--; \ - } \ - } \ - }} while (0) +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc)) NEON_VOP_ENV(qrshl_s8, neon_s8, 4) +#undef NEON_FN + +#define NEON_FN(dest, src1, src2) \ + (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc)) NEON_VOP_ENV(qrshl_s16, neon_s16, 2) #undef NEON_FN -/* The addition of the rounding constant may overflow, so we use an - * intermediate 64 bit accumulator. */ -uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop) +uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift) { - int32_t dest; - int32_t val = (int32_t)valop; - int8_t shift = (int8_t)shiftop; - if (shift >= 32) { - if (val) { - SET_QC(); - dest = (val >> 31) ^ ~SIGNBIT; - } else { - dest = 0; - } - } else if (shift <= -32) { - dest = 0; - } else if (shift < 0) { - int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); - dest = big_dest >> -shift; - } else { - dest = val << shift; - if ((dest >> shift) != val) { - SET_QC(); - dest = (val >> 31) ^ ~SIGNBIT; - } - } - return dest; + return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc); } -/* Handling addition overflow with 64 bit input values is more - * tricky than with 32 bit values. */ -uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) -{ - int8_t shift = (uint8_t)shiftop; - int64_t val = valop; - - if (shift >= 64) { - if (val) { - SET_QC(); - val = (val >> 63) ^ ~SIGNBIT64; - } - } else if (shift <= -64) { - val = 0; - } else if (shift < 0) { - val >>= (-shift - 1); - if (val == INT64_MAX) { - /* In this case, it means that the rounding constant is 1, - * and the addition would overflow. Return the actual - * result directly. */ - val = 0x4000000000000000ULL; - } else { - val++; - val >>= 1; - } - } else { - int64_t tmp = val; - val <<= shift; - if ((val >> shift) != tmp) { - SET_QC(); - val = (tmp >> 63) ^ ~SIGNBIT64; - } - } - return val; +uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift) +{ + return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc); } uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) |