summaryrefslogtreecommitdiffstats
path: root/target/arm/neon_helper.c
diff options
context:
space:
mode:
authorRichard Henderson2021-05-25 03:02:31 +0200
committerPeter Maydell2021-05-25 17:01:43 +0200
commit8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a (patch)
tree270d5ebbdd7b231ab39ac14eb46c43e50f89e2ce /target/arm/neon_helper.c
parenttarget/arm: Implement SVE2 integer unary operations (predicated) (diff)
downloadqemu-8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a.tar.gz
qemu-8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a.tar.xz
qemu-8b3f15b0a3b6d6ffbe7dfcb8dfbceb6d8da7ee6a.zip
target/arm: Split out saturating/rounding shifts from neon
Split these operations out into a header that can be shared between neon and sve. The "sat" pointer acts both as a boolean for control of saturating behavior and controls the difference in behavior between neon and sve -- QC bit or no QC bit. Widen the shift operand in the new helpers, as the SVE2 insns treat the whole input element as significant. For the neon uses, truncate the shift to int8_t while passing the parameter. Implement right-shift rounding as tmp = src >> (shift - 1); dst = (tmp >> 1) + (tmp & 1); This is the same number of instructions as the current tmp = 1 << (shift - 1); dst = (src + tmp) >> shift; without any possibility of intermediate overflow. Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20210525010358.152808-6-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'target/arm/neon_helper.c')
-rw-r--r--target/arm/neon_helper.c519
1 files changed, 89 insertions, 430 deletions
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index b637265691..338b9189d5 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -11,6 +11,7 @@
#include "cpu.h"
#include "exec/helper-proto.h"
#include "fpu/softfloat.h"
+#include "vec_internal.h"
#define SIGNBIT (uint32_t)0x80000000
#define SIGNBIT64 ((uint64_t)1 << 63)
@@ -576,496 +577,154 @@ NEON_POP(pmax_s16, neon_s16, 2)
NEON_POP(pmax_u16, neon_u16, 2)
#undef NEON_FN
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8 || \
- tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
NEON_VOP(shl_u16, neon_u16, 2)
#undef NEON_FN
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (sizeof(src1) * 8 - 1); \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
NEON_VOP(shl_s16, neon_s16, 2)
#undef NEON_FN
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if ((tmp >= (ssize_t)sizeof(src1) * 8) \
- || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
NEON_VOP(rshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
NEON_VOP(rshl_s16, neon_s16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
{
- int32_t dest;
- int32_t val = (int32_t)valop;
- int8_t shift = (int8_t)shiftop;
- if ((shift >= 32) || (shift <= -32)) {
- dest = 0;
- } else if (shift < 0) {
- int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- }
- return dest;
+ return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
-{
- int8_t shift = (int8_t)shiftop;
- int64_t val = valop;
- if ((shift >= 64) || (shift <= -64)) {
- val = 0;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == INT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x4000000000000000LL;
- } else {
- val++;
- val >>= 1;
- }
- } else {
- val <<= shift;
- }
- return val;
+uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, (int8_t)shift, true, NULL);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8 || \
- tmp < -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (-tmp - 1); \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
NEON_VOP(rshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
NEON_VOP(rshl_u16, neon_u16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
+uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
{
- uint32_t dest;
- int8_t shift = (int8_t)shiftop;
- if (shift >= 32 || shift < -32) {
- dest = 0;
- } else if (shift == -32) {
- dest = val >> 31;
- } else if (shift < 0) {
- uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- }
- return dest;
+ return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
-{
- int8_t shift = (uint8_t)shiftop;
- if (shift >= 64 || shift < -64) {
- val = 0;
- } else if (shift == -64) {
- /* Rounding a 1-bit result just preserves that bit. */
- val >>= 63;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == UINT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x8000000000000000ULL;
- } else {
- val++;
- val >>= 1;
- }
- } else {
- val <<= shift;
- }
- return val;
+uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, (int8_t)shift, true, NULL);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = 0; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = ~0; \
- } \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_u16, neon_u16, 2)
-NEON_VOP_ENV(qshl_u32, neon_u32, 1)
#undef NEON_FN
-uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
-{
- int8_t shift = (int8_t)shiftop;
- if (shift >= 64) {
- if (val) {
- val = ~(uint64_t)0;
- SET_QC();
- }
- } else if (shift <= -64) {
- val = 0;
- } else if (shift < 0) {
- val >>= -shift;
- } else {
- uint64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = ~(uint64_t)0;
- }
- }
- return val;
+uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } else { \
- dest = src1; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> 31; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } \
- }} while (0)
+uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
NEON_VOP_ENV(qshl_s16, neon_s16, 2)
-NEON_VOP_ENV(qshl_s32, neon_s32, 1)
#undef NEON_FN
-uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
-{
- int8_t shift = (uint8_t)shiftop;
- int64_t val = valop;
- if (shift >= 64) {
- if (val) {
- SET_QC();
- val = (val >> 63) ^ ~SIGNBIT64;
- }
- } else if (shift <= -64) {
- val >>= 63;
- } else if (shift < 0) {
- val >>= -shift;
- } else {
- int64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = (tmp >> 63) ^ ~SIGNBIT64;
- }
- }
- return val;
+uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
+{
+ return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
- SET_QC(); \
- dest = 0; \
- } else { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = 0; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = src1 >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = ~0; \
- } \
- } \
- }} while (0)
-NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
-NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
+uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
+}
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
+NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
#undef NEON_FN
-uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- if ((int32_t)valop < 0) {
- SET_QC();
- return 0;
- }
- return helper_neon_qshl_u32(env, valop, shiftop);
+ return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
}
-uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
+uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
{
- if ((int64_t)valop < 0) {
- SET_QC();
- return 0;
- }
- return helper_neon_qshl_u64(env, valop, shiftop);
+ return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = ~0; \
- } else { \
- dest = 0; \
- } \
- } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
- dest = src1 >> (sizeof(src1) * 8 - 1); \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = ~0; \
- } \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop)
+uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- uint32_t dest;
- int8_t shift = (int8_t)shiftop;
- if (shift >= 32) {
- if (val) {
- SET_QC();
- dest = ~0;
- } else {
- dest = 0;
- }
- } else if (shift < -32) {
- dest = 0;
- } else if (shift == -32) {
- dest = val >> 31;
- } else if (shift < 0) {
- uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- if ((dest >> shift) != val) {
- SET_QC();
- dest = ~0;
- }
- }
- return dest;
+ return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
-{
- int8_t shift = (int8_t)shiftop;
- if (shift >= 64) {
- if (val) {
- SET_QC();
- val = ~0;
- }
- } else if (shift < -64) {
- val = 0;
- } else if (shift == -64) {
- val >>= 63;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == UINT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x8000000000000000ULL;
- } else {
- val++;
- val >>= 1;
- }
- } else { \
- uint64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = ~0;
- }
- }
- return val;
+uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
}
-#define NEON_FN(dest, src1, src2) do { \
- int8_t tmp; \
- tmp = (int8_t)src2; \
- if (tmp >= (ssize_t)sizeof(src1) * 8) { \
- if (src1) { \
- SET_QC(); \
- dest = (typeof(dest))(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } else { \
- dest = 0; \
- } \
- } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
- dest = 0; \
- } else if (tmp < 0) { \
- dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
- } else { \
- dest = src1 << tmp; \
- if ((dest >> tmp) != src1) { \
- SET_QC(); \
- dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
- if (src1 > 0) { \
- dest--; \
- } \
- } \
- }} while (0)
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+ (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
#undef NEON_FN
-/* The addition of the rounding constant may overflow, so we use an
- * intermediate 64 bit accumulator. */
-uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
+uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
{
- int32_t dest;
- int32_t val = (int32_t)valop;
- int8_t shift = (int8_t)shiftop;
- if (shift >= 32) {
- if (val) {
- SET_QC();
- dest = (val >> 31) ^ ~SIGNBIT;
- } else {
- dest = 0;
- }
- } else if (shift <= -32) {
- dest = 0;
- } else if (shift < 0) {
- int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
- dest = big_dest >> -shift;
- } else {
- dest = val << shift;
- if ((dest >> shift) != val) {
- SET_QC();
- dest = (val >> 31) ^ ~SIGNBIT;
- }
- }
- return dest;
+ return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
}
-/* Handling addition overflow with 64 bit input values is more
- * tricky than with 32 bit values. */
-uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
-{
- int8_t shift = (uint8_t)shiftop;
- int64_t val = valop;
-
- if (shift >= 64) {
- if (val) {
- SET_QC();
- val = (val >> 63) ^ ~SIGNBIT64;
- }
- } else if (shift <= -64) {
- val = 0;
- } else if (shift < 0) {
- val >>= (-shift - 1);
- if (val == INT64_MAX) {
- /* In this case, it means that the rounding constant is 1,
- * and the addition would overflow. Return the actual
- * result directly. */
- val = 0x4000000000000000ULL;
- } else {
- val++;
- val >>= 1;
- }
- } else {
- int64_t tmp = val;
- val <<= shift;
- if ((val >> shift) != tmp) {
- SET_QC();
- val = (tmp >> 63) ^ ~SIGNBIT64;
- }
- }
- return val;
+uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
+{
+ return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
}
uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)