summaryrefslogtreecommitdiffstats
path: root/target/arm/translate.c
diff options
context:
space:
mode:
authorPeter Maydell2020-10-09 16:47:12 +0200
committerPeter Maydell2020-10-20 17:12:00 +0200
commit5288145d716338ace0f83e3ff05c4d07715bb4f4 (patch)
treef526149591e4c8b64c76778225df14c8cbc2109f /target/arm/translate.c
parentMerge remote-tracking branch 'remotes/aperard/tags/pull-xen-20201020' into st... (diff)
downloadqemu-5288145d716338ace0f83e3ff05c4d07715bb4f4.tar.gz
qemu-5288145d716338ace0f83e3ff05c4d07715bb4f4.tar.xz
qemu-5288145d716338ace0f83e3ff05c4d07715bb4f4.zip
target/arm: Fix SMLAD incorrect setting of Q bit
The SMLAD instruction is supposed to: * signed multiply Rn[15:0] * Rm[15:0] * signed multiply Rn[31:16] * Rm[31:16] * perform a signed addition of the products and Ra * set Rd to the low 32 bits of the theoretical infinite-precision result * set the Q flag if the sign-extension of Rd would differ from the infinite-precision result (ie on overflow) Our current implementation doesn't quite do this, though: it performs an addition of the products setting Q on overflow, and then it adds Ra, again possibly setting Q. This sometimes incorrectly sets Q when the architecturally mandated only-check-for-overflow-once algorithm does not. For instance: r1 = 0x80008000; r2 = 0x80008000; r3 = 0xffffffff smlad r0, r1, r2, r3 This is (-32768 * -32768) + (-32768 * -32768) - 1 The products are both 0x4000_0000, so when added together as 32-bit signed numbers they overflow (and QEMU sets Q), but because the addition of Ra == -1 brings the total back down to 0x7fff_ffff there is no overflow for the complete operation and setting Q is incorrect. Fix this edge case by resorting to 64-bit arithmetic for the case where we need to add three values together. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20201009144712.11187-1-peter.maydell@linaro.org
Diffstat (limited to 'target/arm/translate.c')
-rw-r--r--target/arm/translate.c60
1 files changed, 49 insertions, 11 deletions
diff --git a/target/arm/translate.c b/target/arm/translate.c
index d34c1d351a..d8729e42c4 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -7401,22 +7401,60 @@ static bool op_smlad(DisasContext *s, arg_rrrr *a, bool m_swap, bool sub)
gen_smul_dual(t1, t2);
if (sub) {
- /* This subtraction cannot overflow. */
- tcg_gen_sub_i32(t1, t1, t2);
- } else {
/*
- * This addition cannot overflow 32 bits; however it may
- * overflow considered as a signed operation, in which case
- * we must set the Q flag.
+ * This subtraction cannot overflow, so we can do a simple
+ * 32-bit subtraction and then a possible 32-bit saturating
+ * addition of Ra.
*/
- gen_helper_add_setq(t1, cpu_env, t1, t2);
- }
- tcg_temp_free_i32(t2);
+ tcg_gen_sub_i32(t1, t1, t2);
+ tcg_temp_free_i32(t2);
- if (a->ra != 15) {
- t2 = load_reg(s, a->ra);
+ if (a->ra != 15) {
+ t2 = load_reg(s, a->ra);
+ gen_helper_add_setq(t1, cpu_env, t1, t2);
+ tcg_temp_free_i32(t2);
+ }
+ } else if (a->ra == 15) {
+ /* Single saturation-checking addition */
gen_helper_add_setq(t1, cpu_env, t1, t2);
tcg_temp_free_i32(t2);
+ } else {
+ /*
+ * We need to add the products and Ra together and then
+ * determine whether the final result overflowed. Doing
+ * this as two separate add-and-check-overflow steps incorrectly
+ * sets Q for cases like (-32768 * -32768) + (-32768 * -32768) + -1.
+ * Do all the arithmetic at 64-bits and then check for overflow.
+ */
+ TCGv_i64 p64, q64;
+ TCGv_i32 t3, qf, one;
+
+ p64 = tcg_temp_new_i64();
+ q64 = tcg_temp_new_i64();
+ tcg_gen_ext_i32_i64(p64, t1);
+ tcg_gen_ext_i32_i64(q64, t2);
+ tcg_gen_add_i64(p64, p64, q64);
+ load_reg_var(s, t2, a->ra);
+ tcg_gen_ext_i32_i64(q64, t2);
+ tcg_gen_add_i64(p64, p64, q64);
+ tcg_temp_free_i64(q64);
+
+ tcg_gen_extr_i64_i32(t1, t2, p64);
+ tcg_temp_free_i64(p64);
+ /*
+ * t1 is the low half of the result which goes into Rd.
+ * We have overflow and must set Q if the high half (t2)
+ * is different from the sign-extension of t1.
+ */
+ t3 = tcg_temp_new_i32();
+ tcg_gen_sari_i32(t3, t1, 31);
+ qf = load_cpu_field(QF);
+ one = tcg_const_i32(1);
+ tcg_gen_movcond_i32(TCG_COND_NE, qf, t2, t3, one, qf);
+ store_cpu_field(qf, QF);
+ tcg_temp_free_i32(one);
+ tcg_temp_free_i32(t3);
+ tcg_temp_free_i32(t2);
}
store_reg(s, a->rd, t1);
return true;