From 1dd089d0eec060dcd8478735114d98421d414805 Mon Sep 17 00:00:00 2001 From: Emilio G. Cota Date: Mon, 27 Jun 2016 15:02:13 -0400 Subject: target-arm: emulate aarch64's LL/SC using cmpxchg helpers Emulating LL/SC with cmpxchg is not correct, since it can suffer from the ABA problem. Portable parallel code, however, is written assuming only cmpxchg--and not LL/SC--is available. This means that in practice emulating LL/SC with cmpxchg is a viable alternative. The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers. This works in both user and system mode. In usermode, it avoids pausing all other CPUs to perform the LL/SC pair. The subsequent performance and scalability improvement is significant, as the plots below show. They plot the throughput of atomic_add-bench compiled for ARM and executed on a 64-core x86 machine. Hi-res plots: http://imgur.com/a/JVc8Y atomic_add-bench: 1000000 ops/thread, [0,1] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 16 ++master +-H--+ ++ || | 14 ++ ++ | | | 12 ++| ++ | | | 10 ++++ ++ 8 ++E ++ |+++ | 6 ++ | ++ | | | 4 ++ | ++ | | | 2 +H++E+--- ++ + | +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,2] range 18 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 16 ++master +-H--+ ++ | | | 14 ++E ++ | | | 12 ++| ++ |+++ | 10 ++ | ++ 8 ++ | ++ | | | 6 ++ | ++ | | | 4 ++ | ++ | +E+--- | 2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++ +++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E| 0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,128] range 70 ++---------+----------+---------+----------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 60 ++master +-H--+ +++ ---+E+-----+E+------+E+ | +E+------E-------+E+--- | | --- +++ | 50 ++ +++--- ++ | -+E+ | 40 ++ +++---- ++ | E- | | --| | 30 ++ -- +++ ++ | +E+ | 20 ++E+ ++ |E+ | | | 10 ++ ++ + + + + + + + | 0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads atomic_add-bench: 1000000 ops/thread, [0,1024] range 160 ++---------+---------+----------+---------+----------+----------+---++ +cmpxchg +-E--+ + + + + + | 140 ++master +-H--+ +++ +++ | -+E+-----+E+-------E| 120 ++ +++ ---- +++ | +++ ----E-- | 100 ++ --E--- +++ ++ | +++ ---- +++ | 80 ++ --E-- ++ | ---- +++ | | -+E+ | 60 ++ ---- +++ ++ | +E+- | 40 ++ -- ++ | +E+ | 20 +EE+ ++ +++ + + + + + + | 0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++ 0 10 20 30 40 50 60 Number of threads [rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.] Signed-off-by: Emilio G. Cota Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org> Signed-off-by: Richard Henderson --- target-arm/helper-a64.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'target-arm/helper-a64.h') diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h index 1d3d10fffb..dd32000e63 100644 --- a/target-arm/helper-a64.h +++ b/target-arm/helper-a64.h @@ -46,3 +46,5 @@ DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr) DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env) DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32) DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32) +DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64) +DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64) -- cgit v1.2.3-55-g7522