2013-12-17 23:42:32 +04:00
|
|
|
/*
|
|
|
|
* AArch64 specific helper definitions
|
|
|
|
*
|
|
|
|
* Copyright (c) 2013 Alexander Graf <agraf@suse.de>
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
2013-12-17 23:42:34 +04:00
|
|
|
DEF_HELPER_FLAGS_2(udiv64, TCG_CALL_NO_RWG_SE, i64, i64, i64)
|
|
|
|
DEF_HELPER_FLAGS_2(sdiv64, TCG_CALL_NO_RWG_SE, s64, s64, s64)
|
2013-12-17 23:42:35 +04:00
|
|
|
DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64)
|
2014-01-05 02:15:50 +04:00
|
|
|
DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr)
|
|
|
|
DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr)
|
|
|
|
DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr)
|
|
|
|
DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr)
|
2014-01-31 18:47:31 +04:00
|
|
|
DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32)
|
2014-02-20 14:35:48 +04:00
|
|
|
DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
|
2014-02-20 14:35:49 +04:00
|
|
|
DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(neon_cge_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(neon_cgt_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
|
2018-03-01 14:05:50 +03:00
|
|
|
DEF_HELPER_FLAGS_3(recpsf_f16, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
2014-02-20 14:35:50 +04:00
|
|
|
DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
|
2018-03-01 14:05:50 +03:00
|
|
|
DEF_HELPER_FLAGS_3(rsqrtsf_f16, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
2014-02-20 14:35:50 +04:00
|
|
|
DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
|
2014-03-17 20:31:48 +04:00
|
|
|
DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64)
|
|
|
|
DEF_HELPER_FLAGS_1(neon_addlp_u8, TCG_CALL_NO_RWG_SE, i64, i64)
|
|
|
|
DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)
|
|
|
|
DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)
|
2014-03-17 20:31:50 +04:00
|
|
|
DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
|
|
|
|
DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
|
2018-03-01 14:05:55 +03:00
|
|
|
DEF_HELPER_FLAGS_2(frecpx_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
|
2014-03-17 20:31:53 +04:00
|
|
|
DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
|
2014-06-09 18:43:25 +04:00
|
|
|
DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
|
|
|
|
DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
|
target-arm: emulate aarch64's LL/SC using cmpxchg helpers
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/JVc8Y
atomic_add-bench: 1000000 ops/thread, [0,1] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
|| |
14 ++ ++
| | |
12 ++| ++
| | |
10 ++++ ++
8 ++E ++
|+++ |
6 ++ | ++
| | |
4 ++ | ++
| | |
2 +H++E+--- ++
+ | +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
| | |
14 ++E ++
| | |
12 ++| ++
|+++ |
10 ++ | ++
8 ++ | ++
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- |
2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++
+++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
60 ++master +-H--+ +++ ---+E+-----+E+------+E+
| +E+------E-------+E+--- |
| --- +++ |
50 ++ +++--- ++
| -+E+ |
40 ++ +++---- ++
| E- |
| --| |
30 ++ -- +++ ++
| +E+ |
20 ++E+ ++
|E+ |
| |
10 ++ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
160 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
140 ++master +-H--+ +++ +++
| -+E+-----+E+-------E|
120 ++ +++ ---- +++
| +++ ----E-- |
100 ++ --E--- +++ ++
| +++ ---- +++ |
80 ++ --E-- ++
| ---- +++ |
| -+E+ |
60 ++ ---- +++ ++
| +E+- |
40 ++ -- ++
| +E+ |
20 +EE+ ++
+++ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
[rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.]
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-27 22:02:13 +03:00
|
|
|
DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
|
2017-07-15 01:20:49 +03:00
|
|
|
DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, TCG_CALL_NO_WG,
|
|
|
|
i64, env, i64, i64, i64)
|
target-arm: emulate aarch64's LL/SC using cmpxchg helpers
Emulating LL/SC with cmpxchg is not correct, since it can
suffer from the ABA problem. Portable parallel code, however,
is written assuming only cmpxchg--and not LL/SC--is available.
This means that in practice emulating LL/SC with cmpxchg is
a viable alternative.
The appended emulates LL/SC pairs in aarch64 with cmpxchg helpers.
This works in both user and system mode. In usermode, it avoids
pausing all other CPUs to perform the LL/SC pair. The subsequent
performance and scalability improvement is significant, as the
plots below show. They plot the throughput of atomic_add-bench
compiled for ARM and executed on a 64-core x86 machine.
Hi-res plots: http://imgur.com/a/JVc8Y
atomic_add-bench: 1000000 ops/thread, [0,1] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
|| |
14 ++ ++
| | |
12 ++| ++
| | |
10 ++++ ++
8 ++E ++
|+++ |
6 ++ | ++
| | |
4 ++ | ++
| | |
2 +H++E+--- ++
+ | +E++----+E+---+--+E+----++E+------+E+------+E++----+E+---+--+E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,2] range
18 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
16 ++master +-H--+ ++
| | |
14 ++E ++
| | |
12 ++| ++
|+++ |
10 ++ | ++
8 ++ | ++
| | |
6 ++ | ++
| | |
4 ++ | ++
| +E+--- |
2 +H+ +E+-----+++ +++ +++ ---+E+-----+E+------+++
+++ + +E+---+--+E+----++E+------+E+--- ++++ +++ + +E|
0 ++H-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,128] range
70 ++---------+----------+---------+----------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
60 ++master +-H--+ +++ ---+E+-----+E+------+E+
| +E+------E-------+E+--- |
| --- +++ |
50 ++ +++--- ++
| -+E+ |
40 ++ +++---- ++
| E- |
| --| |
30 ++ -- +++ ++
| +E+ |
20 ++E+ ++
|E+ |
| |
10 ++ ++
+ + + + + + + |
0 +HH-H----H-+-----H----+---------+----------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
atomic_add-bench: 1000000 ops/thread, [0,1024] range
160 ++---------+---------+----------+---------+----------+----------+---++
+cmpxchg +-E--+ + + + + + |
140 ++master +-H--+ +++ +++
| -+E+-----+E+-------E|
120 ++ +++ ---- +++
| +++ ----E-- |
100 ++ --E--- +++ ++
| +++ ---- +++ |
80 ++ --E-- ++
| ---- +++ |
| -+E+ |
60 ++ ---- +++ ++
| +E+- |
40 ++ -- ++
| +E+ |
20 +EE+ ++
+++ + + + + + + |
0 +HH-H---H--+-----H---+----------+---------+----------+----------+---++
0 10 20 30 40 50 60
Number of threads
[rth: Rearrange 128-bit cmpxchg helper. Enforce alignment on LL.]
Signed-off-by: Emilio G. Cota <cota@braap.org>
Message-Id: <1467054136-10430-28-git-send-email-cota@braap.org>
Signed-off-by: Richard Henderson <rth@twiddle.net>
2016-06-27 22:02:13 +03:00
|
|
|
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
|
2017-07-15 01:20:49 +03:00
|
|
|
DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
|
|
|
|
i64, env, i64, i64, i64)
|
2018-03-01 14:05:48 +03:00
|
|
|
DEF_HELPER_FLAGS_3(advsimd_maxh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(advsimd_minh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(advsimd_maxnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_FLAGS_3(advsimd_minnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
|
2018-03-01 14:05:49 +03:00
|
|
|
DEF_HELPER_3(advsimd_addh, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_subh, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_mulh, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_divh, f16, f16, f16, ptr)
|
2018-03-01 14:05:49 +03:00
|
|
|
DEF_HELPER_3(advsimd_ceq_f16, i32, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_cge_f16, i32, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_cgt_f16, i32, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_acge_f16, i32, f16, f16, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_acgt_f16, i32, f16, f16, ptr)
|
2018-03-01 14:05:50 +03:00
|
|
|
DEF_HELPER_3(advsimd_mulxh, f16, f16, f16, ptr)
|
|
|
|
DEF_HELPER_4(advsimd_muladdh, f16, f16, f16, f16, ptr)
|
2018-03-01 14:05:52 +03:00
|
|
|
DEF_HELPER_3(advsimd_add2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_sub2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_mul2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_div2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_max2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_min2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_maxnum2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_minnum2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_3(advsimd_mulx2h, i32, i32, i32, ptr)
|
|
|
|
DEF_HELPER_4(advsimd_muladd2h, i32, i32, i32, i32, ptr)
|
2018-03-01 14:05:53 +03:00
|
|
|
DEF_HELPER_2(advsimd_rinth_exact, f16, f16, ptr)
|
|
|
|
DEF_HELPER_2(advsimd_rinth, f16, f16, ptr)
|
2018-03-01 14:05:53 +03:00
|
|
|
DEF_HELPER_2(advsimd_f16tosinth, i32, f16, ptr)
|
|
|
|
DEF_HELPER_2(advsimd_f16touinth, i32, f16, ptr)
|
2018-03-01 14:05:55 +03:00
|
|
|
DEF_HELPER_2(sqrt_f16, f16, f16, ptr)
|