target/arm: Implement MVE saturating doubling multiply accumulates
Implement the MVE saturating doubling multiply accumulate insns VQDMLAH, VQRDMLAH, VQDMLASH and VQRDMLASH. These perform a multiply, double, add the accumulator shifted by the element size, possibly round, saturate to twice the element size, then take the high half of the result. The *MLAH insns do vector * scalar + vector, and the *MLASH insns do vector * vector + scalar. Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
parent
c69e34c6de
commit
8be9a25058
@ -375,6 +375,22 @@ DEF_HELPER_FLAGS_4(mve_vmlasb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vmlash, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vmlasw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(mve_vqdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(mve_vqrdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqrdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqrdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(mve_vqdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(mve_vqrdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqrdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(mve_vqrdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
|
||||
DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
|
||||
DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
|
||||
|
@ -416,6 +416,11 @@ VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
|
||||
VMLA 111- 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar
|
||||
VMLAS 111- 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
|
||||
|
||||
VQRDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar
|
||||
VQRDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar
|
||||
VQDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar
|
||||
VQDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar
|
||||
|
||||
# Vector add across vector
|
||||
{
|
||||
VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo
|
||||
|
@ -964,6 +964,28 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
|
||||
mve_advance_vpt(env); \
|
||||
}
|
||||
|
||||
#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
|
||||
void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
|
||||
uint32_t rm) \
|
||||
{ \
|
||||
TYPE *d = vd, *n = vn; \
|
||||
TYPE m = rm; \
|
||||
uint16_t mask = mve_element_mask(env); \
|
||||
unsigned e; \
|
||||
bool qc = false; \
|
||||
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
|
||||
bool sat = false; \
|
||||
mergemask(&d[H##ESIZE(e)], \
|
||||
FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \
|
||||
mask); \
|
||||
qc |= sat & mask & 1; \
|
||||
} \
|
||||
if (qc) { \
|
||||
env->vfp.qc[0] = qc; \
|
||||
} \
|
||||
mve_advance_vpt(env); \
|
||||
}
|
||||
|
||||
/* provide unsigned 2-op scalar helpers for all sizes */
|
||||
#define DO_2OP_SCALAR_U(OP, FN) \
|
||||
DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \
|
||||
@ -1008,6 +1030,79 @@ DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
|
||||
DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
|
||||
DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
|
||||
|
||||
static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
|
||||
{
|
||||
int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
|
||||
return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
|
||||
}
|
||||
|
||||
static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
|
||||
int round, bool *sat)
|
||||
{
|
||||
int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
|
||||
return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
|
||||
}
|
||||
|
||||
static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
|
||||
int round, bool *sat)
|
||||
{
|
||||
/*
|
||||
* Architecturally we should do the entire add, double, round
|
||||
* and then check for saturation. We do three saturating adds,
|
||||
* but we need to be careful about the order. If the first
|
||||
* m1 + m2 saturates then it's impossible for the *2+rc to
|
||||
* bring it back into the non-saturated range. However, if
|
||||
* m1 + m2 is negative then it's possible that doing the doubling
|
||||
* would take the intermediate result below INT64_MAX and the
|
||||
* addition of the rounding constant then brings it back in range.
|
||||
* So we add half the rounding constant and half the "c << esize"
|
||||
* before doubling rather than adding the rounding constant after
|
||||
* the doubling.
|
||||
*/
|
||||
int64_t m1 = (int64_t)a * b;
|
||||
int64_t m2 = (int64_t)c << 31;
|
||||
int64_t r;
|
||||
if (sadd64_overflow(m1, m2, &r) ||
|
||||
sadd64_overflow(r, (round << 30), &r) ||
|
||||
sadd64_overflow(r, r, &r)) {
|
||||
*sat = true;
|
||||
return r < 0 ? INT32_MAX : INT32_MIN;
|
||||
}
|
||||
return r >> 32;
|
||||
}
|
||||
|
||||
/*
|
||||
* The *MLAH insns are vector * scalar + vector;
|
||||
* the *MLASH insns are vector * vector + scalar
|
||||
*/
|
||||
#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
|
||||
#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
|
||||
#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
|
||||
#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
|
||||
#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
|
||||
#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
|
||||
|
||||
#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
|
||||
#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
|
||||
#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
|
||||
#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
|
||||
#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
|
||||
#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
|
||||
|
||||
DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
|
||||
|
||||
DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
|
||||
DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
|
||||
|
||||
/* Vector by scalar plus vector */
|
||||
#define DO_VMLA(D, N, M) ((N) * (M) + (D))
|
||||
|
||||
|
@ -622,6 +622,10 @@ DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
|
||||
DO_2OP_SCALAR(VBRSR, vbrsr)
|
||||
DO_2OP_SCALAR(VMLA, vmla)
|
||||
DO_2OP_SCALAR(VMLAS, vmlas)
|
||||
DO_2OP_SCALAR(VQDMLAH, vqdmlah)
|
||||
DO_2OP_SCALAR(VQRDMLAH, vqrdmlah)
|
||||
DO_2OP_SCALAR(VQDMLASH, vqdmlash)
|
||||
DO_2OP_SCALAR(VQRDMLASH, vqrdmlash)
|
||||
|
||||
static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user