target/arm: Implement MVE saturating doubling multiply accumulates

Implement the MVE saturating doubling multiply accumulate insns
VQDMLAH, VQRDMLAH, VQDMLASH and VQRDMLASH.  These perform a multiply,
double, add the accumulator shifted by the element size, possibly
round, saturate to twice the element size, then take the high half of
the result.  The *MLAH insns do vector * scalar + vector, and the
*MLASH insns do vector * vector + scalar.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Peter Maydell 2021-08-13 17:11:54 +01:00
parent c69e34c6de
commit 8be9a25058
4 changed files with 120 additions and 0 deletions

View File

@ -375,6 +375,22 @@ DEF_HELPER_FLAGS_4(mve_vmlasb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vmlash, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vmlasw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqrdmlahb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqrdmlahh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqrdmlahw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqrdmlashb, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqrdmlashh, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vqrdmlashw, TCG_CALL_NO_WG, void, env, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(mve_vmlaldavsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
DEF_HELPER_FLAGS_4(mve_vmlaldavsw, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)
DEF_HELPER_FLAGS_4(mve_vmlaldavxsh, TCG_CALL_NO_WG, i64, env, ptr, ptr, i64)

View File

@ -416,6 +416,11 @@ VQRDMULH_scalar 1111 1110 0 . .. ... 1 ... 0 1110 . 110 .... @2scalar
VMLA 111- 1110 0 . .. ... 1 ... 0 1110 . 100 .... @2scalar
VMLAS 111- 1110 0 . .. ... 1 ... 1 1110 . 100 .... @2scalar
VQRDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 100 .... @2scalar
VQRDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 100 .... @2scalar
VQDMLAH 1110 1110 0 . .. ... 0 ... 0 1110 . 110 .... @2scalar
VQDMLASH 1110 1110 0 . .. ... 0 ... 1 1110 . 110 .... @2scalar
# Vector add across vector
{
VADDV 111 u:1 1110 1111 size:2 01 ... 0 1111 0 0 a:1 0 qm:3 0 rda=%rdalo

View File

@ -964,6 +964,28 @@ DO_VQDMLADH_OP(vqrdmlsdhxw, 4, int32_t, 1, 1, do_vqdmlsdh_w)
mve_advance_vpt(env); \
}
#define DO_2OP_SAT_ACC_SCALAR(OP, ESIZE, TYPE, FN) \
void HELPER(glue(mve_, OP))(CPUARMState *env, void *vd, void *vn, \
uint32_t rm) \
{ \
TYPE *d = vd, *n = vn; \
TYPE m = rm; \
uint16_t mask = mve_element_mask(env); \
unsigned e; \
bool qc = false; \
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
bool sat = false; \
mergemask(&d[H##ESIZE(e)], \
FN(d[H##ESIZE(e)], n[H##ESIZE(e)], m, &sat), \
mask); \
qc |= sat & mask & 1; \
} \
if (qc) { \
env->vfp.qc[0] = qc; \
} \
mve_advance_vpt(env); \
}
/* provide unsigned 2-op scalar helpers for all sizes */
#define DO_2OP_SCALAR_U(OP, FN) \
DO_2OP_SCALAR(OP##b, 1, uint8_t, FN) \
@ -1008,6 +1030,79 @@ DO_2OP_SAT_SCALAR(vqrdmulh_scalarb, 1, int8_t, DO_QRDMULH_B)
DO_2OP_SAT_SCALAR(vqrdmulh_scalarh, 2, int16_t, DO_QRDMULH_H)
DO_2OP_SAT_SCALAR(vqrdmulh_scalarw, 4, int32_t, DO_QRDMULH_W)
static int8_t do_vqdmlah_b(int8_t a, int8_t b, int8_t c, int round, bool *sat)
{
int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 8) + (round << 7);
return do_sat_bhw(r, INT16_MIN, INT16_MAX, sat) >> 8;
}
static int16_t do_vqdmlah_h(int16_t a, int16_t b, int16_t c,
int round, bool *sat)
{
int64_t r = (int64_t)a * b * 2 + ((int64_t)c << 16) + (round << 15);
return do_sat_bhw(r, INT32_MIN, INT32_MAX, sat) >> 16;
}
static int32_t do_vqdmlah_w(int32_t a, int32_t b, int32_t c,
int round, bool *sat)
{
/*
* Architecturally we should do the entire add, double, round
* and then check for saturation. We do three saturating adds,
* but we need to be careful about the order. If the first
* m1 + m2 saturates then it's impossible for the *2+rc to
* bring it back into the non-saturated range. However, if
* m1 + m2 is negative then it's possible that doing the doubling
* would take the intermediate result below INT64_MAX and the
* addition of the rounding constant then brings it back in range.
* So we add half the rounding constant and half the "c << esize"
* before doubling rather than adding the rounding constant after
* the doubling.
*/
int64_t m1 = (int64_t)a * b;
int64_t m2 = (int64_t)c << 31;
int64_t r;
if (sadd64_overflow(m1, m2, &r) ||
sadd64_overflow(r, (round << 30), &r) ||
sadd64_overflow(r, r, &r)) {
*sat = true;
return r < 0 ? INT32_MAX : INT32_MIN;
}
return r >> 32;
}
/*
* The *MLAH insns are vector * scalar + vector;
* the *MLASH insns are vector * vector + scalar
*/
#define DO_VQDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 0, S)
#define DO_VQDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 0, S)
#define DO_VQDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 0, S)
#define DO_VQRDMLAH_B(D, N, M, S) do_vqdmlah_b(N, M, D, 1, S)
#define DO_VQRDMLAH_H(D, N, M, S) do_vqdmlah_h(N, M, D, 1, S)
#define DO_VQRDMLAH_W(D, N, M, S) do_vqdmlah_w(N, M, D, 1, S)
#define DO_VQDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 0, S)
#define DO_VQDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 0, S)
#define DO_VQDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 0, S)
#define DO_VQRDMLASH_B(D, N, M, S) do_vqdmlah_b(N, D, M, 1, S)
#define DO_VQRDMLASH_H(D, N, M, S) do_vqdmlah_h(N, D, M, 1, S)
#define DO_VQRDMLASH_W(D, N, M, S) do_vqdmlah_w(N, D, M, 1, S)
DO_2OP_SAT_ACC_SCALAR(vqdmlahb, 1, int8_t, DO_VQDMLAH_B)
DO_2OP_SAT_ACC_SCALAR(vqdmlahh, 2, int16_t, DO_VQDMLAH_H)
DO_2OP_SAT_ACC_SCALAR(vqdmlahw, 4, int32_t, DO_VQDMLAH_W)
DO_2OP_SAT_ACC_SCALAR(vqrdmlahb, 1, int8_t, DO_VQRDMLAH_B)
DO_2OP_SAT_ACC_SCALAR(vqrdmlahh, 2, int16_t, DO_VQRDMLAH_H)
DO_2OP_SAT_ACC_SCALAR(vqrdmlahw, 4, int32_t, DO_VQRDMLAH_W)
DO_2OP_SAT_ACC_SCALAR(vqdmlashb, 1, int8_t, DO_VQDMLASH_B)
DO_2OP_SAT_ACC_SCALAR(vqdmlashh, 2, int16_t, DO_VQDMLASH_H)
DO_2OP_SAT_ACC_SCALAR(vqdmlashw, 4, int32_t, DO_VQDMLASH_W)
DO_2OP_SAT_ACC_SCALAR(vqrdmlashb, 1, int8_t, DO_VQRDMLASH_B)
DO_2OP_SAT_ACC_SCALAR(vqrdmlashh, 2, int16_t, DO_VQRDMLASH_H)
DO_2OP_SAT_ACC_SCALAR(vqrdmlashw, 4, int32_t, DO_VQRDMLASH_W)
/* Vector by scalar plus vector */
#define DO_VMLA(D, N, M) ((N) * (M) + (D))

View File

@ -622,6 +622,10 @@ DO_2OP_SCALAR(VQRDMULH_scalar, vqrdmulh_scalar)
DO_2OP_SCALAR(VBRSR, vbrsr)
DO_2OP_SCALAR(VMLA, vmla)
DO_2OP_SCALAR(VMLAS, vmlas)
DO_2OP_SCALAR(VQDMLAH, vqdmlah)
DO_2OP_SCALAR(VQRDMLAH, vqrdmlah)
DO_2OP_SCALAR(VQDMLASH, vqdmlash)
DO_2OP_SCALAR(VQRDMLASH, vqrdmlash)
static bool trans_VQDMULLB_scalar(DisasContext *s, arg_2scalar *a)
{