target/arm: Implement MVE VCLZ

Implement the MVE VCLZ insn (and the necessary machinery
for MVE 1-input vector ops).

Note that for non-load instructions predication is always performed
at a byte level granularity regardless of element size (R_ZLSJ),
and so the masking logic here differs from that used in the VLDR
and VSTR helpers.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20210617121628.20116-4-peter.maydell@linaro.org
This commit is contained in:
Peter Maydell 2021-06-17 13:15:47 +01:00
parent 2fc6b7510c
commit 0f0f2bd548
4 changed files with 132 additions and 0 deletions

View File

@ -32,3 +32,7 @@ DEF_HELPER_FLAGS_3(mve_vldrh_uw, TCG_CALL_NO_WG, void, env, ptr, i32)
DEF_HELPER_FLAGS_3(mve_vstrb_h, TCG_CALL_NO_WG, void, env, ptr, i32)
DEF_HELPER_FLAGS_3(mve_vstrb_w, TCG_CALL_NO_WG, void, env, ptr, i32)
DEF_HELPER_FLAGS_3(mve_vstrh_w, TCG_CALL_NO_WG, void, env, ptr, i32)
DEF_HELPER_FLAGS_3(mve_vclzb, TCG_CALL_NO_WG, void, env, ptr, ptr)
DEF_HELPER_FLAGS_3(mve_vclzh, TCG_CALL_NO_WG, void, env, ptr, ptr)
DEF_HELPER_FLAGS_3(mve_vclzw, TCG_CALL_NO_WG, void, env, ptr, ptr)

View File

@ -20,13 +20,17 @@
#
%qd 22:1 13:3
%qm 5:1 1:3
&vldr_vstr rn qd imm p a w size l u
&1op qd qm size
@vldr_vstr ....... . . . . l:1 rn:4 ... ...... imm:7 &vldr_vstr qd=%qd u=0
# Note that both Rn and Qd are 3 bits only (no D bit)
@vldst_wn ... u:1 ... . . . . l:1 . rn:3 qd:3 . ... .. imm:7 &vldr_vstr
@1op .... .... .... size:2 .. .... .... .... .... &1op qd=%qd qm=%qm
# Vector loads and stores
# Widening loads and narrowing stores:
@ -61,3 +65,7 @@ VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111101 ....... @vldr_vstr \
size=1 p=1
VLDR_VSTR 1110110 1 a:1 . w:1 . .... ... 111110 ....... @vldr_vstr \
size=2 p=1
# Vector miscellaneous
VCLZ 1111 1111 1 . 11 .. 00 ... 0 0100 11 . 0 ... 0 @1op

View File

@ -181,3 +181,85 @@ DO_VSTR(vstrh_w, 2, stw, 4, int32_t)
#undef DO_VLDR
#undef DO_VSTR
/*
* The mergemask(D, R, M) macro performs the operation "*D = R" but
* storing only the bytes which correspond to 1 bits in M,
* leaving other bytes in *D unchanged. We use _Generic
* to select the correct implementation based on the type of D.
*/
static void mergemask_ub(uint8_t *d, uint8_t r, uint16_t mask)
{
if (mask & 1) {
*d = r;
}
}
static void mergemask_sb(int8_t *d, int8_t r, uint16_t mask)
{
mergemask_ub((uint8_t *)d, r, mask);
}
static void mergemask_uh(uint16_t *d, uint16_t r, uint16_t mask)
{
uint16_t bmask = expand_pred_b_data[mask & 3];
*d = (*d & ~bmask) | (r & bmask);
}
static void mergemask_sh(int16_t *d, int16_t r, uint16_t mask)
{
mergemask_uh((uint16_t *)d, r, mask);
}
static void mergemask_uw(uint32_t *d, uint32_t r, uint16_t mask)
{
uint32_t bmask = expand_pred_b_data[mask & 0xf];
*d = (*d & ~bmask) | (r & bmask);
}
static void mergemask_sw(int32_t *d, int32_t r, uint16_t mask)
{
mergemask_uw((uint32_t *)d, r, mask);
}
static void mergemask_uq(uint64_t *d, uint64_t r, uint16_t mask)
{
uint64_t bmask = expand_pred_b_data[mask & 0xff];
*d = (*d & ~bmask) | (r & bmask);
}
static void mergemask_sq(int64_t *d, int64_t r, uint16_t mask)
{
mergemask_uq((uint64_t *)d, r, mask);
}
#define mergemask(D, R, M) \
_Generic(D, \
uint8_t *: mergemask_ub, \
int8_t *: mergemask_sb, \
uint16_t *: mergemask_uh, \
int16_t *: mergemask_sh, \
uint32_t *: mergemask_uw, \
int32_t *: mergemask_sw, \
uint64_t *: mergemask_uq, \
int64_t *: mergemask_sq)(D, R, M)
#define DO_1OP(OP, ESIZE, TYPE, FN) \
void HELPER(mve_##OP)(CPUARMState *env, void *vd, void *vm) \
{ \
TYPE *d = vd, *m = vm; \
uint16_t mask = mve_element_mask(env); \
unsigned e; \
for (e = 0; e < 16 / ESIZE; e++, mask >>= ESIZE) { \
mergemask(&d[H##ESIZE(e)], FN(m[H##ESIZE(e)]), mask); \
} \
mve_advance_vpt(env); \
}
#define DO_CLZ_B(N) (clz32(N) - 24)
#define DO_CLZ_H(N) (clz32(N) - 16)
DO_1OP(vclzb, 1, uint8_t, DO_CLZ_B)
DO_1OP(vclzh, 2, uint16_t, DO_CLZ_H)
DO_1OP(vclzw, 4, uint32_t, clz32)

View File

@ -29,6 +29,7 @@
#include "decode-mve.c.inc"
typedef void MVEGenLdStFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
typedef void MVEGenOneOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
/* Return the offset of a Qn register (same semantics as aa32_vfp_qreg()) */
static inline long mve_qreg_offset(unsigned reg)
@ -160,3 +161,40 @@ static bool trans_VLDR_VSTR(DisasContext *s, arg_VLDR_VSTR *a)
DO_VLDST_WIDE_NARROW(VLDSTB_H, vldrb_sh, vldrb_uh, vstrb_h)
DO_VLDST_WIDE_NARROW(VLDSTB_W, vldrb_sw, vldrb_uw, vstrb_w)
DO_VLDST_WIDE_NARROW(VLDSTH_W, vldrh_sw, vldrh_uw, vstrh_w)
static bool do_1op(DisasContext *s, arg_1op *a, MVEGenOneOpFn fn)
{
TCGv_ptr qd, qm;
if (!dc_isar_feature(aa32_mve, s) ||
!mve_check_qreg_bank(s, a->qd | a->qm) ||
!fn) {
return false;
}
if (!mve_eci_check(s) || !vfp_access_check(s)) {
return true;
}
qd = mve_qreg_ptr(a->qd);
qm = mve_qreg_ptr(a->qm);
fn(cpu_env, qd, qm);
tcg_temp_free_ptr(qd);
tcg_temp_free_ptr(qm);
mve_update_eci(s);
return true;
}
#define DO_1OP(INSN, FN) \
static bool trans_##INSN(DisasContext *s, arg_1op *a) \
{ \
static MVEGenOneOpFn * const fns[] = { \
gen_helper_mve_##FN##b, \
gen_helper_mve_##FN##h, \
gen_helper_mve_##FN##w, \
NULL, \
}; \
return do_1op(s, a, fns[a->size]); \
}
DO_1OP(VCLZ, vclz)