target/arm: Convert SQRDMLAH, SQRDMLSH to decodetree
Reviewed-by: Peter Maydell <peter.maydell@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> Message-id: 20240625183536.1672454-5-richard.henderson@linaro.org Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
parent
7619129f0d
commit
f698e45270
@ -979,6 +979,16 @@ DEF_HELPER_FLAGS_5(neon_sqrdmulh_idx_h, TCG_CALL_NO_RWG,
|
||||
DEF_HELPER_FLAGS_5(neon_sqrdmulh_idx_s, TCG_CALL_NO_RWG,
|
||||
void, ptr, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_5(neon_sqrdmlah_idx_h, TCG_CALL_NO_RWG,
|
||||
void, ptr, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_5(neon_sqrdmlah_idx_s, TCG_CALL_NO_RWG,
|
||||
void, ptr, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_5(neon_sqrdmlsh_idx_h, TCG_CALL_NO_RWG,
|
||||
void, ptr, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_5(neon_sqrdmlsh_idx_s, TCG_CALL_NO_RWG,
|
||||
void, ptr, ptr, ptr, ptr, i32)
|
||||
|
||||
DEF_HELPER_FLAGS_4(sve2_sqdmulh_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(sve2_sqdmulh_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
DEF_HELPER_FLAGS_4(sve2_sqdmulh_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
|
||||
|
@ -781,6 +781,8 @@ CMEQ_s 0111 1110 111 ..... 10001 1 ..... ..... @rrr_d
|
||||
|
||||
SQDMULH_s 0101 1110 ..1 ..... 10110 1 ..... ..... @rrr_e
|
||||
SQRDMULH_s 0111 1110 ..1 ..... 10110 1 ..... ..... @rrr_e
|
||||
SQRDMLAH_s 0111 1110 ..0 ..... 10000 1 ..... ..... @rrr_e
|
||||
SQRDMLSH_s 0111 1110 ..0 ..... 10001 1 ..... ..... @rrr_e
|
||||
|
||||
### Advanced SIMD scalar pairwise
|
||||
|
||||
@ -941,6 +943,8 @@ MLS_v 0.10 1110 ..1 ..... 10010 1 ..... ..... @qrrr_e
|
||||
|
||||
SQDMULH_v 0.00 1110 ..1 ..... 10110 1 ..... ..... @qrrr_e
|
||||
SQRDMULH_v 0.10 1110 ..1 ..... 10110 1 ..... ..... @qrrr_e
|
||||
SQRDMLAH_v 0.10 1110 ..0 ..... 10000 1 ..... ..... @qrrr_e
|
||||
SQRDMLSH_v 0.10 1110 ..0 ..... 10001 1 ..... ..... @qrrr_e
|
||||
|
||||
### Advanced SIMD scalar x indexed element
|
||||
|
||||
@ -966,6 +970,12 @@ SQDMULH_si 0101 1111 10 .. .... 1100 . 0 ..... ..... @rrx_s
|
||||
SQRDMULH_si 0101 1111 01 .. .... 1101 . 0 ..... ..... @rrx_h
|
||||
SQRDMULH_si 0101 1111 10 . ..... 1101 . 0 ..... ..... @rrx_s
|
||||
|
||||
SQRDMLAH_si 0111 1111 01 .. .... 1101 . 0 ..... ..... @rrx_h
|
||||
SQRDMLAH_si 0111 1111 10 .. .... 1101 . 0 ..... ..... @rrx_s
|
||||
|
||||
SQRDMLSH_si 0111 1111 01 .. .... 1111 . 0 ..... ..... @rrx_h
|
||||
SQRDMLSH_si 0111 1111 10 .. .... 1111 . 0 ..... ..... @rrx_s
|
||||
|
||||
### Advanced SIMD vector x indexed element
|
||||
|
||||
FMUL_vi 0.00 1111 00 .. .... 1001 . 0 ..... ..... @qrrx_h
|
||||
@ -1004,6 +1014,12 @@ SQDMULH_vi 0.00 1111 10 . ..... 1100 . 0 ..... ..... @qrrx_s
|
||||
SQRDMULH_vi 0.00 1111 01 .. .... 1101 . 0 ..... ..... @qrrx_h
|
||||
SQRDMULH_vi 0.00 1111 10 . ..... 1101 . 0 ..... ..... @qrrx_s
|
||||
|
||||
SQRDMLAH_vi 0.10 1111 01 .. .... 1101 . 0 ..... ..... @qrrx_h
|
||||
SQRDMLAH_vi 0.10 1111 10 .. .... 1101 . 0 ..... ..... @qrrx_s
|
||||
|
||||
SQRDMLSH_vi 0.10 1111 01 .. .... 1111 . 0 ..... ..... @qrrx_h
|
||||
SQRDMLSH_vi 0.10 1111 10 .. .... 1111 . 0 ..... ..... @qrrx_s
|
||||
|
||||
# Floating-point conditional select
|
||||
|
||||
FCSEL 0001 1110 .. 1 rm:5 cond:4 11 rn:5 rd:5 esz=%esz_hsd
|
||||
|
@ -5235,6 +5235,43 @@ static const ENVScalar2 f_scalar_sqrdmulh = {
|
||||
};
|
||||
TRANS(SQRDMULH_s, do_env_scalar2_hs, a, &f_scalar_sqrdmulh)
|
||||
|
||||
typedef struct ENVScalar3 {
|
||||
NeonGenThreeOpEnvFn *gen_hs[2];
|
||||
} ENVScalar3;
|
||||
|
||||
static bool do_env_scalar3_hs(DisasContext *s, arg_rrr_e *a,
|
||||
const ENVScalar3 *f)
|
||||
{
|
||||
TCGv_i32 t0, t1, t2;
|
||||
|
||||
if (a->esz != MO_16 && a->esz != MO_32) {
|
||||
return false;
|
||||
}
|
||||
if (!fp_access_check(s)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
t0 = tcg_temp_new_i32();
|
||||
t1 = tcg_temp_new_i32();
|
||||
t2 = tcg_temp_new_i32();
|
||||
read_vec_element_i32(s, t0, a->rn, 0, a->esz);
|
||||
read_vec_element_i32(s, t1, a->rm, 0, a->esz);
|
||||
read_vec_element_i32(s, t2, a->rd, 0, a->esz);
|
||||
f->gen_hs[a->esz - 1](t0, tcg_env, t0, t1, t2);
|
||||
write_fp_sreg(s, a->rd, t0);
|
||||
return true;
|
||||
}
|
||||
|
||||
static const ENVScalar3 f_scalar_sqrdmlah = {
|
||||
{ gen_helper_neon_qrdmlah_s16, gen_helper_neon_qrdmlah_s32 }
|
||||
};
|
||||
TRANS_FEAT(SQRDMLAH_s, aa64_rdm, do_env_scalar3_hs, a, &f_scalar_sqrdmlah)
|
||||
|
||||
static const ENVScalar3 f_scalar_sqrdmlsh = {
|
||||
{ gen_helper_neon_qrdmlsh_s16, gen_helper_neon_qrdmlsh_s32 }
|
||||
};
|
||||
TRANS_FEAT(SQRDMLSH_s, aa64_rdm, do_env_scalar3_hs, a, &f_scalar_sqrdmlsh)
|
||||
|
||||
static bool do_cmop_d(DisasContext *s, arg_rrr_e *a, TCGCond cond)
|
||||
{
|
||||
if (fp_access_check(s)) {
|
||||
@ -5552,6 +5589,8 @@ TRANS(CMTST_v, do_gvec_fn3, a, gen_gvec_cmtst)
|
||||
|
||||
TRANS(SQDMULH_v, do_gvec_fn3_no8_no64, a, gen_gvec_sqdmulh_qc)
|
||||
TRANS(SQRDMULH_v, do_gvec_fn3_no8_no64, a, gen_gvec_sqrdmulh_qc)
|
||||
TRANS_FEAT(SQRDMLAH_v, aa64_rdm, do_gvec_fn3_no8_no64, a, gen_gvec_sqrdmlah_qc)
|
||||
TRANS_FEAT(SQRDMLSH_v, aa64_rdm, do_gvec_fn3_no8_no64, a, gen_gvec_sqrdmlsh_qc)
|
||||
|
||||
/*
|
||||
* Advanced SIMD scalar/vector x indexed element
|
||||
@ -5681,6 +5720,29 @@ static bool do_env_scalar2_idx_hs(DisasContext *s, arg_rrx_e *a,
|
||||
TRANS(SQDMULH_si, do_env_scalar2_idx_hs, a, &f_scalar_sqdmulh)
|
||||
TRANS(SQRDMULH_si, do_env_scalar2_idx_hs, a, &f_scalar_sqrdmulh)
|
||||
|
||||
static bool do_env_scalar3_idx_hs(DisasContext *s, arg_rrx_e *a,
|
||||
const ENVScalar3 *f)
|
||||
{
|
||||
if (a->esz < MO_16 || a->esz > MO_32) {
|
||||
return false;
|
||||
}
|
||||
if (fp_access_check(s)) {
|
||||
TCGv_i32 t0 = tcg_temp_new_i32();
|
||||
TCGv_i32 t1 = tcg_temp_new_i32();
|
||||
TCGv_i32 t2 = tcg_temp_new_i32();
|
||||
|
||||
read_vec_element_i32(s, t0, a->rn, 0, a->esz);
|
||||
read_vec_element_i32(s, t1, a->rm, a->idx, a->esz);
|
||||
read_vec_element_i32(s, t2, a->rd, 0, a->esz);
|
||||
f->gen_hs[a->esz - 1](t0, tcg_env, t0, t1, t2);
|
||||
write_fp_sreg(s, a->rd, t0);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TRANS_FEAT(SQRDMLAH_si, aa64_rdm, do_env_scalar3_idx_hs, a, &f_scalar_sqrdmlah)
|
||||
TRANS_FEAT(SQRDMLSH_si, aa64_rdm, do_env_scalar3_idx_hs, a, &f_scalar_sqrdmlsh)
|
||||
|
||||
static bool do_fp3_vector_idx(DisasContext *s, arg_qrrx_e *a,
|
||||
gen_helper_gvec_3_ptr * const fns[3])
|
||||
{
|
||||
@ -5838,6 +5900,20 @@ static gen_helper_gvec_4 * const f_vector_idx_sqrdmulh[2] = {
|
||||
};
|
||||
TRANS(SQRDMULH_vi, do_int3_qc_vector_idx, a, f_vector_idx_sqrdmulh)
|
||||
|
||||
static gen_helper_gvec_4 * const f_vector_idx_sqrdmlah[2] = {
|
||||
gen_helper_neon_sqrdmlah_idx_h,
|
||||
gen_helper_neon_sqrdmlah_idx_s,
|
||||
};
|
||||
TRANS_FEAT(SQRDMLAH_vi, aa64_rdm, do_int3_qc_vector_idx, a,
|
||||
f_vector_idx_sqrdmlah)
|
||||
|
||||
static gen_helper_gvec_4 * const f_vector_idx_sqrdmlsh[2] = {
|
||||
gen_helper_neon_sqrdmlsh_idx_h,
|
||||
gen_helper_neon_sqrdmlsh_idx_s,
|
||||
};
|
||||
TRANS_FEAT(SQRDMLSH_vi, aa64_rdm, do_int3_qc_vector_idx, a,
|
||||
f_vector_idx_sqrdmlsh)
|
||||
|
||||
/*
|
||||
* Advanced SIMD scalar pairwise
|
||||
*/
|
||||
@ -9536,84 +9612,6 @@ static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
|
||||
}
|
||||
}
|
||||
|
||||
/* AdvSIMD scalar three same extra
|
||||
* 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0
|
||||
* +-----+---+-----------+------+---+------+---+--------+---+----+----+
|
||||
* | 0 1 | U | 1 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd |
|
||||
* +-----+---+-----------+------+---+------+---+--------+---+----+----+
|
||||
*/
|
||||
static void disas_simd_scalar_three_reg_same_extra(DisasContext *s,
|
||||
uint32_t insn)
|
||||
{
|
||||
int rd = extract32(insn, 0, 5);
|
||||
int rn = extract32(insn, 5, 5);
|
||||
int opcode = extract32(insn, 11, 4);
|
||||
int rm = extract32(insn, 16, 5);
|
||||
int size = extract32(insn, 22, 2);
|
||||
bool u = extract32(insn, 29, 1);
|
||||
TCGv_i32 ele1, ele2, ele3;
|
||||
TCGv_i64 res;
|
||||
bool feature;
|
||||
|
||||
switch (u * 16 + opcode) {
|
||||
case 0x10: /* SQRDMLAH (vector) */
|
||||
case 0x11: /* SQRDMLSH (vector) */
|
||||
if (size != 1 && size != 2) {
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
feature = dc_isar_feature(aa64_rdm, s);
|
||||
break;
|
||||
default:
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
if (!feature) {
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
if (!fp_access_check(s)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* Do a single operation on the lowest element in the vector.
|
||||
* We use the standard Neon helpers and rely on 0 OP 0 == 0
|
||||
* with no side effects for all these operations.
|
||||
* OPTME: special-purpose helpers would avoid doing some
|
||||
* unnecessary work in the helper for the 16 bit cases.
|
||||
*/
|
||||
ele1 = tcg_temp_new_i32();
|
||||
ele2 = tcg_temp_new_i32();
|
||||
ele3 = tcg_temp_new_i32();
|
||||
|
||||
read_vec_element_i32(s, ele1, rn, 0, size);
|
||||
read_vec_element_i32(s, ele2, rm, 0, size);
|
||||
read_vec_element_i32(s, ele3, rd, 0, size);
|
||||
|
||||
switch (opcode) {
|
||||
case 0x0: /* SQRDMLAH */
|
||||
if (size == 1) {
|
||||
gen_helper_neon_qrdmlah_s16(ele3, tcg_env, ele1, ele2, ele3);
|
||||
} else {
|
||||
gen_helper_neon_qrdmlah_s32(ele3, tcg_env, ele1, ele2, ele3);
|
||||
}
|
||||
break;
|
||||
case 0x1: /* SQRDMLSH */
|
||||
if (size == 1) {
|
||||
gen_helper_neon_qrdmlsh_s16(ele3, tcg_env, ele1, ele2, ele3);
|
||||
} else {
|
||||
gen_helper_neon_qrdmlsh_s32(ele3, tcg_env, ele1, ele2, ele3);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
res = tcg_temp_new_i64();
|
||||
tcg_gen_extu_i32_i64(res, ele3);
|
||||
write_fp_dreg(s, rd, res);
|
||||
}
|
||||
|
||||
static void handle_2misc_64(DisasContext *s, int opcode, bool u,
|
||||
TCGv_i64 tcg_rd, TCGv_i64 tcg_rn,
|
||||
TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus)
|
||||
@ -10892,14 +10890,6 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
|
||||
int rot;
|
||||
|
||||
switch (u * 16 + opcode) {
|
||||
case 0x10: /* SQRDMLAH (vector) */
|
||||
case 0x11: /* SQRDMLSH (vector) */
|
||||
if (size != 1 && size != 2) {
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
feature = dc_isar_feature(aa64_rdm, s);
|
||||
break;
|
||||
case 0x02: /* SDOT (vector) */
|
||||
case 0x12: /* UDOT (vector) */
|
||||
if (size != MO_32) {
|
||||
@ -10957,6 +10947,8 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
|
||||
}
|
||||
break;
|
||||
default:
|
||||
case 0x10: /* SQRDMLAH (vector) */
|
||||
case 0x11: /* SQRDMLSH (vector) */
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
@ -10969,14 +10961,6 @@ static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
|
||||
}
|
||||
|
||||
switch (opcode) {
|
||||
case 0x0: /* SQRDMLAH (vector) */
|
||||
gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlah_qc, size);
|
||||
return;
|
||||
|
||||
case 0x1: /* SQRDMLSH (vector) */
|
||||
gen_gvec_fn3(s, is_q, rd, rn, rm, gen_gvec_sqrdmlsh_qc, size);
|
||||
return;
|
||||
|
||||
case 0x2: /* SDOT / UDOT */
|
||||
gen_gvec_op4_ool(s, is_q, rd, rn, rm, rd, 0,
|
||||
u ? gen_helper_gvec_udot_b : gen_helper_gvec_sdot_b);
|
||||
@ -12059,13 +12043,6 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
|
||||
case 0x0b: /* SQDMULL, SQDMULL2 */
|
||||
is_long = true;
|
||||
break;
|
||||
case 0x1d: /* SQRDMLAH */
|
||||
case 0x1f: /* SQRDMLSH */
|
||||
if (!dc_isar_feature(aa64_rdm, s)) {
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
break;
|
||||
case 0x0e: /* SDOT */
|
||||
case 0x1e: /* UDOT */
|
||||
if (is_scalar || size != MO_32 || !dc_isar_feature(aa64_dp, s)) {
|
||||
@ -12127,6 +12104,8 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
|
||||
case 0x18: /* FMLAL2 */
|
||||
case 0x19: /* FMULX */
|
||||
case 0x1c: /* FMLSL2 */
|
||||
case 0x1d: /* SQRDMLAH */
|
||||
case 0x1f: /* SQRDMLSH */
|
||||
unallocated_encoding(s);
|
||||
return;
|
||||
}
|
||||
@ -12320,33 +12299,13 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
|
||||
tcg_op, tcg_idx);
|
||||
}
|
||||
break;
|
||||
case 0x1d: /* SQRDMLAH */
|
||||
read_vec_element_i32(s, tcg_res, rd, pass,
|
||||
is_scalar ? size : MO_32);
|
||||
if (size == 1) {
|
||||
gen_helper_neon_qrdmlah_s16(tcg_res, tcg_env,
|
||||
tcg_op, tcg_idx, tcg_res);
|
||||
} else {
|
||||
gen_helper_neon_qrdmlah_s32(tcg_res, tcg_env,
|
||||
tcg_op, tcg_idx, tcg_res);
|
||||
}
|
||||
break;
|
||||
case 0x1f: /* SQRDMLSH */
|
||||
read_vec_element_i32(s, tcg_res, rd, pass,
|
||||
is_scalar ? size : MO_32);
|
||||
if (size == 1) {
|
||||
gen_helper_neon_qrdmlsh_s16(tcg_res, tcg_env,
|
||||
tcg_op, tcg_idx, tcg_res);
|
||||
} else {
|
||||
gen_helper_neon_qrdmlsh_s32(tcg_res, tcg_env,
|
||||
tcg_op, tcg_idx, tcg_res);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
case 0x01: /* FMLA */
|
||||
case 0x05: /* FMLS */
|
||||
case 0x09: /* FMUL */
|
||||
case 0x19: /* FMULX */
|
||||
case 0x1d: /* SQRDMLAH */
|
||||
case 0x1f: /* SQRDMLSH */
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
@ -12538,7 +12497,6 @@ static const AArch64DecodeTable data_proc_simd[] = {
|
||||
{ 0x0e000000, 0xbf208c00, disas_simd_tb },
|
||||
{ 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
|
||||
{ 0x2e000000, 0xbf208400, disas_simd_ext },
|
||||
{ 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra },
|
||||
{ 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff },
|
||||
{ 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
|
||||
{ 0x5f000000, 0xdf000400, disas_simd_indexed }, /* scalar indexed */
|
||||
|
@ -347,6 +347,42 @@ void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
|
||||
void *vq, uint32_t desc)
|
||||
{
|
||||
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
||||
int idx = simd_data(desc);
|
||||
int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
||||
intptr_t elements = opr_sz / 2;
|
||||
intptr_t eltspersegment = MIN(16 / 2, elements);
|
||||
|
||||
for (i = 0; i < elements; i += 16 / 2) {
|
||||
int16_t mm = m[i];
|
||||
for (j = 0; j < eltspersegment; ++j) {
|
||||
d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
|
||||
}
|
||||
}
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
|
||||
void *vq, uint32_t desc)
|
||||
{
|
||||
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
||||
int idx = simd_data(desc);
|
||||
int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
||||
intptr_t elements = opr_sz / 2;
|
||||
intptr_t eltspersegment = MIN(16 / 2, elements);
|
||||
|
||||
for (i = 0; i < elements; i += 16 / 2) {
|
||||
int16_t mm = m[i];
|
||||
for (j = 0; j < eltspersegment; ++j) {
|
||||
d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
|
||||
}
|
||||
}
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
|
||||
void *va, uint32_t desc)
|
||||
{
|
||||
@ -546,6 +582,42 @@ void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
|
||||
void *vq, uint32_t desc)
|
||||
{
|
||||
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
||||
int idx = simd_data(desc);
|
||||
int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
||||
intptr_t elements = opr_sz / 4;
|
||||
intptr_t eltspersegment = MIN(16 / 4, elements);
|
||||
|
||||
for (i = 0; i < elements; i += 16 / 4) {
|
||||
int32_t mm = m[i];
|
||||
for (j = 0; j < eltspersegment; ++j) {
|
||||
d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
|
||||
}
|
||||
}
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
|
||||
void *vq, uint32_t desc)
|
||||
{
|
||||
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
||||
int idx = simd_data(desc);
|
||||
int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
||||
intptr_t elements = opr_sz / 4;
|
||||
intptr_t eltspersegment = MIN(16 / 4, elements);
|
||||
|
||||
for (i = 0; i < elements; i += 16 / 4) {
|
||||
int32_t mm = m[i];
|
||||
for (j = 0; j < eltspersegment; ++j) {
|
||||
d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
|
||||
}
|
||||
}
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
|
||||
void *va, uint32_t desc)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user