target/arm: Inline scalar SQADD, UQADD, SQSUB, UQSUB

This eliminates the last uses of these neon helpers.
Incorporate the MO_64 expanders as an option to the vector expander.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20240528203044.612851-7-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Richard Henderson 2024-05-28 13:30:17 -07:00 committed by Peter Maydell
parent 1217edace8
commit f4fa83d614
5 changed files with 169 additions and 208 deletions

View File

@ -268,23 +268,6 @@ DEF_HELPER_FLAGS_2(fjcvtzs, TCG_CALL_NO_RWG, i64, f64, ptr)
DEF_HELPER_FLAGS_3(check_hcr_el2_trap, TCG_CALL_NO_WG, void, env, i32, i32)
/* neon_helper.c */
DEF_HELPER_FLAGS_3(neon_qadd_u8, TCG_CALL_NO_RWG, i32, env, i32, i32)
DEF_HELPER_FLAGS_3(neon_qadd_s8, TCG_CALL_NO_RWG, i32, env, i32, i32)
DEF_HELPER_FLAGS_3(neon_qadd_u16, TCG_CALL_NO_RWG, i32, env, i32, i32)
DEF_HELPER_FLAGS_3(neon_qadd_s16, TCG_CALL_NO_RWG, i32, env, i32, i32)
DEF_HELPER_FLAGS_3(neon_qadd_u32, TCG_CALL_NO_RWG, i32, env, i32, i32)
DEF_HELPER_FLAGS_3(neon_qadd_s32, TCG_CALL_NO_RWG, i32, env, i32, i32)
DEF_HELPER_3(neon_qsub_u8, i32, env, i32, i32)
DEF_HELPER_3(neon_qsub_s8, i32, env, i32, i32)
DEF_HELPER_3(neon_qsub_u16, i32, env, i32, i32)
DEF_HELPER_3(neon_qsub_s16, i32, env, i32, i32)
DEF_HELPER_3(neon_qsub_u32, i32, env, i32, i32)
DEF_HELPER_3(neon_qsub_s32, i32, env, i32, i32)
DEF_HELPER_3(neon_qadd_u64, i64, env, i64, i64)
DEF_HELPER_3(neon_qadd_s64, i64, env, i64, i64)
DEF_HELPER_3(neon_qsub_u64, i64, env, i64, i64)
DEF_HELPER_3(neon_qsub_s64, i64, env, i64, i64)
DEF_HELPER_2(neon_hadd_s8, i32, i32, i32)
DEF_HELPER_2(neon_hadd_u8, i32, i32, i32)
DEF_HELPER_2(neon_hadd_s16, i32, i32, i32)

View File

@ -1218,6 +1218,28 @@ void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
}
void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
{
uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
TCGv_i64 tmp = tcg_temp_new_i64();
tcg_gen_add_i64(tmp, a, b);
tcg_gen_umin_i64(res, tmp, tcg_constant_i64(max));
tcg_gen_xor_i64(tmp, tmp, res);
tcg_gen_or_i64(qc, qc, tmp);
}
void gen_uqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
{
TCGv_i64 t = tcg_temp_new_i64();
tcg_gen_add_i64(t, a, b);
tcg_gen_movcond_i64(TCG_COND_LTU, res, t, a,
tcg_constant_i64(UINT64_MAX), t);
tcg_gen_xor_i64(t, t, res);
tcg_gen_or_i64(qc, qc, t);
}
static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
TCGv_vec a, TCGv_vec b)
{
@ -1251,6 +1273,7 @@ void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
.opt_opc = vecop_list,
.vece = MO_32 },
{ .fniv = gen_uqadd_vec,
.fni8 = gen_uqadd_d,
.fno = gen_helper_gvec_uqadd_d,
.write_aofs = true,
.opt_opc = vecop_list,
@ -1262,6 +1285,41 @@ void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
}
void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
{
int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
int64_t min = -1ll - max;
TCGv_i64 tmp = tcg_temp_new_i64();
tcg_gen_add_i64(tmp, a, b);
tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
tcg_gen_xor_i64(tmp, tmp, res);
tcg_gen_or_i64(qc, qc, tmp);
}
void gen_sqadd_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
{
TCGv_i64 t0 = tcg_temp_new_i64();
TCGv_i64 t1 = tcg_temp_new_i64();
TCGv_i64 t2 = tcg_temp_new_i64();
tcg_gen_add_i64(t0, a, b);
/* Compute signed overflow indication into T1 */
tcg_gen_xor_i64(t1, a, b);
tcg_gen_xor_i64(t2, t0, a);
tcg_gen_andc_i64(t1, t2, t1);
/* Compute saturated value into T2 */
tcg_gen_sari_i64(t2, a, 63);
tcg_gen_xori_i64(t2, t2, INT64_MAX);
tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
tcg_gen_xor_i64(t0, t0, res);
tcg_gen_or_i64(qc, qc, t0);
}
static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
TCGv_vec a, TCGv_vec b)
{
@ -1295,6 +1353,7 @@ void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
.write_aofs = true,
.vece = MO_32 },
{ .fniv = gen_sqadd_vec,
.fni8 = gen_sqadd_d,
.fno = gen_helper_gvec_sqadd_d,
.opt_opc = vecop_list,
.write_aofs = true,
@ -1306,6 +1365,26 @@ void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
}
void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
{
TCGv_i64 tmp = tcg_temp_new_i64();
tcg_gen_sub_i64(tmp, a, b);
tcg_gen_smax_i64(res, tmp, tcg_constant_i64(0));
tcg_gen_xor_i64(tmp, tmp, res);
tcg_gen_or_i64(qc, qc, tmp);
}
void gen_uqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
{
TCGv_i64 t = tcg_temp_new_i64();
tcg_gen_sub_i64(t, a, b);
tcg_gen_movcond_i64(TCG_COND_LTU, res, a, b, tcg_constant_i64(0), t);
tcg_gen_xor_i64(t, t, res);
tcg_gen_or_i64(qc, qc, t);
}
static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
TCGv_vec a, TCGv_vec b)
{
@ -1339,6 +1418,7 @@ void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
.write_aofs = true,
.vece = MO_32 },
{ .fniv = gen_uqsub_vec,
.fni8 = gen_uqsub_d,
.fno = gen_helper_gvec_uqsub_d,
.opt_opc = vecop_list,
.write_aofs = true,
@ -1350,6 +1430,41 @@ void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
rn_ofs, rm_ofs, opr_sz, max_sz, &ops[vece]);
}
void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
{
int64_t max = MAKE_64BIT_MASK(0, (8 << esz) - 1);
int64_t min = -1ll - max;
TCGv_i64 tmp = tcg_temp_new_i64();
tcg_gen_sub_i64(tmp, a, b);
tcg_gen_smin_i64(res, tmp, tcg_constant_i64(max));
tcg_gen_smax_i64(res, res, tcg_constant_i64(min));
tcg_gen_xor_i64(tmp, tmp, res);
tcg_gen_or_i64(qc, qc, tmp);
}
void gen_sqsub_d(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b)
{
TCGv_i64 t0 = tcg_temp_new_i64();
TCGv_i64 t1 = tcg_temp_new_i64();
TCGv_i64 t2 = tcg_temp_new_i64();
tcg_gen_sub_i64(t0, a, b);
/* Compute signed overflow indication into T1 */
tcg_gen_xor_i64(t1, a, b);
tcg_gen_xor_i64(t2, t0, a);
tcg_gen_and_i64(t1, t1, t2);
/* Compute saturated value into T2 */
tcg_gen_sari_i64(t2, a, 63);
tcg_gen_xori_i64(t2, t2, INT64_MAX);
tcg_gen_movcond_i64(TCG_COND_LT, res, t1, tcg_constant_i64(0), t2, t0);
tcg_gen_xor_i64(t0, t0, res);
tcg_gen_or_i64(qc, qc, t0);
}
static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec qc,
TCGv_vec a, TCGv_vec b)
{
@ -1383,6 +1498,7 @@ void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
.write_aofs = true,
.vece = MO_32 },
{ .fniv = gen_sqsub_vec,
.fni8 = gen_sqsub_d,
.fno = gen_helper_gvec_sqsub_d,
.opt_opc = vecop_list,
.write_aofs = true,

View File

@ -155,168 +155,6 @@ uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
return arg; \
}
#define NEON_USAT(dest, src1, src2, type) do { \
uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
if (tmp != (type)tmp) { \
SET_QC(); \
dest = ~0; \
} else { \
dest = tmp; \
}} while(0)
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
NEON_VOP_ENV(qadd_u8, neon_u8, 4)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
NEON_VOP_ENV(qadd_u16, neon_u16, 2)
#undef NEON_FN
#undef NEON_USAT
uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
{
uint32_t res = a + b;
if (res < a) {
SET_QC();
res = ~0;
}
return res;
}
uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
{
uint64_t res;
res = src1 + src2;
if (res < src1) {
SET_QC();
res = ~(uint64_t)0;
}
return res;
}
#define NEON_SSAT(dest, src1, src2, type) do { \
int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
if (tmp != (type)tmp) { \
SET_QC(); \
if (src2 > 0) { \
tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
} else { \
tmp = 1 << (sizeof(type) * 8 - 1); \
} \
} \
dest = tmp; \
} while(0)
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
NEON_VOP_ENV(qadd_s8, neon_s8, 4)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
NEON_VOP_ENV(qadd_s16, neon_s16, 2)
#undef NEON_FN
#undef NEON_SSAT
uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
{
uint32_t res = a + b;
if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
SET_QC();
res = ~(((int32_t)a >> 31) ^ SIGNBIT);
}
return res;
}
uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
{
uint64_t res;
res = src1 + src2;
if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
SET_QC();
res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
}
return res;
}
#define NEON_USAT(dest, src1, src2, type) do { \
uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
if (tmp != (type)tmp) { \
SET_QC(); \
dest = 0; \
} else { \
dest = tmp; \
}} while(0)
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
NEON_VOP_ENV(qsub_u8, neon_u8, 4)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
NEON_VOP_ENV(qsub_u16, neon_u16, 2)
#undef NEON_FN
#undef NEON_USAT
uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
{
uint32_t res = a - b;
if (res > a) {
SET_QC();
res = 0;
}
return res;
}
uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
{
uint64_t res;
if (src1 < src2) {
SET_QC();
res = 0;
} else {
res = src1 - src2;
}
return res;
}
#define NEON_SSAT(dest, src1, src2, type) do { \
int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
if (tmp != (type)tmp) { \
SET_QC(); \
if (src2 < 0) { \
tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
} else { \
tmp = 1 << (sizeof(type) * 8 - 1); \
} \
} \
dest = tmp; \
} while(0)
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
NEON_VOP_ENV(qsub_s8, neon_s8, 4)
#undef NEON_FN
#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
NEON_VOP_ENV(qsub_s16, neon_s16, 2)
#undef NEON_FN
#undef NEON_SSAT
uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
{
uint32_t res = a - b;
if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
SET_QC();
res = ~(((int32_t)a >> 31) ^ SIGNBIT);
}
return res;
}
uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
{
uint64_t res;
res = src1 - src2;
if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
SET_QC();
res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
}
return res;
}
#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
NEON_VOP(hadd_s8, neon_s8, 4)
NEON_VOP(hadd_u8, neon_u8, 4)

View File

@ -9291,21 +9291,28 @@ static void handle_3same_64(DisasContext *s, int opcode, bool u,
* or scalar-three-reg-same groups.
*/
TCGCond cond;
TCGv_i64 qc;
switch (opcode) {
case 0x1: /* SQADD */
qc = tcg_temp_new_i64();
tcg_gen_ld_i64(qc, tcg_env, offsetof(CPUARMState, vfp.qc));
if (u) {
gen_helper_neon_qadd_u64(tcg_rd, tcg_env, tcg_rn, tcg_rm);
gen_uqadd_d(tcg_rd, qc, tcg_rn, tcg_rm);
} else {
gen_helper_neon_qadd_s64(tcg_rd, tcg_env, tcg_rn, tcg_rm);
gen_sqadd_d(tcg_rd, qc, tcg_rn, tcg_rm);
}
tcg_gen_st_i64(qc, tcg_env, offsetof(CPUARMState, vfp.qc));
break;
case 0x5: /* SQSUB */
qc = tcg_temp_new_i64();
tcg_gen_ld_i64(qc, tcg_env, offsetof(CPUARMState, vfp.qc));
if (u) {
gen_helper_neon_qsub_u64(tcg_rd, tcg_env, tcg_rn, tcg_rm);
gen_uqsub_d(tcg_rd, qc, tcg_rn, tcg_rm);
} else {
gen_helper_neon_qsub_s64(tcg_rd, tcg_env, tcg_rn, tcg_rm);
gen_sqsub_d(tcg_rd, qc, tcg_rn, tcg_rm);
}
tcg_gen_st_i64(qc, tcg_env, offsetof(CPUARMState, vfp.qc));
break;
case 0x6: /* CMGT, CMHI */
cond = u ? TCG_COND_GTU : TCG_COND_GT;
@ -9425,35 +9432,16 @@ static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
* OPTME: special-purpose helpers would avoid doing some
* unnecessary work in the helper for the 8 and 16 bit cases.
*/
NeonGenTwoOpEnvFn *genenvfn;
TCGv_i32 tcg_rn = tcg_temp_new_i32();
TCGv_i32 tcg_rm = tcg_temp_new_i32();
TCGv_i32 tcg_rd32 = tcg_temp_new_i32();
read_vec_element_i32(s, tcg_rn, rn, 0, size);
read_vec_element_i32(s, tcg_rm, rm, 0, size);
NeonGenTwoOpEnvFn *genenvfn = NULL;
void (*genfn)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64, MemOp) = NULL;
switch (opcode) {
case 0x1: /* SQADD, UQADD */
{
static NeonGenTwoOpEnvFn * const fns[3][2] = {
{ gen_helper_neon_qadd_s8, gen_helper_neon_qadd_u8 },
{ gen_helper_neon_qadd_s16, gen_helper_neon_qadd_u16 },
{ gen_helper_neon_qadd_s32, gen_helper_neon_qadd_u32 },
};
genenvfn = fns[size][u];
genfn = u ? gen_uqadd_bhs : gen_sqadd_bhs;
break;
}
case 0x5: /* SQSUB, UQSUB */
{
static NeonGenTwoOpEnvFn * const fns[3][2] = {
{ gen_helper_neon_qsub_s8, gen_helper_neon_qsub_u8 },
{ gen_helper_neon_qsub_s16, gen_helper_neon_qsub_u16 },
{ gen_helper_neon_qsub_s32, gen_helper_neon_qsub_u32 },
};
genenvfn = fns[size][u];
genfn = u ? gen_uqsub_bhs : gen_sqsub_bhs;
break;
}
case 0x9: /* SQSHL, UQSHL */
{
static NeonGenTwoOpEnvFn * const fns[3][2] = {
@ -9488,8 +9476,29 @@ static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
g_assert_not_reached();
}
genenvfn(tcg_rd32, tcg_env, tcg_rn, tcg_rm);
tcg_gen_extu_i32_i64(tcg_rd, tcg_rd32);
if (genenvfn) {
TCGv_i32 tcg_rn = tcg_temp_new_i32();
TCGv_i32 tcg_rm = tcg_temp_new_i32();
read_vec_element_i32(s, tcg_rn, rn, 0, size);
read_vec_element_i32(s, tcg_rm, rm, 0, size);
genenvfn(tcg_rn, tcg_env, tcg_rn, tcg_rm);
tcg_gen_extu_i32_i64(tcg_rd, tcg_rn);
} else {
TCGv_i64 tcg_rn = tcg_temp_new_i64();
TCGv_i64 tcg_rm = tcg_temp_new_i64();
TCGv_i64 qc = tcg_temp_new_i64();
read_vec_element(s, tcg_rn, rn, 0, size | (u ? 0 : MO_SIGN));
read_vec_element(s, tcg_rm, rm, 0, size | (u ? 0 : MO_SIGN));
tcg_gen_ld_i64(qc, tcg_env, offsetof(CPUARMState, vfp.qc));
genfn(tcg_rd, qc, tcg_rn, tcg_rm, size);
tcg_gen_st_i64(qc, tcg_env, offsetof(CPUARMState, vfp.qc));
if (!u) {
/* Truncate signed 64-bit result for writeback. */
tcg_gen_ext_i64(tcg_rd, tcg_rd, size);
}
}
}
write_fp_dreg(s, rd, tcg_rd);

View File

@ -466,12 +466,27 @@ void gen_sshl_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b);
void gen_ushl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void gen_sshl_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc,
TCGv_i64 a, TCGv_i64 b, MemOp esz);
void gen_uqadd_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b);
void gen_gvec_uqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
void gen_sqadd_bhs(TCGv_i64 res, TCGv_i64 qc,
TCGv_i64 a, TCGv_i64 b, MemOp esz);
void gen_sqadd_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b);
void gen_gvec_sqadd_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
void gen_uqsub_bhs(TCGv_i64 res, TCGv_i64 qc,
TCGv_i64 a, TCGv_i64 b, MemOp esz);
void gen_uqsub_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b);
void gen_gvec_uqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
void gen_sqsub_bhs(TCGv_i64 res, TCGv_i64 qc,
TCGv_i64 a, TCGv_i64 b, MemOp esz);
void gen_sqsub_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b);
void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);