tcg/i386: Implement cmpsel_vec with avx512 insns

The avx512 vpblendm* instructions exactly implement cmpsel,
using a predicate input.  Of course this matches nicely with
the avx512 predicate comparison instructions.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2024-09-07 17:24:57 -07:00
parent c044ec0d85
commit d589674902

View File

@ -413,6 +413,10 @@ static bool tcg_target_const_match(int64_t val, int ct,
#define OPC_UD2 (0x0b | P_EXT) #define OPC_UD2 (0x0b | P_EXT)
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) #define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) #define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
@ -738,6 +742,16 @@ static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
tcg_out_vex_modrm(s, opc, r, v, rm); tcg_out_vex_modrm(s, opc, r, v, rm);
} }
static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
int rm, int aaa, bool z, TCGType type)
{
if (type == TCG_TYPE_V256) {
opc |= P_VEXL;
}
tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode. /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
We handle either RM and INDEX missing with a negative value. In 64-bit We handle either RM and INDEX missing with a negative value. In 64-bit
mode for absolute addresses, ~RM is the size of the immediate operand mode for absolute addresses, ~RM is the size of the immediate operand
@ -3183,11 +3197,39 @@ static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
} }
} }
static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
TCGReg v0, TCGReg c1, TCGReg c2,
TCGReg v3, TCGReg v4, TCGCond cond)
{
static const int vpblendm_insn[] = {
OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
};
bool z = false;
/* Swap to place constant in V4 to take advantage of zero-masking. */
if (!v3) {
z = true;
v3 = v4;
cond = tcg_invert_cond(cond);
}
tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
/* k1 */1, z, type);
}
static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
TCGReg v0, TCGReg c1, TCGReg c2, TCGReg v0, TCGReg c1, TCGReg c2,
TCGReg v3, TCGReg v4, TCGCond cond) TCGReg v3, TCGReg v4, TCGCond cond)
{ {
bool inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); bool inv;
if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
return;
}
inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);
/* /*
* Since XMM0 is 16, the only way we get 0 into V3 * Since XMM0 is 16, the only way we get 0 into V3