tcg/i386: Implement cmpsel_vec with avx512 insns
The avx512 vpblendm* instructions exactly implement cmpsel, using a predicate input. Of course this matches nicely with the avx512 predicate comparison instructions. Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
parent
c044ec0d85
commit
d589674902
@ -413,6 +413,10 @@ static bool tcg_target_const_match(int64_t val, int ct,
|
|||||||
#define OPC_UD2 (0x0b | P_EXT)
|
#define OPC_UD2 (0x0b | P_EXT)
|
||||||
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
|
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
|
||||||
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
|
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
|
||||||
|
#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
|
||||||
|
#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
|
||||||
|
#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
|
||||||
|
#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
|
||||||
#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
|
#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
|
||||||
#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
|
#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
|
||||||
#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
|
#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
|
||||||
@ -738,6 +742,16 @@ static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
|
|||||||
tcg_out_vex_modrm(s, opc, r, v, rm);
|
tcg_out_vex_modrm(s, opc, r, v, rm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
|
||||||
|
int rm, int aaa, bool z, TCGType type)
|
||||||
|
{
|
||||||
|
if (type == TCG_TYPE_V256) {
|
||||||
|
opc |= P_VEXL;
|
||||||
|
}
|
||||||
|
tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
|
||||||
|
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
|
||||||
|
}
|
||||||
|
|
||||||
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
|
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
|
||||||
We handle either RM and INDEX missing with a negative value. In 64-bit
|
We handle either RM and INDEX missing with a negative value. In 64-bit
|
||||||
mode for absolute addresses, ~RM is the size of the immediate operand
|
mode for absolute addresses, ~RM is the size of the immediate operand
|
||||||
@ -3183,11 +3197,39 @@ static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
|
||||||
|
TCGReg v0, TCGReg c1, TCGReg c2,
|
||||||
|
TCGReg v3, TCGReg v4, TCGCond cond)
|
||||||
|
{
|
||||||
|
static const int vpblendm_insn[] = {
|
||||||
|
OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
|
||||||
|
};
|
||||||
|
bool z = false;
|
||||||
|
|
||||||
|
/* Swap to place constant in V4 to take advantage of zero-masking. */
|
||||||
|
if (!v3) {
|
||||||
|
z = true;
|
||||||
|
v3 = v4;
|
||||||
|
cond = tcg_invert_cond(cond);
|
||||||
|
}
|
||||||
|
|
||||||
|
tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
|
||||||
|
tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
|
||||||
|
/* k1 */1, z, type);
|
||||||
|
}
|
||||||
|
|
||||||
static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
|
static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
|
||||||
TCGReg v0, TCGReg c1, TCGReg c2,
|
TCGReg v0, TCGReg c1, TCGReg c2,
|
||||||
TCGReg v3, TCGReg v4, TCGCond cond)
|
TCGReg v3, TCGReg v4, TCGCond cond)
|
||||||
{
|
{
|
||||||
bool inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);
|
bool inv;
|
||||||
|
|
||||||
|
if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
|
||||||
|
tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Since XMM0 is 16, the only way we get 0 into V3
|
* Since XMM0 is 16, the only way we get 0 into V3
|
||||||
|
Loading…
Reference in New Issue
Block a user