add support for AVX encoded VNNI INT8 extensions

This commit is contained in:
Stanislav Shwartsman 2022-10-02 23:00:46 +03:00
parent 3a20495db8
commit a56144833a
7 changed files with 121 additions and 2 deletions

View File

@ -49,6 +49,13 @@ AVX_3OP_VNNI(VPDPBUSDS_VdqHdqWdqR, xmm_pdpbusds)
AVX_3OP_VNNI(VPDPWSSD_VdqHdqWdqR, xmm_pdpwssd)
AVX_3OP_VNNI(VPDPWSSDS_VdqHdqWdqR, xmm_pdpwssds)
AVX_3OP_VNNI(VPDPBSSD_VdqHdqWdqR, xmm_pdpbssd)
AVX_3OP_VNNI(VPDPBSSDS_VdqHdqWdqR, xmm_pdpbssds)
AVX_3OP_VNNI(VPDPBSUD_VdqHdqWdqR, xmm_pdpbsud)
AVX_3OP_VNNI(VPDPBSUDS_VdqHdqWdqR, xmm_pdpbsuds)
AVX_3OP_VNNI(VPDPBUUD_VdqHdqWdqR, xmm_pdpbuud)
AVX_3OP_VNNI(VPDPBUUDS_VdqHdqWdqR, xmm_pdpbuuds)
#endif
#if BX_SUPPORT_EVEX

View File

@ -3091,6 +3091,14 @@ public: // for now...
BX_SMF void VPDPWSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void VPDPWSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
/* AVX encoded VNNI INT8 instructions */
BX_SMF void VPDPBSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void VPDPBSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void VPDPBSUD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void VPDPBSUDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void VPDPBUUD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void VPDPBUUDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
// AVX512 OPMASK instructions (VEX encoded)
BX_SMF void KADDB_KGbKHbKEbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
BX_SMF void KANDB_KGbKHbKEbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);

View File

@ -114,6 +114,7 @@ static const char *cpu_feature_name[] =
"avx512vp2intersect", // BX_ISA_AVX512_VP2INTERSECT
"avx_vnni", // BX_ISA_AVX_VNNI
"avx_ifma", // BX_ISA_AVX_IFMA
"avx_vnni_int8", // BX_ISA_AVX_VNNI_INT8
"xapic", // BX_ISA_XAPIC
"x2apic", // BX_ISA_X2APIC
"xapicext", // BX_ISA_XAPICEXT

View File

@ -109,6 +109,7 @@ enum x86_feature_name {
BX_ISA_AVX512_VP2INTERSECT, /* AVX-512 VP2INTERSECT Instructions */
BX_ISA_AVX_VNNI, /* AVX encoded VNNI Instructions */
BX_ISA_AVX_IFMA, /* AVX encoded IFMA Instructions */
BX_ISA_AVX_VNNI_INT8, /* AVX encoded VNNI-INT8 Instructions */
BX_ISA_XAPIC, /* XAPIC support */
BX_ISA_X2APIC, /* X2APIC support */
BX_ISA_XAPIC_EXT, /* XAPIC Extensions support */

View File

@ -994,8 +994,20 @@ static const Bit64u BxOpcodeGroup_VEX_0F3847[] = {
last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W1, BX_IA_VPSLLVQ_VdqHdqWdq)
};
static const Bit64u BxOpcodeGroup_VEX_0F3850[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq) };
static const Bit64u BxOpcodeGroup_VEX_0F3851[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq) };
static const Bit64u BxOpcodeGroup_VEX_0F3850[] = {
form_opcode(ATTR_NO_SSE_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUD_VdqHdqWdq),
form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq),
form_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0, BX_IA_VPDPBSSD_VdqHdqWdq),
last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUD_VdqHdqWdq)
};
static const Bit64u BxOpcodeGroup_VEX_0F3851[] = {
form_opcode(ATTR_NO_SSE_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUDS_VdqHdqWdq),
form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq),
form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSSDS_VdqHdqWdq),
last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUDS_VdqHdqWdq)
};
static const Bit64u BxOpcodeGroup_VEX_0F3852[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPWSSD_VdqHdqWdq) };
static const Bit64u BxOpcodeGroup_VEX_0F3853[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPWSSDS_VdqHdqWdq) };

View File

@ -2421,6 +2421,15 @@ bx_define_opcode(BX_IA_VPMADD52LUQ_VdqHdqWdq, "vpmadd52luq", "vpmadd52luq", &BX_
bx_define_opcode(BX_IA_VPMADD52HUQ_VdqHdqWdq, "vpmadd52huq", "vpmadd52huq", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR, BX_ISA_AVX_IFMA, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
// AVX IFMA
// AVX VNNI INT8
bx_define_opcode(BX_IA_VPDPBSSD_VdqHdqWdq, "vpdpbssd", "vpdpbssd", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSSD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
bx_define_opcode(BX_IA_VPDPBSSDS_VdqHdqWdq, "vpdpbssds", "vpdpbssds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSSDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
bx_define_opcode(BX_IA_VPDPBSUD_VdqHdqWdq, "vpdpbsud", "vpdpbsud", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSUD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
bx_define_opcode(BX_IA_VPDPBSUDS_VdqHdqWdq, "vpdpbsuds", "vpdpbsuds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSUDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
bx_define_opcode(BX_IA_VPDPBUUD_VdqHdqWdq, "vpdpbuud", "vpdpbuud", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBUUD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
bx_define_opcode(BX_IA_VPDPBUUDS_VdqHdqWdq, "vpdpbuuds", "vpdpbuuds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBUUDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
// AVX VNNI INT8
// BMI1 - VexW64 aliased
bx_define_opcode(BX_IA_ANDN_GdBdEd, "andn", "andnl", &BX_CPU_C::LOAD_Ed, &BX_CPU_C::ANDN_GdBdEdR, BX_ISA_BMI1, OP_Gd, OP_Bd, OP_Ed, OP_NONE, 0)
bx_define_opcode(BX_IA_ANDN_GqBqEq, "andn", "andnq", &BX_CPU_C::LOAD_Eq, &BX_CPU_C::ANDN_GqBqEqR, BX_ISA_BMI1, OP_Gq, OP_Bq, OP_Eq, OP_NONE, 0)

View File

@ -1775,6 +1775,87 @@ BX_CPP_INLINE void xmm_pdpbusds(BxPackedXmmRegister *dst, BxPackedXmmRegister *o
}
}
BX_CPP_INLINE void xmm_pdpbssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)
{
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32s) op2->xmmsbyte(n*4);
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
dst->xmm32s(n) += (p1word + p2word + p3word + p4word);
}
}
BX_CPP_INLINE void xmm_pdpbssds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)
{
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32s) op2->xmmsbyte(n*4);
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word);
dst->xmm32s(n) = SaturateQwordSToDwordS(result);
}
}
BX_CPP_INLINE void xmm_pdpbsud(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)
{
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
dst->xmm32s(n) += (p1word + p2word + p3word + p4word);
}
}
BX_CPP_INLINE void xmm_pdpbsuds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)
{
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word);
dst->xmm32s(n) = SaturateQwordSToDwordS(result);
}
}
BX_CPP_INLINE void xmm_pdpbuud(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)
{
Bit32u p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
Bit32u p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
Bit32u p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
Bit32u p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
dst->xmm32u(n) += (p1word + p2word + p3word + p4word);
}
}
BX_CPP_INLINE void xmm_pdpbuuds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)
{
Bit32u p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
Bit32u p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
Bit32u p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
Bit32u p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
Bit64u result = (Bit64u) dst->xmm32u(n) + (p1word + p2word + p3word + p4word);
dst->xmm32u(n) = SaturateQwordUToDwordU(result);
}
}
BX_CPP_INLINE void xmm_pdpwssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
{
for(unsigned n=0; n<4; n++)