From a56144833a00c70eaa497c91a691fd7a7776623c Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Sun, 2 Oct 2022 23:00:46 +0300 Subject: [PATCH 1/5] add support for AVX encoded VNNI INT8 extensions --- bochs/cpu/avx/vnni.cc | 7 +++ bochs/cpu/cpu.h | 8 +++ bochs/cpu/cpuid.cc | 1 + bochs/cpu/decoder/decoder.h | 1 + bochs/cpu/decoder/fetchdecode_avx.h | 16 +++++- bochs/cpu/decoder/ia_opcodes.def | 9 ++++ bochs/cpu/simd_int.h | 81 +++++++++++++++++++++++++++++ 7 files changed, 121 insertions(+), 2 deletions(-) diff --git a/bochs/cpu/avx/vnni.cc b/bochs/cpu/avx/vnni.cc index d6e18acff..54ca46927 100644 --- a/bochs/cpu/avx/vnni.cc +++ b/bochs/cpu/avx/vnni.cc @@ -49,6 +49,13 @@ AVX_3OP_VNNI(VPDPBUSDS_VdqHdqWdqR, xmm_pdpbusds) AVX_3OP_VNNI(VPDPWSSD_VdqHdqWdqR, xmm_pdpwssd) AVX_3OP_VNNI(VPDPWSSDS_VdqHdqWdqR, xmm_pdpwssds) +AVX_3OP_VNNI(VPDPBSSD_VdqHdqWdqR, xmm_pdpbssd) +AVX_3OP_VNNI(VPDPBSSDS_VdqHdqWdqR, xmm_pdpbssds) +AVX_3OP_VNNI(VPDPBSUD_VdqHdqWdqR, xmm_pdpbsud) +AVX_3OP_VNNI(VPDPBSUDS_VdqHdqWdqR, xmm_pdpbsuds) +AVX_3OP_VNNI(VPDPBUUD_VdqHdqWdqR, xmm_pdpbuud) +AVX_3OP_VNNI(VPDPBUUDS_VdqHdqWdqR, xmm_pdpbuuds) + #endif #if BX_SUPPORT_EVEX diff --git a/bochs/cpu/cpu.h b/bochs/cpu/cpu.h index 63be62a55..5463f81ec 100644 --- a/bochs/cpu/cpu.h +++ b/bochs/cpu/cpu.h @@ -3091,6 +3091,14 @@ public: // for now... BX_SMF void VPDPWSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPDPWSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + /* AVX encoded VNNI INT8 instructions */ + BX_SMF void VPDPBSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + BX_SMF void VPDPBSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + BX_SMF void VPDPBSUD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + BX_SMF void VPDPBSUDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + BX_SMF void VPDPBUUD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + BX_SMF void VPDPBUUDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + // AVX512 OPMASK instructions (VEX encoded) BX_SMF void KADDB_KGbKHbKEbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void KANDB_KGbKHbKEbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); diff --git a/bochs/cpu/cpuid.cc b/bochs/cpu/cpuid.cc index e3e386fe9..ae03a6340 100644 --- a/bochs/cpu/cpuid.cc +++ b/bochs/cpu/cpuid.cc @@ -114,6 +114,7 @@ static const char *cpu_feature_name[] = "avx512vp2intersect", // BX_ISA_AVX512_VP2INTERSECT "avx_vnni", // BX_ISA_AVX_VNNI "avx_ifma", // BX_ISA_AVX_IFMA + "avx_vnni_int8", // BX_ISA_AVX_VNNI_INT8 "xapic", // BX_ISA_XAPIC "x2apic", // BX_ISA_X2APIC "xapicext", // BX_ISA_XAPICEXT diff --git a/bochs/cpu/decoder/decoder.h b/bochs/cpu/decoder/decoder.h index 0e3cf0734..3ce99ab61 100644 --- a/bochs/cpu/decoder/decoder.h +++ b/bochs/cpu/decoder/decoder.h @@ -109,6 +109,7 @@ enum x86_feature_name { BX_ISA_AVX512_VP2INTERSECT, /* AVX-512 VP2INTERSECT Instructions */ BX_ISA_AVX_VNNI, /* AVX encoded VNNI Instructions */ BX_ISA_AVX_IFMA, /* AVX encoded IFMA Instructions */ + BX_ISA_AVX_VNNI_INT8, /* AVX encoded VNNI-INT8 Instructions */ BX_ISA_XAPIC, /* XAPIC support */ BX_ISA_X2APIC, /* X2APIC support */ BX_ISA_XAPIC_EXT, /* XAPIC Extensions support */ diff --git a/bochs/cpu/decoder/fetchdecode_avx.h b/bochs/cpu/decoder/fetchdecode_avx.h index 5573c0b2f..5f1f52d34 100644 --- a/bochs/cpu/decoder/fetchdecode_avx.h +++ b/bochs/cpu/decoder/fetchdecode_avx.h @@ -994,8 +994,20 @@ static const Bit64u BxOpcodeGroup_VEX_0F3847[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W1, BX_IA_VPSLLVQ_VdqHdqWdq) }; -static const Bit64u BxOpcodeGroup_VEX_0F3850[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq) }; -static const Bit64u BxOpcodeGroup_VEX_0F3851[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq) }; +static const Bit64u BxOpcodeGroup_VEX_0F3850[] = { + form_opcode(ATTR_NO_SSE_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUD_VdqHdqWdq), + form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq), + form_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0, BX_IA_VPDPBSSD_VdqHdqWdq), + last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUD_VdqHdqWdq) +}; + +static const Bit64u BxOpcodeGroup_VEX_0F3851[] = { + form_opcode(ATTR_NO_SSE_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUDS_VdqHdqWdq), + form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq), + form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSSDS_VdqHdqWdq), + last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUDS_VdqHdqWdq) +}; + static const Bit64u BxOpcodeGroup_VEX_0F3852[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPWSSD_VdqHdqWdq) }; static const Bit64u BxOpcodeGroup_VEX_0F3853[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPWSSDS_VdqHdqWdq) }; diff --git a/bochs/cpu/decoder/ia_opcodes.def b/bochs/cpu/decoder/ia_opcodes.def index 273746c8f..bd8ea7de6 100644 --- a/bochs/cpu/decoder/ia_opcodes.def +++ b/bochs/cpu/decoder/ia_opcodes.def @@ -2421,6 +2421,15 @@ bx_define_opcode(BX_IA_VPMADD52LUQ_VdqHdqWdq, "vpmadd52luq", "vpmadd52luq", &BX_ bx_define_opcode(BX_IA_VPMADD52HUQ_VdqHdqWdq, "vpmadd52huq", "vpmadd52huq", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR, BX_ISA_AVX_IFMA, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) // AVX IFMA +// AVX VNNI INT8 +bx_define_opcode(BX_IA_VPDPBSSD_VdqHdqWdq, "vpdpbssd", "vpdpbssd", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSSD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) +bx_define_opcode(BX_IA_VPDPBSSDS_VdqHdqWdq, "vpdpbssds", "vpdpbssds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSSDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) +bx_define_opcode(BX_IA_VPDPBSUD_VdqHdqWdq, "vpdpbsud", "vpdpbsud", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSUD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) +bx_define_opcode(BX_IA_VPDPBSUDS_VdqHdqWdq, "vpdpbsuds", "vpdpbsuds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSUDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) +bx_define_opcode(BX_IA_VPDPBUUD_VdqHdqWdq, "vpdpbuud", "vpdpbuud", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBUUD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) +bx_define_opcode(BX_IA_VPDPBUUDS_VdqHdqWdq, "vpdpbuuds", "vpdpbuuds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBUUDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX) +// AVX VNNI INT8 + // BMI1 - VexW64 aliased bx_define_opcode(BX_IA_ANDN_GdBdEd, "andn", "andnl", &BX_CPU_C::LOAD_Ed, &BX_CPU_C::ANDN_GdBdEdR, BX_ISA_BMI1, OP_Gd, OP_Bd, OP_Ed, OP_NONE, 0) bx_define_opcode(BX_IA_ANDN_GqBqEq, "andn", "andnq", &BX_CPU_C::LOAD_Eq, &BX_CPU_C::ANDN_GqBqEqR, BX_ISA_BMI1, OP_Gq, OP_Bq, OP_Eq, OP_NONE, 0) diff --git a/bochs/cpu/simd_int.h b/bochs/cpu/simd_int.h index fb97611cf..6fda7deea 100644 --- a/bochs/cpu/simd_int.h +++ b/bochs/cpu/simd_int.h @@ -1775,6 +1775,87 @@ BX_CPP_INLINE void xmm_pdpbusds(BxPackedXmmRegister *dst, BxPackedXmmRegister *o } } +BX_CPP_INLINE void xmm_pdpbssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) +{ + for(unsigned n=0; n<4; n++) + { + Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32s) op2->xmmsbyte(n*4); + Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1); + Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2); + Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3); + + dst->xmm32s(n) += (p1word + p2word + p3word + p4word); + } +} + +BX_CPP_INLINE void xmm_pdpbssds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) +{ + for(unsigned n=0; n<4; n++) + { + Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32s) op2->xmmsbyte(n*4); + Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1); + Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2); + Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3); + + Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word); + dst->xmm32s(n) = SaturateQwordSToDwordS(result); + } +} + +BX_CPP_INLINE void xmm_pdpbsud(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) +{ + for(unsigned n=0; n<4; n++) + { + Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32u) op2->xmmubyte(n*4); + Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1); + Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2); + Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3); + + dst->xmm32s(n) += (p1word + p2word + p3word + p4word); + } +} + +BX_CPP_INLINE void xmm_pdpbsuds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) +{ + for(unsigned n=0; n<4; n++) + { + Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32u) op2->xmmubyte(n*4); + Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1); + Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2); + Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3); + + Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word); + dst->xmm32s(n) = SaturateQwordSToDwordS(result); + } +} + +BX_CPP_INLINE void xmm_pdpbuud(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) +{ + for(unsigned n=0; n<4; n++) + { + Bit32u p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32u) op2->xmmubyte(n*4); + Bit32u p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1); + Bit32u p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2); + Bit32u p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3); + + dst->xmm32u(n) += (p1word + p2word + p3word + p4word); + } +} + +BX_CPP_INLINE void xmm_pdpbuuds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) +{ + for(unsigned n=0; n<4; n++) + { + Bit32u p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32u) op2->xmmubyte(n*4); + Bit32u p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1); + Bit32u p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2); + Bit32u p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3); + + Bit64u result = (Bit64u) dst->xmm32u(n) + (p1word + p2word + p3word + p4word); + dst->xmm32u(n) = SaturateQwordUToDwordU(result); + } +} + BX_CPP_INLINE void xmm_pdpwssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2) { for(unsigned n=0; n<4; n++) From 4aed72e0ef022815c7dcdd4c462e44f1634b0b2e Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Sun, 2 Oct 2022 23:07:05 +0300 Subject: [PATCH 2/5] fix issue with AVX IFMA when EVEX is not compiled in --- bochs/cpu/cpu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bochs/cpu/cpu.h b/bochs/cpu/cpu.h index 5463f81ec..55e3b8130 100644 --- a/bochs/cpu/cpu.h +++ b/bochs/cpu/cpu.h @@ -3091,6 +3091,10 @@ public: // for now... BX_SMF void VPDPWSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPDPWSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + /* AVX encoded IFMA instructions */ + BX_SMF void VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + BX_SMF void VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); + /* AVX encoded VNNI INT8 instructions */ BX_SMF void VPDPBSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPDPBSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); @@ -3756,9 +3760,7 @@ public: // for now... BX_SMF void VPMOVD2M_KGwWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPMOVQ2M_KGbWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); - BX_SMF void VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); - BX_SMF void VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); BX_SMF void VPMULTISHIFTQB_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1); From 63ed4477178c6fc8a5ab328f30b9bea5573769c9 Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Sun, 2 Oct 2022 23:09:41 +0300 Subject: [PATCH 3/5] fixed compilation --- bochs/cpu/decoder/fetchdecode_avx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bochs/cpu/decoder/fetchdecode_avx.h b/bochs/cpu/decoder/fetchdecode_avx.h index 5f1f52d34..d2c42c400 100644 --- a/bochs/cpu/decoder/fetchdecode_avx.h +++ b/bochs/cpu/decoder/fetchdecode_avx.h @@ -995,14 +995,14 @@ static const Bit64u BxOpcodeGroup_VEX_0F3847[] = { }; static const Bit64u BxOpcodeGroup_VEX_0F3850[] = { - form_opcode(ATTR_NO_SSE_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUD_VdqHdqWdq), + form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUD_VdqHdqWdq), form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq), form_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0, BX_IA_VPDPBSSD_VdqHdqWdq), last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUD_VdqHdqWdq) }; static const Bit64u BxOpcodeGroup_VEX_0F3851[] = { - form_opcode(ATTR_NO_SSE_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUDS_VdqHdqWdq), + form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUDS_VdqHdqWdq), form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq), form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSSDS_VdqHdqWdq), last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUDS_VdqHdqWdq) From c47b5ff5cb7ad5b3aef17b2b3106a1dddc692157 Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Sun, 2 Oct 2022 23:24:00 +0300 Subject: [PATCH 4/5] extract IFMA52 code to separate file --- bochs/cpu/avx/Makefile.in | 11 +++- bochs/cpu/avx/avx512.cc | 84 ------------------------- bochs/cpu/avx/avx_ifma52.cc | 120 ++++++++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 85 deletions(-) create mode 100644 bochs/cpu/avx/avx_ifma52.cc diff --git a/bochs/cpu/avx/Makefile.in b/bochs/cpu/avx/Makefile.in index 03edf0f42..c0153f367 100644 --- a/bochs/cpu/avx/Makefile.in +++ b/bochs/cpu/avx/Makefile.in @@ -46,6 +46,7 @@ AVX_OBJS = \ avx_pfp.o \ avx_cvt.o \ avx_fma.o \ + avx_ifma52.o \ avx2.o \ avx512.o \ avx512_broadcast.o \ @@ -117,7 +118,7 @@ avx512.o: avx512.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \ ../fpu/status_w.h ../fpu/control_w.h ../crregs.h ../descriptor.h \ ../decoder/instr.h ../lazy_flags.h ../tlb.h ../icache.h ../apic.h \ ../xmm.h ../vmx.h ../svm.h ../cpuid.h ../stack.h ../access.h \ - ../simd_int.h ../simd_compare.h ../wide_int.h + ../simd_int.h ../simd_compare.h avx512_bitalg.o: avx512_bitalg.@CPP_SUFFIX@ ../../bochs.h ../../config.h \ ../../osdep.h ../../bx_debug/debug.h ../../config.h ../../osdep.h \ ../../cpu/decoder/decoder.h ../../gui/paramtree.h ../../logio.h \ @@ -242,6 +243,14 @@ avx_fma.o: avx_fma.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \ ../decoder/instr.h ../lazy_flags.h ../tlb.h ../icache.h ../apic.h \ ../xmm.h ../vmx.h ../svm.h ../cpuid.h ../stack.h ../access.h \ ../simd_pfp.h +avx_ifma52.o: avx_ifma52.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \ + ../../bx_debug/debug.h ../../config.h ../../osdep.h \ + ../../cpu/decoder/decoder.h ../../gui/paramtree.h ../../logio.h \ + ../../instrument/stubs/instrument.h ../cpu.h \ + ../decoder/decoder.h ../i387.h ../fpu/softfloat.h ../fpu/tag_w.h \ + ../fpu/status_w.h ../fpu/control_w.h ../crregs.h ../descriptor.h \ + ../decoder/instr.h ../lazy_flags.h ../tlb.h ../icache.h ../apic.h \ + ../xmm.h ../vmx.h ../svm.h ../cpuid.h ../stack.h ../access.h ../wide_int.h avx_pfp.o: avx_pfp.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \ ../../bx_debug/debug.h ../../config.h ../../osdep.h \ ../../cpu/decoder/decoder.h ../../gui/paramtree.h ../../logio.h \ diff --git a/bochs/cpu/avx/avx512.cc b/bochs/cpu/avx/avx512.cc index 38b1ce22c..41b34068d 100644 --- a/bochs/cpu/avx/avx512.cc +++ b/bochs/cpu/avx/avx512.cc @@ -30,7 +30,6 @@ #include "simd_int.h" #include "simd_compare.h" -#include "wide_int.h" // compare @@ -2271,89 +2270,6 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMULTISHIFTQB_MASK_VdqHdqWdqR(bxInstructi BX_NEXT_INSTR(i); } -// 52-bit integer FMA - -BX_CPP_INLINE Bit64u pmadd52luq_scalar(Bit64u dst, Bit64u op1, Bit64u op2) -{ - op1 &= BX_CONST64(0x000fffffffffffff); - op2 &= BX_CONST64(0x000fffffffffffff); - - return dst + ((op1 * op2) & BX_CONST64(0x000fffffffffffff)); -} - -BX_CPP_INLINE Bit64u pmadd52huq_scalar(Bit64u dst, Bit64u op1, Bit64u op2) -{ - op1 &= BX_CONST64(0x000fffffffffffff); - op2 &= BX_CONST64(0x000fffffffffffff); - - Bit128u product_128; - long_mul(&product_128, op1, op2); - - Bit64u temp = (product_128.lo >> 52) | ((product_128.hi & BX_CONST64(0x000000ffffffffff)) << 12); - - return dst + temp; -} - -void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *i) -{ - BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); - unsigned len = i->getVL(); - - for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) { - dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); - } - - BX_WRITE_AVX_REGZ(i->dst(), dst, len); - BX_NEXT_INSTR(i); -} - -void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *i) -{ - BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); - Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask()); - unsigned len = i->getVL(); - - for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) { - if (tmp_mask & 0x1) - dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); - else if (i->isZeroMasking()) - dst.vmm64u(n) = 0; - } - - BX_WRITE_AVX_REGZ(i->dst(), dst, len); - BX_NEXT_INSTR(i); -} - -void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *i) -{ - BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); - unsigned len = i->getVL(); - - for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) { - dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); - } - - BX_WRITE_AVX_REGZ(i->dst(), dst, len); - BX_NEXT_INSTR(i); -} - -void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *i) -{ - BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); - Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask()); - unsigned len = i->getVL(); - - for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) { - if (tmp_mask & 0x1) - dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); - else if (i->isZeroMasking()) - dst.vmm64u(n) = 0; - } - - BX_WRITE_AVX_REGZ(i->dst(), dst, len); - BX_NEXT_INSTR(i); -} - void BX_CPP_AttrRegparmN(1) BX_CPU_C::VP2INTERSECTD_KGqHdqWdqR(bxInstruction_c *i) { BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()); diff --git a/bochs/cpu/avx/avx_ifma52.cc b/bochs/cpu/avx/avx_ifma52.cc new file mode 100644 index 000000000..7878f7bd9 --- /dev/null +++ b/bochs/cpu/avx/avx_ifma52.cc @@ -0,0 +1,120 @@ +///////////////////////////////////////////////////////////////////////// +// $Id$ +///////////////////////////////////////////////////////////////////////// +// +// Copyright (c) 2011-2018 Stanislav Shwartsman +// Written by Stanislav Shwartsman [sshwarts at sourceforge net] +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA +// +///////////////////////////////////////////////////////////////////////// + +#define NEED_CPU_REG_SHORTCUTS 1 +#include "bochs.h" +#include "cpu.h" +#define LOG_THIS BX_CPU_THIS_PTR + +#if BX_SUPPORT_AVX || BX_SUPPORT_EVEX + +#include "wide_int.h" + +// 52-bit integer FMA + +BX_CPP_INLINE Bit64u pmadd52luq_scalar(Bit64u dst, Bit64u op1, Bit64u op2) +{ + op1 &= BX_CONST64(0x000fffffffffffff); + op2 &= BX_CONST64(0x000fffffffffffff); + + return dst + ((op1 * op2) & BX_CONST64(0x000fffffffffffff)); +} + +BX_CPP_INLINE Bit64u pmadd52huq_scalar(Bit64u dst, Bit64u op1, Bit64u op2) +{ + op1 &= BX_CONST64(0x000fffffffffffff); + op2 &= BX_CONST64(0x000fffffffffffff); + + Bit128u product_128; + long_mul(&product_128, op1, op2); + + Bit64u temp = (product_128.lo >> 52) | ((product_128.hi & BX_CONST64(0x000000ffffffffff)) << 12); + + return dst + temp; +} + +void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *i) +{ + BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); + unsigned len = i->getVL(); + + for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) { + dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); + } + + BX_WRITE_AVX_REGZ(i->dst(), dst, len); + BX_NEXT_INSTR(i); +} + +void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *i) +{ + BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); + unsigned len = i->getVL(); + + for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) { + dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); + } + + BX_WRITE_AVX_REGZ(i->dst(), dst, len); + BX_NEXT_INSTR(i); +} + +#endif + +#if BX_SUPPORT_EVEX + +void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *i) +{ + BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); + Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask()); + unsigned len = i->getVL(); + + for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) { + if (tmp_mask & 0x1) + dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); + else if (i->isZeroMasking()) + dst.vmm64u(n) = 0; + } + + BX_WRITE_AVX_REGZ(i->dst(), dst, len); + BX_NEXT_INSTR(i); +} + +void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *i) +{ + BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst()); + Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask()); + unsigned len = i->getVL(); + + for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) { + if (tmp_mask & 0x1) + dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n)); + else if (i->isZeroMasking()) + dst.vmm64u(n) = 0; + } + + BX_WRITE_AVX_REGZ(i->dst(), dst, len); + BX_NEXT_INSTR(i); +} + +#endif From aa84121edeec88e75069b9946cc9296f2aab4b5c Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Sun, 2 Oct 2022 23:26:57 +0300 Subject: [PATCH 5/5] changes (c) date --- bochs/cpu/avx/avx_ifma52.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bochs/cpu/avx/avx_ifma52.cc b/bochs/cpu/avx/avx_ifma52.cc index 7878f7bd9..faf8f1e11 100644 --- a/bochs/cpu/avx/avx_ifma52.cc +++ b/bochs/cpu/avx/avx_ifma52.cc @@ -2,7 +2,7 @@ // $Id$ ///////////////////////////////////////////////////////////////////////// // -// Copyright (c) 2011-2018 Stanislav Shwartsman +// Copyright (c) 2022 Stanislav Shwartsman // Written by Stanislav Shwartsman [sshwarts at sourceforge net] // // This library is free software; you can redistribute it and/or