implemented AMX_FP16 and aMX_COMPLEX, fixes for daz handling in AVX_NE_CONVERT FB16

updated CHANGES
This commit is contained in:
Shwartsman 2024-01-12 12:03:11 +02:00
parent 167942816a
commit 6f4f217a08
9 changed files with 234 additions and 22 deletions

View File

@ -4,14 +4,15 @@ The Bochs source tree is transitioning from SVN to GIT hosted on github (https:/
We welcome every new contributor !
Brief summary :
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, AVX-512, CET, SHA, GFNI fixes)
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, x87, AVX-512, CET, SHA, GFNI fixes)
! Implemented VMX MBE (Mode Based Execution Control) emulation required for Windows 11 guest
! Implemented Posted-Interrupt Processing VMX extension emulation
! Implemented Linear Address Separation (LASS) extension
! Implemented 57-bit Linear Address and 5-Level Paging support
! Implemented User-Level Interrupt (UINTR) extension
! Implemented Intel AMX extensions (AMX, AMX_INT8, AMX_BF16, AMX_FP16, AMX_COMPLEX)
! Implemented Intel instruction sets:
- MOVDIRI/MOVDIR64B, AMX, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
- MOVDIRI/MOVDIR64B, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
! CPUID: Added Xeon Sapphire Rapids CPU definition
- Improved 64-bit guest support in Bochs internal debugger, added new internal debugger commands
- Bochs debugger enhanced with new commands (setpmem, loadmem, deref, ...)
@ -33,17 +34,18 @@ Detailed change log :
- Improved parsing bochsrc options passed on the command line.
- CPU/CPUDB
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, AVX-512, CET)
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, x87, AVX-512, CET)
- Critical CPU emulation bugfixes for SHA and GFNI instructions
- Implemented VMX MBE (Mode Based Execution Control) emulation required for Windows 11 guest
- Implemented Posted-Interrupt Processing VMX extension emulation
- Implemented Linear Address Separation (LASS) extension
- Implemented 57-bit Linear Address and 5-Level Paging support
- Implemented User-Level Interrupt (UINTR) extension
- Implemented Intel AMX extensions (AMX, AMX_INT8, AMX_BF16, AMX_FP16, AMX_COMPLEX)
- Implemented Intel instruction sets:
- MOVDIRI/MOVDIR64B, AMX, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
- MOVDIRI/MOVDIR64B, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
- CPUID: Added Xeon Sapphire Rapids CPU definition
- Features PKS, WAITPKG, UINTR, AVX-VNNI, AVX512_BF16, MOVDIRI/MOVDIR64, LA57, SERIALIZE and more
- Features AMX/AMX_INT8/AMX_BF16, PKS, WAITPKG, UINTR, AVX-VNNI, AVX512_BF16, MOVDIRI/MOVDIR64, LA57, SERIALIZE and more
Not yet supported but will be added in future: AVX512_FP16, VMX Extensions (HLAT, IPI Virtualization)
- Bochs Debugger and Instrumentation

View File

@ -204,6 +204,28 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILESTORED_MdqTnnn(bxInstruction_c *i)
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILEZERO_Tnnn(bxInstruction_c *i)
{
unsigned tile = i->dst();
if (tile >= BX_TILE_REGISTERS || ! BX_CPU_THIS_PTR amx->tile_valid(tile)) {
BX_ERROR(("TILEZERO: invalid tile %d", tile));
exception(BX_UD_EXCEPTION, 0);
}
BX_CPU_THIS_PTR amx->clear_tile_used(tile);
BX_CPU_THIS_PTR amx->tile[tile].clear();
BX_CPU_THIS_PTR amx->restart();
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILERELEASE(bxInstruction_c *i)
{
BX_CPU_THIS_PTR amx->clear();
BX_NEXT_INSTR(i);
}
void BX_CPU_C::check_tiles(bxInstruction_c *i, unsigned tile_dst, unsigned tile_src1, unsigned tile_src2)
{
// #UD if srcdest == src1 OR src1 == src2 OR srcdest == src2
@ -276,6 +298,8 @@ void BX_CPU_C::check_tiles(bxInstruction_c *i, unsigned tile_dst, unsigned tile_
}
}
// AMX-INT8 //
BX_CPP_INLINE Bit32u DPBDSS(Bit32u x, Bit32u y)
{
const Bit8u xbyte[4] = { Bit8u(x & 0xff), Bit8u((x >> 8) & 0xff), Bit8u((x >> 16) & 0xff), Bit8u(x >> 24) };
@ -460,6 +484,8 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPBUUD_TnnnTrmTreg(bxInstruction_c *i)
BX_NEXT_INSTR(i);
}
// AMX-BF16 //
#include "bf16.h"
extern float_status_t prepare_ne_softfloat_status_helper();
@ -481,7 +507,10 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPBF16PS_TnnnTrmTreg(bxInstruction_c *i)
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
// output denormals are always flushed to zero and input denormals are always treated as zero.
float_status_t status = prepare_ne_softfloat_status_helper();
status.denormals_are_zeros = true;
for (unsigned m=0; m < max_m; m++) {
float32 tmp[32]; // new empty array
@ -512,25 +541,166 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPBF16PS_TnnnTrmTreg(bxInstruction_c *i)
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILEZERO_Tnnn(bxInstruction_c *i)
{
unsigned tile = i->dst();
// AMX-FP16 //
if (tile >= BX_TILE_REGISTERS || ! BX_CPU_THIS_PTR amx->tile_valid(tile)) {
BX_ERROR(("TILEZERO: invalid tile %d", tile));
exception(BX_UD_EXCEPTION, 0);
extern float32 convert_ne_fp16_to_fp32(float16 op);
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPFP16PS_TnnnTrmTreg(bxInstruction_c *i)
{
unsigned tile_dst = i->dst(), tile_src1 = i->src1(), tile_src2 = i->src2();
check_tiles(i, tile_dst, tile_src1, tile_src2);
// R C
// A = m x k (tsrc1)
// B = k x n (tsrc2)
// C = m x n (tsrcdest)
unsigned max_n = BX_CPU_THIS_PTR amx->tile_bytes_per_row(tile_dst) / 4;
unsigned max_m = BX_CPU_THIS_PTR amx->tile_num_rows(tile_dst);
unsigned max_k = BX_CPU_THIS_PTR amx->tile_num_rows(tile_src2);
AMX::TILE *tdst = &(BX_CPU_THIS_PTR amx->tile[tile_dst]);
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
// output FP32 denormals are always flushed to zero and input denormals are always treated as zero.
float_status_t status = prepare_ne_softfloat_status_helper();
status.denormals_are_zeros = true;
for (unsigned m=0; m < max_m; m++) {
float32 tmp[32]; // new empty array
for (unsigned n=0; n < 32; n++) tmp[n] = 0;
for (unsigned k=0; k < max_k; k++) {
for (unsigned n=0; n < max_n; n++) {
tmp[2*n] = float32_fmadd(convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k)),
convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n)), tmp[2*n], status);
tmp[2*n+1] = float32_fmadd(convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k+1)),
convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n+1)), tmp[2*n+1], status);
}
}
for (unsigned n=0; n < max_n; n++) {
float32 tmpf32 = float32_add(tmp[2*n], tmp[2*n+1], status);
tdst->row[m].vmm32u(n) = float32_add(tdst->row[m].vmm32u(n), tmpf32, status);
}
tdst->zero_upper_row_data32(m, max_n);
}
BX_CPU_THIS_PTR amx->clear_tile_used(tile);
BX_CPU_THIS_PTR amx->tile[tile].clear();
BX_CPU_THIS_PTR amx->set_tile_used(tile_dst);
BX_CPU_THIS_PTR amx->tile[tile_dst].clear_upper_rows(max_m);
BX_CPU_THIS_PTR amx->restart();
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILERELEASE(bxInstruction_c *i)
// AMX-COMPLEX //
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TCMMRLFP16PS_TnnnTrmTreg(bxInstruction_c *i)
{
BX_CPU_THIS_PTR amx->clear();
unsigned tile_dst = i->dst(), tile_src1 = i->src1(), tile_src2 = i->src2();
check_tiles(i, tile_dst, tile_src1, tile_src2);
// R C
// A = m x k (tsrc1)
// B = k x n (tsrc2)
// C = m x n (tsrcdest)
unsigned max_n = BX_CPU_THIS_PTR amx->tile_bytes_per_row(tile_dst) / 4;
unsigned max_m = BX_CPU_THIS_PTR amx->tile_num_rows(tile_dst);
unsigned max_k = BX_CPU_THIS_PTR amx->tile_num_rows(tile_src2);
AMX::TILE *tdst = &(BX_CPU_THIS_PTR amx->tile[tile_dst]);
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
// output FP32 denormals are always flushed to zero and input denormals are always treated as zero.
float_status_t status = prepare_ne_softfloat_status_helper();
status.denormals_are_zeros = true;
for (unsigned m=0; m < max_m; m++) {
float32 tmp[32]; // new empty array
for (unsigned n=0; n < 32; n++) tmp[n] = 0;
for (unsigned k=0; k < max_k; k++) {
for (unsigned n=0; n < max_n; n++) {
float32 s1r = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k)); // real
float32 s2r = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n)); // real
float32 s1i = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k+1)); // imaginary
float32 s2i = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n+1)); // imaginary
tmp[2*n] = float32_muladd(s1r, s2r, tmp[2*n], 0, status); // real
tmp[2*n+1] = float32_muladd(s1i, s2i, tmp[2*n+1], float_muladd_negate_product, status); // imaginary, negate for i^2 = -1
}
}
for (unsigned n=0; n < max_n; n++) {
float32 tmpf32 = float32_add(tmp[2*n], tmp[2*n+1], status);
tdst->row[m].vmm32u(n) = float32_add(tdst->row[m].vmm32u(n), tmpf32, status);
}
tdst->zero_upper_row_data32(m, max_n);
}
BX_CPU_THIS_PTR amx->set_tile_used(tile_dst);
BX_CPU_THIS_PTR amx->tile[tile_dst].clear_upper_rows(max_m);
BX_CPU_THIS_PTR amx->restart();
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TCMMIMFP16PS_TnnnTrmTreg(bxInstruction_c *i)
{
unsigned tile_dst = i->dst(), tile_src1 = i->src1(), tile_src2 = i->src2();
check_tiles(i, tile_dst, tile_src1, tile_src2);
// R C
// A = m x k (tsrc1)
// B = k x n (tsrc2)
// C = m x n (tsrcdest)
unsigned max_n = BX_CPU_THIS_PTR amx->tile_bytes_per_row(tile_dst) / 4;
unsigned max_m = BX_CPU_THIS_PTR amx->tile_num_rows(tile_dst);
unsigned max_k = BX_CPU_THIS_PTR amx->tile_num_rows(tile_src2);
AMX::TILE *tdst = &(BX_CPU_THIS_PTR amx->tile[tile_dst]);
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
// output FP32 denormals are always flushed to zero and input denormals are always treated as zero.
float_status_t status = prepare_ne_softfloat_status_helper();
status.denormals_are_zeros = true;
for (unsigned m=0; m < max_m; m++) {
float32 tmp[32]; // new empty array
for (unsigned n=0; n < 32; n++) tmp[n] = 0;
for (unsigned k=0; k < max_k; k++) {
for (unsigned n=0; n < max_n; n++) {
float32 s1r = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k)); // real
float32 s2r = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n)); // real
float32 s1i = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k+1)); // imaginary
float32 s2i = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n+1)); // imaginary
tmp[2*n] = float32_muladd(s1i, s2r, tmp[2*n], 0, status);
tmp[2*n+1] = float32_muladd(s1r, s2i, tmp[2*n+1], 0, status);
}
}
for (unsigned n=0; n < max_n; n++) {
float32 tmpf32 = float32_add(tmp[2*n], tmp[2*n+1], status);
tdst->row[m].vmm32u(n) = float32_add(tdst->row[m].vmm32u(n), tmpf32, status);
}
tdst->zero_upper_row_data32(m, max_n);
}
BX_CPU_THIS_PTR amx->set_tile_used(tile_dst);
BX_CPU_THIS_PTR amx->tile[tile_dst].clear_upper_rows(max_m);
BX_CPU_THIS_PTR amx->restart();
BX_NEXT_INSTR(i);
}

View File

@ -86,7 +86,10 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VDPBF16PS_MASK_VpsHdqWdqR(bxInstruction_c
unsigned len = i->getVL();
Bit32u mask = (i->opmask() != 0) ? BX_READ_16BIT_OPMASK(i->opmask()) : 0xffff;
static float_status_t status = prepare_ne_softfloat_status_helper();
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
// output denormals are always flushed to zero and input denormals are always treated as zero.
float_status_t status = prepare_ne_softfloat_status_helper();
status.denormals_are_zeros = true;
for (unsigned n=0, tmp_mask = mask; n < DWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
if (tmp_mask & 0x1) {

View File

@ -29,8 +29,8 @@
#if BX_SUPPORT_AVX
// FP32: s|eeeeeeee|mmmmmmmmmmmmmmmmmmmmmmm
// BF16: s|eeeeeeee|mmmmmmmm
// F16: s|eeeee|mmmmmmmmmmm
// BF16: s|eeeeeeee|mmmmmmm
// F16: s|eeeee|mmmmmmmmmm
float_status_t prepare_ne_softfloat_status_helper()
{
@ -42,12 +42,13 @@ float_status_t prepare_ne_softfloat_status_helper()
status.float_suppress_exception = float_all_exceptions_mask;
status.float_nan_handling_mode = float_first_operand_nan;
status.flush_underflow_to_zero = true;
status.denormals_are_zeros = true;
// input denormals not converted to zero and handled normally
status.denormals_are_zeros = false;
return status;
}
static float32 convert_ne_fp16_to_fp32(float16 op)
float32 convert_ne_fp16_to_fp32(float16 op)
{
static float_status_t status = prepare_ne_softfloat_status_helper();
return float16_to_float32(op, status);

View File

@ -3601,6 +3601,9 @@ public: // for now...
BX_SMF void TDPBUSD_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TDPBUUD_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TDPBF16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TDPFP16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TCMMRLFP16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TCMMIMFP16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TILEZERO_Tnnn(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
BX_SMF void TILERELEASE(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
#endif

View File

@ -1166,7 +1166,15 @@ Bit32u bx_cpuid_t::get_std_cpuid_leaf_7_subleaf_1_eax(Bit32u extra) const
eax |= BX_CPUID_STD7_SUBLEAF1_EAX_WRMSRNS;
// [20:20] reserved
// [21:21] AMX-FB16 support
// [21:21] AMX-FP16 support
#if BX_SUPPORT_AMX
if (is_cpu_extension_supported(BX_ISA_AMX)) {
if (is_cpu_extension_supported(BX_ISA_AMX_FP16))
eax |= BX_CPUID_STD7_SUBLEAF1_EAX_AMX_FP16;
}
#endif
// [22:22] HRESET and CPUID leaf 0x20 support
// [23:23] AVX IFMA support
@ -1207,7 +1215,15 @@ Bit32u bx_cpuid_t::get_std_cpuid_leaf_7_subleaf_1_edx(Bit32u extra) const
#endif
// [7:6] reserved
// [8:8] AMX-COMPLEX instructions
#if BX_SUPPORT_AMX
if (is_cpu_extension_supported(BX_ISA_AMX)) {
if (is_cpu_extension_supported(BX_ISA_AMX_COMPLEX))
edx |= BX_CPUID_STD7_SUBLEAF1_EDX_AMX_COMPLEX;
}
#endif
// [9:9] reserved
// [10:10] AVX-VNNI-INT16 instructions

View File

@ -119,6 +119,8 @@ x86_feature(BX_ISA_AVX_NE_CONVERT, "avx_ne_convert") /* AVX-N
x86_feature(BX_ISA_AMX, "amx") /* AMX Instructions */
x86_feature(BX_ISA_AMX_INT8, "amx_int8") /* AMX-INT8 Instructions */
x86_feature(BX_ISA_AMX_BF16, "amx_bf16") /* AMX-BF16 Instructions */
x86_feature(BX_ISA_AMX_FP16, "amx_fp16") /* AMX-FP16 Instructions */
x86_feature(BX_ISA_AMX_COMPLEX, "amx_complex") /* AMX-COMPLEX Instructions */
#endif
#endif
x86_feature(BX_ISA_XAPIC, "xapic") /* XAPIC support */

View File

@ -1031,7 +1031,10 @@ static const Bit64u BxOpcodeGroup_VEX_0F3859[] = { last_opcode(ATTR_SSE_PREFIX_6
static const Bit64u BxOpcodeGroup_VEX_0F385A[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0 | ATTR_VL256 | ATTR_MOD_MEM, BX_IA_V256_VBROADCASTI128_VdqMdq) };
#if BX_SUPPORT_AMX
static const Bit64u BxOpcodeGroup_VEX_0F385C[] = { last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBF16PS_TnnnTrmTreg) };
static const Bit64u BxOpcodeGroup_VEX_0F385C[] = {
form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPFP16PS_TnnnTrmTreg),
last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBF16PS_TnnnTrmTreg)
};
static const Bit64u BxOpcodeGroup_VEX_0F385E[] = {
form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBUUD_TnnnTrmTreg),
@ -1039,6 +1042,11 @@ static const Bit64u BxOpcodeGroup_VEX_0F385E[] = {
form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBSSD_TnnnTrmTreg),
last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBSUD_TnnnTrmTreg)
};
static const Bit64u BxOpcodeGroup_VEX_0F386C[] = {
form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TCMMRLFP16PS_TnnnTrmTreg),
last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TCMMIMFP16PS_TnnnTrmTreg)
};
#endif
static const Bit64u BxOpcodeGroup_VEX_0F3872[] = { last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0, BX_IA_VCVTNEPS2BF16_Vbf16Wps) };
@ -1968,7 +1976,11 @@ static const Bit64u *BxOpcodeTableVEX[256*3] = {
/* 69 */ ( BxOpcodeGroup_ERR ),
/* 6A */ ( BxOpcodeGroup_ERR ),
/* 6B */ ( BxOpcodeGroup_ERR ),
#if BX_SUPPORT_AMX
/* 6C */ ( BxOpcodeGroup_VEX_0F386C ),
#else
/* 6C */ ( BxOpcodeGroup_ERR ),
#endif
/* 6D */ ( BxOpcodeGroup_ERR ),
/* 6E */ ( BxOpcodeGroup_ERR ),
/* 6F */ ( BxOpcodeGroup_ERR ),

View File

@ -2773,6 +2773,9 @@ bx_define_opcode(BX_IA_TDPBSUD_TnnnTrmTreg, "tdpbsud", "tdpbsud", NULL, &BX_CPU_
bx_define_opcode(BX_IA_TDPBUSD_TnnnTrmTreg, "tdpbusd", "tdpbusd", NULL, &BX_CPU_C::TDPBUSD_TnnnTrmTreg, BX_ISA_AMX_INT8, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
bx_define_opcode(BX_IA_TDPBUUD_TnnnTrmTreg, "tdpbuud", "tdpbuud", NULL, &BX_CPU_C::TDPBUUD_TnnnTrmTreg, BX_ISA_AMX_INT8, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
bx_define_opcode(BX_IA_TDPBF16PS_TnnnTrmTreg, "tdpbf16ps", "tdpbf16ps", NULL, &BX_CPU_C::TDPBF16PS_TnnnTrmTreg, BX_ISA_AMX_BF16, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
bx_define_opcode(BX_IA_TDPFP16PS_TnnnTrmTreg, "tdpfp16ps", "tdpfp16ps", NULL, &BX_CPU_C::TDPFP16PS_TnnnTrmTreg, BX_ISA_AMX_FP16, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
bx_define_opcode(BX_IA_TCMMRLFP16PS_TnnnTrmTreg, "tcmmrlfp16ps", "tcmmrlfp16ps", NULL, &BX_CPU_C::TCMMRLFP16PS_TnnnTrmTreg, BX_ISA_AMX_COMPLEX, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
bx_define_opcode(BX_IA_TCMMIMFP16PS_TnnnTrmTreg, "tcmmimfp16ps", "tcmmimfp16ps", NULL, &BX_CPU_C::TCMMIMFP16PS_TnnnTrmTreg, BX_ISA_AMX_COMPLEX, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
#endif
#if BX_SUPPORT_AVX