implemented AMX_FP16 and aMX_COMPLEX, fixes for daz handling in AVX_NE_CONVERT FB16
updated CHANGES
This commit is contained in:
parent
167942816a
commit
6f4f217a08
@ -4,14 +4,15 @@ The Bochs source tree is transitioning from SVN to GIT hosted on github (https:/
|
||||
We welcome every new contributor !
|
||||
|
||||
Brief summary :
|
||||
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, AVX-512, CET, SHA, GFNI fixes)
|
||||
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, x87, AVX-512, CET, SHA, GFNI fixes)
|
||||
! Implemented VMX MBE (Mode Based Execution Control) emulation required for Windows 11 guest
|
||||
! Implemented Posted-Interrupt Processing VMX extension emulation
|
||||
! Implemented Linear Address Separation (LASS) extension
|
||||
! Implemented 57-bit Linear Address and 5-Level Paging support
|
||||
! Implemented User-Level Interrupt (UINTR) extension
|
||||
! Implemented Intel AMX extensions (AMX, AMX_INT8, AMX_BF16, AMX_FP16, AMX_COMPLEX)
|
||||
! Implemented Intel instruction sets:
|
||||
- MOVDIRI/MOVDIR64B, AMX, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
|
||||
- MOVDIRI/MOVDIR64B, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
|
||||
! CPUID: Added Xeon Sapphire Rapids CPU definition
|
||||
- Improved 64-bit guest support in Bochs internal debugger, added new internal debugger commands
|
||||
- Bochs debugger enhanced with new commands (setpmem, loadmem, deref, ...)
|
||||
@ -33,17 +34,18 @@ Detailed change log :
|
||||
- Improved parsing bochsrc options passed on the command line.
|
||||
|
||||
- CPU/CPUDB
|
||||
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, AVX-512, CET)
|
||||
- Bugfixes for CPU emulation correctness (MONITOR/MWAIT, VMX/SVM, x87, AVX-512, CET)
|
||||
- Critical CPU emulation bugfixes for SHA and GFNI instructions
|
||||
- Implemented VMX MBE (Mode Based Execution Control) emulation required for Windows 11 guest
|
||||
- Implemented Posted-Interrupt Processing VMX extension emulation
|
||||
- Implemented Linear Address Separation (LASS) extension
|
||||
- Implemented 57-bit Linear Address and 5-Level Paging support
|
||||
- Implemented User-Level Interrupt (UINTR) extension
|
||||
- Implemented Intel AMX extensions (AMX, AMX_INT8, AMX_BF16, AMX_FP16, AMX_COMPLEX)
|
||||
- Implemented Intel instruction sets:
|
||||
- MOVDIRI/MOVDIR64B, AMX, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
|
||||
- MOVDIRI/MOVDIR64B, AVX512 BF16, AVX IFMA52, AVX-VNNI/VNNI-INT8/VNNI-INT16, AVX-NE-CONVERT, CMPCCXADD, SM3/SM4, SHA512, WRMSRNS, MSRLIST, WAITPKG, SERIALIZE
|
||||
- CPUID: Added Xeon Sapphire Rapids CPU definition
|
||||
- Features PKS, WAITPKG, UINTR, AVX-VNNI, AVX512_BF16, MOVDIRI/MOVDIR64, LA57, SERIALIZE and more
|
||||
- Features AMX/AMX_INT8/AMX_BF16, PKS, WAITPKG, UINTR, AVX-VNNI, AVX512_BF16, MOVDIRI/MOVDIR64, LA57, SERIALIZE and more
|
||||
Not yet supported but will be added in future: AVX512_FP16, VMX Extensions (HLAT, IPI Virtualization)
|
||||
|
||||
- Bochs Debugger and Instrumentation
|
||||
|
@ -204,6 +204,28 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILESTORED_MdqTnnn(bxInstruction_c *i)
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILEZERO_Tnnn(bxInstruction_c *i)
|
||||
{
|
||||
unsigned tile = i->dst();
|
||||
|
||||
if (tile >= BX_TILE_REGISTERS || ! BX_CPU_THIS_PTR amx->tile_valid(tile)) {
|
||||
BX_ERROR(("TILEZERO: invalid tile %d", tile));
|
||||
exception(BX_UD_EXCEPTION, 0);
|
||||
}
|
||||
|
||||
BX_CPU_THIS_PTR amx->clear_tile_used(tile);
|
||||
BX_CPU_THIS_PTR amx->tile[tile].clear();
|
||||
BX_CPU_THIS_PTR amx->restart();
|
||||
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILERELEASE(bxInstruction_c *i)
|
||||
{
|
||||
BX_CPU_THIS_PTR amx->clear();
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPU_C::check_tiles(bxInstruction_c *i, unsigned tile_dst, unsigned tile_src1, unsigned tile_src2)
|
||||
{
|
||||
// #UD if srcdest == src1 OR src1 == src2 OR srcdest == src2
|
||||
@ -276,6 +298,8 @@ void BX_CPU_C::check_tiles(bxInstruction_c *i, unsigned tile_dst, unsigned tile_
|
||||
}
|
||||
}
|
||||
|
||||
// AMX-INT8 //
|
||||
|
||||
BX_CPP_INLINE Bit32u DPBDSS(Bit32u x, Bit32u y)
|
||||
{
|
||||
const Bit8u xbyte[4] = { Bit8u(x & 0xff), Bit8u((x >> 8) & 0xff), Bit8u((x >> 16) & 0xff), Bit8u(x >> 24) };
|
||||
@ -460,6 +484,8 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPBUUD_TnnnTrmTreg(bxInstruction_c *i)
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
// AMX-BF16 //
|
||||
|
||||
#include "bf16.h"
|
||||
|
||||
extern float_status_t prepare_ne_softfloat_status_helper();
|
||||
@ -481,7 +507,10 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPBF16PS_TnnnTrmTreg(bxInstruction_c *i)
|
||||
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
|
||||
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
|
||||
|
||||
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
|
||||
// output denormals are always flushed to zero and input denormals are always treated as zero.
|
||||
float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
status.denormals_are_zeros = true;
|
||||
|
||||
for (unsigned m=0; m < max_m; m++) {
|
||||
float32 tmp[32]; // new empty array
|
||||
@ -512,25 +541,166 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPBF16PS_TnnnTrmTreg(bxInstruction_c *i)
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILEZERO_Tnnn(bxInstruction_c *i)
|
||||
{
|
||||
unsigned tile = i->dst();
|
||||
// AMX-FP16 //
|
||||
|
||||
if (tile >= BX_TILE_REGISTERS || ! BX_CPU_THIS_PTR amx->tile_valid(tile)) {
|
||||
BX_ERROR(("TILEZERO: invalid tile %d", tile));
|
||||
exception(BX_UD_EXCEPTION, 0);
|
||||
extern float32 convert_ne_fp16_to_fp32(float16 op);
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TDPFP16PS_TnnnTrmTreg(bxInstruction_c *i)
|
||||
{
|
||||
unsigned tile_dst = i->dst(), tile_src1 = i->src1(), tile_src2 = i->src2();
|
||||
check_tiles(i, tile_dst, tile_src1, tile_src2);
|
||||
|
||||
// R C
|
||||
// A = m x k (tsrc1)
|
||||
// B = k x n (tsrc2)
|
||||
// C = m x n (tsrcdest)
|
||||
unsigned max_n = BX_CPU_THIS_PTR amx->tile_bytes_per_row(tile_dst) / 4;
|
||||
unsigned max_m = BX_CPU_THIS_PTR amx->tile_num_rows(tile_dst);
|
||||
unsigned max_k = BX_CPU_THIS_PTR amx->tile_num_rows(tile_src2);
|
||||
|
||||
AMX::TILE *tdst = &(BX_CPU_THIS_PTR amx->tile[tile_dst]);
|
||||
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
|
||||
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
|
||||
|
||||
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
|
||||
// output FP32 denormals are always flushed to zero and input denormals are always treated as zero.
|
||||
float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
status.denormals_are_zeros = true;
|
||||
|
||||
for (unsigned m=0; m < max_m; m++) {
|
||||
float32 tmp[32]; // new empty array
|
||||
for (unsigned n=0; n < 32; n++) tmp[n] = 0;
|
||||
|
||||
for (unsigned k=0; k < max_k; k++) {
|
||||
for (unsigned n=0; n < max_n; n++) {
|
||||
tmp[2*n] = float32_fmadd(convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k)),
|
||||
convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n)), tmp[2*n], status);
|
||||
|
||||
tmp[2*n+1] = float32_fmadd(convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k+1)),
|
||||
convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n+1)), tmp[2*n+1], status);
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned n=0; n < max_n; n++) {
|
||||
float32 tmpf32 = float32_add(tmp[2*n], tmp[2*n+1], status);
|
||||
tdst->row[m].vmm32u(n) = float32_add(tdst->row[m].vmm32u(n), tmpf32, status);
|
||||
}
|
||||
|
||||
tdst->zero_upper_row_data32(m, max_n);
|
||||
}
|
||||
|
||||
BX_CPU_THIS_PTR amx->clear_tile_used(tile);
|
||||
BX_CPU_THIS_PTR amx->tile[tile].clear();
|
||||
BX_CPU_THIS_PTR amx->set_tile_used(tile_dst);
|
||||
BX_CPU_THIS_PTR amx->tile[tile_dst].clear_upper_rows(max_m);
|
||||
BX_CPU_THIS_PTR amx->restart();
|
||||
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TILERELEASE(bxInstruction_c *i)
|
||||
// AMX-COMPLEX //
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TCMMRLFP16PS_TnnnTrmTreg(bxInstruction_c *i)
|
||||
{
|
||||
BX_CPU_THIS_PTR amx->clear();
|
||||
unsigned tile_dst = i->dst(), tile_src1 = i->src1(), tile_src2 = i->src2();
|
||||
check_tiles(i, tile_dst, tile_src1, tile_src2);
|
||||
|
||||
// R C
|
||||
// A = m x k (tsrc1)
|
||||
// B = k x n (tsrc2)
|
||||
// C = m x n (tsrcdest)
|
||||
unsigned max_n = BX_CPU_THIS_PTR amx->tile_bytes_per_row(tile_dst) / 4;
|
||||
unsigned max_m = BX_CPU_THIS_PTR amx->tile_num_rows(tile_dst);
|
||||
unsigned max_k = BX_CPU_THIS_PTR amx->tile_num_rows(tile_src2);
|
||||
|
||||
AMX::TILE *tdst = &(BX_CPU_THIS_PTR amx->tile[tile_dst]);
|
||||
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
|
||||
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
|
||||
|
||||
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
|
||||
// output FP32 denormals are always flushed to zero and input denormals are always treated as zero.
|
||||
float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
status.denormals_are_zeros = true;
|
||||
|
||||
for (unsigned m=0; m < max_m; m++) {
|
||||
float32 tmp[32]; // new empty array
|
||||
for (unsigned n=0; n < 32; n++) tmp[n] = 0;
|
||||
|
||||
for (unsigned k=0; k < max_k; k++) {
|
||||
for (unsigned n=0; n < max_n; n++) {
|
||||
float32 s1r = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k)); // real
|
||||
float32 s2r = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n)); // real
|
||||
float32 s1i = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k+1)); // imaginary
|
||||
float32 s2i = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n+1)); // imaginary
|
||||
|
||||
tmp[2*n] = float32_muladd(s1r, s2r, tmp[2*n], 0, status); // real
|
||||
tmp[2*n+1] = float32_muladd(s1i, s2i, tmp[2*n+1], float_muladd_negate_product, status); // imaginary, negate for i^2 = -1
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned n=0; n < max_n; n++) {
|
||||
float32 tmpf32 = float32_add(tmp[2*n], tmp[2*n+1], status);
|
||||
tdst->row[m].vmm32u(n) = float32_add(tdst->row[m].vmm32u(n), tmpf32, status);
|
||||
}
|
||||
|
||||
tdst->zero_upper_row_data32(m, max_n);
|
||||
}
|
||||
|
||||
BX_CPU_THIS_PTR amx->set_tile_used(tile_dst);
|
||||
BX_CPU_THIS_PTR amx->tile[tile_dst].clear_upper_rows(max_m);
|
||||
BX_CPU_THIS_PTR amx->restart();
|
||||
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::TCMMIMFP16PS_TnnnTrmTreg(bxInstruction_c *i)
|
||||
{
|
||||
unsigned tile_dst = i->dst(), tile_src1 = i->src1(), tile_src2 = i->src2();
|
||||
check_tiles(i, tile_dst, tile_src1, tile_src2);
|
||||
|
||||
// R C
|
||||
// A = m x k (tsrc1)
|
||||
// B = k x n (tsrc2)
|
||||
// C = m x n (tsrcdest)
|
||||
unsigned max_n = BX_CPU_THIS_PTR amx->tile_bytes_per_row(tile_dst) / 4;
|
||||
unsigned max_m = BX_CPU_THIS_PTR amx->tile_num_rows(tile_dst);
|
||||
unsigned max_k = BX_CPU_THIS_PTR amx->tile_num_rows(tile_src2);
|
||||
|
||||
AMX::TILE *tdst = &(BX_CPU_THIS_PTR amx->tile[tile_dst]);
|
||||
AMX::TILE *tsrc1 = &(BX_CPU_THIS_PTR amx->tile[tile_src1]);
|
||||
AMX::TILE *tsrc2 = &(BX_CPU_THIS_PTR amx->tile[tile_src2]);
|
||||
|
||||
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
|
||||
// output FP32 denormals are always flushed to zero and input denormals are always treated as zero.
|
||||
float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
status.denormals_are_zeros = true;
|
||||
|
||||
for (unsigned m=0; m < max_m; m++) {
|
||||
float32 tmp[32]; // new empty array
|
||||
for (unsigned n=0; n < 32; n++) tmp[n] = 0;
|
||||
|
||||
for (unsigned k=0; k < max_k; k++) {
|
||||
for (unsigned n=0; n < max_n; n++) {
|
||||
float32 s1r = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k)); // real
|
||||
float32 s2r = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n)); // real
|
||||
float32 s1i = convert_ne_fp16_to_fp32(tsrc1->row[m].vmm16u(2*k+1)); // imaginary
|
||||
float32 s2i = convert_ne_fp16_to_fp32(tsrc2->row[k].vmm16u(2*n+1)); // imaginary
|
||||
|
||||
tmp[2*n] = float32_muladd(s1i, s2r, tmp[2*n], 0, status);
|
||||
tmp[2*n+1] = float32_muladd(s1r, s2i, tmp[2*n+1], 0, status);
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned n=0; n < max_n; n++) {
|
||||
float32 tmpf32 = float32_add(tmp[2*n], tmp[2*n+1], status);
|
||||
tdst->row[m].vmm32u(n) = float32_add(tdst->row[m].vmm32u(n), tmpf32, status);
|
||||
}
|
||||
|
||||
tdst->zero_upper_row_data32(m, max_n);
|
||||
}
|
||||
|
||||
BX_CPU_THIS_PTR amx->set_tile_used(tile_dst);
|
||||
BX_CPU_THIS_PTR amx->tile[tile_dst].clear_upper_rows(max_m);
|
||||
BX_CPU_THIS_PTR amx->restart();
|
||||
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
|
@ -86,7 +86,10 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VDPBF16PS_MASK_VpsHdqWdqR(bxInstruction_c
|
||||
unsigned len = i->getVL();
|
||||
Bit32u mask = (i->opmask() != 0) ? BX_READ_16BIT_OPMASK(i->opmask()) : 0xffff;
|
||||
|
||||
static float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
// "round to nearest even" rounding mode is used when doing each accumulation of the FMA.
|
||||
// output denormals are always flushed to zero and input denormals are always treated as zero.
|
||||
float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
status.denormals_are_zeros = true;
|
||||
|
||||
for (unsigned n=0, tmp_mask = mask; n < DWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
|
||||
if (tmp_mask & 0x1) {
|
||||
|
@ -29,8 +29,8 @@
|
||||
#if BX_SUPPORT_AVX
|
||||
|
||||
// FP32: s|eeeeeeee|mmmmmmmmmmmmmmmmmmmmmmm
|
||||
// BF16: s|eeeeeeee|mmmmmmmm
|
||||
// F16: s|eeeee|mmmmmmmmmmm
|
||||
// BF16: s|eeeeeeee|mmmmmmm
|
||||
// F16: s|eeeee|mmmmmmmmmm
|
||||
|
||||
float_status_t prepare_ne_softfloat_status_helper()
|
||||
{
|
||||
@ -42,12 +42,13 @@ float_status_t prepare_ne_softfloat_status_helper()
|
||||
status.float_suppress_exception = float_all_exceptions_mask;
|
||||
status.float_nan_handling_mode = float_first_operand_nan;
|
||||
status.flush_underflow_to_zero = true;
|
||||
status.denormals_are_zeros = true;
|
||||
// input denormals not converted to zero and handled normally
|
||||
status.denormals_are_zeros = false;
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static float32 convert_ne_fp16_to_fp32(float16 op)
|
||||
float32 convert_ne_fp16_to_fp32(float16 op)
|
||||
{
|
||||
static float_status_t status = prepare_ne_softfloat_status_helper();
|
||||
return float16_to_float32(op, status);
|
||||
|
@ -3601,6 +3601,9 @@ public: // for now...
|
||||
BX_SMF void TDPBUSD_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TDPBUUD_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TDPBF16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TDPFP16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TCMMRLFP16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TCMMIMFP16PS_TnnnTrmTreg(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TILEZERO_Tnnn(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void TILERELEASE(bxInstruction_c *i) BX_CPP_AttrRegparmN(1);
|
||||
#endif
|
||||
|
@ -1166,7 +1166,15 @@ Bit32u bx_cpuid_t::get_std_cpuid_leaf_7_subleaf_1_eax(Bit32u extra) const
|
||||
eax |= BX_CPUID_STD7_SUBLEAF1_EAX_WRMSRNS;
|
||||
|
||||
// [20:20] reserved
|
||||
// [21:21] AMX-FB16 support
|
||||
|
||||
// [21:21] AMX-FP16 support
|
||||
#if BX_SUPPORT_AMX
|
||||
if (is_cpu_extension_supported(BX_ISA_AMX)) {
|
||||
if (is_cpu_extension_supported(BX_ISA_AMX_FP16))
|
||||
eax |= BX_CPUID_STD7_SUBLEAF1_EAX_AMX_FP16;
|
||||
}
|
||||
#endif
|
||||
|
||||
// [22:22] HRESET and CPUID leaf 0x20 support
|
||||
|
||||
// [23:23] AVX IFMA support
|
||||
@ -1207,7 +1215,15 @@ Bit32u bx_cpuid_t::get_std_cpuid_leaf_7_subleaf_1_edx(Bit32u extra) const
|
||||
#endif
|
||||
|
||||
// [7:6] reserved
|
||||
|
||||
// [8:8] AMX-COMPLEX instructions
|
||||
#if BX_SUPPORT_AMX
|
||||
if (is_cpu_extension_supported(BX_ISA_AMX)) {
|
||||
if (is_cpu_extension_supported(BX_ISA_AMX_COMPLEX))
|
||||
edx |= BX_CPUID_STD7_SUBLEAF1_EDX_AMX_COMPLEX;
|
||||
}
|
||||
#endif
|
||||
|
||||
// [9:9] reserved
|
||||
|
||||
// [10:10] AVX-VNNI-INT16 instructions
|
||||
|
@ -119,6 +119,8 @@ x86_feature(BX_ISA_AVX_NE_CONVERT, "avx_ne_convert") /* AVX-N
|
||||
x86_feature(BX_ISA_AMX, "amx") /* AMX Instructions */
|
||||
x86_feature(BX_ISA_AMX_INT8, "amx_int8") /* AMX-INT8 Instructions */
|
||||
x86_feature(BX_ISA_AMX_BF16, "amx_bf16") /* AMX-BF16 Instructions */
|
||||
x86_feature(BX_ISA_AMX_FP16, "amx_fp16") /* AMX-FP16 Instructions */
|
||||
x86_feature(BX_ISA_AMX_COMPLEX, "amx_complex") /* AMX-COMPLEX Instructions */
|
||||
#endif
|
||||
#endif
|
||||
x86_feature(BX_ISA_XAPIC, "xapic") /* XAPIC support */
|
||||
|
@ -1031,7 +1031,10 @@ static const Bit64u BxOpcodeGroup_VEX_0F3859[] = { last_opcode(ATTR_SSE_PREFIX_6
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F385A[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0 | ATTR_VL256 | ATTR_MOD_MEM, BX_IA_V256_VBROADCASTI128_VdqMdq) };
|
||||
|
||||
#if BX_SUPPORT_AMX
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F385C[] = { last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBF16PS_TnnnTrmTreg) };
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F385C[] = {
|
||||
form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPFP16PS_TnnnTrmTreg),
|
||||
last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBF16PS_TnnnTrmTreg)
|
||||
};
|
||||
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F385E[] = {
|
||||
form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBUUD_TnnnTrmTreg),
|
||||
@ -1039,6 +1042,11 @@ static const Bit64u BxOpcodeGroup_VEX_0F385E[] = {
|
||||
form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBSSD_TnnnTrmTreg),
|
||||
last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TDPBSUD_TnnnTrmTreg)
|
||||
};
|
||||
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F386C[] = {
|
||||
form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TCMMRLFP16PS_TnnnTrmTreg),
|
||||
last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0 | ATTR_VL128 | ATTR_MODC0 | ATTR_IS64, BX_IA_TCMMIMFP16PS_TnnnTrmTreg)
|
||||
};
|
||||
#endif
|
||||
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3872[] = { last_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0, BX_IA_VCVTNEPS2BF16_Vbf16Wps) };
|
||||
@ -1968,7 +1976,11 @@ static const Bit64u *BxOpcodeTableVEX[256*3] = {
|
||||
/* 69 */ ( BxOpcodeGroup_ERR ),
|
||||
/* 6A */ ( BxOpcodeGroup_ERR ),
|
||||
/* 6B */ ( BxOpcodeGroup_ERR ),
|
||||
#if BX_SUPPORT_AMX
|
||||
/* 6C */ ( BxOpcodeGroup_VEX_0F386C ),
|
||||
#else
|
||||
/* 6C */ ( BxOpcodeGroup_ERR ),
|
||||
#endif
|
||||
/* 6D */ ( BxOpcodeGroup_ERR ),
|
||||
/* 6E */ ( BxOpcodeGroup_ERR ),
|
||||
/* 6F */ ( BxOpcodeGroup_ERR ),
|
||||
|
@ -2773,6 +2773,9 @@ bx_define_opcode(BX_IA_TDPBSUD_TnnnTrmTreg, "tdpbsud", "tdpbsud", NULL, &BX_CPU_
|
||||
bx_define_opcode(BX_IA_TDPBUSD_TnnnTrmTreg, "tdpbusd", "tdpbusd", NULL, &BX_CPU_C::TDPBUSD_TnnnTrmTreg, BX_ISA_AMX_INT8, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
|
||||
bx_define_opcode(BX_IA_TDPBUUD_TnnnTrmTreg, "tdpbuud", "tdpbuud", NULL, &BX_CPU_C::TDPBUUD_TnnnTrmTreg, BX_ISA_AMX_INT8, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
|
||||
bx_define_opcode(BX_IA_TDPBF16PS_TnnnTrmTreg, "tdpbf16ps", "tdpbf16ps", NULL, &BX_CPU_C::TDPBF16PS_TnnnTrmTreg, BX_ISA_AMX_BF16, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
|
||||
bx_define_opcode(BX_IA_TDPFP16PS_TnnnTrmTreg, "tdpfp16ps", "tdpfp16ps", NULL, &BX_CPU_C::TDPFP16PS_TnnnTrmTreg, BX_ISA_AMX_FP16, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
|
||||
bx_define_opcode(BX_IA_TCMMRLFP16PS_TnnnTrmTreg, "tcmmrlfp16ps", "tcmmrlfp16ps", NULL, &BX_CPU_C::TCMMRLFP16PS_TnnnTrmTreg, BX_ISA_AMX_COMPLEX, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
|
||||
bx_define_opcode(BX_IA_TCMMIMFP16PS_TnnnTrmTreg, "tcmmimfp16ps", "tcmmimfp16ps", NULL, &BX_CPU_C::TCMMIMFP16PS_TnnnTrmTreg, BX_ISA_AMX_COMPLEX, OP_Tnnn, OP_Trm, OP_Treg, OP_NONE, BX_PREPARE_AMX)
|
||||
#endif
|
||||
|
||||
#if BX_SUPPORT_AVX
|
||||
|
Loading…
Reference in New Issue
Block a user