Merge pull request #14 from bochs-emu/cpu/avx-vnni-int8
add support for AVX encoded VNNI INT8 extensions extract IFMA52 code to separate file
This commit is contained in:
commit
cdc48128be
@ -46,6 +46,7 @@ AVX_OBJS = \
|
||||
avx_pfp.o \
|
||||
avx_cvt.o \
|
||||
avx_fma.o \
|
||||
avx_ifma52.o \
|
||||
avx2.o \
|
||||
avx512.o \
|
||||
avx512_broadcast.o \
|
||||
@ -117,7 +118,7 @@ avx512.o: avx512.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \
|
||||
../fpu/status_w.h ../fpu/control_w.h ../crregs.h ../descriptor.h \
|
||||
../decoder/instr.h ../lazy_flags.h ../tlb.h ../icache.h ../apic.h \
|
||||
../xmm.h ../vmx.h ../svm.h ../cpuid.h ../stack.h ../access.h \
|
||||
../simd_int.h ../simd_compare.h ../wide_int.h
|
||||
../simd_int.h ../simd_compare.h
|
||||
avx512_bitalg.o: avx512_bitalg.@CPP_SUFFIX@ ../../bochs.h ../../config.h \
|
||||
../../osdep.h ../../bx_debug/debug.h ../../config.h ../../osdep.h \
|
||||
../../cpu/decoder/decoder.h ../../gui/paramtree.h ../../logio.h \
|
||||
@ -242,6 +243,14 @@ avx_fma.o: avx_fma.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \
|
||||
../decoder/instr.h ../lazy_flags.h ../tlb.h ../icache.h ../apic.h \
|
||||
../xmm.h ../vmx.h ../svm.h ../cpuid.h ../stack.h ../access.h \
|
||||
../simd_pfp.h
|
||||
avx_ifma52.o: avx_ifma52.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \
|
||||
../../bx_debug/debug.h ../../config.h ../../osdep.h \
|
||||
../../cpu/decoder/decoder.h ../../gui/paramtree.h ../../logio.h \
|
||||
../../instrument/stubs/instrument.h ../cpu.h \
|
||||
../decoder/decoder.h ../i387.h ../fpu/softfloat.h ../fpu/tag_w.h \
|
||||
../fpu/status_w.h ../fpu/control_w.h ../crregs.h ../descriptor.h \
|
||||
../decoder/instr.h ../lazy_flags.h ../tlb.h ../icache.h ../apic.h \
|
||||
../xmm.h ../vmx.h ../svm.h ../cpuid.h ../stack.h ../access.h ../wide_int.h
|
||||
avx_pfp.o: avx_pfp.@CPP_SUFFIX@ ../../bochs.h ../../config.h ../../osdep.h \
|
||||
../../bx_debug/debug.h ../../config.h ../../osdep.h \
|
||||
../../cpu/decoder/decoder.h ../../gui/paramtree.h ../../logio.h \
|
||||
|
@ -30,7 +30,6 @@
|
||||
|
||||
#include "simd_int.h"
|
||||
#include "simd_compare.h"
|
||||
#include "wide_int.h"
|
||||
|
||||
// compare
|
||||
|
||||
@ -2271,89 +2270,6 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMULTISHIFTQB_MASK_VdqHdqWdqR(bxInstructi
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
// 52-bit integer FMA
|
||||
|
||||
BX_CPP_INLINE Bit64u pmadd52luq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
|
||||
{
|
||||
op1 &= BX_CONST64(0x000fffffffffffff);
|
||||
op2 &= BX_CONST64(0x000fffffffffffff);
|
||||
|
||||
return dst + ((op1 * op2) & BX_CONST64(0x000fffffffffffff));
|
||||
}
|
||||
|
||||
BX_CPP_INLINE Bit64u pmadd52huq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
|
||||
{
|
||||
op1 &= BX_CONST64(0x000fffffffffffff);
|
||||
op2 &= BX_CONST64(0x000fffffffffffff);
|
||||
|
||||
Bit128u product_128;
|
||||
long_mul(&product_128, op1, op2);
|
||||
|
||||
Bit64u temp = (product_128.lo >> 52) | ((product_128.hi & BX_CONST64(0x000000ffffffffff)) << 12);
|
||||
|
||||
return dst + temp;
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
|
||||
dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
|
||||
if (tmp_mask & 0x1)
|
||||
dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
else if (i->isZeroMasking())
|
||||
dst.vmm64u(n) = 0;
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
|
||||
dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
|
||||
if (tmp_mask & 0x1)
|
||||
dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
else if (i->isZeroMasking())
|
||||
dst.vmm64u(n) = 0;
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VP2INTERSECTD_KGqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2());
|
||||
|
120
bochs/cpu/avx/avx_ifma52.cc
Normal file
120
bochs/cpu/avx/avx_ifma52.cc
Normal file
@ -0,0 +1,120 @@
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
// $Id$
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Copyright (c) 2022 Stanislav Shwartsman
|
||||
// Written by Stanislav Shwartsman [sshwarts at sourceforge net]
|
||||
//
|
||||
// This library is free software; you can redistribute it and/or
|
||||
// modify it under the terms of the GNU Lesser General Public
|
||||
// License as published by the Free Software Foundation; either
|
||||
// version 2 of the License, or (at your option) any later version.
|
||||
//
|
||||
// This library is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
// Lesser General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Lesser General Public
|
||||
// License along with this library; if not, write to the Free Software
|
||||
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define NEED_CPU_REG_SHORTCUTS 1
|
||||
#include "bochs.h"
|
||||
#include "cpu.h"
|
||||
#define LOG_THIS BX_CPU_THIS_PTR
|
||||
|
||||
#if BX_SUPPORT_AVX || BX_SUPPORT_EVEX
|
||||
|
||||
#include "wide_int.h"
|
||||
|
||||
// 52-bit integer FMA
|
||||
|
||||
BX_CPP_INLINE Bit64u pmadd52luq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
|
||||
{
|
||||
op1 &= BX_CONST64(0x000fffffffffffff);
|
||||
op2 &= BX_CONST64(0x000fffffffffffff);
|
||||
|
||||
return dst + ((op1 * op2) & BX_CONST64(0x000fffffffffffff));
|
||||
}
|
||||
|
||||
BX_CPP_INLINE Bit64u pmadd52huq_scalar(Bit64u dst, Bit64u op1, Bit64u op2)
|
||||
{
|
||||
op1 &= BX_CONST64(0x000fffffffffffff);
|
||||
op2 &= BX_CONST64(0x000fffffffffffff);
|
||||
|
||||
Bit128u product_128;
|
||||
long_mul(&product_128, op1, op2);
|
||||
|
||||
Bit64u temp = (product_128.lo >> 52) | ((product_128.hi & BX_CONST64(0x000000ffffffffff)) << 12);
|
||||
|
||||
return dst + temp;
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
|
||||
dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
|
||||
dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if BX_SUPPORT_EVEX
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
|
||||
if (tmp_mask & 0x1)
|
||||
dst.vmm64u(n) = pmadd52luq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
else if (i->isZeroMasking())
|
||||
dst.vmm64u(n) = 0;
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *i)
|
||||
{
|
||||
BxPackedAvxRegister op1 = BX_READ_AVX_REG(i->src1()), op2 = BX_READ_AVX_REG(i->src2()), dst = BX_READ_AVX_REG(i->dst());
|
||||
Bit32u mask = BX_READ_8BIT_OPMASK(i->opmask());
|
||||
unsigned len = i->getVL();
|
||||
|
||||
for (unsigned n=0, tmp_mask = mask; n < QWORD_ELEMENTS(len); n++, tmp_mask >>= 1) {
|
||||
if (tmp_mask & 0x1)
|
||||
dst.vmm64u(n) = pmadd52huq_scalar(dst.vmm64u(n), op1.vmm64u(n), op2.vmm64u(n));
|
||||
else if (i->isZeroMasking())
|
||||
dst.vmm64u(n) = 0;
|
||||
}
|
||||
|
||||
BX_WRITE_AVX_REGZ(i->dst(), dst, len);
|
||||
BX_NEXT_INSTR(i);
|
||||
}
|
||||
|
||||
#endif
|
@ -49,6 +49,13 @@ AVX_3OP_VNNI(VPDPBUSDS_VdqHdqWdqR, xmm_pdpbusds)
|
||||
AVX_3OP_VNNI(VPDPWSSD_VdqHdqWdqR, xmm_pdpwssd)
|
||||
AVX_3OP_VNNI(VPDPWSSDS_VdqHdqWdqR, xmm_pdpwssds)
|
||||
|
||||
AVX_3OP_VNNI(VPDPBSSD_VdqHdqWdqR, xmm_pdpbssd)
|
||||
AVX_3OP_VNNI(VPDPBSSDS_VdqHdqWdqR, xmm_pdpbssds)
|
||||
AVX_3OP_VNNI(VPDPBSUD_VdqHdqWdqR, xmm_pdpbsud)
|
||||
AVX_3OP_VNNI(VPDPBSUDS_VdqHdqWdqR, xmm_pdpbsuds)
|
||||
AVX_3OP_VNNI(VPDPBUUD_VdqHdqWdqR, xmm_pdpbuud)
|
||||
AVX_3OP_VNNI(VPDPBUUDS_VdqHdqWdqR, xmm_pdpbuuds)
|
||||
|
||||
#endif
|
||||
|
||||
#if BX_SUPPORT_EVEX
|
||||
|
@ -3091,6 +3091,18 @@ public: // for now...
|
||||
BX_SMF void VPDPWSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPDPWSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
|
||||
/* AVX encoded IFMA instructions */
|
||||
BX_SMF void VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
|
||||
/* AVX encoded VNNI INT8 instructions */
|
||||
BX_SMF void VPDPBSSD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPDPBSSDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPDPBSUD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPDPBSUDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPDPBUUD_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPDPBUUDS_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
|
||||
// AVX512 OPMASK instructions (VEX encoded)
|
||||
BX_SMF void KADDB_KGbKHbKEbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void KANDB_KGbKHbKEbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
@ -3748,9 +3760,7 @@ public: // for now...
|
||||
BX_SMF void VPMOVD2M_KGwWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPMOVQ2M_KGbWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
|
||||
BX_SMF void VPMADD52LUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPMADD52LUQ_MASK_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPMADD52HUQ_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
BX_SMF void VPMADD52HUQ_MASK_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
|
||||
BX_SMF void VPMULTISHIFTQB_VdqHdqWdqR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
|
||||
|
@ -114,6 +114,7 @@ static const char *cpu_feature_name[] =
|
||||
"avx512vp2intersect", // BX_ISA_AVX512_VP2INTERSECT
|
||||
"avx_vnni", // BX_ISA_AVX_VNNI
|
||||
"avx_ifma", // BX_ISA_AVX_IFMA
|
||||
"avx_vnni_int8", // BX_ISA_AVX_VNNI_INT8
|
||||
"xapic", // BX_ISA_XAPIC
|
||||
"x2apic", // BX_ISA_X2APIC
|
||||
"xapicext", // BX_ISA_XAPICEXT
|
||||
|
@ -109,6 +109,7 @@ enum x86_feature_name {
|
||||
BX_ISA_AVX512_VP2INTERSECT, /* AVX-512 VP2INTERSECT Instructions */
|
||||
BX_ISA_AVX_VNNI, /* AVX encoded VNNI Instructions */
|
||||
BX_ISA_AVX_IFMA, /* AVX encoded IFMA Instructions */
|
||||
BX_ISA_AVX_VNNI_INT8, /* AVX encoded VNNI-INT8 Instructions */
|
||||
BX_ISA_XAPIC, /* XAPIC support */
|
||||
BX_ISA_X2APIC, /* X2APIC support */
|
||||
BX_ISA_XAPIC_EXT, /* XAPIC Extensions support */
|
||||
|
@ -994,8 +994,20 @@ static const Bit64u BxOpcodeGroup_VEX_0F3847[] = {
|
||||
last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W1, BX_IA_VPSLLVQ_VdqHdqWdq)
|
||||
};
|
||||
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3850[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq) };
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3851[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq) };
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3850[] = {
|
||||
form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUD_VdqHdqWdq),
|
||||
form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSD_VdqHdqWdq),
|
||||
form_opcode(ATTR_SSE_PREFIX_F3 | ATTR_VEX_W0, BX_IA_VPDPBSSD_VdqHdqWdq),
|
||||
last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUD_VdqHdqWdq)
|
||||
};
|
||||
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3851[] = {
|
||||
form_opcode(ATTR_SSE_NO_PREFIX | ATTR_VEX_W0, BX_IA_VPDPBUUDS_VdqHdqWdq),
|
||||
form_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPBUSDS_VdqHdqWdq),
|
||||
form_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSSDS_VdqHdqWdq),
|
||||
last_opcode(ATTR_SSE_PREFIX_F2 | ATTR_VEX_W0, BX_IA_VPDPBSUDS_VdqHdqWdq)
|
||||
};
|
||||
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3852[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPWSSD_VdqHdqWdq) };
|
||||
static const Bit64u BxOpcodeGroup_VEX_0F3853[] = { last_opcode(ATTR_SSE_PREFIX_66 | ATTR_VEX_W0, BX_IA_VPDPWSSDS_VdqHdqWdq) };
|
||||
|
||||
|
@ -2421,6 +2421,15 @@ bx_define_opcode(BX_IA_VPMADD52LUQ_VdqHdqWdq, "vpmadd52luq", "vpmadd52luq", &BX_
|
||||
bx_define_opcode(BX_IA_VPMADD52HUQ_VdqHdqWdq, "vpmadd52huq", "vpmadd52huq", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPMADD52HUQ_VdqHdqWdqR, BX_ISA_AVX_IFMA, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
// AVX IFMA
|
||||
|
||||
// AVX VNNI INT8
|
||||
bx_define_opcode(BX_IA_VPDPBSSD_VdqHdqWdq, "vpdpbssd", "vpdpbssd", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSSD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
bx_define_opcode(BX_IA_VPDPBSSDS_VdqHdqWdq, "vpdpbssds", "vpdpbssds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSSDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
bx_define_opcode(BX_IA_VPDPBSUD_VdqHdqWdq, "vpdpbsud", "vpdpbsud", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSUD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
bx_define_opcode(BX_IA_VPDPBSUDS_VdqHdqWdq, "vpdpbsuds", "vpdpbsuds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBSUDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
bx_define_opcode(BX_IA_VPDPBUUD_VdqHdqWdq, "vpdpbuud", "vpdpbuud", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBUUD_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
bx_define_opcode(BX_IA_VPDPBUUDS_VdqHdqWdq, "vpdpbuuds", "vpdpbuuds", &BX_CPU_C::LOAD_Vector, &BX_CPU_C::VPDPBUUDS_VdqHdqWdqR, BX_ISA_AVX_VNNI_INT8, OP_Vdq, OP_Hdq, OP_Wdq, OP_NONE, BX_PREPARE_AVX)
|
||||
// AVX VNNI INT8
|
||||
|
||||
// BMI1 - VexW64 aliased
|
||||
bx_define_opcode(BX_IA_ANDN_GdBdEd, "andn", "andnl", &BX_CPU_C::LOAD_Ed, &BX_CPU_C::ANDN_GdBdEdR, BX_ISA_BMI1, OP_Gd, OP_Bd, OP_Ed, OP_NONE, 0)
|
||||
bx_define_opcode(BX_IA_ANDN_GqBqEq, "andn", "andnq", &BX_CPU_C::LOAD_Eq, &BX_CPU_C::ANDN_GqBqEqR, BX_ISA_BMI1, OP_Gq, OP_Bq, OP_Eq, OP_NONE, 0)
|
||||
|
@ -1775,6 +1775,87 @@ BX_CPP_INLINE void xmm_pdpbusds(BxPackedXmmRegister *dst, BxPackedXmmRegister *o
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpbssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
{
|
||||
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32s) op2->xmmsbyte(n*4);
|
||||
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
|
||||
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
|
||||
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
|
||||
|
||||
dst->xmm32s(n) += (p1word + p2word + p3word + p4word);
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpbssds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
{
|
||||
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32s) op2->xmmsbyte(n*4);
|
||||
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32s) op2->xmmsbyte(n*4+1);
|
||||
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32s) op2->xmmsbyte(n*4+2);
|
||||
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32s) op2->xmmsbyte(n*4+3);
|
||||
|
||||
Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word);
|
||||
dst->xmm32s(n) = SaturateQwordSToDwordS(result);
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpbsud(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
{
|
||||
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
|
||||
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
|
||||
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
|
||||
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
|
||||
|
||||
dst->xmm32s(n) += (p1word + p2word + p3word + p4word);
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpbsuds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
{
|
||||
Bit32s p1word = (Bit32s) op1->xmmsbyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
|
||||
Bit32s p2word = (Bit32s) op1->xmmsbyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
|
||||
Bit32s p3word = (Bit32s) op1->xmmsbyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
|
||||
Bit32s p4word = (Bit32s) op1->xmmsbyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
|
||||
|
||||
Bit64s result = (Bit64s) dst->xmm32s(n) + (p1word + p2word + p3word + p4word);
|
||||
dst->xmm32s(n) = SaturateQwordSToDwordS(result);
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpbuud(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
{
|
||||
Bit32u p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
|
||||
Bit32u p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
|
||||
Bit32u p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
|
||||
Bit32u p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
|
||||
|
||||
dst->xmm32u(n) += (p1word + p2word + p3word + p4word);
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpbuuds(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
{
|
||||
Bit32u p1word = (Bit32u) op1->xmmubyte(n*4) * (Bit32u) op2->xmmubyte(n*4);
|
||||
Bit32u p2word = (Bit32u) op1->xmmubyte(n*4+1) * (Bit32u) op2->xmmubyte(n*4+1);
|
||||
Bit32u p3word = (Bit32u) op1->xmmubyte(n*4+2) * (Bit32u) op2->xmmubyte(n*4+2);
|
||||
Bit32u p4word = (Bit32u) op1->xmmubyte(n*4+3) * (Bit32u) op2->xmmubyte(n*4+3);
|
||||
|
||||
Bit64u result = (Bit64u) dst->xmm32u(n) + (p1word + p2word + p3word + p4word);
|
||||
dst->xmm32u(n) = SaturateQwordUToDwordU(result);
|
||||
}
|
||||
}
|
||||
|
||||
BX_CPP_INLINE void xmm_pdpwssd(BxPackedXmmRegister *dst, BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2)
|
||||
{
|
||||
for(unsigned n=0; n<4; n++)
|
||||
|
Loading…
Reference in New Issue
Block a user