code refactor towards fixing issue #2 (#112)

next step new methods for VEXPANDPS_MASK_VpsWpsR and
VEXPANDPD_MASK_VpdWpdR with mask and memory operand will be introduced
with fault suppression support
legacy load methods aren't good because the loads here actually read
POPCNT(mask) elements from memory src
This commit is contained in:
Stanislav Shwartsman 2023-10-28 22:21:42 +03:00 committed by GitHub
parent 1a420ddf8b
commit 96b887a9bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 73 additions and 108 deletions

View File

@ -1859,120 +1859,96 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VBLENDMPD_MASK_VpdHpdWpdR(bxInstruction_c
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPEXPANDB_MASK_VdqWdqR(bxInstruction_c *i)
{
BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
Bit64u opmask = BX_READ_OPMASK(i->opmask()), mask = opmask;
unsigned len = i->getVL(), n = 0, k = 0;
Bit64u opmask = BX_READ_OPMASK(i->opmask());
unsigned len = i->getVL(), k = 0;
for (; n < len*16; n++, mask >>= 1) {
if (mask & 0x1) {
for (unsigned n = 0; n < len*16; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmmubyte(n) = op.vmmubyte(k);
k++;
}
else {
result.vmmubyte(n) = 0;
}
}
if (i->isZeroMasking()) {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
else {
for (unsigned n=0; n < len; n++, opmask >>= 16)
xmm_pblendb(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPEXPANDW_MASK_VdqWdqR(bxInstruction_c *i)
{
BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
Bit32u opmask = BX_READ_32BIT_OPMASK(i->opmask()), mask = opmask;
unsigned len = i->getVL(), n = 0, k = 0;
Bit32u opmask = BX_READ_32BIT_OPMASK(i->opmask());
unsigned len = i->getVL(), k = 0;
for (; n < len*8; n++, mask >>= 1) {
if (mask & 0x1) {
for (unsigned n = 0; n < len*8; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmm16u(n) = op.vmm16u(k);
k++;
}
else {
result.vmm16u(n) = 0;
}
}
if (i->isZeroMasking()) {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
else {
for (unsigned n=0; n < len; n++, opmask >>= 8)
xmm_pblendw(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXPANDPS_MASK_VpsWpsR(bxInstruction_c *i)
{
BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask()), mask = opmask;
unsigned len = i->getVL(), n = 0, k = 0;
Bit32u opmask = BX_READ_16BIT_OPMASK(i->opmask());
unsigned len = i->getVL(), k = 0;
for (; n < len*4; n++, mask >>= 1) {
if (mask & 0x1) {
for (unsigned n = 0; n < len*4; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmm32u(n) = op.vmm32u(k);
k++;
}
else {
result.vmm32u(n) = 0;
}
}
if (i->isZeroMasking()) {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
else {
for (unsigned n=0; n < len; n++, opmask >>= 4)
xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VEXPANDPD_MASK_VpdWpdR(bxInstruction_c *i)
{
BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()), result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask()), mask = opmask;
unsigned len = i->getVL(), n = 0, k = 0;
Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
unsigned len = i->getVL(), k = 0;
for (; n < len*2; n++, mask >>= 1) {
if (mask & 0x1) {
for (unsigned n = 0; n < len*2; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmm64u(n) = op.vmm64u(k);
k++;
}
else {
result.vmm64u(n) = 0;
}
}
if (i->isZeroMasking()) {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
else {
for (unsigned n=0; n < len; n++, opmask >>= 2)
xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
@ -1984,11 +1960,12 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMPRESSB_MASK_WdqVdq(bxInstruction_c *i
unsigned len = i->getVL(), n = 0, k = 0;
for (; n < len*16; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmmubyte(k) = op.vmmubyte(n);
k++;
}
if (! opmask) break;
}
Bit64u writemask = (BX_CONST64(1) << k) - 1;
@ -2012,11 +1989,12 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMPRESSW_MASK_WdqVdq(bxInstruction_c *i
unsigned len = i->getVL(), n = 0, k = 0;
for (; n < len*8; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmm16u(k) = op.vmm16u(n);
k++;
}
if (! opmask) break;
}
Bit32u writemask = (1 << k) - 1;
@ -2040,11 +2018,12 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCOMPRESSPS_MASK_WpsVps(bxInstruction_c *i
unsigned len = i->getVL(), n = 0, k = 0;
for (; n < len*4; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmm32u(k) = op.vmm32u(n);
k++;
}
if (! opmask) break;
}
Bit32u writemask = (1 << k) - 1;
@ -2068,11 +2047,12 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCOMPRESSPD_MASK_WpdVpd(bxInstruction_c *i
unsigned len = i->getVL(), n = 0, k = 0;
for (; n < len*2; n++, opmask >>= 1) {
if (! opmask) break;
if (opmask & 0x1) {
result.vmm64u(k) = op.vmm64u(n);
k++;
}
if (! opmask) break;
}
Bit32u writemask = (1 << k) - 1;

View File

@ -395,54 +395,44 @@ AVX512_CVT32_TO_64(VCVTTPS2UQQ_MASK_VdqWpsR, float32_to_uint64_round_to_zero)
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTUDQ2PD_MASK_VpdWdqR(bxInstruction_c *i)
{
BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
BxPackedAvxRegister result;
unsigned len = i->getVL();
BxPackedAvxRegister result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) {
if (opmask & mask)
result.vmm64u(n) = uint32_to_float64(op.ymm32u(n));
else
result.vmm64u(n) = 0;
}
if (! i->isZeroMasking()) {
for (unsigned n=0; n < len; n++, opmask >>= 2)
xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
else {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTDQ2PD_MASK_VpdWdqR(bxInstruction_c *i)
{
BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
BxPackedAvxRegister result;
unsigned len = i->getVL();
BxPackedAvxRegister result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
Bit32u opmask = BX_READ_8BIT_OPMASK(i->opmask());
for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) {
if (opmask & mask)
result.vmm64u(n) = int32_to_float64(op.ymm32s(n));
else
result.vmm64u(n) = 0;
}
if (! i->isZeroMasking()) {
for (unsigned n=0; n < len; n++, opmask >>= 2)
xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
else {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
@ -800,10 +790,15 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTUDQ2PD_VpdWdqR(bxInstruction_c *i)
void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTPH2PS_MASK_VpsWpsR(bxInstruction_c *i)
{
BxPackedAvxRegister result;
BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
unsigned len = i->getVL();
BxPackedAvxRegister result;
if (i->isZeroMasking())
result.clear();
else
result = BX_READ_AVX_REG(i->dst());
float_status_t status = mxcsr_to_softfloat_status_word(MXCSR);
status.denormals_are_zeros = 0; // ignore MXCSR.DAZ
// no denormal exception is reported on MXCSR
@ -814,21 +809,11 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTPH2PS_MASK_VpsWpsR(bxInstruction_c *i)
for (unsigned n=0, mask = 0x1; n < DWORD_ELEMENTS(len); n++, mask <<= 1) {
if (opmask & mask)
result.vmm32u(n) = float16_to_float32(op.ymm16u(n), status);
else
result.vmm32u(n) = 0;
}
check_exceptionsSSE(get_exception_flags(status));
if (! i->isZeroMasking()) {
for (unsigned n=0; n < len; n++, opmask >>= 4)
xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &result.vmm128(n), opmask);
BX_CLEAR_AVX_REGZ(i->dst(), len);
}
else {
BX_WRITE_AVX_REGZ(i->dst(), result, len);
}
BX_WRITE_AVX_REGZ(i->dst(), result, len);
BX_NEXT_INSTR(i);
}