Bochs/bochs/cpu/avx/xop.cc
Volker Ruppert 59eac1f196 Moved AVX/EVEX stuff to a new cpu subfolder and updated build system
TODO: update MVSC workspace files
2014-07-25 08:35:06 +00:00

945 lines
29 KiB
C++

/////////////////////////////////////////////////////////////////////////
// $Id$
/////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2011-2014 Stanislav Shwartsman
// Written by Stanislav Shwartsman [sshwarts at sourceforge net]
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA
//
/////////////////////////////////////////////////////////////////////////
#define NEED_CPU_REG_SHORTCUTS 1
#include "bochs.h"
#include "cpu.h"
#define LOG_THIS BX_CPU_THIS_PTR
#if BX_SUPPORT_AVX
extern void mxcsr_to_softfloat_status_word(float_status_t &status, bx_mxcsr_t mxcsr);
#include "simd_int.h"
#include "simd_compare.h"
typedef void (*simd_compare_method)(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2);
// comparison predicate for PCOMB
static simd_compare_method xop_compare8[8] = {
xmm_pcmpltb,
xmm_pcmpleb,
xmm_pcmpgtb,
xmm_pcmpgeb,
xmm_pcmpeqb,
xmm_pcmpneb,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMUB
static simd_compare_method xop_compare8u[8] = {
xmm_pcmpltub,
xmm_pcmpleub,
xmm_pcmpgtub,
xmm_pcmpgeub,
xmm_pcmpeqb,
xmm_pcmpneb,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMW
static simd_compare_method xop_compare16[8] = {
xmm_pcmpltw,
xmm_pcmplew,
xmm_pcmpgtw,
xmm_pcmpgew,
xmm_pcmpeqw,
xmm_pcmpnew,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMUW
static simd_compare_method xop_compare16u[8] = {
xmm_pcmpltuw,
xmm_pcmpleuw,
xmm_pcmpgtuw,
xmm_pcmpgeuw,
xmm_pcmpeqw,
xmm_pcmpnew,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMD
static simd_compare_method xop_compare32[8] = {
xmm_pcmpltd,
xmm_pcmpled,
xmm_pcmpgtd,
xmm_pcmpged,
xmm_pcmpeqd,
xmm_pcmpned,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMUD
static simd_compare_method xop_compare32u[8] = {
xmm_pcmpltud,
xmm_pcmpleud,
xmm_pcmpgtud,
xmm_pcmpgeud,
xmm_pcmpeqd,
xmm_pcmpned,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMQ
static simd_compare_method xop_compare64[8] = {
xmm_pcmpltq,
xmm_pcmpleq,
xmm_pcmpgtq,
xmm_pcmpgeq,
xmm_pcmpeqq,
xmm_pcmpneq,
xmm_pcmpfalse,
xmm_pcmptrue
};
// comparison predicate for PCOMUQ
static simd_compare_method xop_compare64u[8] = {
xmm_pcmpltuq,
xmm_pcmpleuq,
xmm_pcmpgtuq,
xmm_pcmpgeuq,
xmm_pcmpeqq,
xmm_pcmpneq,
xmm_pcmpfalse,
xmm_pcmptrue
};
typedef Bit8u (*vpperm_operation)(Bit8u byte);
BX_CPP_INLINE Bit8u vpperm_bit_reverse(Bit8u v8)
{
return (v8 >> 7) |
((v8 >> 5) & 0x02) |
((v8 >> 3) & 0x04) |
((v8 >> 1) & 0x08) |
((v8 << 1) & 0x10) |
((v8 << 3) & 0x20) |
((v8 << 5) & 0x40) |
(v8 << 7);
}
BX_CPP_INLINE Bit8u vpperm_noop(Bit8u v8) { return v8; }
BX_CPP_INLINE Bit8u vpperm_invert(Bit8u v8) { return ~v8; }
BX_CPP_INLINE Bit8u vpperm_invert_bit_reverse(Bit8u v8) { return vpperm_bit_reverse(~v8); }
BX_CPP_INLINE Bit8u vpperm_zeros(Bit8u v8) { return 0; }
BX_CPP_INLINE Bit8u vpperm_ones(Bit8u v8) { return 0xff; }
BX_CPP_INLINE Bit8u vpperm_replicate_msb(Bit8u v8) { return (((Bit8s) v8) >> 7); }
BX_CPP_INLINE Bit8u vpperm_invert_replicate_msb(Bit8u v8) { return vpperm_replicate_msb(~v8); }
// logical operation for VPPERM
static vpperm_operation vpperm_op[8] = {
vpperm_noop,
vpperm_invert,
vpperm_bit_reverse,
vpperm_invert_bit_reverse,
vpperm_zeros,
vpperm_ones,
vpperm_replicate_msb,
vpperm_invert_replicate_msb
};
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMOV_VdqHdqWdqVIb(bxInstruction_c *i)
{
BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3());
unsigned len = i->getVL();
for (unsigned n=0; n < len; n++) {
xmm_pselect(&op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n));
}
BX_WRITE_YMM_REGZ_VLEN(i->dst(), op1, len);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPPERM_VdqHdqWdqVIb(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()), dst;
for (unsigned n=0;n<16;n++) {
unsigned control = op3.xmmubyte(n);
if (control & 0x10)
dst.xmmubyte(n) = op1.xmmubyte(control & 0xf);
else
dst.xmmubyte(n) = op2.xmmubyte(control & 0xf);
dst.xmmubyte(n) = vpperm_op[control >> 5](dst.xmmubyte(n));
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), dst);
BX_NEXT_INSTR(i);
}
#define XOP_SHIFT_ROTATE(HANDLER, func) \
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \
{ \
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); \
\
(func)(&op1, &op2); \
\
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); \
\
BX_NEXT_INSTR(i); \
}
XOP_SHIFT_ROTATE(VPSHAB_VdqWdqHdq, xmm_pshab);
XOP_SHIFT_ROTATE(VPSHAW_VdqWdqHdq, xmm_pshaw);
XOP_SHIFT_ROTATE(VPSHAD_VdqWdqHdq, xmm_pshad);
XOP_SHIFT_ROTATE(VPSHAQ_VdqWdqHdq, xmm_pshaq);
XOP_SHIFT_ROTATE(VPSHLB_VdqWdqHdq, xmm_pshlb);
XOP_SHIFT_ROTATE(VPSHLW_VdqWdqHdq, xmm_pshlw);
XOP_SHIFT_ROTATE(VPSHLD_VdqWdqHdq, xmm_pshld);
XOP_SHIFT_ROTATE(VPSHLQ_VdqWdqHdq, xmm_pshlq);
XOP_SHIFT_ROTATE(VPROTB_VdqWdqHdq, xmm_protb);
XOP_SHIFT_ROTATE(VPROTW_VdqWdqHdq, xmm_protw);
XOP_SHIFT_ROTATE(VPROTD_VdqWdqHdq, xmm_protd);
XOP_SHIFT_ROTATE(VPROTQ_VdqWdqHdq, xmm_protq);
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWW_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
for(unsigned n=0;n<8;n++) {
op1.xmm16s(n) = SaturateDwordSToWordS(((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n));
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
op1.xmm32s(0) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0));
op1.xmm32s(1) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1));
op1.xmm32s(2) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2));
op1.xmm32s(3) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3));
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_CPP_INLINE Bit64s add_saturate64(Bit64s a, Bit64s b)
{
Bit64s r = a + b;
Bit64u overflow = GET_ADD_OVERFLOW(a, b, r, BX_CONST64(0x8000000000000000));
if (! overflow) return r;
// signed overflow detected, saturate
if (a > 0) overflow--;
return (Bit64s) overflow;
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQL_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0);
Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2);
op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0));
op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1));
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDD_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
for(unsigned n=0;n<4;n++) {
op1.xmm32s(n) = SaturateQwordSToDwordS(((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n));
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQH_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1);
Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3);
op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0));
op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1));
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWW_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
for(unsigned n=0;n<8;n++) {
op1.xmm16s(n) = ((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n);
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
op1.xmm32s(0) = ((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0);
op1.xmm32s(1) = ((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1);
op1.xmm32s(2) = ((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2);
op1.xmm32s(3) = ((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQL_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0);
Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2);
op1.xmm64s(0) = product1 + op3.xmm64s(0);
op1.xmm64s(1) = product2 + op3.xmm64s(1);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDD_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
for(unsigned n=0;n<4;n++) {
op1.xmm32s(n) = ((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n);
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQH_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1);
Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3);
op1.xmm64s(0) = product1 + op3.xmm64s(0);
op1.xmm64s(1) = product2 + op3.xmm64s(1);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
Bit32s product[8];
for(unsigned n=0;n < 8;n++)
product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n);
op1.xmm32s(0) = SaturateQwordSToDwordS((Bit64s) product[0] + (Bit64s) product[1] + (Bit64s) op3.xmm32s(0));
op1.xmm32s(1) = SaturateQwordSToDwordS((Bit64s) product[2] + (Bit64s) product[3] + (Bit64s) op3.xmm32s(1));
op1.xmm32s(2) = SaturateQwordSToDwordS((Bit64s) product[4] + (Bit64s) product[5] + (Bit64s) op3.xmm32s(2));
op1.xmm32s(3) = SaturateQwordSToDwordS((Bit64s) product[6] + (Bit64s) product[7] + (Bit64s) op3.xmm32s(3));
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSWD_VdqHdqWdqVIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1());
BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2());
BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3());
Bit32s product[8];
for(unsigned n=0;n < 8;n++)
product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n);
op1.xmm32s(0) = product[0] + product[1] + op3.xmm32s(0);
op1.xmm32s(1) = product[2] + product[3] + op3.xmm32s(1);
op1.xmm32s(2) = product[4] + product[5] + op3.xmm32s(2);
op1.xmm32s(3) = product[6] + product[7] + op3.xmm32s(3);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTB_VdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
int count = i->Ib();
if (count > 0) {
// rotate left
xmm_prolb(&op, count);
}
else if (count < 0) {
// rotate right
xmm_prorb(&op, -count);
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTW_VdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
int count = i->Ib();
if (count > 0) {
// rotate left
xmm_prolw(&op, count);
}
else if (count < 0) {
// rotate right
xmm_prorw(&op, -count);
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTD_VdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
int count = i->Ib();
if (count > 0) {
// rotate left
xmm_prold(&op, count);
}
else if (count < 0) {
// rotate right
xmm_prord(&op, -count);
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTQ_VdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
int count = i->Ib();
if (count > 0) {
// rotate left
xmm_prolq(&op, count);
}
else if (count < 0) {
// rotate right
xmm_prorq(&op, -count);
}
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMB_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare8[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMW_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare16[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMD_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare32[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMQ_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare64[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUB_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare8u[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUW_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare16u[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUD_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare32u[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUQ_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2());
xop_compare64u[i->Ib() & 7](&op1, &op2);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPS_VpsWpsR(bxInstruction_c *i)
{
BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
unsigned len = i->getVL();
float_status_t status;
mxcsr_to_softfloat_status_word(status, MXCSR);
for (unsigned n=0; n < DWORD_ELEMENTS(len); n++) {
op.ymm32u(n) = float32_frc(op.ymm32u(n), status);
}
check_exceptionsSSE(get_exception_flags(status));
BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPD_VpdWpdR(bxInstruction_c *i)
{
BxPackedYmmRegister op = BX_READ_YMM_REG(i->src());
unsigned len = i->getVL();
float_status_t status;
mxcsr_to_softfloat_status_word(status, MXCSR);
for (unsigned n=0; n < QWORD_ELEMENTS(len); n++) {
op.ymm64u(n) = float64_frc(op.ymm64u(n), status);
}
check_exceptionsSSE(get_exception_flags(status));
BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSS_VssWssR(bxInstruction_c *i)
{
float32 op = BX_READ_XMM_REG_LO_DWORD(i->src());
BxPackedXmmRegister r;
float_status_t status;
mxcsr_to_softfloat_status_word(status, MXCSR);
r.xmm64u(0) = (Bit64u) float32_frc(op, status);
r.xmm64u(1) = 0;
check_exceptionsSSE(get_exception_flags(status));
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSD_VsdWsdR(bxInstruction_c *i)
{
float64 op = BX_READ_XMM_REG_LO_QWORD(i->src());
BxPackedXmmRegister r;
float_status_t status;
mxcsr_to_softfloat_status_word(status, MXCSR);
r.xmm64u(0) = float64_frc(op, status);
r.xmm64u(1) = 0;
check_exceptionsSSE(get_exception_flags(status));
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBW_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) + (Bit16s) op.xmmsbyte(0x1);
op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) + (Bit16s) op.xmmsbyte(0x3);
op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) + (Bit16s) op.xmmsbyte(0x5);
op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) + (Bit16s) op.xmmsbyte(0x7);
op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) + (Bit16s) op.xmmsbyte(0x9);
op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) + (Bit16s) op.xmmsbyte(0xB);
op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) + (Bit16s) op.xmmsbyte(0xD);
op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) + (Bit16s) op.xmmsbyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBD_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm32s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) +
(Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3);
op.xmm32s(1) = (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) +
(Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7);
op.xmm32s(2) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) +
(Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB);
op.xmm32s(3) = (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) +
(Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) +
(Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3) +
(Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) +
(Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7);
op.xmm64s(1) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) +
(Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB) +
(Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) +
(Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWD_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm32s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1);
op.xmm32s(1) = (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3);
op.xmm32s(2) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5);
op.xmm32s(3) = (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1) +
(Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3);
op.xmm64s(1) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5) +
(Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDDQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64s(0) = (Bit64s) op.xmm32s(0) + (Bit64s) op.xmm32s(1);
op.xmm64s(1) = (Bit64s) op.xmm32s(2) + (Bit64s) op.xmm32s(3);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBW_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm16u(0) = (Bit16u) op.xmmubyte(0x0) + (Bit16u) op.xmmubyte(0x1);
op.xmm16u(1) = (Bit16u) op.xmmubyte(0x2) + (Bit16u) op.xmmubyte(0x3);
op.xmm16u(2) = (Bit16u) op.xmmubyte(0x4) + (Bit16u) op.xmmubyte(0x5);
op.xmm16u(3) = (Bit16u) op.xmmubyte(0x6) + (Bit16u) op.xmmubyte(0x7);
op.xmm16u(4) = (Bit16u) op.xmmubyte(0x8) + (Bit16u) op.xmmubyte(0x9);
op.xmm16u(5) = (Bit16u) op.xmmubyte(0xA) + (Bit16u) op.xmmubyte(0xB);
op.xmm16u(6) = (Bit16u) op.xmmubyte(0xC) + (Bit16u) op.xmmubyte(0xD);
op.xmm16u(7) = (Bit16u) op.xmmubyte(0xE) + (Bit16u) op.xmmubyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBD_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm32u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32s) op.xmmubyte(0x1) +
(Bit32u) op.xmmubyte(0x2) + (Bit32s) op.xmmubyte(0x3);
op.xmm32u(1) = (Bit32u) op.xmmubyte(0x4) + (Bit32s) op.xmmubyte(0x5) +
(Bit32u) op.xmmubyte(0x6) + (Bit32s) op.xmmubyte(0x7);
op.xmm32u(2) = (Bit32u) op.xmmubyte(0x8) + (Bit32s) op.xmmubyte(0x9) +
(Bit32u) op.xmmubyte(0xA) + (Bit32s) op.xmmubyte(0xB);
op.xmm32u(3) = (Bit32u) op.xmmubyte(0xC) + (Bit32s) op.xmmubyte(0xD) +
(Bit32u) op.xmmubyte(0xE) + (Bit32s) op.xmmubyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32u) op.xmmubyte(0x1) +
(Bit32u) op.xmmubyte(0x2) + (Bit32u) op.xmmubyte(0x3) +
(Bit32u) op.xmmubyte(0x4) + (Bit32u) op.xmmubyte(0x5) +
(Bit32u) op.xmmubyte(0x6) + (Bit32u) op.xmmubyte(0x7);
op.xmm64u(1) = (Bit32u) op.xmmubyte(0x8) + (Bit32u) op.xmmubyte(0x9) +
(Bit32u) op.xmmubyte(0xA) + (Bit32u) op.xmmubyte(0xB) +
(Bit32u) op.xmmubyte(0xC) + (Bit32u) op.xmmubyte(0xD) +
(Bit32u) op.xmmubyte(0xE) + (Bit32u) op.xmmubyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWD_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm32u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1);
op.xmm32u(1) = (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3);
op.xmm32u(2) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5);
op.xmm32u(3) = (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1) +
(Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3);
op.xmm64u(1) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5) +
(Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUDQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64u(0) = (Bit64u) op.xmm32u(0) + (Bit64u) op.xmm32u(1);
op.xmm64u(1) = (Bit64u) op.xmm32u(2) + (Bit64u) op.xmm32u(3);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBBW_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) - (Bit16s) op.xmmsbyte(0x1);
op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) - (Bit16s) op.xmmsbyte(0x3);
op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) - (Bit16s) op.xmmsbyte(0x5);
op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) - (Bit16s) op.xmmsbyte(0x7);
op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) - (Bit16s) op.xmmsbyte(0x9);
op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) - (Bit16s) op.xmmsbyte(0xB);
op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) - (Bit16s) op.xmmsbyte(0xD);
op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) - (Bit16s) op.xmmsbyte(0xF);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBWD_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm32s(0) = (Bit32s) op.xmm16s(0) - (Bit32s) op.xmm16s(1);
op.xmm32s(1) = (Bit32s) op.xmm16s(2) - (Bit32s) op.xmm16s(3);
op.xmm32s(2) = (Bit32s) op.xmm16s(4) - (Bit32s) op.xmm16s(5);
op.xmm32s(3) = (Bit32s) op.xmm16s(6) - (Bit32s) op.xmm16s(7);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBDQ_VdqWdqR(bxInstruction_c *i)
{
BxPackedXmmRegister op = BX_READ_XMM_REG(i->src());
op.xmm64s(0) = (Bit64s) op.xmm32s(0) - (Bit64s) op.xmm32s(1);
op.xmm64s(1) = (Bit64s) op.xmm32s(2) - (Bit64s) op.xmm32s(3);
BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PS_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result;
unsigned len = i->getVL();
for (unsigned n=0; n < len; n++) {
xmm_permil2ps(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3);
}
BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PD_VdqHdqWdqIbR(bxInstruction_c *i)
{
BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1());
BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2());
BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result;
unsigned len = i->getVL();
for (unsigned n=0; n < len; n++) {
xmm_permil2pd(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3);
}
BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len);
BX_NEXT_INSTR(i);
}
#endif