///////////////////////////////////////////////////////////////////////// // $Id$ ///////////////////////////////////////////////////////////////////////// // // Copyright (c) 2011-2012 Stanislav Shwartsman // Written by Stanislav Shwartsman [sshwarts at sourceforge net] // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA B 02110-1301 USA // ///////////////////////////////////////////////////////////////////////// #define NEED_CPU_REG_SHORTCUTS 1 #include "bochs.h" #include "cpu.h" #define LOG_THIS BX_CPU_THIS_PTR #if BX_SUPPORT_AVX extern void mxcsr_to_softfloat_status_word(float_status_t &status, bx_mxcsr_t mxcsr); #include "simd_int.h" #include "simd_compare.h" typedef void (*simd_compare_method)(BxPackedXmmRegister *op1, const BxPackedXmmRegister *op2); // comparison predicate for PCOMB static simd_compare_method compare8[8] = { sse_pcmpltb, sse_pcmpleb, sse_pcmpgtb, sse_pcmpgeb, sse_pcmpeqb, sse_pcmpneb, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMUB static simd_compare_method compare8u[8] = { sse_pcmpltub, sse_pcmpleub, sse_pcmpgtub, sse_pcmpgeub, sse_pcmpeqb, sse_pcmpneb, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMW static simd_compare_method compare16[8] = { sse_pcmpltw, sse_pcmplew, sse_pcmpgtw, sse_pcmpgew, sse_pcmpeqw, sse_pcmpnew, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMUW static simd_compare_method compare16u[8] = { sse_pcmpltuw, sse_pcmpleuw, sse_pcmpgtuw, sse_pcmpgeuw, sse_pcmpeqw, sse_pcmpnew, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMD static simd_compare_method compare32[8] = { sse_pcmpltd, sse_pcmpled, sse_pcmpgtd, sse_pcmpged, sse_pcmpeqd, sse_pcmpned, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMUD static simd_compare_method compare32u[8] = { sse_pcmpltud, sse_pcmpleud, sse_pcmpgtud, sse_pcmpgeud, sse_pcmpeqd, sse_pcmpned, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMQ static simd_compare_method compare64[8] = { sse_pcmpltq, sse_pcmpleq, sse_pcmpgtq, sse_pcmpgeq, sse_pcmpeqq, sse_pcmpneq, sse_pcmpfalse, sse_pcmptrue }; // comparison predicate for PCOMUQ static simd_compare_method compare64u[8] = { sse_pcmpltuq, sse_pcmpleuq, sse_pcmpgtuq, sse_pcmpgeuq, sse_pcmpeqq, sse_pcmpneq, sse_pcmpfalse, sse_pcmptrue }; typedef Bit8u (*vpperm_operation)(Bit8u byte); BX_CPP_INLINE Bit8u vpperm_bit_reverse(Bit8u v8) { return (v8 >> 7) | ((v8 >> 5) & 0x02) | ((v8 >> 3) & 0x04) | ((v8 >> 1) & 0x08) | ((v8 << 1) & 0x10) | ((v8 << 3) & 0x20) | ((v8 << 5) & 0x40) | (v8 << 7); } BX_CPP_INLINE Bit8u vpperm_noop(Bit8u v8) { return v8; } BX_CPP_INLINE Bit8u vpperm_invert(Bit8u v8) { return ~v8; } BX_CPP_INLINE Bit8u vpperm_invert_bit_reverse(Bit8u v8) { return vpperm_bit_reverse(~v8); } BX_CPP_INLINE Bit8u vpperm_zeros(Bit8u v8) { return 0; } BX_CPP_INLINE Bit8u vpperm_ones(Bit8u v8) { return 0xff; } BX_CPP_INLINE Bit8u vpperm_replicate_msb(Bit8u v8) { return (((Bit8s) v8) >> 7); } BX_CPP_INLINE Bit8u vpperm_invert_replicate_msb(Bit8u v8) { return vpperm_replicate_msb(~v8); } // logical operation for VPPERM static vpperm_operation vpperm_op[8] = { vpperm_noop, vpperm_invert, vpperm_bit_reverse, vpperm_invert_bit_reverse, vpperm_zeros, vpperm_ones, vpperm_replicate_msb, vpperm_invert_replicate_msb }; BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCMOV_VdqHdqWdqVIb(bxInstruction_c *i) { BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1()); BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2()); BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()); unsigned len = i->getVL(); for (unsigned n=0; n < len; n++) { sse_pselect(&op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n)); } BX_WRITE_YMM_REGZ_VLEN(i->dst(), op1, len); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPPERM_VdqHdqWdqVIb(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()), dst; for (unsigned n=0;n<16;n++) { unsigned control = op3.xmmubyte(n); if (control & 0x10) dst.xmmubyte(n) = op1.xmmubyte(control & 0xf); else dst.xmmubyte(n) = op2.xmmubyte(control & 0xf); dst.xmmubyte(n) = vpperm_op[control >> 5](dst.xmmubyte(n)); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), dst); BX_NEXT_INSTR(i); } #define XOP_SHIFT_ROTATE(HANDLER, func) \ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C:: HANDLER (bxInstruction_c *i) \ { \ BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); \ \ (func)(&op1, &op2); \ \ BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); \ \ BX_NEXT_INSTR(i); \ } XOP_SHIFT_ROTATE(VPSHAB_VdqWdqHdq, sse_pshab); XOP_SHIFT_ROTATE(VPSHAW_VdqWdqHdq, sse_pshaw); XOP_SHIFT_ROTATE(VPSHAD_VdqWdqHdq, sse_pshad); XOP_SHIFT_ROTATE(VPSHAQ_VdqWdqHdq, sse_pshaq); XOP_SHIFT_ROTATE(VPSHLB_VdqWdqHdq, sse_pshlb); XOP_SHIFT_ROTATE(VPSHLW_VdqWdqHdq, sse_pshlw); XOP_SHIFT_ROTATE(VPSHLD_VdqWdqHdq, sse_pshld); XOP_SHIFT_ROTATE(VPSHLQ_VdqWdqHdq, sse_pshlq); XOP_SHIFT_ROTATE(VPROTB_VdqWdqHdq, sse_protb); XOP_SHIFT_ROTATE(VPROTW_VdqWdqHdq, sse_protw); XOP_SHIFT_ROTATE(VPROTD_VdqWdqHdq, sse_protd); XOP_SHIFT_ROTATE(VPROTQ_VdqWdqHdq, sse_protq); BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWW_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); for(unsigned n=0;n<8;n++) { op1.xmm16s(n) = SaturateDwordSToWordS(((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n)); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSWD_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); op1.xmm32s(0) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0)); op1.xmm32s(1) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1)); op1.xmm32s(2) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2)); op1.xmm32s(3) = SaturateQwordSToDwordS(((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3)); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_CPP_INLINE Bit64s add_saturate64(Bit64s a, Bit64s b) { Bit64s r = a + b; Bit64u overflow = GET_ADD_OVERFLOW(a, b, r, BX_CONST64(0x8000000000000000)); if (! overflow) return r; // signed overflow detected, saturate if (a > 0) overflow--; return (Bit64s) overflow; } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQL_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0); Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2); op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0)); op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1)); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDD_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); for(unsigned n=0;n<4;n++) { op1.xmm32s(n) = SaturateQwordSToDwordS(((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n)); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSSDQH_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1); Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3); op1.xmm64s(0) = add_saturate64(product1, op3.xmm64s(0)); op1.xmm64s(1) = add_saturate64(product2, op3.xmm64s(1)); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWW_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); for(unsigned n=0;n<8;n++) { op1.xmm16s(n) = ((Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n)) + (Bit32s) op3.xmm16s(n); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSWD_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); op1.xmm32s(0) = ((Bit32s) op1.xmm16s(1) * (Bit32s) op2.xmm16s(1)) + (Bit64s) op3.xmm32s(0); op1.xmm32s(1) = ((Bit32s) op1.xmm16s(3) * (Bit32s) op2.xmm16s(3)) + (Bit64s) op3.xmm32s(1); op1.xmm32s(2) = ((Bit32s) op1.xmm16s(5) * (Bit32s) op2.xmm16s(5)) + (Bit64s) op3.xmm32s(2); op1.xmm32s(3) = ((Bit32s) op1.xmm16s(7) * (Bit32s) op2.xmm16s(7)) + (Bit64s) op3.xmm32s(3); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQL_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); Bit64s product1 = (Bit64s) op1.xmm32s(0) * (Bit64s) op2.xmm32s(0); Bit64s product2 = (Bit64s) op1.xmm32s(2) * (Bit64s) op2.xmm32s(2); op1.xmm64s(0) = product1 + op3.xmm64s(0); op1.xmm64s(1) = product2 + op3.xmm64s(1); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDD_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); for(unsigned n=0;n<4;n++) { op1.xmm32s(n) = ((Bit64s) op1.xmm32s(n) * (Bit64s) op2.xmm32s(n)) + (Bit64s) op3.xmm32s(n); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMACSDQH_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); Bit64s product1 = (Bit64s) op1.xmm32s(1) * (Bit64s) op2.xmm32s(1); Bit64s product2 = (Bit64s) op1.xmm32s(3) * (Bit64s) op2.xmm32s(3); op1.xmm64s(0) = product1 + op3.xmm64s(0); op1.xmm64s(1) = product2 + op3.xmm64s(1); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSSWD_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); Bit32s product[8]; for(unsigned n=0;n < 8;n++) product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n); op1.xmm32s(0) = SaturateQwordSToDwordS((Bit64s) product[0] + (Bit64s) product[1] + (Bit64s) op3.xmm32s(0)); op1.xmm32s(1) = SaturateQwordSToDwordS((Bit64s) product[2] + (Bit64s) product[3] + (Bit64s) op3.xmm32s(1)); op1.xmm32s(2) = SaturateQwordSToDwordS((Bit64s) product[4] + (Bit64s) product[5] + (Bit64s) op3.xmm32s(2)); op1.xmm32s(3) = SaturateQwordSToDwordS((Bit64s) product[6] + (Bit64s) product[7] + (Bit64s) op3.xmm32s(3)); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPMADCSWD_VdqHdqWdqVIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); BxPackedXmmRegister op2 = BX_READ_XMM_REG(i->src2()); BxPackedXmmRegister op3 = BX_READ_XMM_REG(i->src3()); Bit32s product[8]; for(unsigned n=0;n < 8;n++) product[n] = (Bit32s) op1.xmm16s(n) * (Bit32s) op2.xmm16s(n); op1.xmm32s(0) = product[0] + product[1] + op3.xmm32s(0); op1.xmm32s(1) = product[2] + product[3] + op3.xmm32s(1); op1.xmm32s(2) = product[4] + product[5] + op3.xmm32s(2); op1.xmm32s(3) = product[6] + product[7] + op3.xmm32s(3); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTB_VdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); int count = i->Ib(); if (count > 0) { // rotate left sse_prolb(&op, count); } else if (count < 0) { // rotate right sse_prorb(&op, -count); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTW_VdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); int count = i->Ib(); if (count > 0) { // rotate left sse_prolw(&op, count); } else if (count < 0) { // rotate right sse_prorw(&op, -count); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTD_VdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); int count = i->Ib(); if (count > 0) { // rotate left sse_prold(&op, count); } else if (count < 0) { // rotate right sse_prord(&op, -count); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPROTQ_VdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); int count = i->Ib(); if (count > 0) { // rotate left sse_prolq(&op, count); } else if (count < 0) { // rotate right sse_prorq(&op, -count); } BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMB_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare8[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMW_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare16[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMD_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare32[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMQ_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare64[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUB_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare8u[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUW_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare16u[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUD_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare32u[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPCOMUQ_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()), op2 = BX_READ_XMM_REG(i->src2()); compare64u[i->Ib() & 7](&op1, &op2); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPS_VpsWpsR(bxInstruction_c *i) { BxPackedYmmRegister op = BX_READ_YMM_REG(i->src()); unsigned len = i->getVL(); float_status_t status; mxcsr_to_softfloat_status_word(status, MXCSR); for (unsigned n=0; n < (4*len); n++) { op.ymm32u(n) = float32_frc(op.ymm32u(n), status); } check_exceptionsSSE(status.float_exception_flags); BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZPD_VpdWpdR(bxInstruction_c *i) { BxPackedYmmRegister op = BX_READ_YMM_REG(i->src()); unsigned len = i->getVL(); float_status_t status; mxcsr_to_softfloat_status_word(status, MXCSR); for (unsigned n=0; n < (2*len); n++) { op.ymm64u(n) = float64_frc(op.ymm64u(n), status); } check_exceptionsSSE(status.float_exception_flags); BX_WRITE_YMM_REGZ_VLEN(i->dst(), op, len); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSS_VssWssR(bxInstruction_c *i) { float32 op = BX_READ_XMM_REG_LO_DWORD(i->src()); BxPackedXmmRegister r; float_status_t status; mxcsr_to_softfloat_status_word(status, MXCSR); r.xmm64u(0) = (Bit64u) float32_frc(op, status); r.xmm64u(1) = 0; check_exceptionsSSE(status.float_exception_flags); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VFRCZSD_VsdWsdR(bxInstruction_c *i) { float64 op = BX_READ_XMM_REG_LO_QWORD(i->src()); BxPackedXmmRegister r; float_status_t status; mxcsr_to_softfloat_status_word(status, MXCSR); r.xmm64u(0) = float64_frc(op, status); r.xmm64u(1) = 0; check_exceptionsSSE(status.float_exception_flags); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), r); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBW_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) + (Bit16s) op.xmmsbyte(0x1); op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) + (Bit16s) op.xmmsbyte(0x3); op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) + (Bit16s) op.xmmsbyte(0x5); op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) + (Bit16s) op.xmmsbyte(0x7); op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) + (Bit16s) op.xmmsbyte(0x9); op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) + (Bit16s) op.xmmsbyte(0xB); op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) + (Bit16s) op.xmmsbyte(0xD); op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) + (Bit16s) op.xmmsbyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBD_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm32s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) + (Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3); op.xmm32s(1) = (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) + (Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7); op.xmm32s(2) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) + (Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB); op.xmm32s(3) = (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) + (Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDBQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64s(0) = (Bit32s) op.xmmsbyte(0x0) + (Bit32s) op.xmmsbyte(0x1) + (Bit32s) op.xmmsbyte(0x2) + (Bit32s) op.xmmsbyte(0x3) + (Bit32s) op.xmmsbyte(0x4) + (Bit32s) op.xmmsbyte(0x5) + (Bit32s) op.xmmsbyte(0x6) + (Bit32s) op.xmmsbyte(0x7); op.xmm64s(1) = (Bit32s) op.xmmsbyte(0x8) + (Bit32s) op.xmmsbyte(0x9) + (Bit32s) op.xmmsbyte(0xA) + (Bit32s) op.xmmsbyte(0xB) + (Bit32s) op.xmmsbyte(0xC) + (Bit32s) op.xmmsbyte(0xD) + (Bit32s) op.xmmsbyte(0xE) + (Bit32s) op.xmmsbyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWD_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm32s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1); op.xmm32s(1) = (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3); op.xmm32s(2) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5); op.xmm32s(3) = (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDWQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64s(0) = (Bit32s) op.xmm16s(0) + (Bit32s) op.xmm16s(1) + (Bit32s) op.xmm16s(2) + (Bit32s) op.xmm16s(3); op.xmm64s(1) = (Bit32s) op.xmm16s(4) + (Bit32s) op.xmm16s(5) + (Bit32s) op.xmm16s(6) + (Bit32s) op.xmm16s(7); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDDQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64s(0) = (Bit64s) op.xmm32s(0) + (Bit64s) op.xmm32s(1); op.xmm64s(1) = (Bit64s) op.xmm32s(2) + (Bit64s) op.xmm32s(3); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBW_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm16u(0) = (Bit16u) op.xmmubyte(0x0) + (Bit16u) op.xmmubyte(0x1); op.xmm16u(1) = (Bit16u) op.xmmubyte(0x2) + (Bit16u) op.xmmubyte(0x3); op.xmm16u(2) = (Bit16u) op.xmmubyte(0x4) + (Bit16u) op.xmmubyte(0x5); op.xmm16u(3) = (Bit16u) op.xmmubyte(0x6) + (Bit16u) op.xmmubyte(0x7); op.xmm16u(4) = (Bit16u) op.xmmubyte(0x8) + (Bit16u) op.xmmubyte(0x9); op.xmm16u(5) = (Bit16u) op.xmmubyte(0xA) + (Bit16u) op.xmmubyte(0xB); op.xmm16u(6) = (Bit16u) op.xmmubyte(0xC) + (Bit16u) op.xmmubyte(0xD); op.xmm16u(7) = (Bit16u) op.xmmubyte(0xE) + (Bit16u) op.xmmubyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBD_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm32u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32s) op.xmmubyte(0x1) + (Bit32u) op.xmmubyte(0x2) + (Bit32s) op.xmmubyte(0x3); op.xmm32u(1) = (Bit32u) op.xmmubyte(0x4) + (Bit32s) op.xmmubyte(0x5) + (Bit32u) op.xmmubyte(0x6) + (Bit32s) op.xmmubyte(0x7); op.xmm32u(2) = (Bit32u) op.xmmubyte(0x8) + (Bit32s) op.xmmubyte(0x9) + (Bit32u) op.xmmubyte(0xA) + (Bit32s) op.xmmubyte(0xB); op.xmm32u(3) = (Bit32u) op.xmmubyte(0xC) + (Bit32s) op.xmmubyte(0xD) + (Bit32u) op.xmmubyte(0xE) + (Bit32s) op.xmmubyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUBQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64u(0) = (Bit32u) op.xmmubyte(0x0) + (Bit32u) op.xmmubyte(0x1) + (Bit32u) op.xmmubyte(0x2) + (Bit32u) op.xmmubyte(0x3) + (Bit32u) op.xmmubyte(0x4) + (Bit32u) op.xmmubyte(0x5) + (Bit32u) op.xmmubyte(0x6) + (Bit32u) op.xmmubyte(0x7); op.xmm64u(1) = (Bit32u) op.xmmubyte(0x8) + (Bit32u) op.xmmubyte(0x9) + (Bit32u) op.xmmubyte(0xA) + (Bit32u) op.xmmubyte(0xB) + (Bit32u) op.xmmubyte(0xC) + (Bit32u) op.xmmubyte(0xD) + (Bit32u) op.xmmubyte(0xE) + (Bit32u) op.xmmubyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWD_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm32u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1); op.xmm32u(1) = (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3); op.xmm32u(2) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5); op.xmm32u(3) = (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUWQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64u(0) = (Bit32u) op.xmm16u(0) + (Bit32u) op.xmm16u(1) + (Bit32u) op.xmm16u(2) + (Bit32u) op.xmm16u(3); op.xmm64u(1) = (Bit32u) op.xmm16u(4) + (Bit32u) op.xmm16u(5) + (Bit32u) op.xmm16u(6) + (Bit32u) op.xmm16u(7); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHADDUDQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64u(0) = (Bit64u) op.xmm32u(0) + (Bit64u) op.xmm32u(1); op.xmm64u(1) = (Bit64u) op.xmm32u(2) + (Bit64u) op.xmm32u(3); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBBW_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm16s(0) = (Bit16s) op.xmmsbyte(0x0) - (Bit16s) op.xmmsbyte(0x1); op.xmm16s(1) = (Bit16s) op.xmmsbyte(0x2) - (Bit16s) op.xmmsbyte(0x3); op.xmm16s(2) = (Bit16s) op.xmmsbyte(0x4) - (Bit16s) op.xmmsbyte(0x5); op.xmm16s(3) = (Bit16s) op.xmmsbyte(0x6) - (Bit16s) op.xmmsbyte(0x7); op.xmm16s(4) = (Bit16s) op.xmmsbyte(0x8) - (Bit16s) op.xmmsbyte(0x9); op.xmm16s(5) = (Bit16s) op.xmmsbyte(0xA) - (Bit16s) op.xmmsbyte(0xB); op.xmm16s(6) = (Bit16s) op.xmmsbyte(0xC) - (Bit16s) op.xmmsbyte(0xD); op.xmm16s(7) = (Bit16s) op.xmmsbyte(0xE) - (Bit16s) op.xmmsbyte(0xF); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBWD_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm32s(0) = (Bit32s) op.xmm16s(0) - (Bit32s) op.xmm16s(1); op.xmm32s(1) = (Bit32s) op.xmm16s(2) - (Bit32s) op.xmm16s(3); op.xmm32s(2) = (Bit32s) op.xmm16s(4) - (Bit32s) op.xmm16s(5); op.xmm32s(3) = (Bit32s) op.xmm16s(6) - (Bit32s) op.xmm16s(7); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPHSUBDQ_VdqWdqR(bxInstruction_c *i) { BxPackedXmmRegister op = BX_READ_XMM_REG(i->src()); op.xmm64s(0) = (Bit64s) op.xmm32s(0) - (Bit64s) op.xmm32s(1); op.xmm64s(1) = (Bit64s) op.xmm32s(2) - (Bit64s) op.xmm32s(3); BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PS_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1()); BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2()); BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result; unsigned len = i->getVL(); for (unsigned n=0; n < len; n++) { sse_permil2ps(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3); } BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len); BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VPERMIL2PD_VdqHdqWdqIbR(bxInstruction_c *i) { BxPackedYmmRegister op1 = BX_READ_YMM_REG(i->src1()); BxPackedYmmRegister op2 = BX_READ_YMM_REG(i->src2()); BxPackedYmmRegister op3 = BX_READ_YMM_REG(i->src3()), result; unsigned len = i->getVL(); for (unsigned n=0; n < len; n++) { sse_permil2pd(&result.ymm128(n), &op1.ymm128(n), &op2.ymm128(n), &op3.ymm128(n), i->Ib() & 3); } BX_WRITE_YMM_REGZ_VLEN(i->dst(), result, len); BX_NEXT_INSTR(i); } #endif