From 695d245116c710220703d91048b2d7eda0975516 Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Thu, 27 Feb 2014 18:27:57 +0000 Subject: [PATCH] Implemented VRNDSCALE AVX-512 instructions. Now only missed AVX-512 opcodes now are: 512.66.0F38.W0 2C VSCALEFPS 512.66.0F38.W1 2C VSCALEFPD NDS.LIG.66.0F38.W0 2D VSCALESS NDS.LIG.66.0F38.W1 2D VSCALESD --- bochs/cpu/avx512_pfp.cc | 146 ++++++++++++++++++++++++++++++++++--- bochs/cpu/fpu/softfloat.cc | 46 +++++++----- bochs/cpu/fpu/softfloat.h | 12 +++ 3 files changed, 175 insertions(+), 29 deletions(-) diff --git a/bochs/cpu/avx512_pfp.cc b/bochs/cpu/avx512_pfp.cc index 497e14751..b5c3392fe 100644 --- a/bochs/cpu/avx512_pfp.cc +++ b/bochs/cpu/avx512_pfp.cc @@ -960,34 +960,160 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VGETMANTPD_MASK_VpdWpdIbR(bxInstru BX_NEXT_INSTR(i); } +// rndscale + BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALEPS_MASK_VpsWpsIbR(bxInstruction_c *i) { - BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); + BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()); + Bit32u opmask = i->opmask() ? BX_READ_16BIT_OPMASK(i->opmask()) : (Bit32u) -1; + unsigned len = i->getVL(); - BX_NEXT_INSTR(i); -} + float_status_t status; + mxcsr_to_softfloat_status_word(status, MXCSR); + softfloat_status_word_rc_override(status, i); -BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALEPD_MASK_VpdWpdIbR(bxInstruction_c *i) -{ - BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); + Bit8u control = i->Ib(), scale = control >> 4; + + // override MXCSR rounding mode with control coming from imm8 + if ((control & 0x4) == 0) + status.float_rounding_mode = control & 0x3; + // ignore precision exception result + if (control & 0x8) + status.float_suppress_exception |= float_flag_inexact; + + for (unsigned n=0, mask = 0x1; n < DWORD_ELEMENTS(len); n++, mask <<= 1) { + if (opmask & mask) + op.vmm32u(n) = float32_round_to_int(op.vmm32u(n), scale, status); + else + op.vmm32u(n) = 0; + } + + check_exceptionsSSE(get_exception_flags(status)); + + if (! i->isZeroMasking()) { + for (unsigned n=0; n < len; n++, opmask >>= 4) + xmm_blendps(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask); + BX_CLEAR_AVX_REGZ(i->dst(), len); + } + else { + BX_WRITE_AVX_REGZ(i->dst(), op, len); + } BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALESS_MASK_VssHpsWssIbR(bxInstruction_c *i) { - BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); + BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); + + if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) { + float32 op2 = BX_READ_XMM_REG_LO_DWORD(i->src2()); + + Bit8u control = i->Ib(), scale = control >> 4; + + float_status_t status; + mxcsr_to_softfloat_status_word(status, MXCSR); + softfloat_status_word_rc_override(status, i); + + // override MXCSR rounding mode with control coming from imm8 + if ((control & 0x4) == 0) + status.float_rounding_mode = control & 0x3; + // ignore precision exception result + if (control & 0x8) + status.float_suppress_exception |= float_flag_inexact; + + op1.xmm32u(0) = float32_round_to_int(op2, scale, status); + + check_exceptionsSSE(get_exception_flags(status)); + } + else { + if (i->isZeroMasking()) + op1.xmm32u(0) = 0; + else + op1.xmm32u(0) = BX_READ_XMM_REG_LO_DWORD(i->dst()); + } + + BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); + BX_NEXT_INSTR(i); +} + +BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALEPD_MASK_VpdWpdIbR(bxInstruction_c *i) +{ + BxPackedAvxRegister op = BX_READ_AVX_REG(i->src()); + Bit32u opmask = i->opmask() ? BX_READ_8BIT_OPMASK(i->opmask()) : (Bit32u) -1; + unsigned len = i->getVL(); + + float_status_t status; + mxcsr_to_softfloat_status_word(status, MXCSR); + softfloat_status_word_rc_override(status, i); + + Bit8u control = i->Ib(), scale = control >> 4; + + // override MXCSR rounding mode with control coming from imm8 + if ((control & 0x4) == 0) + status.float_rounding_mode = control & 0x3; + // ignore precision exception result + if (control & 0x8) + status.float_suppress_exception |= float_flag_inexact; + + for (unsigned n=0, mask = 0x1; n < QWORD_ELEMENTS(len); n++, mask <<= 1) { + if (opmask & mask) + op.vmm64u(n) = float64_round_to_int(op.vmm64u(n), scale, status); + else + op.vmm64u(n) = 0; + } + + check_exceptionsSSE(get_exception_flags(status)); + + if (! i->isZeroMasking()) { + for (unsigned n=0; n < len; n++, opmask >>= 2) + xmm_blendpd(&BX_READ_AVX_REG_LANE(i->dst(), n), &op.vmm128(n), opmask); + BX_CLEAR_AVX_REGZ(i->dst(), len); + } + else { + BX_WRITE_AVX_REGZ(i->dst(), op, len); + } BX_NEXT_INSTR(i); } BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRNDSCALESD_MASK_VsdHpdWsdIbR(bxInstruction_c *i) { - BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); + BxPackedXmmRegister op1 = BX_READ_XMM_REG(i->src1()); + if (! i->opmask() || BX_SCALAR_ELEMENT_MASK(i->opmask())) { + float64 op2 = BX_READ_XMM_REG_LO_QWORD(i->src2()); + + Bit8u control = i->Ib(), scale = control >> 4; + + float_status_t status; + mxcsr_to_softfloat_status_word(status, MXCSR); + softfloat_status_word_rc_override(status, i); + + // override MXCSR rounding mode with control coming from imm8 + if ((control & 0x4) == 0) + status.float_rounding_mode = control & 0x3; + // ignore precision exception result + if (control & 0x8) + status.float_suppress_exception |= float_flag_inexact; + + op1.xmm64u(0) = float64_round_to_int(op2, scale, status); + + check_exceptionsSSE(get_exception_flags(status)); + } + else { + if (i->isZeroMasking()) + op1.xmm64u(0) = 0; + else + op1.xmm64u(0) = BX_READ_XMM_REG_LO_QWORD(i->dst()); + } + + BX_WRITE_XMM_REG_CLEAR_HIGH(i->dst(), op1); BX_NEXT_INSTR(i); } +// scale + BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFPS_MASK_VpsWpsR(bxInstruction_c *i) { BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); @@ -995,14 +1121,14 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFPS_MASK_VpsWpsR(bxInstructi BX_NEXT_INSTR(i); } -BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFPD_MASK_VpdWpdR(bxInstruction_c *i) +BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFSS_MASK_VssHpsWssR(bxInstruction_c *i) { BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); BX_NEXT_INSTR(i); } -BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFSS_MASK_VssHpsWssR(bxInstruction_c *i) +BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VSCALEFPD_MASK_VpdWpdR(bxInstruction_c *i) { BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort())); diff --git a/bochs/cpu/fpu/softfloat.cc b/bochs/cpu/fpu/softfloat.cc index a82ddadd6..57a45cb36 100644 --- a/bochs/cpu/fpu/softfloat.cc +++ b/bochs/cpu/fpu/softfloat.cc @@ -508,16 +508,20 @@ float64 float32_to_float64(float32 a, float_status_t &status) | Floating-Point Arithmetic. *----------------------------------------------------------------------------*/ -float32 float32_round_to_int(float32 a, float_status_t &status) +float32 float32_round_to_int(float32 a, Bit8u scale, float_status_t &status) { Bit32u lastBitMask, roundBitsMask; int roundingMode = get_float_rounding_mode(status); - Bit16s aExp = extractFloat32Exp(a); + scale &= 0xf; + + if ((aExp == 0xFF) && extractFloat32Frac(a)) { + return propagateFloat32NaN(a, status); + } + + aExp += scale; // scale the exponent + if (0x96 <= aExp) { - if ((aExp == 0xFF) && extractFloat32Frac(a)) { - return propagateFloat32NaN(a, status); - } return a; } @@ -532,16 +536,17 @@ float32 float32_round_to_int(float32 a, float_status_t &status) switch (roundingMode) { case float_round_nearest_even: if ((aExp == 0x7E) && extractFloat32Frac(a)) { - return packFloat32(aSign, 0x7F, 0); + return packFloat32(aSign, 0x7F - scale, 0); } break; case float_round_down: - return aSign ? float32_negative_one : 0; + return aSign ? packFloat32(1, 0x7F - scale, 0) : float32_positive_zero; case float_round_up: - return aSign ? float32_negative_zero : float32_positive_one; + return aSign ? float32_negative_zero : packFloat32(0, 0x7F - scale, 0); } return packFloat32(aSign, 0, 0); } + lastBitMask = 1; lastBitMask <<= 0x96 - aExp; roundBitsMask = lastBitMask - 1; @@ -1610,18 +1615,20 @@ float32 float64_to_float32(float64 a, float_status_t &status) | Floating-Point Arithmetic. *----------------------------------------------------------------------------*/ -float64 float64_round_to_int(float64 a, float_status_t &status) +float64 float64_round_to_int(float64 a, Bit8u scale, float_status_t &status) { - Bit16s aExp; Bit64u lastBitMask, roundBitsMask; int roundingMode = get_float_rounding_mode(status); - float64 z; + Bit16s aExp = extractFloat64Exp(a); + scale &= 0xf; + + if ((aExp == 0x7FF) && extractFloat64Frac(a)) { + return propagateFloat64NaN(a, status); + } + + aExp += scale; // scale the exponent - aExp = extractFloat64Exp(a); if (0x433 <= aExp) { - if ((aExp == 0x7FF) && extractFloat64Frac(a)) { - return propagateFloat64NaN(a, status); - } return a; } @@ -1636,20 +1643,21 @@ float64 float64_round_to_int(float64 a, float_status_t &status) switch (roundingMode) { case float_round_nearest_even: if ((aExp == 0x3FE) && extractFloat64Frac(a)) { - return packFloat64(aSign, 0x3FF, 0); + return packFloat64(aSign, 0x3FF - scale, 0); } break; case float_round_down: - return aSign ? float64_negative_one : 0; + return aSign ? packFloat64(1, 0x3FF - scale, 0) : float64_positive_zero; case float_round_up: - return aSign ? float64_negative_zero : float64_positive_one; + return aSign ? float64_negative_zero : packFloat64(0, 0x3FF - scale, 0); } return packFloat64(aSign, 0, 0); } + lastBitMask = 1; lastBitMask <<= 0x433 - aExp; roundBitsMask = lastBitMask - 1; - z = a; + float64 z = a; if (roundingMode == float_round_nearest_even) { z += lastBitMask>>1; if ((z & roundBitsMask) == 0) z &= ~lastBitMask; diff --git a/bochs/cpu/fpu/softfloat.h b/bochs/cpu/fpu/softfloat.h index 1d4f2e40e..ddb9c1d6f 100644 --- a/bochs/cpu/fpu/softfloat.h +++ b/bochs/cpu/fpu/softfloat.h @@ -256,6 +256,7 @@ float64 float32_to_float64(float32, float_status_t &status); | Software IEC/IEEE single-precision operations. *----------------------------------------------------------------------------*/ float32 float32_round_to_int(float32, float_status_t &status); +float32 float32_round_to_int(float32, Bit8u scale, float_status_t &status); float32 float32_add(float32, float32, float_status_t &status); float32 float32_sub(float32, float32, float_status_t &status); float32 float32_mul(float32, float32, float_status_t &status); @@ -264,6 +265,11 @@ float32 float32_sqrt(float32, float_status_t &status); float32 float32_frc(float32, float_status_t &status); float32 float32_muladd(float32, float32, float32, int flags, float_status_t &status); +BX_CPP_INLINE float32 float32_round_to_int(float32 a, float_status_t &status) +{ + return float32_round_to_int(a, 0, status); +} + BX_CPP_INLINE float32 float32_fmadd(float32 a, float32 b, float32 c, float_status_t &status) { return float32_muladd(a, b, c, 0, status); @@ -315,6 +321,7 @@ float32 float64_to_float32(float64, float_status_t &status); | Software IEC/IEEE double-precision operations. *----------------------------------------------------------------------------*/ float64 float64_round_to_int(float64, float_status_t &status); +float64 float64_round_to_int(float64, Bit8u scale, float_status_t &status); float64 float64_add(float64, float64, float_status_t &status); float64 float64_sub(float64, float64, float_status_t &status); float64 float64_mul(float64, float64, float_status_t &status); @@ -323,6 +330,11 @@ float64 float64_sqrt(float64, float_status_t &status); float64 float64_frc(float64, float_status_t &status); float64 float64_muladd(float64, float64, float64, int flags, float_status_t &status); +BX_CPP_INLINE float64 float64_round_to_int(float64 a, float_status_t &status) +{ + return float64_round_to_int(a, 0, status); +} + BX_CPP_INLINE float64 float64_fmadd(float64 a, float64 b, float64 c, float_status_t &status) { return float64_muladd(a, b, c, 0, status);