Implemented VRSQRT14 AVX-512 instructions & optimized legacy SSE RSQRTSS/PS instructions handling

//
// The table lookup was reverse-engineered from VRSQRT14SS instruction implementation available
// in the Intel Software Development Emulator rev6.20 (released February 13, 2014)
// http://software.intel.com/en-us/articles/intel-software-development-emulator/
//

// TODO: find better way to emulate these instructions, I am sure the HW doesn't have 64K entry lookup tables

Now only missed AVX-512 opcodes now are:

512.66.0F38.W0 2C VSCALEFPS
512.66.0F38.W1 2C VSCALEFPD
NDS.LIG.66.0F38.W0 2D VSCALESS
NDS.LIG.66.0F38.W1 2D VSCALESD

512.66.0F3A.W0 08 VRNDSCALEPS
512.66.0F3A.W1 09 VRNDSCALEPD
NDS.LIG.66.0F3A.W1 0A VRNDSCALESS
NDS.LIG.66.0F3A.W1 0B VRNDSCALESD
This commit is contained in:
Stanislav Shwartsman 2014-02-25 18:57:49 +00:00
parent 47b56a2174
commit 01af7f5346
4 changed files with 8455 additions and 58 deletions

View File

@ -144,6 +144,7 @@ AVX_OBJS = \
avx512_move.o \
avx512_pfp.o \
avx512_rcp14.o \
avx512_rsqrt14.o \
avx512_cvt.o \
avx512_fma.o \
avx512_mask16.o \
@ -335,6 +336,14 @@ avx512_rcp14.o: avx512_rcp14.@CPP_SUFFIX@ ../bochs.h ../config.h ../osdep.h \
fpu/softfloat.h fpu/tag_w.h fpu/status_w.h fpu/control_w.h xmm.h vmx.h \
svm.h stack.h fpu/softfloat-compare.h fpu/softfloat.h simd_int.h \
fpu/softfloat-specialize.h fpu/softfloat-round-pack.h
avx512_rsqrt14.o: avx512_rsqrt14.@CPP_SUFFIX@ ../bochs.h ../config.h ../osdep.h \
../bx_debug/debug.h ../config.h ../osdep.h ../gui/siminterface.h \
../cpudb.h ../gui/paramtree.h ../memory/memory.h ../pc_system.h \
../gui/gui.h ../instrument/stubs/instrument.h cpu.h cpuid.h crregs.h \
descriptor.h instr.h ia_opcodes.h lazy_flags.h icache.h apic.h i387.h \
fpu/softfloat.h fpu/tag_w.h fpu/status_w.h fpu/control_w.h xmm.h vmx.h \
svm.h stack.h fpu/softfloat-compare.h fpu/softfloat.h simd_int.h \
fpu/softfloat-specialize.h fpu/softfloat-round-pack.h
bcd.o: bcd.@CPP_SUFFIX@ ../bochs.h ../config.h ../osdep.h ../bx_debug/debug.h \
../config.h ../osdep.h ../gui/siminterface.h ../cpudb.h \
../gui/paramtree.h ../memory/memory.h ../pc_system.h ../gui/gui.h \

View File

@ -8273,8 +8273,7 @@ float32 approximate_rcp14(float32 op, const float_status_t &status)
Bit32u fraction = float32_fraction(op);
Bit16s exp = float32_exp(op);
switch(op_class)
{
switch(op_class) {
case float_zero:
return packFloat32(sign, 0xFF, 0);
@ -8325,8 +8324,7 @@ float64 approximate_rcp14(float64 op, const float_status_t &status)
Bit64u fraction = float64_fraction(op);
Bit16s exp = float64_exp(op);
switch(op_class)
{
switch(op_class) {
case float_zero:
return packFloat64(sign, 0x7FF, 0);
@ -8474,32 +8472,4 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRCP14SD_MASK_VsdHpdWsdR(bxInstruc
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRSQRT14PS_MASK_VpsWpsR(bxInstruction_c *i)
{
BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort()));
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRSQRT14PD_MASK_VpdWpdR(bxInstruction_c *i)
{
BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort()));
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRSQRT14SS_MASK_VssHpsWssR(bxInstruction_c *i)
{
BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort()));
BX_NEXT_INSTR(i);
}
BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VRSQRT14SD_MASK_VsdHpdWsdR(bxInstruction_c *i)
{
BX_PANIC(("%s: AVX-512 instruction still not implemented", i->getIaOpcodeNameShort()));
BX_NEXT_INSTR(i);
}
#endif

8426
bochs/cpu/avx512_rsqrt14.cc Executable file

File diff suppressed because it is too large Load Diff

View File

@ -295,21 +295,20 @@ float32 approximate_rcp(float32 op)
float_class_t op_class = float32_class(op);
int sign = float32_sign(op);
switch(op_class)
{
switch(op_class) {
case float_zero:
case float_denormal:
return packFloat32(sign, 0xFF, 0);
return packFloat32(sign, 0xFF, 0);
case float_negative_inf:
case float_positive_inf:
return packFloat32(sign, 0x00, 0);
return packFloat32(sign, 0, 0);
case float_NaN:
return convert_to_QNaN(op);
return convert_to_QNaN(op);
case float_normalized:
break;
break;
}
Bit32u fraction = float32_fraction(op);
@ -327,7 +326,7 @@ float32 approximate_rcp(float32 op)
/* check for underflow */
if (exp <= 0)
return packFloat32(sign, 0x00, 0);
return packFloat32(sign, 0, 0);
return packFloat32(sign, exp, (Bit32u)(rcp_table[fraction >> 12]) << 8);
}
@ -373,8 +372,7 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::RCPSS_VssWssR(bxInstruction_c *i)
#if BX_CPU_LEVEL >= 6
Bit16u rsqrt_table0[1024] =
{
static const Bit16u rsqrt_table0[1024] = {
0x34f8, 0x34e0, 0x34d0, 0x34b8, 0x34a0, 0x3488, 0x3470, 0x3460,
0x3448, 0x3430, 0x3418, 0x3400, 0x33f0, 0x33d8, 0x33c0, 0x33a8,
0x3398, 0x3380, 0x3368, 0x3350, 0x3338, 0x3328, 0x3310, 0x32f8,
@ -505,8 +503,7 @@ Bit16u rsqrt_table0[1024] =
0x0040, 0x0038, 0x0030, 0x0028, 0x0020, 0x0018, 0x0010, 0x0008
};
Bit16u rsqrt_table1[1024] =
{
static const Bit16u rsqrt_table1[1024] = {
0x7ff0, 0x7fd0, 0x7fb0, 0x7f90, 0x7f70, 0x7f50, 0x7f30, 0x7f10,
0x7ef0, 0x7ed0, 0x7eb0, 0x7e90, 0x7e70, 0x7e58, 0x7e38, 0x7e18,
0x7df8, 0x7dd8, 0x7db8, 0x7d98, 0x7d78, 0x7d58, 0x7d38, 0x7d20,
@ -637,34 +634,32 @@ Bit16u rsqrt_table1[1024] =
0x3558, 0x3550, 0x3540, 0x3538, 0x3530, 0x3520, 0x3518, 0x3508
};
// approximate reciprocal sqrt of scalar single precision FP
float32 approximate_rsqrt(float32 op)
{
float_class_t op_class = float32_class(op);
int sign = float32_sign(op);
switch(op_class)
{
switch(op_class) {
case float_zero:
case float_denormal:
return packFloat32(sign, 0xFF, 0);
return packFloat32(sign, 0xFF, 0);
case float_positive_inf:
return 0;
return 0;
case float_negative_inf:
return float32_default_nan;
return float32_default_nan;
case float_NaN:
return convert_to_QNaN(op);
return convert_to_QNaN(op);
case float_normalized:
break;
break;
};
if (sign == 1)
return float32_default_nan;
return float32_default_nan;
Bit32u fraction = float32_fraction(op);
Bit16s exp = float32_exp(op);
@ -677,14 +672,11 @@ float32 approximate_rsqrt(float32 op)
* Using two precalculated 1024-entry tables.
*/
Bit16u *rsqrt_table = (exp & 1) ? rsqrt_table1 : rsqrt_table0;
const Bit16u *rsqrt_table = (exp & 1) ? rsqrt_table1 : rsqrt_table0;
exp = 126 - ((exp - 127) >> 1);
/* check for underflow */
if (exp <= 0)
return packFloat32(sign, 0x00, 0);
exp = 0x7E - ((exp - 0x7F) >> 1);
return packFloat32(sign, exp, (Bit32u)(rsqrt_table[fraction >> 13]) << 8);
return packFloat32(0, exp, (Bit32u)(rsqrt_table[fraction >> 13]) << 8);
}
#endif