From 55e1d53a48103d7a71ec1d5084c30baa125aa8de Mon Sep 17 00:00:00 2001 From: Stanislav Shwartsman Date: Sun, 2 Feb 2014 18:57:25 +0000 Subject: [PATCH] implement DPPS/DPPD ops using existing primitives; added some missing defs --- bochs/cpu/avx_pfp.cc | 31 ++++++++---------- bochs/cpu/cpu.h | 3 ++ bochs/cpu/fetchdecode.h | 5 +-- bochs/cpu/ia_opcodes.h | 2 +- bochs/cpu/sse_pfp.cc | 72 +++++++++++++++++++---------------------- 5 files changed, 55 insertions(+), 58 deletions(-) diff --git a/bochs/cpu/avx_pfp.cc b/bochs/cpu/avx_pfp.cc index 747da7f52..f42340ae9 100644 --- a/bochs/cpu/avx_pfp.cc +++ b/bochs/cpu/avx_pfp.cc @@ -35,6 +35,7 @@ extern float32 approximate_rcp(float32 op); #include "fpu/softfloat-compare.h" #include "simd_pfp.h" +#include "simd_int.h" void BX_CPU_C::print_state_AVX(void) { @@ -1095,32 +1096,28 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::VDPPS_VpsHpsWpsIbR(bxInstruction_c for (unsigned n=0; n < len; n++) { + // op1: [A, B, C, D] + // op2: [E, F, G, H] + + // after multiplication: op1 = [AE, BF, CG, DH] xmm_mulps_mask(&op1.ymm128(n), &op2.ymm128(n), status, mask >> 4); - float32 tmp1 = float32_add(op1.ymm32u(n*4+0), op1.ymm32u(n*4+1), status); - float32 tmp2 = float32_add(op1.ymm32u(n*4+2), op1.ymm32u(n*4+3), status); + // shuffle op2 = [BF, AE, DH, CG] + xmm_shufps(&op2.ymm128(n), &op1.ymm128(n), &op1.ymm128(n), 0xb1); -#ifdef BX_DPPS_DPPD_NAN_MATCHING_HARDWARE - float32 r1 = float32_add(tmp1, tmp2, status); - float32 r2 = float32_add(tmp2, tmp1, status); + // op2 = [(BF+AE), (AE+BF), (DH+CG), (CG+DH)] + xmm_addps(&op2.ymm128(n), &op1.ymm128(n), status); - op1.ymm32u(n*4+0) = (mask & 0x01) ? r1 : 0; - op1.ymm32u(n*4+1) = (mask & 0x02) ? r1 : 0; - op1.ymm32u(n*4+2) = (mask & 0x04) ? r2 : 0; - op1.ymm32u(n*4+3) = (mask & 0x08) ? r2 : 0; -#else - float32 r = float32_add(tmp1, tmp2, status); + // shuffle op1 = [(DH+CG), (CG+DH), (BF+AE), (AE+BF)] + xmm_shufpd(&op1.ymm128(n), &op2.ymm128(n), &op2.ymm128(n), 0x1); - op1.ymm32u(n*4+0) = (mask & 0x01) ? r : 0; - op1.ymm32u(n*4+1) = (mask & 0x02) ? r : 0; - op1.ymm32u(n*4+2) = (mask & 0x04) ? r : 0; - op1.ymm32u(n*4+3) = (mask & 0x08) ? r : 0; -#endif + // op2 = [(BF+AE)+(DH+CG), (AE+BF)+(CG+DH), (DH+CG)+(BF+AE), (CG+DH)+(AE+BF)] + xmm_addps_mask(&op2.ymm128(n), &op1.ymm128(n), status, mask); } check_exceptionsSSE(get_exception_flags(status)); - BX_WRITE_YMM_REGZ_VLEN(i->dst(), op1, len); + BX_WRITE_YMM_REGZ_VLEN(i->dst(), op2, len); BX_NEXT_INSTR(i); } diff --git a/bochs/cpu/cpu.h b/bochs/cpu/cpu.h index 085d4b43e..8e14d5e12 100644 --- a/bochs/cpu/cpu.h +++ b/bochs/cpu/cpu.h @@ -435,6 +435,9 @@ struct BxExceptionInfo { #define BX_MSR_TSC_DEADLINE 0x6E0 +/* Intel MPX supervisor bound configuration register */ +#define BX_MSR_BNDCFGS 0xd90 + #define BX_MSR_MAX_INDEX 0x1000 enum { diff --git a/bochs/cpu/fetchdecode.h b/bochs/cpu/fetchdecode.h index cf4e89ab0..2b0bae7cc 100644 --- a/bochs/cpu/fetchdecode.h +++ b/bochs/cpu/fetchdecode.h @@ -2,7 +2,7 @@ // $Id$ ///////////////////////////////////////////////////////////////////////// // -// Copyright (c) 2013 Stanislav Shwartsman +// Copyright (c) 2013-2014 Stanislav Shwartsman // Written by Stanislav Shwartsman [sshwarts at sourceforge net] // // This library is free software; you can redistribute it and/or @@ -99,7 +99,8 @@ enum { BX_KMASK_REG = 0xA, BX_SEGREG = 0xB, BX_CREG = 0xC, - BX_DREG = 0xD + BX_DREG = 0xD, + BX_BOUNDS_REG = 0xE }; enum { diff --git a/bochs/cpu/ia_opcodes.h b/bochs/cpu/ia_opcodes.h index 87e0a36bd..662223aba 100644 --- a/bochs/cpu/ia_opcodes.h +++ b/bochs/cpu/ia_opcodes.h @@ -1360,7 +1360,7 @@ bx_define_opcode(BX_IA_TEST_EqId, &BX_CPU_C::TEST_EqIdM, &BX_CPU_C::TEST_EqIdR, bx_define_opcode(BX_IA_CMP_EqId, &BX_CPU_C::CMP_EqIdM, &BX_CPU_C::CMP_EqIdR, 0, OP_Eq, OP_sId, OP_NONE, OP_NONE, 0) bx_define_opcode(BX_IA_XCHG_EqGq, &BX_CPU_C::XCHG_EqGqM, &BX_CPU_C::XCHG_EqGqR, 0, OP_Eq, OP_Gq, OP_NONE, OP_NONE, 0) -bx_define_opcode(BX_IA_LEA_GqM, &BX_CPU_C::LEA_GqM, &BX_CPU_C::BxError, 0, OP_Gq, OP_Mq, OP_NONE, OP_NONE, 0) +bx_define_opcode(BX_IA_LEA_GqM, &BX_CPU_C::LEA_GqM, &BX_CPU_C::BxError, 0, OP_Gq, OP_M, OP_NONE, OP_NONE, 0) bx_define_opcode(BX_IA_MOV_Op64_GdEd, &BX_CPU_C::MOV64_GdEdM, &BX_CPU_C::MOV_GdEdR, 0, OP_Gd, OP_Ed, OP_NONE, OP_NONE, 0) bx_define_opcode(BX_IA_MOV_Op64_EdGd, &BX_CPU_C::MOV64_EdGdM, &BX_CPU_C::MOV_GdEdR, 0, OP_Ed, OP_Gd, OP_NONE, OP_NONE, 0) bx_define_opcode(BX_IA_MOV_GqEq, &BX_CPU_C::MOV_GqEqM, &BX_CPU_C::MOV_GqEqR, 0, OP_Gq, OP_Eq, OP_NONE, OP_NONE, 0) diff --git a/bochs/cpu/sse_pfp.cc b/bochs/cpu/sse_pfp.cc index 07c3a0be0..c932d4fcb 100644 --- a/bochs/cpu/sse_pfp.cc +++ b/bochs/cpu/sse_pfp.cc @@ -31,6 +31,7 @@ #include "fpu/softfloat-compare.h" #include "simd_pfp.h" +#include "simd_int.h" void BX_CPU_C::check_exceptionsSSE(int exceptions_flags) { @@ -1833,32 +1834,28 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::DPPS_VpsWpsIbR(bxInstruction_c *i) float_status_t status; mxcsr_to_softfloat_status_word(status, MXCSR); + // op1: [A, B, C, D] + // op2: [E, F, G, H] + + // after multiplication: op1 = [EA, BF, CG, DH] xmm_mulps_mask(&op1, &op2, status, mask >> 4); - - float32 tmp1 = float32_add(op1.xmm32u(0), op1.xmm32u(1), status); - float32 tmp2 = float32_add(op1.xmm32u(2), op1.xmm32u(3), status); - - op1.clear(); - -#ifdef BX_DPPS_DPPD_NAN_MATCHING_HARDWARE - float32 r1 = float32_add(tmp1, tmp2, status); - float32 r2 = float32_add(tmp2, tmp1, status); - - if (mask & 0x01) op1.xmm32u(0) = r1; - if (mask & 0x02) op1.xmm32u(1) = r1; - if (mask & 0x04) op1.xmm32u(2) = r2; - if (mask & 0x08) op1.xmm32u(3) = r2; -#else - float32 r = float32_add(tmp1, tmp2, status); - - if (mask & 0x01) op1.xmm32u(0) = r; - if (mask & 0x02) op1.xmm32u(1) = r; - if (mask & 0x04) op1.xmm32u(2) = r; - if (mask & 0x08) op1.xmm32u(3) = r; -#endif - check_exceptionsSSE(get_exception_flags(status)); - BX_WRITE_XMM_REG(i->dst(), op1); + + // shuffle op2 = [BF, AE, DH, CG] + xmm_shufps(&op2, &op1, &op1, 0xb1); + + // op2 = [(BF+AE), (AE+BF), (DH+CG), (CG+DH)] + xmm_addps(&op2, &op1, status); + check_exceptionsSSE(get_exception_flags(status)); + + // shuffle op1 = [(DH+CG), (CG+DH), (BF+AE), (AE+BF)] + xmm_shufpd(&op1, &op2, &op2, 0x1); + + // op2 = [(BF+AE)+(DH+CG), (AE+BF)+(CG+DH), (DH+CG)+(BF+AE), (CG+DH)+(AE+BF)] + xmm_addps_mask(&op2, &op1, status, mask); + check_exceptionsSSE(get_exception_flags(status)); + + BX_WRITE_XMM_REG(i->dst(), op2); BX_NEXT_INSTR(i); } @@ -1877,22 +1874,21 @@ BX_INSF_TYPE BX_CPP_AttrRegparmN(1) BX_CPU_C::DPPD_VpdHpdWpdIbR(bxInstruction_c float_status_t status; mxcsr_to_softfloat_status_word(status, MXCSR); + // op1: [A, B] + // op2: [C, D] + + // after multiplication: op1 = [AC, BD] xmm_mulpd_mask(&op1, &op2, status, mask >> 4); - - op2.clear(); - -#ifdef BX_DPPS_DPPD_NAN_MATCHING_HARDWARE - if (mask & 0x01) op2.xmm64u(0) = float64_add(op1.xmm64u(0), op1.xmm64u(1), status); - if (mask & 0x02) op2.xmm64u(1) = float64_add(op1.xmm64u(1), op1.xmm64u(0), status); -#else - float64 result = float64_add(op1.xmm64u(0), op1.xmm64u(1), status); - - if (mask & 0x01) op2.xmm64u(0) = result; - if (mask & 0x02) op2.xmm64u(1) = result; -#endif - check_exceptionsSSE(get_exception_flags(status)); - BX_WRITE_XMM_REGZ(i->dst(), op2, i->getVL()); + + // shuffle op2 = [BD, AC] + xmm_shufpd(&op2, &op1, &op1, 0x1); + + // op1 = [AC+BD, BD+AC] + xmm_addpd_mask(&op1, &op2, status, mask); + check_exceptionsSSE(get_exception_flags(status)); + + BX_WRITE_XMM_REGZ(i->dst(), op1, i->getVL()); BX_NEXT_INSTR(i); }