implemented AVX float16 convert instructions

2011-06-11 13:12:32 +00:00 · 2011-06-11 13:12:32 +00:00 · 8399dee24c
commit 8399dee24c
parent d7f19bcfd4
19 changed files with 464 additions and 44 deletions
--- a/bochs/.bochsrc
+++ b/bochs/.bochsrc
@ -179,6 +179,10 @@ cpu: count=1, ips=50000000, reset_on_triple_fault=1, ignore_bad_msrs=1, msrs="ms
 #  Select AVX instruction set support.
 #  This option exists only if Bochs compiled with --enable-avx option.
 #
+#  AVX_F16C:
+#  Select AVX float16 convert instructions support.
+#  This option exists only if Bochs compiled with --enable-avx option.
+#
 #  1G_PAGES:
 #  Enable 1G page size support in long mode.
 #  This option exists only if Bochs compiled with x86-64 support.
--- a/bochs/CHANGES
+++ b/bochs/CHANGES
@ -10,6 +10,8 @@ Bochs repository moved to the SVN version control !
  - Added support for AVX instruction set emulation, to enable configure with
        --enable-avx option.
    When compiled in, AVX still could be disabled using .bochsrc CPUID option.
+  - Added emulation of AVX float16 convert instructions, the feature can be
+    enabled using .bochsrc CPUID option.
  - Updated/Fixed instrumentation callbacks.
  - Bugfixes for CPU emulation correctness and stability.

--- a/bochs/PARAM_TREE.txt
+++ b/bochs/PARAM_TREE.txt
@ -40,6 +40,7 @@ cpuid
  xsave
  xsaveopt
  avx
+  avx_f16c
  apic
  1g_pages
  pcid
--- a/bochs/config.cc
+++ b/bochs/config.cc
@ -419,6 +419,10 @@ void bx_init_options()
      "avx", "Support for AVX instruction set",
      "Support for AVX instruction set",
      0);
+  new bx_param_bool_c(cpuid_param,
+      "avx_f16c", "Support for AVX F16 convert instructions",
+      "Support for AVX F16 convert instructions",
+      0);
 #endif
 #if BX_SUPPORT_X86_64
  new bx_param_bool_c(cpuid_param,
@ -2714,6 +2718,10 @@ static int parse_line_formatted(const char *context, int num_params, char *param
        if (parse_param_bool(params[i], 4, BXPN_CPUID_AVX) < 0) {
          PARSE_ERR(("%s: cpuid directive malformed.", context));
        }
+      } else if (!strncmp(params[i], "avx_f16c=", 9)) {
+        if (parse_param_bool(params[i], 9, BXPN_CPUID_AVX_F16CVT) < 0) {
+          PARSE_ERR(("%s: cpuid directive malformed.", context));
+        }
 #endif
 #if BX_SUPPORT_X86_64
      } else if (!strncmp(params[i], "1g_pages=", 9)) {
@ -3951,7 +3959,9 @@ int bx_write_configuration(const char *rc, int overwrite)
    SIM->get_param_bool(BXPN_CPUID_MOVBE)->get(),
    SIM->get_param_bool(BXPN_CPUID_SMEP)->get());
 #if BX_SUPPORT_AVX
-  fprintf(fp, ", avx=%d", SIM->get_param_bool(BXPN_CPUID_AVX)->get());
+  fprintf(fp, ", avx=%d, avx_f16c=%d", 
+    SIM->get_param_bool(BXPN_CPUID_AVX)->get(),
+    SIM->get_param_bool(BXPN_CPUID_AVX_F16CVT)->get());
 #endif
 #if BX_SUPPORT_X86_64
  fprintf(fp, ", 1g_pages=%d, pcid=%d, fsgsbase=%d",
--- a/bochs/cpu/avx_pfp.cc
+++ b/bochs/cpu/avx_pfp.cc
@ -1643,4 +1643,68 @@ void BX_CPP_AttrRegparmN(1) BX_CPU_C::VDPPS_VpsWpsIbR(bxInstruction_c *i)
 #endif
 }

+/* Opcode: VEX.66.0F.3A.13 (VEX.W=0) */
+void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTPH2PS_VpsWpsIbR(bxInstruction_c *i)
+{
+  BxPackedAvxRegister result;
+  BxPackedXmmRegister op = BX_READ_XMM_REG(i->rm());
+  unsigned len = i->getVL();
+
+  float_status_t status_word;
+  mxcsr_to_softfloat_status_word(status_word, MXCSR);
+
+  for (unsigned n=0; n < (4*len); n++) {
+
+    if (MXCSR.get_DAZ())
+      op.xmm16u(n) = float16_denormal_to_zero(op.xmm16u(n));
+
+     result.avx32u(n) = float16_to_float32(op.xmm16u(n), status_word);
+  }
+
+  check_exceptionsSSE(status_word.float_exception_flags);
+
+  BX_WRITE_AVX_REGZ(i->nnn(), result, len);
+}
+
+/* Opcode: VEX.66.0F.3A.1D (VEX.W=0) */
+void BX_CPP_AttrRegparmN(1) BX_CPU_C::VCVTPS2PH_WpsVpsIb(bxInstruction_c *i)
+{
+  BxPackedAvxRegister op = BX_READ_AVX_REG(i->nnn());
+  BxPackedXmmRegister result;
+
+  result.xmm64u(1) = 0; /* clear upper part of the result for case of VL128 */
+
+  float_status_t status_word;
+  mxcsr_to_softfloat_status_word(status_word, MXCSR);
+  unsigned len = i->getVL();
+
+  Bit8u control = i->Ib();
+
+  // override MXCSR rounding mode with control coming from imm8
+  if ((control & 0x4) == 0)
+    status_word.float_rounding_mode = control & 0x3;
+
+  for (unsigned n=0; n < (4*len); n++) {
+
+    if (MXCSR.get_DAZ())
+      op.avx32u(n) = float32_denormal_to_zero(op.avx32u(n));
+
+    result.xmm16u(n) = float32_to_float16(op.avx32u(n), status_word);
+  }
+
+  check_exceptionsSSE(status_word.float_exception_flags);
+
+  if (i->modC0()) {
+    BX_WRITE_XMM_REG_CLEAR_HIGH(i->rm(), result);
+  }
+  else {
+    bx_address eaddr = BX_CPU_CALL_METHODR(i->ResolveModrm, (i));
+
+    if (len == BX_VL256)
+      write_virtual_dqword(i->seg(), eaddr, &result);
+    else
+      write_virtual_qword(i->seg(), eaddr, result.xmm64u(0));
+  }
+}
+
 #endif
--- a/bochs/cpu/cpu.h
+++ b/bochs/cpu/cpu.h
@ -2500,6 +2500,9 @@ public: // for now...
  BX_SMF void VMASKMOVPD_VpdMpd(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
  BX_SMF void VMASKMOVPS_MpsVps(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
  BX_SMF void VMASKMOVPD_MpdVpd(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
+
+  BX_SMF void VCVTPH2PS_VpsWpsIbR(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
+  BX_SMF void VCVTPS2PH_WpsVpsIb(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
 #endif

  BX_SMF void CMPXCHG_XBTS(bxInstruction_c *) BX_CPP_AttrRegparmN(1);
--- a/bochs/cpu/cpuid.cc
+++ b/bochs/cpu/cpuid.cc
@ -98,7 +98,7 @@ Bit32u BX_CPU_C::get_extended_cpuid_features(void)
  // [26:26] XSAVE extensions support
  // [27:27] OSXSAVE support
  // [28:28] AVX extensions support
-  // [29:29] F16C - Float16 conversion support
+  // [29:29] AVX F16C - Float16 conversion support
  // [30:30] RDRAND instruction
  // [31:31] reserved

@ -154,6 +154,9 @@ Bit32u BX_CPU_C::get_extended_cpuid_features(void)
 #if BX_SUPPORT_AVX
  if (BX_CPUID_SUPPORT_ISA_EXTENSION(BX_CPU_AVX))
    features |= BX_CPUID_EXT_AVX;
+
+  if (BX_CPUID_SUPPORT_ISA_EXTENSION(BX_CPU_AVX_F16C))
+    features |= BX_CPUID_EXT_AVX_F16C;
 #endif

  return features;
@ -1138,6 +1141,16 @@ void BX_CPU_C::init_isa_features_bitmask(void)
      return;
    }
  }
+
+  static bx_bool avx_f16c_enabled = SIM->get_param_bool(BXPN_CPUID_AVX_F16CVT)->get();
+  if (avx_f16c_enabled) {
+    if (! avx_enabled) {
+      BX_PANIC(("PANIC: Float16 convert emulation requires AVX support !"));
+      return;
+    }
+
+    features_bitmask |= BX_CPU_AVX_F16C;
+  }
 #endif

 #if BX_SUPPORT_VMX
--- a/bochs/cpu/cpuid.h
+++ b/bochs/cpu/cpuid.h
@ -132,7 +132,7 @@ struct cpuid_function_t {
 // [26:26] XSAVE extensions support
 // [27:27] OSXSAVE support
 // [28:28] AVX extensions support
-// [29:29] F16C - Float16 conversion support
+// [29:29] AVX F16C - Float16 conversion support
 // [30:30] RDRAND instruction
 // [31:31] reserved

@ -165,7 +165,7 @@ struct cpuid_function_t {
 #define BX_CPUID_EXT_XSAVE                   (1 << 26)
 #define BX_CPUID_EXT_OSXSAVE                 (1 << 27)
 #define BX_CPUID_EXT_AVX                     (1 << 28)
-#define BX_CPUID_EXT_F16C                    (1 << 29)
+#define BX_CPUID_EXT_AVX_F16C                (1 << 29)
 #define BX_CPUID_EXT_RDRAND                  (1 << 30)
 #define BX_CPUID_EXT_RESERVED31              (1 << 31)

--- a/bochs/cpu/fetchdecode_avx.h
+++ b/bochs/cpu/fetchdecode_avx.h
@ -936,7 +936,7 @@ static const BxOpcodeInfo_t BxOpcodeTableAVX[256*3] = {
  /* 10 */ { 0, BX_IA_ERROR },
  /* 11 */ { 0, BX_IA_ERROR },
  /* 12 */ { 0, BX_IA_ERROR },
-  /* 13 */ { 0, BX_IA_ERROR },
+  /* 13 */ { BxPrefixSSE66 | BxVexW0 | BxImmediate_Ib, BX_IA_VCVTPH2PS_VpsWpsIb },
  /* 14 */ { BxPrefixSSE66 | BxImmediate_Ib, BX_IA_VPEXTRB_EbdVdqIb },
  /* 15 */ { BxPrefixSSE66 | BxImmediate_Ib, BX_IA_VPEXTRW_EwdVdqIb },
  /* 16 */ { BxSplitVexW | BxImmediate_Ib, BX_IA_ERROR, BxOpcodeInfoAVX_VexW_0f3a16 },
@ -946,7 +946,7 @@ static const BxOpcodeInfo_t BxOpcodeTableAVX[256*3] = {
  /* 1A */ { 0, BX_IA_ERROR },
  /* 1B */ { 0, BX_IA_ERROR },
  /* 1C */ { 0, BX_IA_ERROR },
-  /* 1D */ { 0, BX_IA_ERROR },
+  /* 1D */ { BxPrefixSSE66 | BxVexW0 | BxImmediate_Ib, BX_IA_VCVTPS2PH_WpsVpsIb },
  /* 1E */ { 0, BX_IA_ERROR },
  /* 1F */ { 0, BX_IA_ERROR },
  /* 20 */ { BxPrefixSSE66 | BxImmediate_Ib, BX_IA_VPINSRB_VdqEbIb },
--- a/bochs/cpu/ia_opcodes.h
+++ b/bochs/cpu/ia_opcodes.h
@ -1645,5 +1645,8 @@ bx_define_opcode(BX_IA_VPEXTRQ_EqVdqIb, &BX_CPU_C::PEXTRD_EdVdqIbM, &BX_CPU_C::P
 bx_define_opcode(BX_IA_VPINSRQ_VdqEqIb, &BX_CPU_C::PINSRD_VdqEdIbM, &BX_CPU_C::PINSRD_VdqEdIbR, BX_CPU_AVX | BX_CPU_X86_64, BX_PREPARE_AVX | BX_VEX_L128)
 bx_define_opcode(BX_IA_VMOVQ_VdqEq, &BX_CPU_C::MOVQ_VqWqM, &BX_CPU_C::MOVQ_VdqEqR, BX_CPU_AVX | BX_CPU_X86_64, BX_PREPARE_AVX | BX_VEX_L128)
 bx_define_opcode(BX_IA_VMOVQ_EqVq, &BX_CPU_C::MOVLPS_MqVps, &BX_CPU_C::MOVQ_EqVqR, BX_CPU_AVX | BX_CPU_X86_64, BX_PREPARE_AVX | BX_VEX_L128)
+
+bx_define_opcode(BX_IA_VCVTPH2PS_VpsWpsIb, &BX_CPU_C::LOAD_VectorQ, &BX_CPU_C::VCVTPH2PS_VpsWpsIbR, BX_CPU_AVX_F16C, BX_PREPARE_AVX | BX_VEX_NO_VVV | BX_VEX_L128 | BX_VEX_L256)
+bx_define_opcode(BX_IA_VCVTPS2PH_WpsVpsIb, &BX_CPU_C::VCVTPS2PH_WpsVpsIb, &BX_CPU_C::VCVTPS2PH_WpsVpsIb, BX_CPU_AVX_F16C, BX_PREPARE_AVX | BX_VEX_NO_VVV | BX_VEX_L128 | BX_VEX_L256)
 #endif
 // AVX
--- a/bochs/doc/docbook/user/user.dbk
+++ b/bochs/doc/docbook/user/user.dbk
@ -3097,6 +3097,11 @@ This option exists only if Bochs compiled with BX_CPU_LEVEL >= 6.
 Select AVX instruction set support.
 This option exists only if Bochs compiled with --enable-avx option.
 </para>
+<para><command>avx_f16c</command></para>
+<para>
+Select AVX float16 convert instructions support.
+This option exists only if Bochs compiled with --enable-avx option.
+</para>
 <para><command>1g_pages</command></para>
 <para>
 Enable 1G page size support in long mode.
--- a/bochs/doc/man/bochsrc.5
+++ b/bochs/doc/man/bochsrc.5
@ -234,6 +234,11 @@ avx:
 Select AVX instruction set support.
 This option exists only if Bochs compiled with --enable-avx option.

+avx_f16c:
+
+Select AVX float16 convert instructions support.
+This option exists only if Bochs compiled with --enable-avx option.
+
 1g_pages:

 Enable 1G page size support in long mode.
--- a/bochs/fpu/Makefile.in
+++ b/bochs/fpu/Makefile.in
@ -44,7 +44,7 @@ BX_INCDIRS = -I.. -I$(srcdir)/.. -I../@INSTRUMENT_DIR@ -I$(srcdir)/../@INSTRUMEN
 OBJS = ferr.o fpu.o fpu_arith.o fpu_compare.o fpu_const.o \
               fpu_load_store.o fpu_misc.o fpu_trans.o fpu_tags.o \
               fprem.o fsincos.o f2xm1.o fyl2x.o fpatan.o \
-               softfloat.o softfloatx80.o softfloat-specialize.o \
+               softfloat.o softfloatx80.o softfloat16.o softfloat-specialize.o \
               softfloat-round-pack.o poly.o

 all: libfpu.a
@ -176,3 +176,5 @@ softfloat-specialize.o: softfloat-specialize.@CPP_SUFFIX@ softfloat.h ../config.
  softfloat-specialize.h softfloat-macros.h
 softfloatx80.o: softfloatx80.@CPP_SUFFIX@ softfloatx80.h softfloat.h ../config.h \
  softfloat-specialize.h softfloat-round-pack.h softfloat-macros.h
+softfloat16.o: softfloat16.@CPP_SUFFIX@ softfloat.h ../config.h \
+  softfloat-specialize.h softfloat-round-pack.h softfloat-macros.h
--- a/bochs/fpu/softfloat-macros.h
+++ b/bochs/fpu/softfloat-macros.h
@ -35,13 +35,37 @@ these four paragraphs for those parts of this code that are retained.
 #ifndef _SOFTFLOAT_MACROS_H_
 #define _SOFTFLOAT_MACROS_H_

+/*----------------------------------------------------------------------------
+| Shifts `a' right by the number of bits given in `count'.  If any nonzero
+| bits are shifted off, they are ``jammed'' into the least significant bit of
+| the result by setting the least significant bit to 1.  The value of `count'
+| can be arbitrarily large; in particular, if `count' is greater than 16, the
+| result will be either 0 or 1, depending on whether `a' is zero or nonzero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16u shift16RightJamming(Bit16u a, int count)
+{
+    Bit16u z;
+
+    if (count == 0) {
+        z = a;
+    }
+    else if (count < 16) {
+        z = (a>>count) | ((a<<((-count) & 15)) != 0);
+    }
+    else {
+        z = (a != 0);
+    }
+
+    return z;
+}
+
 /*----------------------------------------------------------------------------
 | Shifts `a' right by the number of bits given in `count'.  If any nonzero
 | bits are shifted off, they are ``jammed'' into the least significant bit of
 | the result by setting the least significant bit to 1.  The value of `count'
 | can be arbitrarily large; in particular, if `count' is greater than 32, the
 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
-| The result is stored in the location pointed to by `zPtr'.
 *----------------------------------------------------------------------------*/

 BX_CPP_INLINE Bit32u shift32RightJamming(Bit32u a, int count)
@ -67,7 +91,6 @@ BX_CPP_INLINE Bit32u shift32RightJamming(Bit32u a, int count)
 | the result by setting the least significant bit to 1.  The value of `count'
 | can be arbitrarily large; in particular, if `count' is greater than 64, the
 | result will be either 0 or 1, depending on whether `a' is zero or nonzero.
-| The result is stored in the location pointed to by `zPtr'.
 *----------------------------------------------------------------------------*/

 BX_CPP_INLINE Bit64u shift64RightJamming(Bit64u a, int count)
@ -104,9 +127,7 @@ BX_CPP_INLINE Bit64u shift64RightJamming(Bit64u a, int count)
 | described above, and is returned at the location pointed to by `z1Ptr'.)
 *----------------------------------------------------------------------------*/

-BX_CPP_INLINE void
- shift64ExtraRightJamming(
-     Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+BX_CPP_INLINE void shift64ExtraRightJamming(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
 {
    Bit64u z0, z1;
    int negCount = (-count) & 63;
@ -139,8 +160,7 @@ BX_CPP_INLINE void
 | are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 *----------------------------------------------------------------------------*/

-BX_CPP_INLINE void
- add128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1, Bit64u *z0Ptr, Bit64u *z1Ptr)
+BX_CPP_INLINE void add128(Bit64u a0, Bit64u a1, Bit64u b0, Bit64u b1, Bit64u *z0Ptr, Bit64u *z1Ptr)
 {
    Bit64u z1 = a1 + b1;
    *z1Ptr = z1;
@ -261,31 +281,52 @@ static Bit32u estimateSqrt32(Bit16s aExp, Bit32u a)
 }
 #endif

+static const int countLeadingZeros8[] = {
+  8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| Returns the number of leading 0 bits before the most-significant 1 bit of
+| `a'.  If `a' is zero, 16 is returned.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int countLeadingZeros16(Bit16u a)
+{
+    int shiftCount = 0;
+    if (a < 0x100) {
+        shiftCount += 8;
+        a <<= 8;
+    }
+    shiftCount += countLeadingZeros8[a>>8];
+    return shiftCount;
+}
+
+#endif
+
 /*----------------------------------------------------------------------------
 | Returns the number of leading 0 bits before the most-significant 1 bit of
 | `a'.  If `a' is zero, 32 is returned.
 *----------------------------------------------------------------------------*/

-static int countLeadingZeros32(Bit32u a)
+BX_CPP_INLINE int countLeadingZeros32(Bit32u a)
 {
-    static const int countLeadingZerosHigh[] = {
-        8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
-        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-    };
    int shiftCount = 0;
    if (a < 0x10000) {
        shiftCount += 16;
@ -295,7 +336,7 @@ static int countLeadingZeros32(Bit32u a)
        shiftCount += 8;
        a <<= 8;
    }
-    shiftCount += countLeadingZerosHigh[ a>>24 ];
+    shiftCount += countLeadingZeros8[a>>24];
    return shiftCount;
 }

@ -307,13 +348,13 @@ static int countLeadingZeros32(Bit32u a)
 BX_CPP_INLINE int countLeadingZeros64(Bit64u a)
 {
    int shiftCount = 0;
-    if (a < ((Bit64u) 1)<<32) {
+    if (a < BX_CONST64(0x100000000)) {
        shiftCount += 32;
    }
    else {
        a >>= 32;
    }
-    shiftCount += countLeadingZeros32((int)(a));
+    shiftCount += countLeadingZeros32((Bit32u)(a));
    return shiftCount;
 }

@ -327,8 +368,7 @@ BX_CPP_INLINE int countLeadingZeros64(Bit64u a)
 | which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 *----------------------------------------------------------------------------*/

-BX_CPP_INLINE void
- shift128Right(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+BX_CPP_INLINE void shift128Right(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
 {
    Bit64u z0, z1;
    int negCount = (-count) & 63;
@ -360,9 +400,7 @@ BX_CPP_INLINE void
 | the locations pointed to by `z0Ptr' and `z1Ptr'.
 *----------------------------------------------------------------------------*/

-BX_CPP_INLINE void
- shift128RightJamming(
-     Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+BX_CPP_INLINE void shift128RightJamming(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
 {
    Bit64u z0, z1;
    int negCount = (-count) & 63;
@ -398,9 +436,7 @@ BX_CPP_INLINE void
 | pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'.
 *----------------------------------------------------------------------------*/

-BX_CPP_INLINE void
- shortShift128Left(
-     Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
+BX_CPP_INLINE void shortShift128Left(Bit64u a0, Bit64u a1, int count, Bit64u *z0Ptr, Bit64u *z1Ptr)
 {
    *z1Ptr = a1<<count;
    *z0Ptr = (count == 0) ? a0 : (a0<<count) | (a1>>((-count) & 63));
--- a/bochs/fpu/softfloat-round-pack.cc
+++ b/bochs/fpu/softfloat-round-pack.cc
@ -148,6 +148,102 @@ Bit64s roundAndPackInt64(int zSign, Bit64u absZ0, Bit64u absZ1, float_status_t &
    return z;
 }

+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal half-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat16Subnormal(Bit16u aSig, Bit16s *zExpPtr, Bit16u *zSigPtr)
+{
+    int shiftCount = countLeadingZeros16(aSig) - 5;
+    *zSigPtr = aSig<<shiftCount;
+    *zExpPtr = 1 - shiftCount;
+}
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper half-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the half-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 14
+| and 13, which is 4 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float16 roundAndPackFloat16(int zSign, Bit16s zExp, Bit16u zSig, float_status_t &status)
+{
+    Bit16s roundIncrement, roundBits, roundMask;
+
+    int roundingMode = get_float_rounding_mode(status);
+    int roundNearestEven = (roundingMode == float_round_nearest_even);
+    roundIncrement = 8;
+    roundMask = 0xF;
+
+    if (! roundNearestEven) {
+        if (roundingMode == float_round_to_zero) roundIncrement = 0;
+        else {
+            roundIncrement = roundMask;
+            if (zSign) {
+                if (roundingMode == float_round_up) roundIncrement = 0;
+            }
+            else {
+                if (roundingMode == float_round_down) roundIncrement = 0;
+            }
+        }
+    }
+    roundBits = zSig & roundMask;
+    if (0x1D <= (Bit16u) zExp) {
+        if ((0x1D < zExp)
+             || ((zExp == 0x1D)
+                  && ((Bit16s) (zSig + roundIncrement) < 0)))
+        {
+            float_raise(status, float_flag_overflow);
+            if (roundBits || float_exception_masked(status, float_flag_overflow)) {
+                float_raise(status, float_flag_inexact);
+            }
+            return packFloat16(zSign, 0x1F, 0) - (roundIncrement == 0);
+        }
+        if (zExp < 0) {
+            int isTiny = (zExp < -1) || (zSig + roundIncrement < 0x8000);
+            zSig = shift16RightJamming(zSig, -zExp);
+            zExp = 0;
+            roundBits = zSig & roundMask;
+            if (isTiny) {
+                if(get_flush_underflow_to_zero(status)) {
+                    float_raise(status, float_flag_underflow | float_flag_inexact);
+                    return packFloat16(zSign, 0, 0);
+                }
+                if (roundBits || !float_exception_masked(status, float_flag_underflow)) {
+                    float_raise(status, float_flag_underflow);
+                }
+            }
+        }
+    }
+    if (roundBits) float_raise(status, float_flag_inexact);
+    Bit16u zSigRound = ((zSig + roundIncrement) & ~roundMask) >> 4;
+    zSigRound &= ~(((roundBits ^ 0x10) == 0) & roundNearestEven);
+    if (zSigRound == 0) zExp = 0;
+    return packFloat16(zSign, zExp, zSigRound);
+}
+
+#endif
+
 /*----------------------------------------------------------------------------
 | Normalizes the subnormal single-precision floating-point value represented
 | by the denormalized significand `aSig'.  The normalized exponent and
--- a/bochs/fpu/softfloat-round-pack.h
+++ b/bochs/fpu/softfloat-round-pack.h
@ -63,6 +63,43 @@ Bit32s roundAndPackInt32(int zSign, Bit64u absZ, float_status_t &status);

 Bit64s roundAndPackInt64(int zSign, Bit64u absZ0, Bit64u absZ1, float_status_t &status);

+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| Normalizes the subnormal half-precision floating-point value represented
+| by the denormalized significand `aSig'.  The normalized exponent and
+| significand are stored at the locations pointed to by `zExpPtr' and
+| `zSigPtr', respectively.
+*----------------------------------------------------------------------------*/
+
+void normalizeFloat16Subnormal(Bit16u aSig, Bit16s *zExpPtr, Bit16u *zSigPtr);
+
+/*----------------------------------------------------------------------------
+| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
+| and significand `zSig', and returns the proper half-precision floating-
+| point value corresponding to the abstract input.  Ordinarily, the abstract
+| value is simply rounded and packed into the half-precision format, with
+| the inexact exception raised if the abstract input cannot be represented
+| exactly.  However, if the abstract value is too large, the overflow and
+| inexact exceptions are raised and an infinity or maximal finite value is
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal single-
+| precision floating-point number.
+|     The input significand `zSig' has its binary point between bits 14
+| and 13, which is 4 bits to the left of the usual location.  This shifted
+| significand must be normalized or smaller.  If `zSig' is not normalized,
+| `zExp' must be 0; in that case, the result returned is a subnormal number,
+| and it must not require rounding.  In the usual case that `zSig' is
+| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
+| The handling of underflow and overflow follows the IEC/IEEE Standard for
+| Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float16 roundAndPackFloat16(int zSign, Bit16s zExp, Bit16u zSig, float_status_t &status);
+
+#endif
+
 /*----------------------------------------------------------------------------
 | Normalizes the subnormal single-precision floating-point value represented
 | by the denormalized significand `aSig'.  The normalized exponent and
--- a/bochs/fpu/softfloat-specialize.h
+++ b/bochs/fpu/softfloat-specialize.h
@ -50,6 +50,128 @@ typedef struct {
    Bit64u hi, lo;
 } commonNaNT;

+#ifdef FLOAT16
+
+/*----------------------------------------------------------------------------
+| The pattern for a default generated half-precision NaN.
+*----------------------------------------------------------------------------*/
+#define float16_default_nan 0xFE00
+
+#define float16_fraction extractFloat16Frac
+#define float16_exp extractFloat16Exp
+#define float16_sign extractFloat16Sign
+
+/*----------------------------------------------------------------------------
+| Returns the fraction bits of the half-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16u extractFloat16Frac(float16 a)
+{
+    return a & 0x3FF;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the exponent bits of the half-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE Bit16s extractFloat16Exp(float16 a)
+{
+    return (a>>10) & 0x1F;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the sign bit of the half-precision floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int extractFloat16Sign(float16 a)
+{
+    return a>>15;
+}
+
+/*----------------------------------------------------------------------------
+| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
+| single-precision floating-point value, returning the result.  After being
+| shifted into the proper positions, the three fields are simply added
+| together to form the result.  This means that any integer portion of `zSig'
+| will be added into the exponent.  Since a properly normalized significand
+| will have an integer portion equal to 1, the `zExp' input should be 1 less
+| than the desired result exponent whenever `zSig' is a complete, normalized
+| significand.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float16 packFloat16(int zSign, int zExp, Bit16u zSig)
+{
+    return (((Bit16u) zSign)<<15) + (((Bit16u) zExp)<<10) + zSig;
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the half-precision floating-point value `a' is a NaN;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float16_is_nan(float16 a)
+{
+    return (0xF800 < (Bit16u) (a<<1));
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the half-precision floating-point value `a' is a signaling
+| NaN; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float16_is_signaling_nan(float16 a)
+{
+    return (((a>>9) & 0x3F) == 0x3E) && (a & 0x1FF);
+}
+
+/*----------------------------------------------------------------------------
+| Returns 1 if the half-precision floating-point value `a' is denormal;
+| otherwise returns 0.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE int float16_is_denormal(float16 a)
+{
+   return (extractFloat16Exp(a) == 0) && (extractFloat16Frac(a) != 0);
+}
+
+/*----------------------------------------------------------------------------
+| Convert float16 denormals to zero.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float16 float16_denormal_to_zero(float16 a)
+{
+  if (float16_is_denormal(a)) a &= 0x8000;
+  return a;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the half-precision floating-point NaN
+| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid
+| exception is raised.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE commonNaNT float16ToCommonNaN(float16 a, float_status_t &status)
+{
+    commonNaNT z;
+    if (float16_is_signaling_nan(a)) float_raise(status, float_flag_invalid);
+    z.sign = a>>15;
+    z.lo = 0;
+    z.hi = ((Bit64u) a)<<54;
+    return z;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the canonical NaN `a' to the half-
+| precision floating-point format.
+*----------------------------------------------------------------------------*/
+
+BX_CPP_INLINE float16 commonNaNToFloat16(commonNaNT a)
+{
+    return (((Bit16u) a.sign)<<15) | 0x7E00 | (Bit16u)(a.hi>>54);
+}
+
+#endif
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated single-precision NaN.
 *----------------------------------------------------------------------------*/
--- a/bochs/fpu/softfloat.h
+++ b/bochs/fpu/softfloat.h
@ -37,11 +37,15 @@ these four paragraphs for those parts of this code that are retained.
 #ifndef _SOFTFLOAT_H_
 #define _SOFTFLOAT_H_

+#define FLOAT16
 #define FLOATX80

 /*----------------------------------------------------------------------------
 | Software IEC/IEEE floating-point types.
 *----------------------------------------------------------------------------*/
+#ifdef FLOAT16
+typedef Bit16u float16;
+#endif
 typedef Bit32u float32;
 typedef Bit64u float64;

@ -220,6 +224,7 @@ int float32_compare_quiet(float32, float32, float_status_t &status);
 float_class_t float32_class(float32);
 int float32_is_signaling_nan(float32);
 int float32_is_nan(float32);
+int float32_is_denormal(float32);

 /*----------------------------------------------------------------------------
 | Software IEC/IEEE double-precision conversion routines.
@ -246,6 +251,17 @@ int float64_compare_quiet(float64, float64, float_status_t &status);
 float_class_t float64_class(float64);
 int float64_is_signaling_nan(float64);
 int float64_is_nan(float64);
+int float64_is_denormal(float64);
+
+#ifdef FLOAT16
+float32 float16_to_float32(float16, float_status_t &status);
+float16 float32_to_float16(float32, float_status_t &status);
+
+float_class_t float16_class(float16);
+int float16_is_signaling_nan(float16);
+int float16_is_nan(float16);
+int float16_is_denormal(float16);
+#endif

 #ifdef FLOATX80

--- a/bochs/param_names.h
+++ b/bochs/param_names.h
@ -54,6 +54,7 @@
 #define BXPN_CPUID_XSAVE                 "cpuid.xsave"
 #define BXPN_CPUID_XSAVEOPT              "cpuid.xsaveopt"
 #define BXPN_CPUID_AVX                   "cpuid.avx"
+#define BXPN_CPUID_AVX_F16CVT            "cpuid.avx_f16c"
 #define BXPN_CPUID_APIC                  "cpuid.apic"
 #define BXPN_CPUID_MWAIT                 "cpuid.mwait"
 #define BXPN_CPUID_MWAIT_IS_NOP          "cpuid.mwait_is_nop"