target-arm queue:

* update MAINTAINERS for Alistair's new email address * add Arm v8.2 FP16 arithmetic extension for linux-user * implement display connector emulation for vexpress board * xilinx_spips: Enable only two slaves when reading/writing with stripe * xilinx_spips: Use 8 dummy cycles with the QIOR/QIOR4 commands * hw: register: Run post_write hook on reset -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQIcBAABCAAGBQJal+KGAAoJEDwlJe0UNgzeYkgP/jgaMPdRG1nSRL12SXhQi9yO O95PDRmnoGmtAzb1hOZmQATrcFmRoLnv1irCFVycGrGtfwnxXC7kuJVKI9QJ+T+0 r0jSg/TpKGchRFvIuu+JLHNttuonQln890dPJiR860TVclBjnD+PFvzEX1gI2Lhw gOnB+EL5UTMcs8Zj/HNqtAQfwQdW8yq/dDZW4/B0dQaBC0+/Qy+pRHCAp4nSbELI QLM/tIu2mz6++GlMbjN3Radkl3gdIuYWzcf9R5gZ4xlwaUwihDOOmSJ0x+41eyVy FUGfza5KEEBlRjU9ZzaJ/fIq4DMStIEugaEujr1UpKmwQB/kJoBX2iX6tP6ndgLf Pt3dxdOcJI4RyZzUZwEBUi0M4tnBZVCpOMb4zTw/IwS4ELhGiIGOeZD+j9UihTVr /Ply5G9/fC0mv4jVEQcug9FciRR6n59RNm1GRDKfElkUyU4AVSom3Up9UuWPofbx I0RjYrHcoOyaPy7t3gwllijfsy01RICxsAQbnCYWFtN+XeGfeNFQasyzualj/7fK Xe8nLywHoYMqfkIeogO9LTBUsRmO9Mk05QEVAAGdM9o3JoHZVR+u1Sc05CvCHPp/ wMiIYUOWzmLzpdhdWq9OqzIVr4fAhnrpI9Iz8gcfljCA7DQp9kboQRSPocJM6KRB mvM0AiNenrcEBLExUmjC =Gu1v -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20180301' into staging target-arm queue: * update MAINTAINERS for Alistair's new email address * add Arm v8.2 FP16 arithmetic extension for linux-user * implement display connector emulation for vexpress board * xilinx_spips: Enable only two slaves when reading/writing with stripe * xilinx_spips: Use 8 dummy cycles with the QIOR/QIOR4 commands * hw: register: Run post_write hook on reset # gpg: Signature made Thu 01 Mar 2018 11:22:46 GMT # gpg: using RSA key 3C2525ED14360CDE # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" # gpg: aka "Peter Maydell <pmaydell@gmail.com>" # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * remotes/pmaydell/tags/pull-target-arm-20180301: (42 commits) MAINTAINERS: Update my email address linux-user: Report AArch64 FP16 support via hwcap bits target/arm: Enable ARM_V8_FP16 feature bit for the AArch64 "any" CPU arm/translate-a64: add all single op FP16 to handle_fp_1src_half arm/translate-a64: implement simd_scalar_three_reg_same_fp16 arm/translate-a64: add all FP16 ops in simd_scalar_pairwise arm/translate-a64: add FP16 FMOV to simd_mod_imm arm/translate-a64: add FP16 FRSQRTE to simd_two_reg_misc_fp16 arm/helper.c: re-factor rsqrte and add rsqrte_f16 arm/translate-a64: add FP16 FSQRT to simd_two_reg_misc_fp16 arm/translate-a64: add FP16 FRCPX to simd_two_reg_misc_fp16 arm/translate-a64: add FP16 FRECPE arm/helper.c: re-factor recpe and add recepe_f16 arm/translate-a64: add FP16 FNEG/FABS to simd_two_reg_misc_fp16 arm/translate-a64: add FP16 SCVTF/UCVFT to simd_two_reg_misc_fp16 arm/translate-a64: add FP16 FCMxx (zero) to simd_two_reg_misc_fp16 arm/translate-a64: add FCVTxx to simd_two_reg_misc_fp16 arm/translate-a64: add FP16 FPRINTx to simd_two_reg_misc_fp16 arm/translate-a64: initial decode for simd_two_reg_misc_fp16 arm/translate-a64: add FP16 x2 ops for simd_indexed ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2018-03-01 15:37:31 +00:00 · 2018-03-01 15:37:31 +00:00 · 9db0855e85
commit 9db0855e85
parent 8cb340c613 c22e580c2a
23 changed files with 1981 additions and 471 deletions
--- a/12
+++ b/12
@ -556,7 +556,7 @@ F: hw/misc/arm_sysctl.c
 Xilinx Zynq
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 L: qemu-arm@nongnu.org
 S: Maintained
 F: hw/*/xilinx_*
@ -566,7 +566,7 @@ F: include/hw/misc/zynq*
 X: hw/ssi/xilinx_*
 Xilinx ZynqMP
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 M: Edgar E. Iglesias <edgar.iglesias@gmail.com>
 L: qemu-arm@nongnu.org
 S: Maintained
@ -1075,7 +1075,7 @@ T: git git://github.com/bonzini/qemu.git scsi-next
 SSI
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
 F: hw/ssi/*
 F: hw/block/m25p80.c
@ -1084,7 +1084,7 @@ X: hw/ssi/xilinx_*
 F: tests/m25p80-test.c
 Xilinx SPI
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 M: Peter Crosthwaite <crosthwaite.peter@gmail.com>
 S: Maintained
 F: hw/ssi/xilinx_*
@ -1254,7 +1254,7 @@ S: Maintained
 F: hw/net/eepro100.c
 Generic Loader
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
 F: hw/core/generic-loader.c
 F: include/hw/core/generic-loader.h
@ -1600,7 +1600,7 @@ F: tests/qmp-test.c
 T: git git://repo.or.cz/qemu/armbru.git qapi-next
 Register API
-M: Alistair Francis <alistair.francis@xilinx.com>
+M: Alistair Francis <alistair@alistair23.me>
 S: Maintained
 F: hw/core/register.c
 F: include/hw/register.h
--- a/default-configs/arm-softmmu.mak
+++ b/default-configs/arm-softmmu.mak
@ -21,6 +21,8 @@ CONFIG_STELLARIS_INPUT=y
 CONFIG_STELLARIS_ENET=y
 CONFIG_SSD0303=y
 CONFIG_SSD0323=y
 CONFIG_DDC=y
 CONFIG_SII9022=y
 CONFIG_ADS7846=y
 CONFIG_MAX111X=y
 CONFIG_SSI=y
--- a/hw/arm/vexpress.c
+++ b/hw/arm/vexpress.c
@ -29,6 +29,7 @@
 #include "hw/arm/arm.h"
 #include "hw/arm/primecell.h"
 #include "hw/devices.h"
 #include "hw/i2c/i2c.h"
 #include "net/net.h"
 #include "sysemu/sysemu.h"
 #include "hw/boards.h"
@ -537,6 +538,7 @@ static void vexpress_common_init(MachineState *machine)
    uint32_t sys_id;
    DriveInfo *dinfo;
    pflash_t *pflash0;
    I2CBus *i2c;
    ram_addr_t vram_size, sram_size;
    MemoryRegion *sysmem = get_system_memory();
    MemoryRegion *vram = g_new(MemoryRegion, 1);
@ -628,7 +630,9 @@ static void vexpress_common_init(MachineState *machine)
    sysbus_create_simple("sp804", map[VE_TIMER01], pic[2]);
    sysbus_create_simple("sp804", map[VE_TIMER23], pic[3]);
-    /* VE_SERIALDVI: not modelled */
+    dev = sysbus_create_simple("versatile_i2c", map[VE_SERIALDVI], NULL);
    i2c = (I2CBus *)qdev_get_child_bus(dev, "i2c");
    i2c_create_slave(i2c, "sii9022", 0x39);
    sysbus_create_simple("pl031", map[VE_RTC], pic[4]); /* RTC */
--- a/hw/core/register.c
+++ b/hw/core/register.c
@ -159,13 +159,21 @@ uint64_t register_read(RegisterInfo *reg, uint64_t re, const char* prefix,
 void register_reset(RegisterInfo *reg)
 {
    const RegisterAccessInfo *ac;
    g_assert(reg);
    if (!reg->data || !reg->access) {
        return;
    }
    ac = reg->access;
    register_write_val(reg, reg->access->reset);
    if (ac->post_write) {
        ac->post_write(reg, reg->access->reset);
    }
 }
 void register_init(RegisterInfo *reg)
--- a/hw/display/Makefile.objs
+++ b/hw/display/Makefile.objs
@ -3,6 +3,7 @@ common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o
 common-obj-$(CONFIG_G364FB) += g364fb.o
 common-obj-$(CONFIG_JAZZ_LED) += jazz_led.o
 common-obj-$(CONFIG_PL110) += pl110.o
 common-obj-$(CONFIG_SII9022) += sii9022.o
 common-obj-$(CONFIG_SSD0303) += ssd0303.o
 common-obj-$(CONFIG_SSD0323) += ssd0323.o
 common-obj-$(CONFIG_XEN) += xenfb.o
--- a/hw/display/sii9022.c
+++ b/hw/display/sii9022.c
@ -0,0 +1,191 @@
 /*
 * Silicon Image SiI9022
 *
 * This is a pretty hollow emulation: all we do is acknowledge that we
 * exist (chip ID) and confirm that we get switched over into DDC mode
 * so the emulated host can proceed to read out EDID data. All subsequent
 * set-up of connectors etc will be acknowledged and ignored.
 *
 * Copyright (C) 2018 Linus Walleij
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 * SPDX-License-Identifier: GPL-2.0-or-later
 */
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "hw/i2c/i2c.h"
 #include "hw/i2c/i2c-ddc.h"
 #include "trace.h"
 #define SII9022_SYS_CTRL_DATA 0x1a
 #define SII9022_SYS_CTRL_PWR_DWN 0x10
 #define SII9022_SYS_CTRL_AV_MUTE 0x08
 #define SII9022_SYS_CTRL_DDC_BUS_REQ 0x04
 #define SII9022_SYS_CTRL_DDC_BUS_GRTD 0x02
 #define SII9022_SYS_CTRL_OUTPUT_MODE 0x01
 #define SII9022_SYS_CTRL_OUTPUT_HDMI 1
 #define SII9022_SYS_CTRL_OUTPUT_DVI 0
 #define SII9022_REG_CHIPID 0x1b
 #define SII9022_INT_ENABLE 0x3c
 #define SII9022_INT_STATUS 0x3d
 #define SII9022_INT_STATUS_HOTPLUG 0x01;
 #define SII9022_INT_STATUS_PLUGGED 0x04;
 #define TYPE_SII9022 "sii9022"
 #define SII9022(obj) OBJECT_CHECK(sii9022_state, (obj), TYPE_SII9022)
 typedef struct sii9022_state {
    I2CSlave parent_obj;
    uint8_t ptr;
    bool addr_byte;
    bool ddc_req;
    bool ddc_skip_finish;
    bool ddc;
 } sii9022_state;
 static const VMStateDescription vmstate_sii9022 = {
    .name = "sii9022",
    .version_id = 1,
    .minimum_version_id = 1,
    .fields = (VMStateField[]) {
        VMSTATE_I2C_SLAVE(parent_obj, sii9022_state),
        VMSTATE_UINT8(ptr, sii9022_state),
        VMSTATE_BOOL(addr_byte, sii9022_state),
        VMSTATE_BOOL(ddc_req, sii9022_state),
        VMSTATE_BOOL(ddc_skip_finish, sii9022_state),
        VMSTATE_BOOL(ddc, sii9022_state),
        VMSTATE_END_OF_LIST()
    }
 };
 static int sii9022_event(I2CSlave *i2c, enum i2c_event event)
 {
    sii9022_state *s = SII9022(i2c);
    switch (event) {
    case I2C_START_SEND:
        s->addr_byte = true;
        break;
    case I2C_START_RECV:
        break;
    case I2C_FINISH:
        break;
    case I2C_NACK:
        break;
    }
    return 0;
 }
 static int sii9022_rx(I2CSlave *i2c)
 {
    sii9022_state *s = SII9022(i2c);
    uint8_t res = 0x00;
    switch (s->ptr) {
    case SII9022_SYS_CTRL_DATA:
        if (s->ddc_req) {
            /* Acknowledge DDC bus request */
            res = SII9022_SYS_CTRL_DDC_BUS_GRTD | SII9022_SYS_CTRL_DDC_BUS_REQ;
        }
        break;
    case SII9022_REG_CHIPID:
        res = 0xb0;
        break;
    case SII9022_INT_STATUS:
        /* Something is cold-plugged in, no interrupts */
        res = SII9022_INT_STATUS_PLUGGED;
        break;
    default:
        break;
    }
    trace_sii9022_read_reg(s->ptr, res);
    s->ptr++;
    return res;
 }
 static int sii9022_tx(I2CSlave *i2c, uint8_t data)
 {
    sii9022_state *s = SII9022(i2c);
    if (s->addr_byte) {
        s->ptr = data;
        s->addr_byte = false;
        return 0;
    }
    switch (s->ptr) {
    case SII9022_SYS_CTRL_DATA:
        if (data & SII9022_SYS_CTRL_DDC_BUS_REQ) {
            s->ddc_req = true;
            if (data & SII9022_SYS_CTRL_DDC_BUS_GRTD) {
                s->ddc = true;
                /* Skip this finish since we just switched to DDC */
                s->ddc_skip_finish = true;
                trace_sii9022_switch_mode("DDC");
            }
        } else {
            s->ddc_req = false;
            s->ddc = false;
            trace_sii9022_switch_mode("normal");
        }
        break;
    default:
        break;
    }
    trace_sii9022_write_reg(s->ptr, data);
    s->ptr++;
    return 0;
 }
 static void sii9022_reset(DeviceState *dev)
 {
    sii9022_state *s = SII9022(dev);
    s->ptr = 0;
    s->addr_byte = false;
    s->ddc_req = false;
    s->ddc_skip_finish = false;
    s->ddc = false;
 }
 static void sii9022_realize(DeviceState *dev, Error **errp)
 {
    I2CBus *bus;
    bus = I2C_BUS(qdev_get_parent_bus(dev));
    i2c_create_slave(bus, TYPE_I2CDDC, 0x50);
 }
 static void sii9022_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    I2CSlaveClass *k = I2C_SLAVE_CLASS(klass);
    k->event = sii9022_event;
    k->recv = sii9022_rx;
    k->send = sii9022_tx;
    dc->reset = sii9022_reset;
    dc->realize = sii9022_realize;
    dc->vmsd = &vmstate_sii9022;
 }
 static const TypeInfo sii9022_info = {
    .name          = TYPE_SII9022,
    .parent        = TYPE_I2C_SLAVE,
    .instance_size = sizeof(sii9022_state),
    .class_init    = sii9022_class_init,
 };
 static void sii9022_register_types(void)
 {
    type_register_static(&sii9022_info);
 }
 type_init(sii9022_register_types)
--- a/hw/display/trace-events
+++ b/hw/display/trace-events
@ -132,3 +132,8 @@ vga_cirrus_read_io(uint32_t addr, uint32_t val) "addr 0x%x, val 0x%x"
 vga_cirrus_write_io(uint32_t addr, uint32_t val) "addr 0x%x, val 0x%x"
 vga_cirrus_read_blt(uint32_t offset, uint32_t val) "offset 0x%x, val 0x%x"
 vga_cirrus_write_blt(uint32_t offset, uint32_t val) "offset 0x%x, val 0x%x"
 # hw/display/sii9022.c
 sii9022_read_reg(uint8_t addr, uint8_t val) "addr 0x%02x, val 0x%02x"
 sii9022_write_reg(uint8_t addr, uint8_t val) "addr 0x%02x, val 0x%02x"
 sii9022_switch_mode(const char *mode) "mode: %s"
--- a/hw/i2c/core.c
+++ b/hw/i2c/core.c
@ -10,31 +10,13 @@
 #include "qemu/osdep.h"
 #include "hw/i2c/i2c.h"
 typedef struct I2CNode I2CNode;
 struct I2CNode {
    I2CSlave *elt;
    QLIST_ENTRY(I2CNode) next;
 };
 #define I2C_BROADCAST 0x00
 struct I2CBus
 {
    BusState qbus;
    QLIST_HEAD(, I2CNode) current_devs;
    uint8_t saved_address;
    bool broadcast;
 };
 static Property i2c_props[] = {
    DEFINE_PROP_UINT8("address", struct I2CSlave, address, 0),
    DEFINE_PROP_END_OF_LIST(),
 };
 #define TYPE_I2C_BUS "i2c-bus"
 #define I2C_BUS(obj) OBJECT_CHECK(I2CBus, (obj), TYPE_I2C_BUS)
 static const TypeInfo i2c_bus_info = {
    .name = TYPE_I2C_BUS,
    .parent = TYPE_BUS,
--- a/hw/i2c/i2c-ddc.c
+++ b/hw/i2c/i2c-ddc.c
@ -259,12 +259,12 @@ static int i2c_ddc_tx(I2CSlave *i2c, uint8_t data)
        s->reg = data;
        s->firstbyte = false;
        DPRINTF("[EDID] Written new pointer: %u\n", data);
-        return 1;
+        return 0;
    }
    /* Ignore all writes */
    s->reg++;
-    return 1;
+    return 0;
 }
 static void i2c_ddc_init(Object *obj)
--- a/hw/ssi/xilinx_spips.c
+++ b/hw/ssi/xilinx_spips.c
@ -223,7 +223,7 @@ static void xilinx_spips_update_cs(XilinxSPIPS *s, int field)
 {
    int i;
-    for (i = 0; i < s->num_cs; i++) {
+    for (i = 0; i < s->num_cs * s->num_busses; i++) {
        bool old_state = s->cs_lines_state[i];
        bool new_state = field & (1 << i);
@ -234,7 +234,7 @@ static void xilinx_spips_update_cs(XilinxSPIPS *s, int field)
        }
        qemu_set_irq(s->cs_lines[i], !new_state);
    }
-    if (!(field & ((1 << s->num_cs) - 1))) {
+    if (!(field & ((1 << (s->num_cs * s->num_busses)) - 1))) {
        s->snoop_state = SNOOP_CHECKING;
        s->cmd_dummies = 0;
        s->link_state = 1;
@ -248,7 +248,40 @@ static void xlnx_zynqmp_qspips_update_cs_lines(XlnxZynqMPQSPIPS *s)
 {
    if (s->regs[R_GQSPI_GF_SNAPSHOT]) {
        int field = ARRAY_FIELD_EX32(s->regs, GQSPI_GF_SNAPSHOT, CHIP_SELECT);
-        xilinx_spips_update_cs(XILINX_SPIPS(s), field);
+        bool upper_cs_sel = field & (1 << 1);
        bool lower_cs_sel = field & 1;
        bool bus0_enabled;
        bool bus1_enabled;
        uint8_t buses;
        int cs = 0;
        buses = ARRAY_FIELD_EX32(s->regs, GQSPI_GF_SNAPSHOT, DATA_BUS_SELECT);
        bus0_enabled = buses & 1;
        bus1_enabled = buses & (1 << 1);
        if (bus0_enabled && bus1_enabled) {
            if (lower_cs_sel) {
                cs |= 1;
            }
            if (upper_cs_sel) {
                cs |= 1 << 3;
            }
        } else if (bus0_enabled) {
            if (lower_cs_sel) {
                cs |= 1;
            }
            if (upper_cs_sel) {
                cs |= 1 << 1;
            }
        } else if (bus1_enabled) {
            if (lower_cs_sel) {
                cs |= 1 << 2;
            }
            if (upper_cs_sel) {
                cs |= 1 << 3;
            }
        }
        xilinx_spips_update_cs(XILINX_SPIPS(s), cs);
    }
 }
@ -260,7 +293,7 @@ static void xilinx_spips_update_cs_lines(XilinxSPIPS *s)
    if (num_effective_busses(s) == 2) {
        /* Single bit chip-select for qspi */
        field &= 0x1;
-        field |= field << 1;
+        field |= field << 3;
    /* Dual stack U-Page */
    } else if (s->regs[R_LQSPI_CFG] & LQSPI_CFG_TWO_MEM &&
               s->regs[R_LQSPI_STS] & LQSPI_CFG_U_PAGE) {
@ -544,7 +577,7 @@ static int xilinx_spips_num_dummies(XilinxQSPIPS *qs, uint8_t command)
        return 2;
    case QIOR:
    case QIOR_4:
-        return 5;
+        return 4;
    default:
        return -1;
    }
--- a/include/exec/helper-head.h
+++ b/include/exec/helper-head.h
@ -26,6 +26,7 @@
 #define dh_alias_int i32
 #define dh_alias_i64 i64
 #define dh_alias_s64 i64
 #define dh_alias_f16 i32
 #define dh_alias_f32 i32
 #define dh_alias_f64 i64
 #define dh_alias_ptr ptr
@ -38,6 +39,7 @@
 #define dh_ctype_int int
 #define dh_ctype_i64 uint64_t
 #define dh_ctype_s64 int64_t
 #define dh_ctype_f16 float16
 #define dh_ctype_f32 float32
 #define dh_ctype_f64 float64
 #define dh_ctype_ptr void *
@ -94,6 +96,7 @@
 #define dh_is_signed_s32 1
 #define dh_is_signed_i64 0
 #define dh_is_signed_s64 1
 #define dh_is_signed_f16 0
 #define dh_is_signed_f32 0
 #define dh_is_signed_f64 0
 #define dh_is_signed_tl  0
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@ -306,8 +306,11 @@ static inline float16 float16_set_sign(float16 a, int sign)
 }
 #define float16_zero make_float16(0)
 #define float16_one make_float16(0x3c00)
 #define float16_half make_float16(0x3800)
 #define float16_one make_float16(0x3c00)
 #define float16_one_point_five make_float16(0x3e00)
 #define float16_two make_float16(0x4000)
 #define float16_three make_float16(0x4200)
 #define float16_infinity make_float16(0x7c00)
 /*----------------------------------------------------------------------------
@ -415,11 +418,13 @@ static inline float32 float32_set_sign(float32 a, int sign)
 }
 #define float32_zero make_float32(0)
 #define float32_one make_float32(0x3f800000)
 #define float32_half make_float32(0x3f000000)
 #define float32_one make_float32(0x3f800000)
 #define float32_one_point_five make_float32(0x3fc00000)
 #define float32_two make_float32(0x40000000)
 #define float32_three make_float32(0x40400000)
 #define float32_infinity make_float32(0x7f800000)
 /*----------------------------------------------------------------------------
 | The pattern for a default generated single-precision NaN.
 *----------------------------------------------------------------------------*/
@ -526,9 +531,12 @@ static inline float64 float64_set_sign(float64 a, int sign)
 }
 #define float64_zero make_float64(0)
 #define float64_one make_float64(0x3ff0000000000000LL)
 #define float64_ln2 make_float64(0x3fe62e42fefa39efLL)
 #define float64_half make_float64(0x3fe0000000000000LL)
 #define float64_one make_float64(0x3ff0000000000000LL)
 #define float64_one_point_five make_float64(0x3FF8000000000000ULL)
 #define float64_two make_float64(0x4000000000000000ULL)
 #define float64_three make_float64(0x4008000000000000ULL)
 #define float64_ln2 make_float64(0x3fe62e42fefa39efLL)
 #define float64_infinity make_float64(0x7ff0000000000000LL)
 /*----------------------------------------------------------------------------
--- a/include/hw/i2c/i2c.h
+++ b/include/hw/i2c/i2c.h
@ -25,8 +25,7 @@ typedef struct I2CSlave I2CSlave;
 #define I2C_SLAVE_GET_CLASS(obj) \
     OBJECT_GET_CLASS(I2CSlaveClass, (obj), TYPE_I2C_SLAVE)
-typedef struct I2CSlaveClass
+typedef struct I2CSlaveClass {
 {
    DeviceClass parent_class;
    /* Callbacks provided by the device.  */
@ -50,14 +49,30 @@ typedef struct I2CSlaveClass
    int (*event)(I2CSlave *s, enum i2c_event event);
 } I2CSlaveClass;
-struct I2CSlave
+struct I2CSlave {
 {
    DeviceState qdev;
    /* Remaining fields for internal use by the I2C code.  */
    uint8_t address;
 };
 #define TYPE_I2C_BUS "i2c-bus"
 #define I2C_BUS(obj) OBJECT_CHECK(I2CBus, (obj), TYPE_I2C_BUS)
 typedef struct I2CNode I2CNode;
 struct I2CNode {
    I2CSlave *elt;
    QLIST_ENTRY(I2CNode) next;
 };
 struct I2CBus {
    BusState qbus;
    QLIST_HEAD(, I2CNode) current_devs;
    uint8_t saved_address;
    bool broadcast;
 };
 I2CBus *i2c_init_bus(DeviceState *parent, const char *name);
 void i2c_set_slave_address(I2CSlave *dev, uint8_t address);
 int i2c_bus_busy(I2CBus *bus);
--- a/include/hw/register.h
+++ b/include/hw/register.h
@ -34,7 +34,7 @@ typedef struct RegisterInfoArray RegisterInfoArray;
 * immediately before the actual write. The returned value is what is written,
 * giving the handler a chance to modify the written value.
 * @post_write: Post write callback. Passed the written value. Most write side
- * effects should be implemented here.
+ * effects should be implemented here. This is called during device reset.
 *
 * @post_read: Post read callback. Passes the value that is about to be returned
 * for a read. The return value from this function is what is ultimately read,
@ -135,8 +135,8 @@ uint64_t register_read(RegisterInfo *reg, uint64_t re, const char* prefix,
                       bool debug);
 /**
- * reset a register
+ * Resets a register. This will also call the post_write hook if it exists.
- * @reg: register to reset
+ * @reg: The register to reset.
 */
 void register_reset(RegisterInfo *reg);
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@ -551,6 +551,8 @@ static uint32_t get_elf_hwcap(void)
    GET_FEATURE(ARM_FEATURE_V8_SM3, ARM_HWCAP_A64_SM3);
    GET_FEATURE(ARM_FEATURE_V8_SM4, ARM_HWCAP_A64_SM4);
    GET_FEATURE(ARM_FEATURE_V8_SHA512, ARM_HWCAP_A64_SHA512);
    GET_FEATURE(ARM_FEATURE_V8_FP16,
                ARM_HWCAP_A64_FPHP | ARM_HWCAP_A64_ASIMDHP);
 #undef GET_FEATURE
    return hwcaps;
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@ -168,6 +168,7 @@ typedef struct {
 *  Qn = regs[n].d[1]:regs[n].d[0]
 *  Dn = regs[n].d[0]
 *  Sn = regs[n].d[0] bits 31..0
 *  Hn = regs[n].d[0] bits 15..0
 *
 * This corresponds to the architecturally defined mapping between
 * the two execution states, and means we do not need to explicitly
@ -537,19 +538,29 @@ typedef struct CPUARMState {
        /* scratch space when Tn are not sufficient.  */
        uint32_t scratch[8];
-        /* fp_status is the "normal" fp status. standard_fp_status retains
+        /* There are a number of distinct float control structures:
-         * values corresponding to the ARM "Standard FPSCR Value", ie
+         *
-         * default-NaN, flush-to-zero, round-to-nearest and is used by
+         *  fp_status: is the "normal" fp status.
-         * any operations (generally Neon) which the architecture defines
+         *  fp_status_fp16: used for half-precision calculations
-         * as controlled by the standard FPSCR value rather than the FPSCR.
+         *  standard_fp_status : the ARM "Standard FPSCR Value"
         *
         * Half-precision operations are governed by a separate
         * flush-to-zero control bit in FPSCR:FZ16. We pass a separate
         * status structure to control this.
         *
         * The "Standard FPSCR", ie default-NaN, flush-to-zero,
         * round-to-nearest and is used by any operations (generally
         * Neon) which the architecture defines as controlled by the
         * standard FPSCR value rather than the FPSCR.
         *
         * To avoid having to transfer exception bits around, we simply
         * say that the FPSCR cumulative exception flags are the logical
-         * OR of the flags in the two fp statuses. This relies on the
+         * OR of the flags in the three fp statuses. This relies on the
         * only thing which needs to read the exception flags being
         * an explicit FPSCR read.
         */
        float_status fp_status;
        float_status fp_status_f16;
        float_status standard_fp_status;
        /* ZCR_EL[1-3] */
@ -1189,12 +1200,20 @@ static inline void xpsr_write(CPUARMState *env, uint32_t val, uint32_t mask)
 uint32_t vfp_get_fpscr(CPUARMState *env);
 void vfp_set_fpscr(CPUARMState *env, uint32_t val);
-/* For A64 the FPSCR is split into two logically distinct registers,
+/* FPCR, Floating Point Control Register
 * FPSR, Floating Poiht Status Register
 *
 * For A64 the FPSCR is split into two logically distinct registers,
 * FPCR and FPSR. However since they still use non-overlapping bits
 * we store the underlying state in fpscr and just mask on read/write.
 */
 #define FPSR_MASK 0xf800009f
 #define FPCR_MASK 0x07f79f00
 #define FPCR_FZ16   (1 << 19)   /* ARMv8.2+, FP16 flush-to-zero */
 #define FPCR_FZ     (1 << 24)   /* Flush-to-zero enable bit */
 #define FPCR_DN     (1 << 25)   /* Default NaN enable bit */
 static inline uint32_t vfp_get_fpsr(CPUARMState *env)
 {
    return vfp_get_fpscr(env) & FPSR_MASK;
@ -1408,6 +1427,7 @@ enum arm_features {
    ARM_FEATURE_V8_SHA3, /* implements SHA3 part of v8 Crypto Extensions */
    ARM_FEATURE_V8_SM3, /* implements SM3 part of v8 Crypto Extensions */
    ARM_FEATURE_V8_SM4, /* implements SM4 part of v8 Crypto Extensions */
    ARM_FEATURE_V8_FP16, /* implements v8.2 half-precision float */
 };
 static inline int arm_feature(CPUARMState *env, int feature)
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@ -230,6 +230,7 @@ static void aarch64_any_initfn(Object *obj)
    set_feature(&cpu->env, ARM_FEATURE_V8_SM4);
    set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
    set_feature(&cpu->env, ARM_FEATURE_CRC);
    set_feature(&cpu->env, ARM_FEATURE_V8_FP16);
    cpu->ctr = 0x80038003; /* 32 byte I and D cacheline size, VIPT icache */
    cpu->dcz_blocksize = 7; /*  512 bytes */
 }
--- a/target/arm/helper-a64.c
+++ b/target/arm/helper-a64.c
@ -192,6 +192,10 @@ uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
 * versions, these do a fully fused multiply-add or
 * multiply-add-and-halve.
 */
 #define float16_two make_float16(0x4000)
 #define float16_three make_float16(0x4200)
 #define float16_one_point_five make_float16(0x3e00)
 #define float32_two make_float32(0x40000000)
 #define float32_three make_float32(0x40400000)
 #define float32_one_point_five make_float32(0x3fc00000)
@ -200,6 +204,21 @@ uint64_t HELPER(neon_cgt_f64)(float64 a, float64 b, void *fpstp)
 #define float64_three make_float64(0x4008000000000000ULL)
 #define float64_one_point_five make_float64(0x3FF8000000000000ULL)
 float16 HELPER(recpsf_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    a = float16_squash_input_denormal(a, fpst);
    b = float16_squash_input_denormal(b, fpst);
    a = float16_chs(a);
    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
        (float16_is_infinity(b) && float16_is_zero(a))) {
        return float16_two;
    }
    return float16_muladd(a, b, float16_two, 0, fpst);
 }
 float32 HELPER(recpsf_f32)(float32 a, float32 b, void *fpstp)
 {
    float_status *fpst = fpstp;
@ -230,6 +249,21 @@ float64 HELPER(recpsf_f64)(float64 a, float64 b, void *fpstp)
    return float64_muladd(a, b, float64_two, 0, fpst);
 }
 float16 HELPER(rsqrtsf_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    a = float16_squash_input_denormal(a, fpst);
    b = float16_squash_input_denormal(b, fpst);
    a = float16_chs(a);
    if ((float16_is_infinity(a) && float16_is_zero(b)) ||
        (float16_is_infinity(b) && float16_is_zero(a))) {
        return float16_one_point_five;
    }
    return float16_muladd(a, b, float16_three, float_muladd_halve_result, fpst);
 }
 float32 HELPER(rsqrtsf_f32)(float32 a, float32 b, void *fpstp)
 {
    float_status *fpst = fpstp;
@ -322,6 +356,35 @@ uint64_t HELPER(neon_addlp_u16)(uint64_t a)
 }
 /* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */
 float16 HELPER(frecpx_f16)(float16 a, void *fpstp)
 {
    float_status *fpst = fpstp;
    uint16_t val16, sbit;
    int16_t exp;
    if (float16_is_any_nan(a)) {
        float16 nan = a;
        if (float16_is_signaling_nan(a, fpst)) {
            float_raise(float_flag_invalid, fpst);
            nan = float16_maybe_silence_nan(a, fpst);
        }
        if (fpst->default_nan_mode) {
            nan = float16_default_nan(fpst);
        }
        return nan;
    }
    val16 = float16_val(a);
    sbit = 0x8000 & val16;
    exp = extract32(val16, 10, 5);
    if (exp == 0) {
        return make_float16(deposit32(sbit, 10, 5, 0x1e));
    } else {
        return make_float16(deposit32(sbit, 10, 5, ~exp));
    }
 }
 float32 HELPER(frecpx_f32)(float32 a, void *fpstp)
 {
    float_status *fpst = fpstp;
@ -572,3 +635,209 @@ uint64_t HELPER(paired_cmpxchg64_be_parallel)(CPUARMState *env, uint64_t addr,
 {
    return do_paired_cmpxchg64_be(env, addr, new_lo, new_hi, true, GETPC());
 }
 /*
 * AdvSIMD half-precision
 */
 #define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
 #define ADVSIMD_HALFOP(name) \
 float16 ADVSIMD_HELPER(name, h)(float16 a, float16 b, void *fpstp) \
 { \
    float_status *fpst = fpstp; \
    return float16_ ## name(a, b, fpst);    \
 }
 ADVSIMD_HALFOP(add)
 ADVSIMD_HALFOP(sub)
 ADVSIMD_HALFOP(mul)
 ADVSIMD_HALFOP(div)
 ADVSIMD_HALFOP(min)
 ADVSIMD_HALFOP(max)
 ADVSIMD_HALFOP(minnum)
 ADVSIMD_HALFOP(maxnum)
 #define ADVSIMD_TWOHALFOP(name)                                         \
 uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \
 { \
    float16  a1, a2, b1, b2;                        \
    uint32_t r1, r2;                                \
    float_status *fpst = fpstp;                     \
    a1 = extract32(two_a, 0, 16);                   \
    a2 = extract32(two_a, 16, 16);                  \
    b1 = extract32(two_b, 0, 16);                   \
    b2 = extract32(two_b, 16, 16);                  \
    r1 = float16_ ## name(a1, b1, fpst);            \
    r2 = float16_ ## name(a2, b2, fpst);            \
    return deposit32(r1, 16, 16, r2);               \
 }
 ADVSIMD_TWOHALFOP(add)
 ADVSIMD_TWOHALFOP(sub)
 ADVSIMD_TWOHALFOP(mul)
 ADVSIMD_TWOHALFOP(div)
 ADVSIMD_TWOHALFOP(min)
 ADVSIMD_TWOHALFOP(max)
 ADVSIMD_TWOHALFOP(minnum)
 ADVSIMD_TWOHALFOP(maxnum)
 /* Data processing - scalar floating-point and advanced SIMD */
 static float16 float16_mulx(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    a = float16_squash_input_denormal(a, fpst);
    b = float16_squash_input_denormal(b, fpst);
    if ((float16_is_zero(a) && float16_is_infinity(b)) ||
        (float16_is_infinity(a) && float16_is_zero(b))) {
        /* 2.0 with the sign bit set to sign(A) XOR sign(B) */
        return make_float16((1U << 14) |
                            ((float16_val(a) ^ float16_val(b)) & (1U << 15)));
    }
    return float16_mul(a, b, fpst);
 }
 ADVSIMD_HALFOP(mulx)
 ADVSIMD_TWOHALFOP(mulx)
 /* fused multiply-accumulate */
 float16 HELPER(advsimd_muladdh)(float16 a, float16 b, float16 c, void *fpstp)
 {
    float_status *fpst = fpstp;
    return float16_muladd(a, b, c, 0, fpst);
 }
 uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b,
                                  uint32_t two_c, void *fpstp)
 {
    float_status *fpst = fpstp;
    float16  a1, a2, b1, b2, c1, c2;
    uint32_t r1, r2;
    a1 = extract32(two_a, 0, 16);
    a2 = extract32(two_a, 16, 16);
    b1 = extract32(two_b, 0, 16);
    b2 = extract32(two_b, 16, 16);
    c1 = extract32(two_c, 0, 16);
    c2 = extract32(two_c, 16, 16);
    r1 = float16_muladd(a1, b1, c1, 0, fpst);
    r2 = float16_muladd(a2, b2, c2, 0, fpst);
    return deposit32(r1, 16, 16, r2);
 }
 /*
 * Floating point comparisons produce an integer result. Softfloat
 * routines return float_relation types which we convert to the 0/-1
 * Neon requires.
 */
 #define ADVSIMD_CMPRES(test) (test) ? 0xffff : 0
 uint32_t HELPER(advsimd_ceq_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    int compare = float16_compare_quiet(a, b, fpst);
    return ADVSIMD_CMPRES(compare == float_relation_equal);
 }
 uint32_t HELPER(advsimd_cge_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    int compare = float16_compare(a, b, fpst);
    return ADVSIMD_CMPRES(compare == float_relation_greater ||
                          compare == float_relation_equal);
 }
 uint32_t HELPER(advsimd_cgt_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    int compare = float16_compare(a, b, fpst);
    return ADVSIMD_CMPRES(compare == float_relation_greater);
 }
 uint32_t HELPER(advsimd_acge_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    float16 f0 = float16_abs(a);
    float16 f1 = float16_abs(b);
    int compare = float16_compare(f0, f1, fpst);
    return ADVSIMD_CMPRES(compare == float_relation_greater ||
                          compare == float_relation_equal);
 }
 uint32_t HELPER(advsimd_acgt_f16)(float16 a, float16 b, void *fpstp)
 {
    float_status *fpst = fpstp;
    float16 f0 = float16_abs(a);
    float16 f1 = float16_abs(b);
    int compare = float16_compare(f0, f1, fpst);
    return ADVSIMD_CMPRES(compare == float_relation_greater);
 }
 /* round to integral */
 float16 HELPER(advsimd_rinth_exact)(float16 x, void *fp_status)
 {
    return float16_round_to_int(x, fp_status);
 }
 float16 HELPER(advsimd_rinth)(float16 x, void *fp_status)
 {
    int old_flags = get_float_exception_flags(fp_status), new_flags;
    float16 ret;
    ret = float16_round_to_int(x, fp_status);
    /* Suppress any inexact exceptions the conversion produced */
    if (!(old_flags & float_flag_inexact)) {
        new_flags = get_float_exception_flags(fp_status);
        set_float_exception_flags(new_flags & ~float_flag_inexact, fp_status);
    }
    return ret;
 }
 /*
 * Half-precision floating point conversion functions
 *
 * There are a multitude of conversion functions with various
 * different rounding modes. This is dealt with by the calling code
 * setting the mode appropriately before calling the helper.
 */
 uint32_t HELPER(advsimd_f16tosinth)(float16 a, void *fpstp)
 {
    float_status *fpst = fpstp;
    /* Invalid if we are passed a NaN */
    if (float16_is_any_nan(a)) {
        float_raise(float_flag_invalid, fpst);
        return 0;
    }
    return float16_to_int16(a, fpst);
 }
 uint32_t HELPER(advsimd_f16touinth)(float16 a, void *fpstp)
 {
    float_status *fpst = fpstp;
    /* Invalid if we are passed a NaN */
    if (float16_is_any_nan(a)) {
        float_raise(float_flag_invalid, fpst);
        return 0;
    }
    return float16_to_uint16(a, fpst);
 }
 /*
 * Square Root and Reciprocal square root
 */
 float16 HELPER(sqrt_f16)(float16 a, void *fpstp)
 {
    float_status *s = fpstp;
    return float16_sqrt(a, s);
 }
--- a/target/arm/helper-a64.h
+++ b/target/arm/helper-a64.h
@ -29,8 +29,10 @@ DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
 DEF_HELPER_FLAGS_3(neon_cge_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
 DEF_HELPER_FLAGS_3(neon_cgt_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr)
 DEF_HELPER_FLAGS_3(recpsf_f16, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
 DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_3(rsqrtsf_f16, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
 DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr)
 DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr)
 DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64)
@ -39,6 +41,7 @@ DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(frecpx_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
 DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env)
 DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
@ -48,3 +51,33 @@ DEF_HELPER_FLAGS_4(paired_cmpxchg64_le_parallel, TCG_CALL_NO_WG,
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_be_parallel, TCG_CALL_NO_WG,
                   i64, env, i64, i64, i64)
 DEF_HELPER_FLAGS_3(advsimd_maxh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
 DEF_HELPER_FLAGS_3(advsimd_minh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
 DEF_HELPER_FLAGS_3(advsimd_maxnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
 DEF_HELPER_FLAGS_3(advsimd_minnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
 DEF_HELPER_3(advsimd_addh, f16, f16, f16, ptr)
 DEF_HELPER_3(advsimd_subh, f16, f16, f16, ptr)
 DEF_HELPER_3(advsimd_mulh, f16, f16, f16, ptr)
 DEF_HELPER_3(advsimd_divh, f16, f16, f16, ptr)
 DEF_HELPER_3(advsimd_ceq_f16, i32, f16, f16, ptr)
 DEF_HELPER_3(advsimd_cge_f16, i32, f16, f16, ptr)
 DEF_HELPER_3(advsimd_cgt_f16, i32, f16, f16, ptr)
 DEF_HELPER_3(advsimd_acge_f16, i32, f16, f16, ptr)
 DEF_HELPER_3(advsimd_acgt_f16, i32, f16, f16, ptr)
 DEF_HELPER_3(advsimd_mulxh, f16, f16, f16, ptr)
 DEF_HELPER_4(advsimd_muladdh, f16, f16, f16, f16, ptr)
 DEF_HELPER_3(advsimd_add2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_sub2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_mul2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_div2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_max2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_min2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_maxnum2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_minnum2h, i32, i32, i32, ptr)
 DEF_HELPER_3(advsimd_mulx2h, i32, i32, i32, ptr)
 DEF_HELPER_4(advsimd_muladd2h, i32, i32, i32, i32, ptr)
 DEF_HELPER_2(advsimd_rinth_exact, f16, f16, ptr)
 DEF_HELPER_2(advsimd_rinth, f16, f16, ptr)
 DEF_HELPER_2(advsimd_f16tosinth, i32, f16, ptr)
 DEF_HELPER_2(advsimd_f16touinth, i32, f16, ptr)
 DEF_HELPER_2(sqrt_f16, f16, f16, ptr)
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@ -11103,6 +11103,7 @@ uint32_t HELPER(vfp_get_fpscr)(CPUARMState *env)
            | (env->vfp.vec_stride << 20);
    i = get_float_exception_flags(&env->vfp.fp_status);
    i |= get_float_exception_flags(&env->vfp.standard_fp_status);
    i |= get_float_exception_flags(&env->vfp.fp_status_f16);
    fpscr |= vfp_exceptbits_from_host(i);
    return fpscr;
 }
@ -11160,16 +11161,31 @@ void HELPER(vfp_set_fpscr)(CPUARMState *env, uint32_t val)
            break;
        }
        set_float_rounding_mode(i, &env->vfp.fp_status);
        set_float_rounding_mode(i, &env->vfp.fp_status_f16);
    }
-    if (changed & (1 << 24)) {
+    if (changed & FPCR_FZ16) {
-        set_flush_to_zero((val & (1 << 24)) != 0, &env->vfp.fp_status);
+        bool ftz_enabled = val & FPCR_FZ16;
-        set_flush_inputs_to_zero((val & (1 << 24)) != 0, &env->vfp.fp_status);
+        set_flush_to_zero(ftz_enabled, &env->vfp.fp_status_f16);
        set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status_f16);
    }
    if (changed & FPCR_FZ) {
        bool ftz_enabled = val & FPCR_FZ;
        set_flush_to_zero(ftz_enabled, &env->vfp.fp_status);
        set_flush_inputs_to_zero(ftz_enabled, &env->vfp.fp_status);
    }
    if (changed & FPCR_DN) {
        bool dnan_enabled = val & FPCR_DN;
        set_default_nan_mode(dnan_enabled, &env->vfp.fp_status);
        set_default_nan_mode(dnan_enabled, &env->vfp.fp_status_f16);
    }
    if (changed & (1 << 25))
        set_default_nan_mode((val & (1 << 25)) != 0, &env->vfp.fp_status);
    /* The exception flags are ORed together when we read fpscr so we
     * only need to preserve the current state in one of our
     * float_status values.
     */
    i = vfp_exceptbits_to_host(val);
    set_float_exception_flags(i, &env->vfp.fp_status);
    set_float_exception_flags(0, &env->vfp.fp_status_f16);
    set_float_exception_flags(0, &env->vfp.standard_fp_status);
 }
@ -11286,8 +11302,10 @@ CONV_ITOF(vfp_##name##to##p, fsz, sign) \
 CONV_FTOI(vfp_to##name##p, fsz, sign, ) \
 CONV_FTOI(vfp_to##name##z##p, fsz, sign, _round_to_zero)
 FLOAT_CONVS(si, h, 16, )
 FLOAT_CONVS(si, s, 32, )
 FLOAT_CONVS(si, d, 64, )
 FLOAT_CONVS(ui, h, 16, u)
 FLOAT_CONVS(ui, s, 32, u)
 FLOAT_CONVS(ui, d, 64, u)
@ -11370,6 +11388,8 @@ VFP_CONV_FIX_A64(sq, s, 32, 64, int64)
 VFP_CONV_FIX(uh, s, 32, 32, uint16)
 VFP_CONV_FIX(ul, s, 32, 32, uint32)
 VFP_CONV_FIX_A64(uq, s, 32, 64, uint64)
 VFP_CONV_FIX_A64(sl, h, 16, 32, int32)
 VFP_CONV_FIX_A64(ul, h, 16, 32, uint32)
 #undef VFP_CONV_FIX
 #undef VFP_CONV_FIX_FLOAT
 #undef VFP_CONV_FLOAT_FIX_ROUND
@ -11377,9 +11397,9 @@ VFP_CONV_FIX_A64(uq, s, 32, 64, uint64)
 /* Set the current fp rounding mode and return the old one.
 * The argument is a softfloat float_round_ value.
 */
-uint32_t HELPER(set_rmode)(uint32_t rmode, CPUARMState *env)
+uint32_t HELPER(set_rmode)(uint32_t rmode, void *fpstp)
 {
-    float_status *fp_status = &env->vfp.fp_status;
+    float_status *fp_status = fpstp;
    uint32_t prev_rmode = get_float_rounding_mode(fp_status);
    set_float_rounding_mode(rmode, fp_status);
@ -11503,80 +11523,75 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, CPUARMState *env)
 * int->float conversions at run-time.  */
 #define float64_256 make_float64(0x4070000000000000LL)
 #define float64_512 make_float64(0x4080000000000000LL)
 #define float16_maxnorm make_float16(0x7bff)
 #define float32_maxnorm make_float32(0x7f7fffff)
 #define float64_maxnorm make_float64(0x7fefffffffffffffLL)
 /* Reciprocal functions
 *
 * The algorithm that must be used to calculate the estimate
- * is specified by the ARM ARM, see FPRecipEstimate()
+ * is specified by the ARM ARM, see FPRecipEstimate()/RecipEstimate
 */
-static float64 recip_estimate(float64 a, float_status *real_fp_status)
+/* See RecipEstimate()
 *
 * input is a 9 bit fixed point number
 * input range 256 .. 511 for a number from 0.5 <= x < 1.0.
 * result range 256 .. 511 for a number from 1.0 to 511/256.
 */
 static int recip_estimate(int input)
 {
-    /* These calculations mustn't set any fp exception flags,
+    int a, b, r;
-     * so we use a local copy of the fp_status.
+    assert(256 <= input && input < 512);
-     */
+    a = (input * 2) + 1;
-    float_status dummy_status = *real_fp_status;
+    b = (1 << 19) / a;
-    float_status *s = &dummy_status;
+    r = (b + 1) >> 1;
-    /* q = (int)(a * 512.0) */
+    assert(256 <= r && r < 512);
-    float64 q = float64_mul(float64_512, a, s);
+    return r;
    int64_t q_int = float64_to_int64_round_to_zero(q, s);
    /* r = 1.0 / (((double)q + 0.5) / 512.0) */
    q = int64_to_float64(q_int, s);
    q = float64_add(q, float64_half, s);
    q = float64_div(q, float64_512, s);
    q = float64_div(float64_one, q, s);
    /* s = (int)(256.0 * r + 0.5) */
    q = float64_mul(q, float64_256, s);
    q = float64_add(q, float64_half, s);
    q_int = float64_to_int64_round_to_zero(q, s);
    /* return (double)s / 256.0 */
    return float64_div(int64_to_float64(q_int, s), float64_256, s);
 }
-/* Common wrapper to call recip_estimate */
+/*
-static float64 call_recip_estimate(float64 num, int off, float_status *fpst)
+ * Common wrapper to call recip_estimate
-{
+ *
-    uint64_t val64 = float64_val(num);
+ * The parameters are exponent and 64 bit fraction (without implicit
-    uint64_t frac = extract64(val64, 0, 52);
+ * bit) where the binary point is nominally at bit 52. Returns a
-    int64_t exp = extract64(val64, 52, 11);
+ * float64 which can then be rounded to the appropriate size by the
-    uint64_t sbit;
+ * callee.
-    float64 scaled, estimate;
+ */
-    /* Generate the scaled number for the estimate function */
+static uint64_t call_recip_estimate(int *exp, int exp_off, uint64_t frac)
-    if (exp == 0) {
+{
    uint32_t scaled, estimate;
    uint64_t result_frac;
    int result_exp;
    /* Handle sub-normals */
    if (*exp == 0) {
        if (extract64(frac, 51, 1) == 0) {
-            exp = -1;
+            *exp = -1;
-            frac = extract64(frac, 0, 50) << 2;
+            frac <<= 2;
        } else {
-            frac = extract64(frac, 0, 51) << 1;
+            frac <<= 1;
        }
    }
-    /* scaled = '0' : '01111111110' : fraction<51:44> : Zeros(44); */
+    /* scaled = UInt('1':fraction<51:44>) */
-    scaled = make_float64((0x3feULL << 52)
+    scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
-                          | extract64(frac, 44, 8) << 44);
+    estimate = recip_estimate(scaled);
-    estimate = recip_estimate(scaled, fpst);
+    result_exp = exp_off - *exp;
-
+    result_frac = deposit64(0, 44, 8, estimate);
-    /* Build new result */
+    if (result_exp == 0) {
-    val64 = float64_val(estimate);
+        result_frac = deposit64(result_frac >> 1, 51, 1, 1);
-    sbit = 0x8000000000000000ULL & val64;
+    } else if (result_exp == -1) {
-    exp = off - exp;
+        result_frac = deposit64(result_frac >> 2, 50, 2, 1);
-    frac = extract64(val64, 0, 52);
+        result_exp = 0;
    if (exp == 0) {
        frac = 1ULL << 51 | extract64(frac, 1, 51);
    } else if (exp == -1) {
        frac = 1ULL << 50 | extract64(frac, 2, 50);
        exp = 0;
    }
-    return make_float64(sbit | (exp << 52) | frac);
+    *exp = result_exp;
    return result_frac;
 }
 static bool round_to_inf(float_status *fpst, bool sign_bit)
@ -11595,18 +11610,63 @@ static bool round_to_inf(float_status *fpst, bool sign_bit)
    g_assert_not_reached();
 }
 float16 HELPER(recpe_f16)(float16 input, void *fpstp)
 {
    float_status *fpst = fpstp;
    float16 f16 = float16_squash_input_denormal(input, fpst);
    uint32_t f16_val = float16_val(f16);
    uint32_t f16_sign = float16_is_neg(f16);
    int f16_exp = extract32(f16_val, 10, 5);
    uint32_t f16_frac = extract32(f16_val, 0, 10);
    uint64_t f64_frac;
    if (float16_is_any_nan(f16)) {
        float16 nan = f16;
        if (float16_is_signaling_nan(f16, fpst)) {
            float_raise(float_flag_invalid, fpst);
            nan = float16_maybe_silence_nan(f16, fpst);
        }
        if (fpst->default_nan_mode) {
            nan =  float16_default_nan(fpst);
        }
        return nan;
    } else if (float16_is_infinity(f16)) {
        return float16_set_sign(float16_zero, float16_is_neg(f16));
    } else if (float16_is_zero(f16)) {
        float_raise(float_flag_divbyzero, fpst);
        return float16_set_sign(float16_infinity, float16_is_neg(f16));
    } else if (float16_abs(f16) < (1 << 8)) {
        /* Abs(value) < 2.0^-16 */
        float_raise(float_flag_overflow | float_flag_inexact, fpst);
        if (round_to_inf(fpst, f16_sign)) {
            return float16_set_sign(float16_infinity, f16_sign);
        } else {
            return float16_set_sign(float16_maxnorm, f16_sign);
        }
    } else if (f16_exp >= 29 && fpst->flush_to_zero) {
        float_raise(float_flag_underflow, fpst);
        return float16_set_sign(float16_zero, float16_is_neg(f16));
    }
    f64_frac = call_recip_estimate(&f16_exp, 29,
                                   ((uint64_t) f16_frac) << (52 - 10));
    /* result = sign : result_exp<4:0> : fraction<51:42> */
    f16_val = deposit32(0, 15, 1, f16_sign);
    f16_val = deposit32(f16_val, 10, 5, f16_exp);
    f16_val = deposit32(f16_val, 0, 10, extract64(f64_frac, 52 - 10, 10));
    return make_float16(f16_val);
 }
 float32 HELPER(recpe_f32)(float32 input, void *fpstp)
 {
    float_status *fpst = fpstp;
    float32 f32 = float32_squash_input_denormal(input, fpst);
    uint32_t f32_val = float32_val(f32);
-    uint32_t f32_sbit = 0x80000000ULL & f32_val;
+    bool f32_sign = float32_is_neg(f32);
-    int32_t f32_exp = extract32(f32_val, 23, 8);
+    int f32_exp = extract32(f32_val, 23, 8);
    uint32_t f32_frac = extract32(f32_val, 0, 23);
-    float64 f64, r64;
+    uint64_t f64_frac;
    uint64_t r64_val;
    int64_t r64_exp;
    uint64_t r64_frac;
    if (float32_is_any_nan(f32)) {
        float32 nan = f32;
@ -11623,30 +11683,27 @@ float32 HELPER(recpe_f32)(float32 input, void *fpstp)
    } else if (float32_is_zero(f32)) {
        float_raise(float_flag_divbyzero, fpst);
        return float32_set_sign(float32_infinity, float32_is_neg(f32));
-    } else if ((f32_val & ~(1ULL << 31)) < (1ULL << 21)) {
+    } else if (float32_abs(f32) < (1ULL << 21)) {
        /* Abs(value) < 2.0^-128 */
        float_raise(float_flag_overflow | float_flag_inexact, fpst);
-        if (round_to_inf(fpst, f32_sbit)) {
+        if (round_to_inf(fpst, f32_sign)) {
-            return float32_set_sign(float32_infinity, float32_is_neg(f32));
+            return float32_set_sign(float32_infinity, f32_sign);
        } else {
-            return float32_set_sign(float32_maxnorm, float32_is_neg(f32));
+            return float32_set_sign(float32_maxnorm, f32_sign);
        }
    } else if (f32_exp >= 253 && fpst->flush_to_zero) {
        float_raise(float_flag_underflow, fpst);
        return float32_set_sign(float32_zero, float32_is_neg(f32));
    }
    f64_frac = call_recip_estimate(&f32_exp, 253,
                                   ((uint64_t) f32_frac) << (52 - 23));
-    f64 = make_float64(((int64_t)(f32_exp) << 52) | (int64_t)(f32_frac) << 29);
+    /* result = sign : result_exp<7:0> : fraction<51:29> */
-    r64 = call_recip_estimate(f64, 253, fpst);
+    f32_val = deposit32(0, 31, 1, f32_sign);
-    r64_val = float64_val(r64);
+    f32_val = deposit32(f32_val, 23, 8, f32_exp);
-    r64_exp = extract64(r64_val, 52, 11);
+    f32_val = deposit32(f32_val, 0, 23, extract64(f64_frac, 52 - 23, 23));
-    r64_frac = extract64(r64_val, 0, 52);
+    return make_float32(f32_val);
    /* result = sign : result_exp<7:0> : fraction<51:29>; */
    return make_float32(f32_sbit |
                        (r64_exp & 0xff) << 23 |
                        extract64(r64_frac, 29, 24));
 }
 float64 HELPER(recpe_f64)(float64 input, void *fpstp)
@ -11654,12 +11711,9 @@ float64 HELPER(recpe_f64)(float64 input, void *fpstp)
    float_status *fpst = fpstp;
    float64 f64 = float64_squash_input_denormal(input, fpst);
    uint64_t f64_val = float64_val(f64);
-    uint64_t f64_sbit = 0x8000000000000000ULL & f64_val;
+    bool f64_sign = float64_is_neg(f64);
-    int64_t f64_exp = extract64(f64_val, 52, 11);
+    int f64_exp = extract64(f64_val, 52, 11);
-    float64 r64;
+    uint64_t f64_frac = extract64(f64_val, 0, 52);
    uint64_t r64_val;
    int64_t r64_exp;
    uint64_t r64_frac;
    /* Deal with any special cases */
    if (float64_is_any_nan(f64)) {
@ -11680,80 +11734,119 @@ float64 HELPER(recpe_f64)(float64 input, void *fpstp)
    } else if ((f64_val & ~(1ULL << 63)) < (1ULL << 50)) {
        /* Abs(value) < 2.0^-1024 */
        float_raise(float_flag_overflow | float_flag_inexact, fpst);
-        if (round_to_inf(fpst, f64_sbit)) {
+        if (round_to_inf(fpst, f64_sign)) {
-            return float64_set_sign(float64_infinity, float64_is_neg(f64));
+            return float64_set_sign(float64_infinity, f64_sign);
        } else {
-            return float64_set_sign(float64_maxnorm, float64_is_neg(f64));
+            return float64_set_sign(float64_maxnorm, f64_sign);
        }
    } else if (f64_exp >= 2045 && fpst->flush_to_zero) {
        float_raise(float_flag_underflow, fpst);
        return float64_set_sign(float64_zero, float64_is_neg(f64));
    }
-    r64 = call_recip_estimate(f64, 2045, fpst);
+    f64_frac = call_recip_estimate(&f64_exp, 2045, f64_frac);
    r64_val = float64_val(r64);
    r64_exp = extract64(r64_val, 52, 11);
    r64_frac = extract64(r64_val, 0, 52);
-    /* result = sign : result_exp<10:0> : fraction<51:0> */
+    /* result = sign : result_exp<10:0> : fraction<51:0>; */
-    return make_float64(f64_sbit |
+    f64_val = deposit64(0, 63, 1, f64_sign);
-                        ((r64_exp & 0x7ff) << 52) |
+    f64_val = deposit64(f64_val, 52, 11, f64_exp);
-                        r64_frac);
+    f64_val = deposit64(f64_val, 0, 52, f64_frac);
    return make_float64(f64_val);
 }
 /* The algorithm that must be used to calculate the estimate
 * is specified by the ARM ARM.
 */
-static float64 recip_sqrt_estimate(float64 a, float_status *real_fp_status)
+
 static int do_recip_sqrt_estimate(int a)
 {
-    /* These calculations mustn't set any fp exception flags,
+    int b, estimate;
     * so we use a local copy of the fp_status.
     */
    float_status dummy_status = *real_fp_status;
    float_status *s = &dummy_status;
    float64 q;
    int64_t q_int;
-    if (float64_lt(a, float64_half, s)) {
+    assert(128 <= a && a < 512);
-        /* range 0.25 <= a < 0.5 */
+    if (a < 256) {
-
+        a = a * 2 + 1;
        /* a in units of 1/512 rounded down */
        /* q0 = (int)(a * 512.0);  */
        q = float64_mul(float64_512, a, s);
        q_int = float64_to_int64_round_to_zero(q, s);
        /* reciprocal root r */
        /* r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);  */
        q = int64_to_float64(q_int, s);
        q = float64_add(q, float64_half, s);
        q = float64_div(q, float64_512, s);
        q = float64_sqrt(q, s);
        q = float64_div(float64_one, q, s);
    } else {
-        /* range 0.5 <= a < 1.0 */
+        a = (a >> 1) << 1;
-
+        a = (a + 1) * 2;
        /* a in units of 1/256 rounded down */
        /* q1 = (int)(a * 256.0); */
        q = float64_mul(float64_256, a, s);
        int64_t q_int = float64_to_int64_round_to_zero(q, s);
        /* reciprocal root r */
        /* r = 1.0 /sqrt(((double)q1 + 0.5) / 256); */
        q = int64_to_float64(q_int, s);
        q = float64_add(q, float64_half, s);
        q = float64_div(q, float64_256, s);
        q = float64_sqrt(q, s);
        q = float64_div(float64_one, q, s);
    }
-    /* r in units of 1/256 rounded to nearest */
+    b = 512;
-    /* s = (int)(256.0 * r + 0.5); */
+    while (a * (b + 1) * (b + 1) < (1 << 28)) {
        b += 1;
    }
    estimate = (b + 1) / 2;
    assert(256 <= estimate && estimate < 512);
-    q = float64_mul(q, float64_256,s );
+    return estimate;
-    q = float64_add(q, float64_half, s);
+}
    q_int = float64_to_int64_round_to_zero(q, s);
-    /* return (double)s / 256.0;*/
+
-    return float64_div(int64_to_float64(q_int, s), float64_256, s);
+static uint64_t recip_sqrt_estimate(int *exp , int exp_off, uint64_t frac)
 {
    int estimate;
    uint32_t scaled;
    if (*exp == 0) {
        while (extract64(frac, 51, 1) == 0) {
            frac = frac << 1;
            *exp -= 1;
        }
        frac = extract64(frac, 0, 51) << 1;
    }
    if (*exp & 1) {
        /* scaled = UInt('01':fraction<51:45>) */
        scaled = deposit32(1 << 7, 0, 7, extract64(frac, 45, 7));
    } else {
        /* scaled = UInt('1':fraction<51:44>) */
        scaled = deposit32(1 << 8, 0, 8, extract64(frac, 44, 8));
    }
    estimate = do_recip_sqrt_estimate(scaled);
    *exp = (exp_off - *exp) / 2;
    return extract64(estimate, 0, 8) << 44;
 }
 float16 HELPER(rsqrte_f16)(float16 input, void *fpstp)
 {
    float_status *s = fpstp;
    float16 f16 = float16_squash_input_denormal(input, s);
    uint16_t val = float16_val(f16);
    bool f16_sign = float16_is_neg(f16);
    int f16_exp = extract32(val, 10, 5);
    uint16_t f16_frac = extract32(val, 0, 10);
    uint64_t f64_frac;
    if (float16_is_any_nan(f16)) {
        float16 nan = f16;
        if (float16_is_signaling_nan(f16, s)) {
            float_raise(float_flag_invalid, s);
            nan = float16_maybe_silence_nan(f16, s);
        }
        if (s->default_nan_mode) {
            nan =  float16_default_nan(s);
        }
        return nan;
    } else if (float16_is_zero(f16)) {
        float_raise(float_flag_divbyzero, s);
        return float16_set_sign(float16_infinity, f16_sign);
    } else if (f16_sign) {
        float_raise(float_flag_invalid, s);
        return float16_default_nan(s);
    } else if (float16_is_infinity(f16)) {
        return float16_zero;
    }
    /* Scale and normalize to a double-precision value between 0.25 and 1.0,
     * preserving the parity of the exponent.  */
    f64_frac = ((uint64_t) f16_frac) << (52 - 10);
    f64_frac = recip_sqrt_estimate(&f16_exp, 44, f64_frac);
    /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(2) */
    val = deposit32(0, 15, 1, f16_sign);
    val = deposit32(val, 10, 5, f16_exp);
    val = deposit32(val, 2, 8, extract64(f64_frac, 52 - 8, 8));
    return make_float16(val);
 }
 float32 HELPER(rsqrte_f32)(float32 input, void *fpstp)
@ -11761,13 +11854,10 @@ float32 HELPER(rsqrte_f32)(float32 input, void *fpstp)
    float_status *s = fpstp;
    float32 f32 = float32_squash_input_denormal(input, s);
    uint32_t val = float32_val(f32);
-    uint32_t f32_sbit = 0x80000000 & val;
+    uint32_t f32_sign = float32_is_neg(f32);
-    int32_t f32_exp = extract32(val, 23, 8);
+    int f32_exp = extract32(val, 23, 8);
    uint32_t f32_frac = extract32(val, 0, 23);
    uint64_t f64_frac;
    uint64_t val64;
    int result_exp;
    float64 f64;
    if (float32_is_any_nan(f32)) {
        float32 nan = f32;
@ -11793,32 +11883,13 @@ float32 HELPER(rsqrte_f32)(float32 input, void *fpstp)
     * preserving the parity of the exponent.  */
    f64_frac = ((uint64_t) f32_frac) << 29;
    if (f32_exp == 0) {
        while (extract64(f64_frac, 51, 1) == 0) {
            f64_frac = f64_frac << 1;
            f32_exp = f32_exp-1;
        }
        f64_frac = extract64(f64_frac, 0, 51) << 1;
    }
-    if (extract64(f32_exp, 0, 1) == 0) {
+    f64_frac = recip_sqrt_estimate(&f32_exp, 380, f64_frac);
        f64 = make_float64(((uint64_t) f32_sbit) << 32
                           | (0x3feULL << 52)
                           | f64_frac);
    } else {
        f64 = make_float64(((uint64_t) f32_sbit) << 32
                           | (0x3fdULL << 52)
                           | f64_frac);
    }
-    result_exp = (380 - f32_exp) / 2;
+    /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(15) */
-
+    val = deposit32(0, 31, 1, f32_sign);
-    f64 = recip_sqrt_estimate(f64, s);
+    val = deposit32(val, 23, 8, f32_exp);
-
+    val = deposit32(val, 15, 8, extract64(f64_frac, 52 - 8, 8));
    val64 = float64_val(f64);
    val = ((result_exp & 0xff) << 23)
        | ((val64 >> 29)  & 0x7fffff);
    return make_float32(val);
 }
@ -11827,11 +11898,9 @@ float64 HELPER(rsqrte_f64)(float64 input, void *fpstp)
    float_status *s = fpstp;
    float64 f64 = float64_squash_input_denormal(input, s);
    uint64_t val = float64_val(f64);
-    uint64_t f64_sbit = 0x8000000000000000ULL & val;
+    bool f64_sign = float64_is_neg(f64);
-    int64_t f64_exp = extract64(val, 52, 11);
+    int f64_exp = extract64(val, 52, 11);
    uint64_t f64_frac = extract64(val, 0, 52);
    int64_t result_exp;
    uint64_t result_frac;
    if (float64_is_any_nan(f64)) {
        float64 nan = f64;
@ -11853,75 +11922,41 @@ float64 HELPER(rsqrte_f64)(float64 input, void *fpstp)
        return float64_zero;
    }
-    /* Scale and normalize to a double-precision value between 0.25 and 1.0,
+    f64_frac = recip_sqrt_estimate(&f64_exp, 3068, f64_frac);
     * preserving the parity of the exponent.  */
-    if (f64_exp == 0) {
+    /* result = sign : result_exp<4:0> : estimate<7:0> : Zeros(44) */
-        while (extract64(f64_frac, 51, 1) == 0) {
+    val = deposit64(0, 61, 1, f64_sign);
-            f64_frac = f64_frac << 1;
+    val = deposit64(val, 52, 11, f64_exp);
-            f64_exp = f64_exp - 1;
+    val = deposit64(val, 44, 8, extract64(f64_frac, 52 - 8, 8));
-        }
+    return make_float64(val);
        f64_frac = extract64(f64_frac, 0, 51) << 1;
    }
    if (extract64(f64_exp, 0, 1) == 0) {
        f64 = make_float64(f64_sbit
                           | (0x3feULL << 52)
                           | f64_frac);
    } else {
        f64 = make_float64(f64_sbit
                           | (0x3fdULL << 52)
                           | f64_frac);
    }
    result_exp = (3068 - f64_exp) / 2;
    f64 = recip_sqrt_estimate(f64, s);
    result_frac = extract64(float64_val(f64), 0, 52);
    return make_float64(f64_sbit |
                        ((result_exp & 0x7ff) << 52) |
                        result_frac);
 }
 uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp)
 {
-    float_status *s = fpstp;
+    /* float_status *s = fpstp; */
-    float64 f64;
+    int input, estimate;
    if ((a & 0x80000000) == 0) {
        return 0xffffffff;
    }
-    f64 = make_float64((0x3feULL << 52)
+    input = extract32(a, 23, 9);
-                       | ((int64_t)(a & 0x7fffffff) << 21));
+    estimate = recip_estimate(input);
-    f64 = recip_estimate(f64, s);
+    return deposit32(0, (32 - 9), 9, estimate);
    return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }
 uint32_t HELPER(rsqrte_u32)(uint32_t a, void *fpstp)
 {
-    float_status *fpst = fpstp;
+    int estimate;
    float64 f64;
    if ((a & 0xc0000000) == 0) {
        return 0xffffffff;
    }
-    if (a & 0x80000000) {
+    estimate = do_recip_sqrt_estimate(extract32(a, 23, 9));
        f64 = make_float64((0x3feULL << 52)
                           | ((uint64_t)(a & 0x7fffffff) << 21));
    } else { /* bits 31-30 == '01' */
        f64 = make_float64((0x3fdULL << 52)
                           | ((uint64_t)(a & 0x3fffffff) << 22));
    }
-    f64 = recip_sqrt_estimate(f64, fpst);
+    return deposit32(0, 23, 9, estimate);
    return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff);
 }
 /* VFPv4 fused multiply-accumulate */
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@ -120,17 +120,23 @@ DEF_HELPER_3(vfp_cmped, void, f64, f64, env)
 DEF_HELPER_2(vfp_fcvtds, f64, f32, env)
 DEF_HELPER_2(vfp_fcvtsd, f32, f64, env)
 DEF_HELPER_2(vfp_uitoh, f16, i32, ptr)
 DEF_HELPER_2(vfp_uitos, f32, i32, ptr)
 DEF_HELPER_2(vfp_uitod, f64, i32, ptr)
 DEF_HELPER_2(vfp_sitoh, f16, i32, ptr)
 DEF_HELPER_2(vfp_sitos, f32, i32, ptr)
 DEF_HELPER_2(vfp_sitod, f64, i32, ptr)
 DEF_HELPER_2(vfp_touih, i32, f16, ptr)
 DEF_HELPER_2(vfp_touis, i32, f32, ptr)
 DEF_HELPER_2(vfp_touid, i32, f64, ptr)
 DEF_HELPER_2(vfp_touizh, i32, f16, ptr)
 DEF_HELPER_2(vfp_touizs, i32, f32, ptr)
 DEF_HELPER_2(vfp_touizd, i32, f64, ptr)
 DEF_HELPER_2(vfp_tosih, i32, f16, ptr)
 DEF_HELPER_2(vfp_tosis, i32, f32, ptr)
 DEF_HELPER_2(vfp_tosid, i32, f64, ptr)
 DEF_HELPER_2(vfp_tosizh, i32, f16, ptr)
 DEF_HELPER_2(vfp_tosizs, i32, f32, ptr)
 DEF_HELPER_2(vfp_tosizd, i32, f64, ptr)
@ -142,6 +148,8 @@ DEF_HELPER_3(vfp_toshd_round_to_zero, i64, f64, i32, ptr)
 DEF_HELPER_3(vfp_tosld_round_to_zero, i64, f64, i32, ptr)
 DEF_HELPER_3(vfp_touhd_round_to_zero, i64, f64, i32, ptr)
 DEF_HELPER_3(vfp_tould_round_to_zero, i64, f64, i32, ptr)
 DEF_HELPER_3(vfp_toulh, i32, f16, i32, ptr)
 DEF_HELPER_3(vfp_toslh, i32, f16, i32, ptr)
 DEF_HELPER_3(vfp_toshs, i32, f32, i32, ptr)
 DEF_HELPER_3(vfp_tosls, i32, f32, i32, ptr)
 DEF_HELPER_3(vfp_tosqs, i64, f32, i32, ptr)
@ -166,8 +174,10 @@ DEF_HELPER_3(vfp_sqtod, f64, i64, i32, ptr)
 DEF_HELPER_3(vfp_uhtod, f64, i64, i32, ptr)
 DEF_HELPER_3(vfp_ultod, f64, i64, i32, ptr)
 DEF_HELPER_3(vfp_uqtod, f64, i64, i32, ptr)
 DEF_HELPER_3(vfp_sltoh, f16, i32, i32, ptr)
 DEF_HELPER_3(vfp_ultoh, f16, i32, i32, ptr)
-DEF_HELPER_FLAGS_2(set_rmode, TCG_CALL_NO_RWG, i32, i32, env)
+DEF_HELPER_FLAGS_2(set_rmode, TCG_CALL_NO_RWG, i32, i32, ptr)
 DEF_HELPER_FLAGS_2(set_neon_rmode, TCG_CALL_NO_RWG, i32, i32, env)
 DEF_HELPER_2(vfp_fcvt_f16_to_f32, f32, i32, env)
@ -182,8 +192,10 @@ DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr)
 DEF_HELPER_3(recps_f32, f32, f32, f32, env)
 DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env)
 DEF_HELPER_FLAGS_2(recpe_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
 DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_FLAGS_2(rsqrte_f16, TCG_CALL_NO_RWG, f16, f16, ptr)
 DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, ptr)
 DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, ptr)
 DEF_HELPER_2(recpe_u32, i32, i32, ptr)
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@ -3143,7 +3143,7 @@ static int handle_vrint(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
    TCGv_i32 tcg_rmode;
    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    if (dp) {
        TCGv_i64 tcg_op;
@ -3167,7 +3167,7 @@ static int handle_vrint(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
        tcg_temp_free_i32(tcg_res);
    }
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    tcg_temp_free_i32(tcg_rmode);
    tcg_temp_free_ptr(fpst);
@ -3184,7 +3184,7 @@ static int handle_vcvt(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
    tcg_shift = tcg_const_i32(0);
    tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rounding));
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    if (dp) {
        TCGv_i64 tcg_double, tcg_res;
@ -3222,7 +3222,7 @@ static int handle_vcvt(uint32_t insn, uint32_t rd, uint32_t rm, uint32_t dp,
        tcg_temp_free_i32(tcg_single);
    }
-    gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+    gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
    tcg_temp_free_i32(tcg_rmode);
    tcg_temp_free_i32(tcg_shift);
@ -3892,13 +3892,13 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
                        TCGv_ptr fpst = get_fpstatus_ptr(0);
                        TCGv_i32 tcg_rmode;
                        tcg_rmode = tcg_const_i32(float_round_to_zero);
-                        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+                        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
                        if (dp) {
                            gen_helper_rintd(cpu_F0d, cpu_F0d, fpst);
                        } else {
                            gen_helper_rints(cpu_F0s, cpu_F0s, fpst);
                        }
-                        gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env);
+                        gen_helper_set_rmode(tcg_rmode, tcg_rmode, fpst);
                        tcg_temp_free_i32(tcg_rmode);
                        tcg_temp_free_ptr(fpst);
                        break;