target-arm queue:

* target/arm: Correct ID_AA64ISAR1_EL1 value for neoverse-v1 * target/arm: More conversions to decodetree of A64 SIMD insns * hw/char/stm32l4x5_usart.c: Enable USART ACK bit response * tests: update aarch64/sbsa-ref tests * kvm: minor Coverity nit fixes * docs/devel: Remove nested-papr.txt -----BEGIN PGP SIGNATURE----- iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAmbsIloZHHBldGVyLm1h eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3k96EACgUaTSEX1BNee0Xsk8wIHb KebGryNymj7LPpWdRxiyQYmZbjfelZPJW7F2tr3iGfiphz+N5TIdRDQlPrRePNg4 5Ure0ShRgn+RlkjFe8r9yjrr3HEAXLnVb5fgzPqGEQ+UXWRVzr72+q+wrGFVuXtn zfqxpc6F0TCxeyH88X6mpKkTeEjfuE++TIf885VVX6nB8qkkDUifRLjdrDcJoxp3 BRrE0Ntob2W0NAHm/QTbATLDErsYXIzm6pX2hWy3DNEQrVHw2rJ1FAzsjt2J/8ZU PC7hwmkPBsrnq5wcpBtOwloHzrIYuwUBI7ABPGctCPziCSw7N55vkgUmGWZ+nYHc DwYGu3H84su0hBi/E9gxl8z/ATBvuAIVa1RLHvbYiwdd088DqxdBe1YLLRaZeKzP oldQBzVegaN1n5n8tHO43b/38V7uknu3fDXGw/OrsO5DouDAj9NoRil1caRx4ZYd dr4IiWKzmlW8wpWgoBnrRbycuNsi6b9HblOX1umjwubCGO+GFesBRAInUeg9gbuv xolfYOScUE/nkTaqulAiPGqGZV8P0fqVMbXmXuowm7iIdw3JihEUm+mU18CTPFBd P/6SH47lXTaQA0JJQmD4LKraZQFYzm5rlSvW/15/mnzutZXMUlWzvxK/E5fX3vhL VUguR6XO8Cjb0cQJWohB1w== =JklH -----END PGP SIGNATURE----- Merge tag 'pull-target-arm-20240919' of https://git.linaro.org/people/pmaydell/qemu-arm into staging target-arm queue: * target/arm: Correct ID_AA64ISAR1_EL1 value for neoverse-v1 * target/arm: More conversions to decodetree of A64 SIMD insns * hw/char/stm32l4x5_usart.c: Enable USART ACK bit response * tests: update aarch64/sbsa-ref tests * kvm: minor Coverity nit fixes * docs/devel: Remove nested-papr.txt # -----BEGIN PGP SIGNATURE----- # # iQJNBAABCAA3FiEE4aXFk81BneKOgxXPPCUl7RQ2DN4FAmbsIloZHHBldGVyLm1h # eWRlbGxAbGluYXJvLm9yZwAKCRA8JSXtFDYM3k96EACgUaTSEX1BNee0Xsk8wIHb # KebGryNymj7LPpWdRxiyQYmZbjfelZPJW7F2tr3iGfiphz+N5TIdRDQlPrRePNg4 # 5Ure0ShRgn+RlkjFe8r9yjrr3HEAXLnVb5fgzPqGEQ+UXWRVzr72+q+wrGFVuXtn # zfqxpc6F0TCxeyH88X6mpKkTeEjfuE++TIf885VVX6nB8qkkDUifRLjdrDcJoxp3 # BRrE0Ntob2W0NAHm/QTbATLDErsYXIzm6pX2hWy3DNEQrVHw2rJ1FAzsjt2J/8ZU # PC7hwmkPBsrnq5wcpBtOwloHzrIYuwUBI7ABPGctCPziCSw7N55vkgUmGWZ+nYHc # DwYGu3H84su0hBi/E9gxl8z/ATBvuAIVa1RLHvbYiwdd088DqxdBe1YLLRaZeKzP # oldQBzVegaN1n5n8tHO43b/38V7uknu3fDXGw/OrsO5DouDAj9NoRil1caRx4ZYd # dr4IiWKzmlW8wpWgoBnrRbycuNsi6b9HblOX1umjwubCGO+GFesBRAInUeg9gbuv # xolfYOScUE/nkTaqulAiPGqGZV8P0fqVMbXmXuowm7iIdw3JihEUm+mU18CTPFBd # P/6SH47lXTaQA0JJQmD4LKraZQFYzm5rlSvW/15/mnzutZXMUlWzvxK/E5fX3vhL # VUguR6XO8Cjb0cQJWohB1w== # =JklH # -----END PGP SIGNATURE----- # gpg: Signature made Thu 19 Sep 2024 14:08:42 BST # gpg: using RSA key E1A5C593CD419DE28E8315CF3C2525ED14360CDE # gpg: issuer "peter.maydell@linaro.org" # gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@gmail.com>" [ultimate] # gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>" [ultimate] # gpg: aka "Peter Maydell <peter@archaic.org.uk>" [ultimate] # Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE * tag 'pull-target-arm-20240919' of https://git.linaro.org/people/pmaydell/qemu-arm: (38 commits) docs/devel: Remove nested-papr.txt target/arm: Correct ID_AA64ISAR1_EL1 value for neoverse-v1 kvm: Remove unreachable code in kvm_dirty_ring_reaper_thread() kvm: Make 'mmap_size' be 'int' in kvm_init_vcpu(), do_kvm_destroy_vcpu() tests: drop OpenBSD tests for aarch64/sbsa-ref tests: expand timeout information for aarch64/sbsa-ref tests: add FreeBSD tests for aarch64/sbsa-ref tests: use default cpu for aarch64/sbsa-ref hw/char/stm32l4x5_usart.c: Enable USART ACK bit response target/arm: Convert scalar [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree target/arm: Convert vector [US]QSHRN, [US]QRSHRN, SQSHRUN to decodetree target/arm: Convert SQSHL, UQSHL, SQSHLU (immediate) to decodetree target/arm: Widen NeonGenNarrowEnvFn return to 64 bits target/arm: Convert VQSHL, VQSHLU to gvec target/arm: Convert handle_scalar_simd_shli to decodetree target/arm: Convert handle_scalar_simd_shri to decodetree target/arm: Convert SHRN, RSHRN to decodetree target/arm: Split out subroutines of handle_shri_with_rndacc target/arm: Push tcg_rnd into handle_shri_with_rndacc target/arm: Convert SSHLL, USHLL to decodetree ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2024-09-19 14:15:15 +01:00 · 2024-09-19 14:15:15 +01:00 · 01dc65a3bc
commit 01dc65a3bc
parent 14556211bc 89b30b4921
15 changed files with 1479 additions and 1658 deletions
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@ -414,7 +414,7 @@ int kvm_create_and_park_vcpu(CPUState *cpu)
 static int do_kvm_destroy_vcpu(CPUState *cpu)
 {
    KVMState *s = kvm_state;
-    long mmap_size;
+    int mmap_size;
    int ret = 0;

    trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
@ -459,7 +459,7 @@ void kvm_destroy_vcpu(CPUState *cpu)
 int kvm_init_vcpu(CPUState *cpu, Error **errp)
 {
    KVMState *s = kvm_state;
-    long mmap_size;
+    int mmap_size;
    int ret;

    trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
@ -1525,11 +1525,7 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
        r->reaper_iteration++;
    }

-    trace_kvm_dirty_ring_reaper("exit");
-
-    rcu_unregister_thread();
-
-    return NULL;
+    g_assert_not_reached();
 }

 static void kvm_dirty_ring_reaper_init(KVMState *s)
--- a/docs/devel/nested-papr.txt
+++ b/docs/devel/nested-papr.txt
@ -1,119 +0,0 @@
-Nested PAPR API (aka KVM on PowerVM)
-====================================
-
-This API aims at providing support to enable nested virtualization with
-KVM on PowerVM. While the existing support for nested KVM on PowerNV was
-introduced with cap-nested-hv option, however, with a slight design change,
-to enable this on papr/pseries, a new cap-nested-papr option is added. eg:
-
-  qemu-system-ppc64 -cpu POWER10 -machine pseries,cap-nested-papr=true ...
-
-Work by:
-    Michael Neuling <mikey@neuling.org>
-    Vaibhav Jain <vaibhav@linux.ibm.com>
-    Jordan Niethe <jniethe5@gmail.com>
-    Harsh Prateek Bora <harshpb@linux.ibm.com>
-    Shivaprasad G Bhat <sbhat@linux.ibm.com>
-    Kautuk Consul <kconsul@linux.vnet.ibm.com>
-
-Below taken from the kernel documentation:
-
-Introduction
-============
-
-This document explains how a guest operating system can act as a
-hypervisor and run nested guests through the use of hypercalls, if the
-hypervisor has implemented them. The terms L0, L1, and L2 are used to
-refer to different software entities. L0 is the hypervisor mode entity
-that would normally be called the "host" or "hypervisor". L1 is a
-guest virtual machine that is directly run under L0 and is initiated
-and controlled by L0. L2 is a guest virtual machine that is initiated
-and controlled by L1 acting as a hypervisor. A significant design change
-wrt existing API is that now the entire L2 state is maintained within L0.
-
-Existing Nested-HV API
-======================
-
-Linux/KVM has had support for Nesting as an L0 or L1 since 2018
-
-The L0 code was added::
-
-   commit 8e3f5fc1045dc49fd175b978c5457f5f51e7a2ce
-   Author: Paul Mackerras <paulus@ozlabs.org>
-   Date:   Mon Oct 8 16:31:03 2018 +1100
-   KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization
-
-The L1 code was added::
-
-   commit 360cae313702cdd0b90f82c261a8302fecef030a
-   Author: Paul Mackerras <paulus@ozlabs.org>
-   Date:   Mon Oct 8 16:31:04 2018 +1100
-   KVM: PPC: Book3S HV: Nested guest entry via hypercall
-
-This API works primarily using a signal hcall h_enter_nested(). This
-call made by the L1 to tell the L0 to start an L2 vCPU with the given
-state. The L0 then starts this L2 and runs until an L2 exit condition
-is reached. Once the L2 exits, the state of the L2 is given back to
-the L1 by the L0. The full L2 vCPU state is always transferred from
-and to L1 when the L2 is run. The L0 doesn't keep any state on the L2
-vCPU (except in the short sequence in the L0 on L1 -> L2 entry and L2
-> L1 exit).
-
-The only state kept by the L0 is the partition table. The L1 registers
-it's partition table using the h_set_partition_table() hcall. All
-other state held by the L0 about the L2s is cached state (such as
-shadow page tables).
-
-The L1 may run any L2 or vCPU without first informing the L0. It
-simply starts the vCPU using h_enter_nested(). The creation of L2s and
-vCPUs is done implicitly whenever h_enter_nested() is called.
-
-In this document, we call this existing API the v1 API.
-
-New PAPR API
-===============
-
-The new PAPR API changes from the v1 API such that the creating L2 and
-associated vCPUs is explicit. In this document, we call this the v2
-API.
-
-h_enter_nested() is replaced with H_GUEST_VCPU_RUN().  Before this can
-be called the L1 must explicitly create the L2 using h_guest_create()
-and any associated vCPUs() created with h_guest_create_vCPU(). Getting
-and setting vCPU state can also be performed using h_guest_{g|s}et
-hcall.
-
-The basic execution flow is for an L1 to create an L2, run it, and
-delete it is:
-
- L1 and L0 negotiate capabilities with H_GUEST_{G,S}ET_CAPABILITIES()
-  (normally at L1 boot time).
-
- L1 requests the L0 to create an L2 with H_GUEST_CREATE() and receives a token
-
- L1 requests the L0 to create an L2 vCPU with H_GUEST_CREATE_VCPU()
-
- L1 and L0 communicate the vCPU state using the H_GUEST_{G,S}ET() hcall
-
- L1 requests the L0 to run the vCPU using H_GUEST_RUN_VCPU() hcall
-
- L1 deletes L2 with H_GUEST_DELETE()
-
-For more details, please refer:
-
-[1] Linux Kernel documentation (upstream documentation commit):
-
-commit 476652297f94a2e5e5ef29e734b0da37ade94110
-Author: Michael Neuling <mikey@neuling.org>
-Date:   Thu Sep 14 13:06:00 2023 +1000
-
-    docs: powerpc: Document nested KVM on POWER
-
-    Document support for nested KVM on POWER using the existing API as well
-    as the new PAPR API. This includes the new HCALL interface and how it
-    used by KVM.
-
-    Signed-off-by: Michael Neuling <mikey@neuling.org>
-    Signed-off-by: Jordan Niethe <jniethe5@gmail.com>
-    Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-    Link: https://msgid.link/20230914030600.16993-12-jniethe5@gmail.com
--- a/hw/char/stm32l4x5_usart.c
+++ b/hw/char/stm32l4x5_usart.c
@ -154,6 +154,21 @@ REG32(RDR, 0x24)
 REG32(TDR, 0x28)
    FIELD(TDR, TDR, 0, 9)

+static void stm32l4x5_update_isr(Stm32l4x5UsartBaseState *s)
+{
+    if (s->cr1 & R_CR1_TE_MASK) {
+        s->isr |= R_ISR_TEACK_MASK;
+    } else {
+        s->isr &= ~R_ISR_TEACK_MASK;
+    }
+
+    if (s->cr1 & R_CR1_RE_MASK) {
+        s->isr |= R_ISR_REACK_MASK;
+    } else {
+        s->isr &= ~R_ISR_REACK_MASK;
+    }
+}
+
 static void stm32l4x5_update_irq(Stm32l4x5UsartBaseState *s)
 {
    if (((s->isr & R_ISR_WUF_MASK) && (s->cr3 & R_CR3_WUFIE_MASK))        ||
@ -456,6 +471,7 @@ static void stm32l4x5_usart_base_write(void *opaque, hwaddr addr,
    case A_CR1:
        s->cr1 = value;
        stm32l4x5_update_params(s);
+        stm32l4x5_update_isr(s);
        stm32l4x5_update_irq(s);
        return;
    case A_CR2:
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@ -324,6 +324,18 @@ DEF_HELPER_FLAGS_5(neon_uqrshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32
 DEF_HELPER_FLAGS_5(neon_uqrshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(neon_uqrshl_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(neon_uqrshl_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_uqshli_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(neon_sqshlui_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

 DEF_HELPER_FLAGS_4(gvec_srshl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_srshl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@ -363,17 +375,17 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32)
 DEF_HELPER_4(neon_qrdmlah_s32, i32, env, s32, s32, s32)
 DEF_HELPER_4(neon_qrdmlsh_s32, i32, env, s32, s32, s32)

-DEF_HELPER_1(neon_narrow_u8, i32, i64)
-DEF_HELPER_1(neon_narrow_u16, i32, i64)
-DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64)
-DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64)
-DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64)
-DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64)
+DEF_HELPER_1(neon_narrow_u8, i64, i64)
+DEF_HELPER_1(neon_narrow_u16, i64, i64)
+DEF_HELPER_2(neon_unarrow_sat8, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u8, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s8, i64, env, i64)
+DEF_HELPER_2(neon_unarrow_sat16, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u16, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s16, i64, env, i64)
+DEF_HELPER_2(neon_unarrow_sat32, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_u32, i64, env, i64)
+DEF_HELPER_2(neon_narrow_sat_s32, i64, env, i64)
 DEF_HELPER_1(neon_narrow_high_u8, i32, i64)
 DEF_HELPER_1(neon_narrow_high_u16, i32, i64)
 DEF_HELPER_1(neon_narrow_round_high_u8, i32, i64)
--- a/target/arm/tcg/a64.decode
+++ b/target/arm/tcg/a64.decode
@ -30,10 +30,12 @@
 &rri_sf         rd rn imm sf
 &i              imm
 &rr_e           rd rn esz
+&rri_e          rd rn imm esz
 &rrr_e          rd rn rm esz
 &rrx_e          rd rn rm idx esz
 &rrrr_e         rd rn rm ra esz
 &qrr_e          q rd rn esz
+&qrri_e         q rd rn imm esz
 &qrrr_e         q rd rn rm esz
 &qrrx_e         q rd rn rm idx esz
 &qrrrr_e        q rd rn rm ra esz
@ -54,11 +56,15 @@
@rrx_d          ........ .. . rm:5  .... idx:1 . rn:5 rd:5  &rrx_e esz=3

@rr_q1e0        ........ ........ ...... rn:5 rd:5      &qrr_e q=1 esz=0
+@rr_q1e2        ........ ........ ...... rn:5 rd:5      &qrr_e q=1 esz=2
@r2r_q1e0       ........ ........ ...... rm:5 rd:5      &qrrr_e rn=%rd q=1 esz=0
@rrr_q1e0       ........ ... rm:5 ...... rn:5 rd:5      &qrrr_e q=1 esz=0
@rrr_q1e3       ........ ... rm:5 ...... rn:5 rd:5      &qrrr_e q=1 esz=3
@rrrr_q1e3      ........ ... rm:5 . ra:5 rn:5 rd:5      &qrrrr_e q=1 esz=3

+@qrr_h          . q:1 ...... .. ...... ...... rn:5 rd:5  &qrr_e esz=1
+@qrr_e          . q:1 ...... esz:2 ...... ...... rn:5 rd:5  &qrr_e
+
@qrrr_b         . q:1 ...... ... rm:5 ...... rn:5 rd:5  &qrrr_e esz=0
@qrrr_h         . q:1 ...... ... rm:5 ...... rn:5 rd:5  &qrrr_e esz=1
@qrrr_s         . q:1 ...... ... rm:5 ...... rn:5 rd:5  &qrrr_e esz=2
@ -1136,3 +1142,254 @@ FMADD           0001 1111 .. 0 ..... 0 ..... ..... .....    @rrrr_hsd
 FMSUB           0001 1111 .. 0 ..... 1 ..... ..... .....    @rrrr_hsd
 FNMADD          0001 1111 .. 1 ..... 0 ..... ..... .....    @rrrr_hsd
 FNMSUB          0001 1111 .. 1 ..... 1 ..... ..... .....    @rrrr_hsd
+
+# Advanced SIMD Extract
+
+EXT_d           0010 1110 00 0 rm:5 00 imm:3 0 rn:5 rd:5
+EXT_q           0110 1110 00 0 rm:5 0  imm:4 0 rn:5 rd:5
+
+# Advanced SIMD Table Lookup
+
+TBL_TBX         0 q:1 00 1110 000 rm:5 0 len:2 tbx:1 00 rn:5 rd:5
+
+# Advanced SIMD Permute
+
+UZP1            0.00 1110 .. 0 ..... 0 001 10 ..... .....   @qrrr_e
+UZP2            0.00 1110 .. 0 ..... 0 101 10 ..... .....   @qrrr_e
+TRN1            0.00 1110 .. 0 ..... 0 010 10 ..... .....   @qrrr_e
+TRN2            0.00 1110 .. 0 ..... 0 110 10 ..... .....   @qrrr_e
+ZIP1            0.00 1110 .. 0 ..... 0 011 10 ..... .....   @qrrr_e
+ZIP2            0.00 1110 .. 0 ..... 0 111 10 ..... .....   @qrrr_e
+
+# Advanced SIMD Across Lanes
+
+ADDV            0.00 1110 .. 11000 11011 10 ..... .....     @qrr_e
+SADDLV          0.00 1110 .. 11000 00011 10 ..... .....     @qrr_e
+UADDLV          0.10 1110 .. 11000 00011 10 ..... .....     @qrr_e
+SMAXV           0.00 1110 .. 11000 01010 10 ..... .....     @qrr_e
+UMAXV           0.10 1110 .. 11000 01010 10 ..... .....     @qrr_e
+SMINV           0.00 1110 .. 11000 11010 10 ..... .....     @qrr_e
+UMINV           0.10 1110 .. 11000 11010 10 ..... .....     @qrr_e
+
+FMAXNMV_h       0.00 1110 00 11000 01100 10 ..... .....     @qrr_h
+FMAXNMV_s       0110 1110 00 11000 01100 10 ..... .....     @rr_q1e2
+
+FMINNMV_h       0.00 1110 10 11000 01100 10 ..... .....     @qrr_h
+FMINNMV_s       0110 1110 10 11000 01100 10 ..... .....     @rr_q1e2
+
+FMAXV_h         0.00 1110 00 11000 01111 10 ..... .....     @qrr_h
+FMAXV_s         0110 1110 00 11000 01111 10 ..... .....     @rr_q1e2
+
+FMINV_h         0.00 1110 10 11000 01111 10 ..... .....     @qrr_h
+FMINV_s         0110 1110 10 11000 01111 10 ..... .....     @rr_q1e2
+
+# Floating-point Immediate
+
+FMOVI_s         0001 1110 .. 1 imm:8 100 00000 rd:5         esz=%esz_hsd
+
+# Advanced SIMD Modified Immediate / Shift by Immediate
+
+%abcdefgh       16:3 5:5
+
+# Right shifts are encoded as N - shift, where N is the element size in bits.
+%neon_rshift_i6 16:6 !function=rsub_64
+%neon_rshift_i5 16:5 !function=rsub_32
+%neon_rshift_i4 16:4 !function=rsub_16
+%neon_rshift_i3 16:3 !function=rsub_8
+
+@q_shri_b       . q:1 .. ..... 0001 ... ..... . rn:5 rd:5   \
+                &qrri_e esz=0 imm=%neon_rshift_i3
+@q_shri_h       . q:1 .. ..... 001 .... ..... . rn:5 rd:5   \
+                &qrri_e esz=1 imm=%neon_rshift_i4
+@q_shri_s       . q:1 .. ..... 01 ..... ..... . rn:5 rd:5   \
+                &qrri_e esz=2 imm=%neon_rshift_i5
+@q_shri_d       . 1 .. ..... 1 ...... ..... . rn:5 rd:5     \
+                &qrri_e esz=3 imm=%neon_rshift_i6 q=1
+
+@q_shli_b       . q:1 .. ..... 0001 imm:3 ..... . rn:5 rd:5 &qrri_e esz=0
+@q_shli_h       . q:1 .. ..... 001  imm:4 ..... . rn:5 rd:5 &qrri_e esz=1
+@q_shli_s       . q:1 .. ..... 01   imm:5 ..... . rn:5 rd:5 &qrri_e esz=2
+@q_shli_d       . 1   .. ..... 1    imm:6 ..... . rn:5 rd:5 &qrri_e esz=3 q=1
+
+FMOVI_v_h       0 q:1 00 1111 00000 ... 1111 11 ..... rd:5  %abcdefgh
+
+# MOVI, MVNI, ORR, BIC, FMOV are all intermixed via cmode.
+Vimm            0 q:1 op:1 0 1111 00000 ... cmode:4 01 ..... rd:5 %abcdefgh
+
+SSHR_v          0.00 11110 .... ... 00000 1 ..... .....     @q_shri_b
+SSHR_v          0.00 11110 .... ... 00000 1 ..... .....     @q_shri_h
+SSHR_v          0.00 11110 .... ... 00000 1 ..... .....     @q_shri_s
+SSHR_v          0.00 11110 .... ... 00000 1 ..... .....     @q_shri_d
+
+USHR_v          0.10 11110 .... ... 00000 1 ..... .....     @q_shri_b
+USHR_v          0.10 11110 .... ... 00000 1 ..... .....     @q_shri_h
+USHR_v          0.10 11110 .... ... 00000 1 ..... .....     @q_shri_s
+USHR_v          0.10 11110 .... ... 00000 1 ..... .....     @q_shri_d
+
+SSRA_v          0.00 11110 .... ... 00010 1 ..... .....     @q_shri_b
+SSRA_v          0.00 11110 .... ... 00010 1 ..... .....     @q_shri_h
+SSRA_v          0.00 11110 .... ... 00010 1 ..... .....     @q_shri_s
+SSRA_v          0.00 11110 .... ... 00010 1 ..... .....     @q_shri_d
+
+USRA_v          0.10 11110 .... ... 00010 1 ..... .....     @q_shri_b
+USRA_v          0.10 11110 .... ... 00010 1 ..... .....     @q_shri_h
+USRA_v          0.10 11110 .... ... 00010 1 ..... .....     @q_shri_s
+USRA_v          0.10 11110 .... ... 00010 1 ..... .....     @q_shri_d
+
+SRSHR_v         0.00 11110 .... ... 00100 1 ..... .....     @q_shri_b
+SRSHR_v         0.00 11110 .... ... 00100 1 ..... .....     @q_shri_h
+SRSHR_v         0.00 11110 .... ... 00100 1 ..... .....     @q_shri_s
+SRSHR_v         0.00 11110 .... ... 00100 1 ..... .....     @q_shri_d
+
+URSHR_v         0.10 11110 .... ... 00100 1 ..... .....     @q_shri_b
+URSHR_v         0.10 11110 .... ... 00100 1 ..... .....     @q_shri_h
+URSHR_v         0.10 11110 .... ... 00100 1 ..... .....     @q_shri_s
+URSHR_v         0.10 11110 .... ... 00100 1 ..... .....     @q_shri_d
+
+SRSRA_v         0.00 11110 .... ... 00110 1 ..... .....     @q_shri_b
+SRSRA_v         0.00 11110 .... ... 00110 1 ..... .....     @q_shri_h
+SRSRA_v         0.00 11110 .... ... 00110 1 ..... .....     @q_shri_s
+SRSRA_v         0.00 11110 .... ... 00110 1 ..... .....     @q_shri_d
+
+URSRA_v         0.10 11110 .... ... 00110 1 ..... .....     @q_shri_b
+URSRA_v         0.10 11110 .... ... 00110 1 ..... .....     @q_shri_h
+URSRA_v         0.10 11110 .... ... 00110 1 ..... .....     @q_shri_s
+URSRA_v         0.10 11110 .... ... 00110 1 ..... .....     @q_shri_d
+
+SRI_v           0.10 11110 .... ... 01000 1 ..... .....     @q_shri_b
+SRI_v           0.10 11110 .... ... 01000 1 ..... .....     @q_shri_h
+SRI_v           0.10 11110 .... ... 01000 1 ..... .....     @q_shri_s
+SRI_v           0.10 11110 .... ... 01000 1 ..... .....     @q_shri_d
+
+SHL_v           0.00 11110 .... ... 01010 1 ..... .....     @q_shli_b
+SHL_v           0.00 11110 .... ... 01010 1 ..... .....     @q_shli_h
+SHL_v           0.00 11110 .... ... 01010 1 ..... .....     @q_shli_s
+SHL_v           0.00 11110 .... ... 01010 1 ..... .....     @q_shli_d
+
+SLI_v           0.10 11110 .... ... 01010 1 ..... .....     @q_shli_b
+SLI_v           0.10 11110 .... ... 01010 1 ..... .....     @q_shli_h
+SLI_v           0.10 11110 .... ... 01010 1 ..... .....     @q_shli_s
+SLI_v           0.10 11110 .... ... 01010 1 ..... .....     @q_shli_d
+
+SSHLL_v         0.00 11110 .... ... 10100 1 ..... .....     @q_shli_b
+SSHLL_v         0.00 11110 .... ... 10100 1 ..... .....     @q_shli_h
+SSHLL_v         0.00 11110 .... ... 10100 1 ..... .....     @q_shli_s
+
+USHLL_v         0.10 11110 .... ... 10100 1 ..... .....     @q_shli_b
+USHLL_v         0.10 11110 .... ... 10100 1 ..... .....     @q_shli_h
+USHLL_v         0.10 11110 .... ... 10100 1 ..... .....     @q_shli_s
+
+SHRN_v          0.00 11110 .... ... 10000 1 ..... .....     @q_shri_b
+SHRN_v          0.00 11110 .... ... 10000 1 ..... .....     @q_shri_h
+SHRN_v          0.00 11110 .... ... 10000 1 ..... .....     @q_shri_s
+
+RSHRN_v         0.00 11110 .... ... 10001 1 ..... .....     @q_shri_b
+RSHRN_v         0.00 11110 .... ... 10001 1 ..... .....     @q_shri_h
+RSHRN_v         0.00 11110 .... ... 10001 1 ..... .....     @q_shri_s
+
+SQSHL_vi        0.00 11110 .... ... 01110 1 ..... .....     @q_shli_b
+SQSHL_vi        0.00 11110 .... ... 01110 1 ..... .....     @q_shli_h
+SQSHL_vi        0.00 11110 .... ... 01110 1 ..... .....     @q_shli_s
+SQSHL_vi        0.00 11110 .... ... 01110 1 ..... .....     @q_shli_d
+
+UQSHL_vi        0.10 11110 .... ... 01110 1 ..... .....     @q_shli_b
+UQSHL_vi        0.10 11110 .... ... 01110 1 ..... .....     @q_shli_h
+UQSHL_vi        0.10 11110 .... ... 01110 1 ..... .....     @q_shli_s
+UQSHL_vi        0.10 11110 .... ... 01110 1 ..... .....     @q_shli_d
+
+SQSHLU_vi       0.10 11110 .... ... 01100 1 ..... .....     @q_shli_b
+SQSHLU_vi       0.10 11110 .... ... 01100 1 ..... .....     @q_shli_h
+SQSHLU_vi       0.10 11110 .... ... 01100 1 ..... .....     @q_shli_s
+SQSHLU_vi       0.10 11110 .... ... 01100 1 ..... .....     @q_shli_d
+
+SQSHRN_v        0.00 11110 .... ... 10010 1 ..... .....     @q_shri_b
+SQSHRN_v        0.00 11110 .... ... 10010 1 ..... .....     @q_shri_h
+SQSHRN_v        0.00 11110 .... ... 10010 1 ..... .....     @q_shri_s
+
+UQSHRN_v        0.10 11110 .... ... 10010 1 ..... .....     @q_shri_b
+UQSHRN_v        0.10 11110 .... ... 10010 1 ..... .....     @q_shri_h
+UQSHRN_v        0.10 11110 .... ... 10010 1 ..... .....     @q_shri_s
+
+SQSHRUN_v       0.10 11110 .... ... 10000 1 ..... .....     @q_shri_b
+SQSHRUN_v       0.10 11110 .... ... 10000 1 ..... .....     @q_shri_h
+SQSHRUN_v       0.10 11110 .... ... 10000 1 ..... .....     @q_shri_s
+
+SQRSHRN_v       0.00 11110 .... ... 10011 1 ..... .....     @q_shri_b
+SQRSHRN_v       0.00 11110 .... ... 10011 1 ..... .....     @q_shri_h
+SQRSHRN_v       0.00 11110 .... ... 10011 1 ..... .....     @q_shri_s
+
+UQRSHRN_v       0.10 11110 .... ... 10011 1 ..... .....     @q_shri_b
+UQRSHRN_v       0.10 11110 .... ... 10011 1 ..... .....     @q_shri_h
+UQRSHRN_v       0.10 11110 .... ... 10011 1 ..... .....     @q_shri_s
+
+SQRSHRUN_v      0.10 11110 .... ... 10001 1 ..... .....     @q_shri_b
+SQRSHRUN_v      0.10 11110 .... ... 10001 1 ..... .....     @q_shri_h
+SQRSHRUN_v      0.10 11110 .... ... 10001 1 ..... .....     @q_shri_s
+
+# Advanced SIMD scalar shift by immediate
+
+@shri_b         .... ..... 0001 ... ..... . rn:5 rd:5   \
+                &rri_e esz=0 imm=%neon_rshift_i3
+@shri_h         .... ..... 001 .... ..... . rn:5 rd:5   \
+                &rri_e esz=1 imm=%neon_rshift_i4
+@shri_s         .... ..... 01 ..... ..... . rn:5 rd:5   \
+                &rri_e esz=2 imm=%neon_rshift_i5
+@shri_d         .... ..... 1 ...... ..... . rn:5 rd:5   \
+                &rri_e esz=3 imm=%neon_rshift_i6
+
+@shli_b         .... ..... 0001 imm:3  ..... . rn:5 rd:5    &rri_e esz=0
+@shli_h         .... ..... 001  imm:4  ..... . rn:5 rd:5    &rri_e esz=1
+@shli_s         .... ..... 01   imm:5  ..... . rn:5 rd:5    &rri_e esz=2
+@shli_d         .... ..... 1    imm:6  ..... . rn:5 rd:5    &rri_e esz=3
+
+SSHR_s          0101 11110 .... ... 00000 1 ..... .....     @shri_d
+USHR_s          0111 11110 .... ... 00000 1 ..... .....     @shri_d
+SSRA_s          0101 11110 .... ... 00010 1 ..... .....     @shri_d
+USRA_s          0111 11110 .... ... 00010 1 ..... .....     @shri_d
+SRSHR_s         0101 11110 .... ... 00100 1 ..... .....     @shri_d
+URSHR_s         0111 11110 .... ... 00100 1 ..... .....     @shri_d
+SRSRA_s         0101 11110 .... ... 00110 1 ..... .....     @shri_d
+URSRA_s         0111 11110 .... ... 00110 1 ..... .....     @shri_d
+SRI_s           0111 11110 .... ... 01000 1 ..... .....     @shri_d
+
+SHL_s           0101 11110 .... ... 01010 1 ..... .....     @shli_d
+SLI_s           0111 11110 .... ... 01010 1 ..... .....     @shli_d
+
+SQSHL_si        0101 11110 .... ... 01110 1 ..... .....     @shli_b
+SQSHL_si        0101 11110 .... ... 01110 1 ..... .....     @shli_h
+SQSHL_si        0101 11110 .... ... 01110 1 ..... .....     @shli_s
+SQSHL_si        0101 11110 .... ... 01110 1 ..... .....     @shli_d
+
+UQSHL_si        0111 11110 .... ... 01110 1 ..... .....     @shli_b
+UQSHL_si        0111 11110 .... ... 01110 1 ..... .....     @shli_h
+UQSHL_si        0111 11110 .... ... 01110 1 ..... .....     @shli_s
+UQSHL_si        0111 11110 .... ... 01110 1 ..... .....     @shli_d
+
+SQSHLU_si       0111 11110 .... ... 01100 1 ..... .....     @shli_b
+SQSHLU_si       0111 11110 .... ... 01100 1 ..... .....     @shli_h
+SQSHLU_si       0111 11110 .... ... 01100 1 ..... .....     @shli_s
+SQSHLU_si       0111 11110 .... ... 01100 1 ..... .....     @shli_d
+
+SQSHRN_si       0101 11110 .... ... 10010 1 ..... .....     @shri_b
+SQSHRN_si       0101 11110 .... ... 10010 1 ..... .....     @shri_h
+SQSHRN_si       0101 11110 .... ... 10010 1 ..... .....     @shri_s
+
+UQSHRN_si       0111 11110 .... ... 10010 1 ..... .....     @shri_b
+UQSHRN_si       0111 11110 .... ... 10010 1 ..... .....     @shri_h
+UQSHRN_si       0111 11110 .... ... 10010 1 ..... .....     @shri_s
+
+SQSHRUN_si      0111 11110 .... ... 10000 1 ..... .....     @shri_b
+SQSHRUN_si      0111 11110 .... ... 10000 1 ..... .....     @shri_h
+SQSHRUN_si      0111 11110 .... ... 10000 1 ..... .....     @shri_s
+
+SQRSHRN_si      0101 11110 .... ... 10011 1 ..... .....     @shri_b
+SQRSHRN_si      0101 11110 .... ... 10011 1 ..... .....     @shri_h
+SQRSHRN_si      0101 11110 .... ... 10011 1 ..... .....     @shri_s
+
+UQRSHRN_si      0111 11110 .... ... 10011 1 ..... .....     @shri_b
+UQRSHRN_si      0111 11110 .... ... 10011 1 ..... .....     @shri_h
+UQRSHRN_si      0111 11110 .... ... 10011 1 ..... .....     @shri_s
+
+SQRSHRUN_si     0111 11110 .... ... 10001 1 ..... .....     @shri_b
+SQRSHRUN_si     0111 11110 .... ... 10001 1 ..... .....     @shri_h
+SQRSHRUN_si     0111 11110 .... ... 10001 1 ..... .....     @shri_s
--- a/target/arm/tcg/cpu64.c
+++ b/target/arm/tcg/cpu64.c
@ -677,7 +677,7 @@ static void aarch64_neoverse_v1_initfn(Object *obj)
    cpu->isar.id_aa64dfr0  = 0x000001f210305519ull;
    cpu->isar.id_aa64dfr1 = 0x00000000;
    cpu->isar.id_aa64isar0 = 0x1011111110212120ull; /* with FEAT_RNG */
-    cpu->isar.id_aa64isar1 = 0x0111000001211032ull;
+    cpu->isar.id_aa64isar1 = 0x0011100001211032ull;
    cpu->isar.id_aa64mmfr0 = 0x0000000000101125ull;
    cpu->isar.id_aa64mmfr1 = 0x0000000010212122ull;
    cpu->isar.id_aa64mmfr2 = 0x0220011102101011ull;
--- a/target/arm/tcg/gengvec.c
+++ b/target/arm/tcg/gengvec.c
@ -88,6 +88,25 @@ GEN_CMP0(gen_gvec_cgt0, TCG_COND_GT)

 #undef GEN_CMP0

+void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    /* Signed shift out of range results in all-sign-bits */
+    shift = MIN(shift, (8 << vece) - 1);
+    tcg_gen_gvec_sari(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
+}
+
+void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz)
+{
+    /* Unsigned shift out of range results in all-zero-bits */
+    if (shift >= (8 << vece)) {
+        tcg_gen_gvec_dup_imm(vece, rd_ofs, opr_sz, max_sz, 0);
+    } else {
+        tcg_gen_gvec_shri(vece, rd_ofs, rm_ofs, shift, opr_sz, max_sz);
+    }
+}
+
 static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
 {
    tcg_gen_vec_sar8i_i64(a, a, shift);
@ -285,7 +304,7 @@ void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
    tcg_gen_add_i32(d, d, t);
 }

- void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
+void gen_srshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
 {
    TCGv_i64 t = tcg_temp_new_i64();

@ -297,10 +316,9 @@ void gen_srshr32_i32(TCGv_i32 d, TCGv_i32 a, int32_t sh)
 static void gen_srshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
 {
    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec ones = tcg_temp_new_vec_matching(d);
+    TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);

    tcg_gen_shri_vec(vece, t, a, sh - 1);
-    tcg_gen_dupi_vec(vece, ones, 1);
    tcg_gen_and_vec(vece, t, t, ones);
    tcg_gen_sari_vec(vece, d, a, sh);
    tcg_gen_add_vec(vece, d, d, t);
@ -492,10 +510,9 @@ void gen_urshr64_i64(TCGv_i64 d, TCGv_i64 a, int64_t sh)
 static void gen_urshr_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t shift)
 {
    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec ones = tcg_temp_new_vec_matching(d);
+    TCGv_vec ones = tcg_constant_vec_matching(d, vece, 1);

    tcg_gen_shri_vec(vece, t, a, shift - 1);
-    tcg_gen_dupi_vec(vece, ones, 1);
    tcg_gen_and_vec(vece, t, t, ones);
    tcg_gen_shri_vec(vece, d, a, shift);
    tcg_gen_add_vec(vece, d, d, t);
@ -685,9 +702,9 @@ static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
 static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
 {
    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec m = tcg_temp_new_vec_matching(d);
+    int64_t mi = MAKE_64BIT_MASK((8 << vece) - sh, sh);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, mi);

-    tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK((8 << vece) - sh, sh));
    tcg_gen_shri_vec(vece, t, a, sh);
    tcg_gen_and_vec(vece, d, d, m);
    tcg_gen_or_vec(vece, d, d, t);
@ -773,10 +790,9 @@ static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
 static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
 {
    TCGv_vec t = tcg_temp_new_vec_matching(d);
-    TCGv_vec m = tcg_temp_new_vec_matching(d);
+    TCGv_vec m = tcg_constant_vec_matching(d, vece, MAKE_64BIT_MASK(0, sh));

    tcg_gen_shli_vec(vece, t, a, sh);
-    tcg_gen_dupi_vec(vece, m, MAKE_64BIT_MASK(0, sh));
    tcg_gen_and_vec(vece, d, d, m);
    tcg_gen_or_vec(vece, d, d, t);
 }
@ -1044,14 +1060,13 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
    TCGv_vec rval = tcg_temp_new_vec_matching(dst);
    TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
    TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
-    TCGv_vec msk, max;
+    TCGv_vec max, zero;

    tcg_gen_neg_vec(vece, rsh, shift);
    if (vece == MO_8) {
        tcg_gen_mov_vec(lsh, shift);
    } else {
-        msk = tcg_temp_new_vec_matching(dst);
-        tcg_gen_dupi_vec(vece, msk, 0xff);
+        TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
        tcg_gen_and_vec(vece, lsh, shift, msk);
        tcg_gen_and_vec(vece, rsh, rsh, msk);
    }
@ -1064,26 +1079,21 @@ static void gen_ushl_vec(unsigned vece, TCGv_vec dst,
    tcg_gen_shlv_vec(vece, lval, src, lsh);
    tcg_gen_shrv_vec(vece, rval, src, rsh);

-    max = tcg_temp_new_vec_matching(dst);
-    tcg_gen_dupi_vec(vece, max, 8 << vece);
-
    /*
-     * The choice of LT (signed) and GEU (unsigned) are biased toward
+     * The choice of GE (signed) and GEU (unsigned) are biased toward
     * the instructions of the x86_64 host.  For MO_8, the whole byte
     * is significant so we must use an unsigned compare; otherwise we
     * have already masked to a byte and so a signed compare works.
     * Other tcg hosts have a full set of comparisons and do not care.
     */
+    zero = tcg_constant_vec_matching(dst, vece, 0);
+    max = tcg_constant_vec_matching(dst, vece, 8 << vece);
    if (vece == MO_8) {
-        tcg_gen_cmp_vec(TCG_COND_GEU, vece, lsh, lsh, max);
-        tcg_gen_cmp_vec(TCG_COND_GEU, vece, rsh, rsh, max);
-        tcg_gen_andc_vec(vece, lval, lval, lsh);
-        tcg_gen_andc_vec(vece, rval, rval, rsh);
+        tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, lval, lsh, max, zero, lval);
+        tcg_gen_cmpsel_vec(TCG_COND_GEU, vece, rval, rsh, max, zero, rval);
    } else {
-        tcg_gen_cmp_vec(TCG_COND_LT, vece, lsh, lsh, max);
-        tcg_gen_cmp_vec(TCG_COND_LT, vece, rsh, rsh, max);
-        tcg_gen_and_vec(vece, lval, lval, lsh);
-        tcg_gen_and_vec(vece, rval, rval, rsh);
+        tcg_gen_cmpsel_vec(TCG_COND_GE, vece, lval, lsh, max, zero, lval);
+        tcg_gen_cmpsel_vec(TCG_COND_GE, vece, rval, rsh, max, zero, rval);
    }
    tcg_gen_or_vec(vece, dst, lval, rval);
 }
@ -1093,7 +1103,7 @@ void gen_gvec_ushl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
 {
    static const TCGOpcode vecop_list[] = {
        INDEX_op_neg_vec, INDEX_op_shlv_vec,
-        INDEX_op_shrv_vec, INDEX_op_cmp_vec, 0
+        INDEX_op_shrv_vec, INDEX_op_cmpsel_vec, 0
    };
    static const GVecGen3 ops[4] = {
        { .fniv = gen_ushl_vec,
@ -1169,7 +1179,7 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
    TCGv_vec rval = tcg_temp_new_vec_matching(dst);
    TCGv_vec lsh = tcg_temp_new_vec_matching(dst);
    TCGv_vec rsh = tcg_temp_new_vec_matching(dst);
-    TCGv_vec tmp = tcg_temp_new_vec_matching(dst);
+    TCGv_vec max, zero;

    /*
     * Rely on the TCG guarantee that out of range shifts produce
@ -1180,29 +1190,28 @@ static void gen_sshl_vec(unsigned vece, TCGv_vec dst,
    if (vece == MO_8) {
        tcg_gen_mov_vec(lsh, shift);
    } else {
-        tcg_gen_dupi_vec(vece, tmp, 0xff);
-        tcg_gen_and_vec(vece, lsh, shift, tmp);
-        tcg_gen_and_vec(vece, rsh, rsh, tmp);
+        TCGv_vec msk = tcg_constant_vec_matching(dst, vece, 0xff);
+        tcg_gen_and_vec(vece, lsh, shift, msk);
+        tcg_gen_and_vec(vece, rsh, rsh, msk);
    }

    /* Bound rsh so out of bound right shift gets -1.  */
-    tcg_gen_dupi_vec(vece, tmp, (8 << vece) - 1);
-    tcg_gen_umin_vec(vece, rsh, rsh, tmp);
-    tcg_gen_cmp_vec(TCG_COND_GT, vece, tmp, lsh, tmp);
+    max = tcg_constant_vec_matching(dst, vece, (8 << vece) - 1);
+    tcg_gen_umin_vec(vece, rsh, rsh, max);

    tcg_gen_shlv_vec(vece, lval, src, lsh);
    tcg_gen_sarv_vec(vece, rval, src, rsh);

    /* Select in-bound left shift.  */
-    tcg_gen_andc_vec(vece, lval, lval, tmp);
+    zero = tcg_constant_vec_matching(dst, vece, 0);
+    tcg_gen_cmpsel_vec(TCG_COND_GT, vece, lval, lsh, max, zero, lval);

    /* Select between left and right shift.  */
    if (vece == MO_8) {
-        tcg_gen_dupi_vec(vece, tmp, 0);
-        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, rval, lval);
+        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, zero, rval, lval);
    } else {
-        tcg_gen_dupi_vec(vece, tmp, 0x80);
-        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, tmp, lval, rval);
+        TCGv_vec sgn = tcg_constant_vec_matching(dst, vece, 0x80);
+        tcg_gen_cmpsel_vec(TCG_COND_LT, vece, dst, lsh, sgn, lval, rval);
    }
 }

@ -1211,7 +1220,7 @@ void gen_gvec_sshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
 {
    static const TCGOpcode vecop_list[] = {
        INDEX_op_neg_vec, INDEX_op_umin_vec, INDEX_op_shlv_vec,
-        INDEX_op_sarv_vec, INDEX_op_cmp_vec, INDEX_op_cmpsel_vec, 0
+        INDEX_op_sarv_vec, INDEX_op_cmpsel_vec, 0
    };
    static const GVecGen3 ops[4] = {
        { .fniv = gen_sshl_vec,
@ -1304,6 +1313,42 @@ void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
                       opr_sz, max_sz, 0, fns[vece]);
 }

+void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                     int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+    static gen_helper_gvec_2_ptr * const fns[] = {
+        gen_helper_neon_sqshli_b, gen_helper_neon_sqshli_h,
+        gen_helper_neon_sqshli_s, gen_helper_neon_sqshli_d,
+    };
+    tcg_debug_assert(vece <= MO_64);
+    tcg_debug_assert(c >= 0 && c <= (8 << vece));
+    tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
+void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                     int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+    static gen_helper_gvec_2_ptr * const fns[] = {
+        gen_helper_neon_uqshli_b, gen_helper_neon_uqshli_h,
+        gen_helper_neon_uqshli_s, gen_helper_neon_uqshli_d,
+    };
+    tcg_debug_assert(vece <= MO_64);
+    tcg_debug_assert(c >= 0 && c <= (8 << vece));
+    tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
+void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                      int64_t c, uint32_t opr_sz, uint32_t max_sz)
+{
+    static gen_helper_gvec_2_ptr * const fns[] = {
+        gen_helper_neon_sqshlui_b, gen_helper_neon_sqshlui_h,
+        gen_helper_neon_sqshlui_s, gen_helper_neon_sqshlui_d,
+    };
+    tcg_debug_assert(vece <= MO_64);
+    tcg_debug_assert(c >= 0 && c <= (8 << vece));
+    tcg_gen_gvec_2_ptr(rd_ofs, rn_ofs, tcg_env, opr_sz, max_sz, c, fns[vece]);
+}
+
 void gen_uqadd_bhs(TCGv_i64 res, TCGv_i64 qc, TCGv_i64 a, TCGv_i64 b, MemOp esz)
 {
    uint64_t max = MAKE_64BIT_MASK(0, 8 << esz);
--- a/target/arm/tcg/neon-dp.decode
+++ b/target/arm/tcg/neon-dp.decode
@ -291,17 +291,17 @@ VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_s
 VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_h
 VSLI_2sh         1111 001 1 1 . ...... .... 0101 . . . 1 .... @2reg_shl_b

-VQSHLU_64_2sh    1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
+VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_d
 VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_s
 VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_h
 VQSHLU_2sh       1111 001 1 1 . ...... .... 0110 . . . 1 .... @2reg_shl_b

-VQSHL_S_64_2sh   1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
 VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
 VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
 VQSHL_S_2sh      1111 001 0 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b

-VQSHL_U_64_2sh   1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
+VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_d
 VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_s
 VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_h
 VQSHL_U_2sh      1111 001 1 1 . ...... .... 0111 . . . 1 .... @2reg_shl_b
--- a/target/arm/tcg/neon_helper.c
+++ b/target/arm/tcg/neon_helper.c
@ -141,6 +141,19 @@ void HELPER(name)(void *vd, void *vn, void *vm, void *venv, uint32_t desc) \
    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
 }

+#define NEON_GVEC_VOP2i_ENV(name, vtype) \
+void HELPER(name)(void *vd, void *vn, void *venv, uint32_t desc) \
+{                                                               \
+    intptr_t i, opr_sz = simd_oprsz(desc);                      \
+    int imm = simd_data(desc);                                  \
+    vtype *d = vd, *n = vn;                                     \
+    CPUARMState *env = venv;                                    \
+    for (i = 0; i < opr_sz / sizeof(vtype); i++) {              \
+        NEON_FN(d[i], n[i], imm);                               \
+    }                                                           \
+    clear_tail(d, opr_sz, simd_maxsz(desc));                    \
+}
+
 /* Pairwise operations.  */
 /* For 32-bit elements each segment only contains a single element, so
   the elementwise and pairwise operations are the same.  */
@ -271,22 +284,26 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
 NEON_GVEC_VOP2_ENV(neon_uqshl_b, uint8_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_b, uint8_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
 NEON_GVEC_VOP2_ENV(neon_uqshl_h, uint16_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_h, uint16_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
 NEON_GVEC_VOP2_ENV(neon_uqshl_s, uint32_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_s, uint32_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_uqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
 NEON_GVEC_VOP2_ENV(neon_uqshl_d, uint64_t)
+NEON_GVEC_VOP2i_ENV(neon_uqshli_d, uint64_t)
 #undef NEON_FN

 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
@ -303,22 +320,26 @@ uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
 NEON_GVEC_VOP2_ENV(neon_sqshl_b, int8_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_b, int8_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
 NEON_GVEC_VOP2_ENV(neon_sqshl_h, int16_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_h, int16_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_sqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
 NEON_GVEC_VOP2_ENV(neon_sqshl_s, int32_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_s, int32_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_sqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
 NEON_GVEC_VOP2_ENV(neon_sqshl_d, int64_t)
+NEON_GVEC_VOP2i_ENV(neon_sqshli_d, int64_t)
 #undef NEON_FN

 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
@ -334,11 +355,13 @@ uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
 #define NEON_FN(dest, src1, src2) \
    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_b, int8_t)
 #undef NEON_FN

 #define NEON_FN(dest, src1, src2) \
    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_h, int16_t)
 #undef NEON_FN

 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
@ -351,6 +374,16 @@ uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
    return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
 }

+#define NEON_FN(dest, src1, src2) \
+    (dest = do_suqrshl_bhs(src1, (int8_t)src2, 32, false, env->vfp.qc))
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_s, int32_t)
+#undef NEON_FN
+
+#define NEON_FN(dest, src1, src2) \
+    (dest = do_suqrshl_d(src1, (int8_t)src2, false, env->vfp.qc))
+NEON_GVEC_VOP2i_ENV(neon_sqshlui_d, int64_t)
+#undef NEON_FN
+
 #define NEON_FN(dest, src1, src2) \
    (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
@ -565,13 +598,15 @@ NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
 #undef NEON_FN
 #undef NEON_QDMULH32

-uint32_t HELPER(neon_narrow_u8)(uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_u8)(uint64_t x)
 {
    return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
           | ((x >> 24) & 0xff000000u);
 }

-uint32_t HELPER(neon_narrow_u16)(uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_u16)(uint64_t x)
 {
    return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
 }
@ -602,7 +637,8 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
    return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
 }

-uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
 {
    uint16_t s;
    uint8_t d;
@ -629,7 +665,8 @@ uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
    return res;
 }

-uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
 {
    uint16_t s;
    uint8_t d;
@ -652,7 +689,8 @@ uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
    return res;
 }

-uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
 {
    int16_t s;
    uint8_t d;
@ -675,7 +713,8 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
    return res;
 }

-uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
 {
    uint32_t high;
    uint32_t low;
@ -695,10 +734,11 @@ uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
        high = 0xffff;
        SET_QC();
    }
-    return low | (high << 16);
+    return deposit32(low, 16, 16, high);
 }

-uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
 {
    uint32_t high;
    uint32_t low;
@ -712,10 +752,11 @@ uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
        high = 0xffff;
        SET_QC();
    }
-    return low | (high << 16);
+    return deposit32(low, 16, 16, high);
 }

-uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
 {
    int32_t low;
    int32_t high;
@ -729,10 +770,11 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
        high = (high >> 31) ^ 0x7fff;
        SET_QC();
    }
-    return (uint16_t)low | (high << 16);
+    return deposit32(low, 16, 16, high);
 }

-uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
 {
    if (x & 0x8000000000000000ull) {
        SET_QC();
@ -745,7 +787,8 @@ uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
    return x;
 }

-uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
 {
    if (x > 0xffffffffu) {
        SET_QC();
@ -754,13 +797,14 @@ uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
    return x;
 }

-uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
+/* Only the low 32-bits of output are significant. */
+uint64_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
 {
    if ((int64_t)x != (int32_t)x) {
        SET_QC();
-        return ((int64_t)x >> 63) ^ 0x7fffffff;
+        return (uint32_t)((int64_t)x >> 63) ^ 0x7fffffff;
    }
-    return x;
+    return (uint32_t)x;
 }

 uint64_t HELPER(neon_widen_u8)(uint32_t x)
--- a/target/arm/tcg/translate-a64.c
+++ b/target/arm/tcg/translate-a64.c
--- a/target/arm/tcg/translate-neon.c
+++ b/target/arm/tcg/translate-neon.c
@ -1099,144 +1099,18 @@ DO_2SH(VRSHR_S, gen_gvec_srshr)
 DO_2SH(VRSHR_U, gen_gvec_urshr)
 DO_2SH(VRSRA_S, gen_gvec_srsra)
 DO_2SH(VRSRA_U, gen_gvec_ursra)
-
-static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
-{
-    /* Signed shift out of range results in all-sign-bits */
-    a->shift = MIN(a->shift, (8 << a->size) - 1);
-    return do_vector_2sh(s, a, tcg_gen_gvec_sari);
-}
-
-static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
-                            int64_t shift, uint32_t oprsz, uint32_t maxsz)
-{
-    tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
-}
-
-static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
-{
-    /* Shift out of range is architecturally valid and results in zero. */
-    if (a->shift >= (8 << a->size)) {
-        return do_vector_2sh(s, a, gen_zero_rd_2sh);
-    } else {
-        return do_vector_2sh(s, a, tcg_gen_gvec_shri);
-    }
-}
-
-static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
-                             NeonGenTwo64OpEnvFn *fn)
-{
-    /*
-     * 2-reg-and-shift operations, size == 3 case, where the
-     * function needs to be passed tcg_env.
-     */
-    TCGv_i64 constimm;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * To avoid excessive duplication of ops we implement shift
-     * by immediate using the variable shift operations.
-     */
-    constimm = tcg_constant_i64(dup_const(a->size, a->shift));
-
-    for (pass = 0; pass < a->q + 1; pass++) {
-        TCGv_i64 tmp = tcg_temp_new_i64();
-
-        read_neon_element64(tmp, a->vm, pass, MO_64);
-        fn(tmp, tcg_env, tmp, constimm);
-        write_neon_element64(tmp, a->vd, pass, MO_64);
-    }
-    return true;
-}
-
-static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
-                             NeonGenTwoOpEnvFn *fn)
-{
-    /*
-     * 2-reg-and-shift operations, size < 3 case, where the
-     * helper needs to be passed tcg_env.
-     */
-    TCGv_i32 constimm, tmp;
-    int pass;
-
-    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
-        return false;
-    }
-
-    /* UNDEF accesses to D16-D31 if they don't exist. */
-    if (!dc_isar_feature(aa32_simd_r32, s) &&
-        ((a->vd | a->vm) & 0x10)) {
-        return false;
-    }
-
-    if ((a->vm | a->vd) & a->q) {
-        return false;
-    }
-
-    if (!vfp_access_check(s)) {
-        return true;
-    }
-
-    /*
-     * To avoid excessive duplication of ops we implement shift
-     * by immediate using the variable shift operations.
-     */
-    constimm = tcg_constant_i32(dup_const(a->size, a->shift));
-    tmp = tcg_temp_new_i32();
-
-    for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
-        read_neon_element32(tmp, a->vm, pass, MO_32);
-        fn(tmp, tcg_env, tmp, constimm);
-        write_neon_element32(tmp, a->vd, pass, MO_32);
-    }
-    return true;
-}
-
-#define DO_2SHIFT_ENV(INSN, FUNC)                                       \
-    static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
-    {                                                                   \
-        return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
-    }                                                                   \
-    static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
-    {                                                                   \
-        static NeonGenTwoOpEnvFn * const fns[] = {                      \
-            gen_helper_neon_##FUNC##8,                                  \
-            gen_helper_neon_##FUNC##16,                                 \
-            gen_helper_neon_##FUNC##32,                                 \
-        };                                                              \
-        assert(a->size < ARRAY_SIZE(fns));                              \
-        return do_2shift_env_32(s, a, fns[a->size]);                    \
-    }
-
-DO_2SHIFT_ENV(VQSHLU, qshlu_s)
-DO_2SHIFT_ENV(VQSHL_U, qshl_u)
-DO_2SHIFT_ENV(VQSHL_S, qshl_s)
+DO_2SH(VSHR_S, gen_gvec_sshr)
+DO_2SH(VSHR_U, gen_gvec_ushr)
+DO_2SH(VQSHLU, gen_neon_sqshlui)
+DO_2SH(VQSHL_U, gen_neon_uqshli)
+DO_2SH(VQSHL_S, gen_neon_sqshli)

 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
                                NeonGenTwo64OpFn *shiftfn,
-                                NeonGenNarrowEnvFn *narrowfn)
+                                NeonGenOne64OpEnvFn *narrowfn)
 {
    /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
-    TCGv_i64 constimm, rm1, rm2;
-    TCGv_i32 rd;
+    TCGv_i64 constimm, rm1, rm2, rd;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -1263,7 +1137,7 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
    constimm = tcg_constant_i64(-a->shift);
    rm1 = tcg_temp_new_i64();
    rm2 = tcg_temp_new_i64();
-    rd = tcg_temp_new_i32();
+    rd = tcg_temp_new_i64();

    /* Load both inputs first to avoid potential overwrite if rm == rd */
    read_neon_element64(rm1, a->vm, 0, MO_64);
@ -1271,18 +1145,18 @@ static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,

    shiftfn(rm1, rm1, constimm);
    narrowfn(rd, tcg_env, rm1);
-    write_neon_element32(rd, a->vd, 0, MO_32);
+    write_neon_element64(rd, a->vd, 0, MO_32);

    shiftfn(rm2, rm2, constimm);
    narrowfn(rd, tcg_env, rm2);
-    write_neon_element32(rd, a->vd, 1, MO_32);
+    write_neon_element64(rd, a->vd, 1, MO_32);

    return true;
 }

 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
                                NeonGenTwoOpFn *shiftfn,
-                                NeonGenNarrowEnvFn *narrowfn)
+                                NeonGenOne64OpEnvFn *narrowfn)
 {
    /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
    TCGv_i32 constimm, rm1, rm2, rm3, rm4;
@ -1337,16 +1211,16 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,

    tcg_gen_concat_i32_i64(rtmp, rm1, rm2);

-    narrowfn(rm1, tcg_env, rtmp);
-    write_neon_element32(rm1, a->vd, 0, MO_32);
+    narrowfn(rtmp, tcg_env, rtmp);
+    write_neon_element64(rtmp, a->vd, 0, MO_32);

    shiftfn(rm3, rm3, constimm);
    shiftfn(rm4, rm4, constimm);

    tcg_gen_concat_i32_i64(rtmp, rm3, rm4);

-    narrowfn(rm3, tcg_env, rtmp);
-    write_neon_element32(rm3, a->vd, 1, MO_32);
+    narrowfn(rtmp, tcg_env, rtmp);
+    write_neon_element64(rtmp, a->vd, 1, MO_32);
    return true;
 }

@ -1361,17 +1235,17 @@ static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
        return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
    }

-static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+static void gen_neon_narrow_u32(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
 {
-    tcg_gen_extrl_i64_i32(dest, src);
+    tcg_gen_ext32u_i64(dest, src);
 }

-static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+static void gen_neon_narrow_u16(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
 {
    gen_helper_neon_narrow_u16(dest, src);
 }

-static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
+static void gen_neon_narrow_u8(TCGv_i64 dest, TCGv_ptr env, TCGv_i64 src)
 {
    gen_helper_neon_narrow_u8(dest, src);
 }
@ -2962,10 +2836,9 @@ static bool trans_VZIP(DisasContext *s, arg_2misc *a)
 }

 static bool do_vmovn(DisasContext *s, arg_2misc *a,
-                     NeonGenNarrowEnvFn *narrowfn)
+                     NeonGenOne64OpEnvFn *narrowfn)
 {
-    TCGv_i64 rm;
-    TCGv_i32 rd0, rd1;
+    TCGv_i64 rm, rd0, rd1;

    if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
        return false;
@ -2990,22 +2863,22 @@ static bool do_vmovn(DisasContext *s, arg_2misc *a,
    }

    rm = tcg_temp_new_i64();
-    rd0 = tcg_temp_new_i32();
-    rd1 = tcg_temp_new_i32();
+    rd0 = tcg_temp_new_i64();
+    rd1 = tcg_temp_new_i64();

    read_neon_element64(rm, a->vm, 0, MO_64);
    narrowfn(rd0, tcg_env, rm);
    read_neon_element64(rm, a->vm, 1, MO_64);
    narrowfn(rd1, tcg_env, rm);
-    write_neon_element32(rd0, a->vd, 0, MO_32);
-    write_neon_element32(rd1, a->vd, 1, MO_32);
+    write_neon_element64(rd0, a->vd, 0, MO_32);
+    write_neon_element64(rd1, a->vd, 1, MO_32);
    return true;
 }

 #define DO_VMOVN(INSN, FUNC)                                    \
    static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
    {                                                           \
-        static NeonGenNarrowEnvFn * const narrowfn[] = {        \
+        static NeonGenOne64OpEnvFn * const narrowfn[] = {       \
            FUNC##8,                                            \
            FUNC##16,                                           \
            FUNC##32,                                           \
--- a/target/arm/tcg/translate-sve.c
+++ b/target/arm/tcg/translate-sve.c
@ -6081,9 +6081,9 @@ static void gen_sshll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)

    if (top) {
        if (shl == halfbits) {
-            TCGv_vec t = tcg_temp_new_vec_matching(d);
-            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
-            tcg_gen_and_vec(vece, d, n, t);
+            tcg_gen_and_vec(vece, d, n,
+                            tcg_constant_vec_matching(d, vece,
+                                MAKE_64BIT_MASK(halfbits, halfbits)));
        } else {
            tcg_gen_sari_vec(vece, d, n, halfbits);
            tcg_gen_shli_vec(vece, d, d, shl);
@ -6138,18 +6138,18 @@ static void gen_ushll_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t imm)

    if (top) {
        if (shl == halfbits) {
-            TCGv_vec t = tcg_temp_new_vec_matching(d);
-            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(halfbits, halfbits));
-            tcg_gen_and_vec(vece, d, n, t);
+            tcg_gen_and_vec(vece, d, n,
+                            tcg_constant_vec_matching(d, vece,
+                                MAKE_64BIT_MASK(halfbits, halfbits)));
        } else {
            tcg_gen_shri_vec(vece, d, n, halfbits);
            tcg_gen_shli_vec(vece, d, d, shl);
        }
    } else {
        if (shl == 0) {
-            TCGv_vec t = tcg_temp_new_vec_matching(d);
-            tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-            tcg_gen_and_vec(vece, d, n, t);
+            tcg_gen_and_vec(vece, d, n,
+                            tcg_constant_vec_matching(d, vece,
+                                MAKE_64BIT_MASK(0, halfbits)));
        } else {
            tcg_gen_shli_vec(vece, d, n, halfbits);
            tcg_gen_shri_vec(vece, d, d, halfbits - shl);
@ -6317,18 +6317,14 @@ static const TCGOpcode sqxtn_list[] = {

 static void gen_sqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t mask = (1ull << halfbits) - 1;
    int64_t min = -1ull << (halfbits - 1);
    int64_t max = -min - 1;

-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, d, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, d, d, t);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_and_vec(vece, d, d, t);
+    tcg_gen_smax_vec(vece, d, n, tcg_constant_vec_matching(d, vece, min));
+    tcg_gen_smin_vec(vece, d, d, tcg_constant_vec_matching(d, vece, max));
+    tcg_gen_and_vec(vece, d, d, tcg_constant_vec_matching(d, vece, mask));
 }

 static const GVecGen2 sqxtnb_ops[3] = {
@ -6349,19 +6345,15 @@ TRANS_FEAT(SQXTNB, aa64_sve2, do_narrow_extract, a, sqxtnb_ops)

 static void gen_sqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t mask = (1ull << halfbits) - 1;
    int64_t min = -1ull << (halfbits - 1);
    int64_t max = -min - 1;

-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, n, n, t);
+    tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min));
+    tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max));
    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n);
 }

 static const GVecGen2 sqxtnt_ops[3] = {
@ -6389,12 +6381,10 @@ static const TCGOpcode uqxtn_list[] = {

 static void gen_uqxtnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t max = (1ull << halfbits) - 1;

-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, d, n, t);
+    tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max));
 }

 static const GVecGen2 uqxtnb_ops[3] = {
@ -6415,14 +6405,13 @@ TRANS_FEAT(UQXTNB, aa64_sve2, do_narrow_extract, a, uqxtnb_ops)

 static void gen_uqxtnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t max = (1ull << halfbits) - 1;
+    TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);

-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_umin_vec(vece, n, n, maxv);
    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, maxv, d, n);
 }

 static const GVecGen2 uqxtnt_ops[3] = {
@ -6450,14 +6439,11 @@ static const TCGOpcode sqxtun_list[] = {

 static void gen_sqxtunb_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t max = (1ull << halfbits) - 1;

-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, d, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, d, d, t);
+    tcg_gen_smax_vec(vece, d, n, tcg_constant_vec_matching(d, vece, 0));
+    tcg_gen_umin_vec(vece, d, d, tcg_constant_vec_matching(d, vece, max));
 }

 static const GVecGen2 sqxtunb_ops[3] = {
@ -6478,16 +6464,14 @@ TRANS_FEAT(SQXTUNB, aa64_sve2, do_narrow_extract, a, sqxtunb_ops)

 static void gen_sqxtunt_vec(unsigned vece, TCGv_vec d, TCGv_vec n)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t max = (1ull << halfbits) - 1;
+    TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);

-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0));
+    tcg_gen_umin_vec(vece, n, n, maxv);
    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, maxv, d, n);
 }

 static const GVecGen2 sqxtunt_ops[3] = {
@ -6551,13 +6535,11 @@ static void gen_shrnb64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)

 static void gen_shrnb_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    uint64_t mask = MAKE_64BIT_MASK(0, halfbits);

    tcg_gen_shri_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_and_vec(vece, d, n, t);
+    tcg_gen_and_vec(vece, d, n, tcg_constant_vec_matching(d, vece, mask));
 }

 static const TCGOpcode shrnb_vec_list[] = { INDEX_op_shri_vec, 0 };
@ -6609,13 +6591,11 @@ static void gen_shrnt64_i64(TCGv_i64 d, TCGv_i64 n, int64_t shr)

 static void gen_shrnt_vec(unsigned vece, TCGv_vec d, TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    uint64_t mask = MAKE_64BIT_MASK(0, halfbits);

    tcg_gen_shli_vec(vece, n, n, halfbits - shr);
-    tcg_gen_dupi_vec(vece, t, mask);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n);
 }

 static const TCGOpcode shrnt_vec_list[] = { INDEX_op_shli_vec, 0 };
@ -6658,14 +6638,12 @@ TRANS_FEAT(RSHRNT, aa64_sve2, do_shr_narrow, a, rshrnt_ops)
 static void gen_sqshrunb_vec(unsigned vece, TCGv_vec d,
                             TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
+    uint64_t max = MAKE_64BIT_MASK(0, halfbits);

    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, d, n, t);
+    tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0));
+    tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max));
 }

 static const TCGOpcode sqshrunb_vec_list[] = {
@ -6690,16 +6668,15 @@ TRANS_FEAT(SQSHRUNB, aa64_sve2, do_shr_narrow, a, sqshrunb_ops)
 static void gen_sqshrunt_vec(unsigned vece, TCGv_vec d,
                             TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
+    uint64_t max = MAKE_64BIT_MASK(0, halfbits);
+    TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);

    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, 0);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, 0));
+    tcg_gen_umin_vec(vece, n, n, maxv);
    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, maxv, d, n);
 }

 static const TCGOpcode sqshrunt_vec_list[] = {
@ -6742,18 +6719,15 @@ TRANS_FEAT(SQRSHRUNT, aa64_sve2, do_shr_narrow, a, sqrshrunt_ops)
 static void gen_sqshrnb_vec(unsigned vece, TCGv_vec d,
                            TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
    int64_t min = -max - 1;
+    int64_t mask = MAKE_64BIT_MASK(0, halfbits);

    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_and_vec(vece, d, n, t);
+    tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min));
+    tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max));
+    tcg_gen_and_vec(vece, d, n, tcg_constant_vec_matching(d, vece, mask));
 }

 static const TCGOpcode sqshrnb_vec_list[] = {
@ -6778,19 +6752,16 @@ TRANS_FEAT(SQSHRNB, aa64_sve2, do_shr_narrow, a, sqshrnb_ops)
 static void gen_sqshrnt_vec(unsigned vece, TCGv_vec d,
                             TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
    int64_t max = MAKE_64BIT_MASK(0, halfbits - 1);
    int64_t min = -max - 1;
+    int64_t mask = MAKE_64BIT_MASK(0, halfbits);

    tcg_gen_sari_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, min);
-    tcg_gen_smax_vec(vece, n, n, t);
-    tcg_gen_dupi_vec(vece, t, max);
-    tcg_gen_smin_vec(vece, n, n, t);
+    tcg_gen_smax_vec(vece, n, n, tcg_constant_vec_matching(d, vece, min));
+    tcg_gen_smin_vec(vece, n, n, tcg_constant_vec_matching(d, vece, max));
    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, tcg_constant_vec_matching(d, vece, mask), d, n);
 }

 static const TCGOpcode sqshrnt_vec_list[] = {
@ -6833,12 +6804,11 @@ TRANS_FEAT(SQRSHRNT, aa64_sve2, do_shr_narrow, a, sqrshrnt_ops)
 static void gen_uqshrnb_vec(unsigned vece, TCGv_vec d,
                            TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
+    int64_t max = MAKE_64BIT_MASK(0, halfbits);

    tcg_gen_shri_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, d, n, t);
+    tcg_gen_umin_vec(vece, d, n, tcg_constant_vec_matching(d, vece, max));
 }

 static const TCGOpcode uqshrnb_vec_list[] = {
@ -6863,14 +6833,14 @@ TRANS_FEAT(UQSHRNB, aa64_sve2, do_shr_narrow, a, uqshrnb_ops)
 static void gen_uqshrnt_vec(unsigned vece, TCGv_vec d,
                            TCGv_vec n, int64_t shr)
 {
-    TCGv_vec t = tcg_temp_new_vec_matching(d);
    int halfbits = 4 << vece;
+    int64_t max = MAKE_64BIT_MASK(0, halfbits);
+    TCGv_vec maxv = tcg_constant_vec_matching(d, vece, max);

    tcg_gen_shri_vec(vece, n, n, shr);
-    tcg_gen_dupi_vec(vece, t, MAKE_64BIT_MASK(0, halfbits));
-    tcg_gen_umin_vec(vece, n, n, t);
+    tcg_gen_umin_vec(vece, n, n, maxv);
    tcg_gen_shli_vec(vece, n, n, halfbits);
-    tcg_gen_bitsel_vec(vece, d, t, d, n);
+    tcg_gen_bitsel_vec(vece, d, maxv, d, n);
 }

 static const TCGOpcode uqshrnt_vec_list[] = {
--- a/target/arm/tcg/translate.h
+++ b/target/arm/tcg/translate.h
@ -471,6 +471,13 @@ void gen_neon_sqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
 void gen_neon_uqrshl(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
                     uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);

+void gen_neon_sqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                     int64_t c, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_uqshli(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                     int64_t c, uint32_t opr_sz, uint32_t max_sz);
+void gen_neon_sqshlui(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
+                      int64_t c, uint32_t opr_sz, uint32_t max_sz);
+
 void gen_gvec_shadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
                    uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);
 void gen_gvec_uhadd(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
@ -514,6 +521,11 @@ void gen_sqsub_d(TCGv_i64 d, TCGv_i64 q, TCGv_i64 a, TCGv_i64 b);
 void gen_gvec_sqsub_qc(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,
                       uint32_t rm_ofs, uint32_t opr_sz, uint32_t max_sz);

+void gen_gvec_sshr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+void gen_gvec_ushr(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
+                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
+
 void gen_gvec_ssra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
                   int64_t shift, uint32_t opr_sz, uint32_t max_sz);
 void gen_gvec_usra(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
@ -593,13 +605,13 @@ typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
 typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64);
 typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64);
 typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64);
-typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64);
 typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32);
 typedef void NeonGenTwoOpWidenFn(TCGv_i64, TCGv_i32, TCGv_i32);
 typedef void NeonGenOneSingleOpFn(TCGv_i32, TCGv_i32, TCGv_ptr);
 typedef void NeonGenTwoSingleOpFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr);
 typedef void NeonGenTwoDoubleOpFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr);
 typedef void NeonGenOne64OpFn(TCGv_i64, TCGv_i64);
+typedef void NeonGenOne64OpEnvFn(TCGv_i64, TCGv_env, TCGv_i64);
 typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
 typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
 typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
--- a/tests/functional/test_aarch64_sbsaref.py
+++ b/tests/functional/test_aarch64_sbsaref.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Functional test that boots a Linux kernel and checks the console
+# Functional test that boots a kernel and checks the console
 #
 # SPDX-FileCopyrightText: 2023-2024 Linaro Ltd.
 # SPDX-FileContributor: Philippe Mathieu-Daudé <philmd@linaro.org>
@ -110,16 +110,17 @@ class Aarch64SbsarefMachine(QemuSystemTest):
    # This tests the whole boot chain from EFI to Userspace
    # We only boot a whole OS for the current top level CPU and GIC
    # Other test profiles should use more minimal boots
-    def boot_alpine_linux(self, cpu):
+    def boot_alpine_linux(self, cpu=None):
        self.fetch_firmware()

        iso_path = self.ASSET_ALPINE_ISO.fetch()

        self.vm.set_console()
        self.vm.add_args(
-            "-cpu", cpu,
            "-drive", f"file={iso_path},media=cdrom,format=raw",
        )
+        if cpu:
+            self.vm.add_args("-cpu", cpu)

        self.vm.launch()
        wait_for_console_pattern(self, "Welcome to Alpine Linux 3.17")
@ -127,8 +128,8 @@ class Aarch64SbsarefMachine(QemuSystemTest):
    def test_sbsaref_alpine_linux_cortex_a57(self):
        self.boot_alpine_linux("cortex-a57")

-    def test_sbsaref_alpine_linux_neoverse_n1(self):
-        self.boot_alpine_linux("neoverse-n1")
+    def test_sbsaref_alpine_linux_default_cpu(self):
+        self.boot_alpine_linux()

    def test_sbsaref_alpine_linux_max_pauth_off(self):
        self.boot_alpine_linux("max,pauth=off")
@ -136,50 +137,53 @@ class Aarch64SbsarefMachine(QemuSystemTest):
    def test_sbsaref_alpine_linux_max_pauth_impdef(self):
        self.boot_alpine_linux("max,pauth-impdef=on")

-    @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), 'Test might timeout')
+    @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'),
+                'Test might timeout due to PAuth emulation')
    def test_sbsaref_alpine_linux_max(self):
        self.boot_alpine_linux("max")


-    ASSET_OPENBSD_ISO = Asset(
-        ('https://cdn.openbsd.org/pub/OpenBSD/7.3/arm64/miniroot73.img'),
-        '7fc2c75401d6f01fbfa25f4953f72ad7d7c18650056d30755c44b9c129b707e5')
+    ASSET_FREEBSD_ISO = Asset(
+        ('https://download.freebsd.org/releases/arm64/aarch64/ISO-IMAGES/'
+         '14.1/FreeBSD-14.1-RELEASE-arm64-aarch64-bootonly.iso'),
+        '44cdbae275ef1bb6dab1d5fbb59473d4f741e1c8ea8a80fd9e906b531d6ad461')

    # This tests the whole boot chain from EFI to Userspace
    # We only boot a whole OS for the current top level CPU and GIC
    # Other test profiles should use more minimal boots
-    def boot_openbsd73(self, cpu):
+    def boot_freebsd14(self, cpu=None):
        self.fetch_firmware()

-        img_path = self.ASSET_OPENBSD_ISO.fetch()
+        img_path = self.ASSET_FREEBSD_ISO.fetch()

        self.vm.set_console()
        self.vm.add_args(
-            "-cpu", cpu,
            "-drive", f"file={img_path},format=raw,snapshot=on",
        )
+        if cpu:
+            self.vm.add_args("-cpu", cpu)

        self.vm.launch()
-        wait_for_console_pattern(self,
-                                 "Welcome to the OpenBSD/arm64"
-                                 " 7.3 installation program.")
+        wait_for_console_pattern(self, 'Welcome to FreeBSD!')

-    def test_sbsaref_openbsd73_cortex_a57(self):
-        self.boot_openbsd73("cortex-a57")
+    def test_sbsaref_freebsd14_cortex_a57(self):
+        self.boot_freebsd14("cortex-a57")

-    def test_sbsaref_openbsd73_neoverse_n1(self):
-        self.boot_openbsd73("neoverse-n1")
+    def test_sbsaref_freebsd14_default_cpu(self):
+        self.boot_freebsd14()

-    def test_sbsaref_openbsd73_max_pauth_off(self):
-        self.boot_openbsd73("max,pauth=off")
+    def test_sbsaref_freebsd14_max_pauth_off(self):
+        self.boot_freebsd14("max,pauth=off")

-    @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), 'Test might timeout')
-    def test_sbsaref_openbsd73_max_pauth_impdef(self):
-        self.boot_openbsd73("max,pauth-impdef=on")
+    @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'),
+                'Test might timeout due to PAuth emulation')
+    def test_sbsaref_freebsd14_max_pauth_impdef(self):
+        self.boot_freebsd14("max,pauth-impdef=on")

-    @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'), 'Test might timeout')
-    def test_sbsaref_openbsd73_max(self):
-        self.boot_openbsd73("max")
+    @skipUnless(os.getenv('QEMU_TEST_TIMEOUT_EXPECTED'),
+                'Test might timeout due to PAuth emulation')
+    def test_sbsaref_freebsd14_max(self):
+        self.boot_freebsd14("max")


 if __name__ == '__main__':
--- a/tests/qtest/stm32l4x5_usart-test.c
+++ b/tests/qtest/stm32l4x5_usart-test.c
@ -36,6 +36,8 @@ REG32(GTPR, 0x10)
 REG32(RTOR, 0x14)
 REG32(RQR, 0x18)
 REG32(ISR, 0x1C)
+    FIELD(ISR, REACK, 22, 1)
+    FIELD(ISR, TEACK, 21, 1)
    FIELD(ISR, TXE, 7, 1)
    FIELD(ISR, RXNE, 5, 1)
    FIELD(ISR, ORE, 3, 1)
@ -191,7 +193,7 @@ static void init_uart(QTestState *qts)

    /* Enable the transmitter, the receiver and the USART. */
    qtest_writel(qts, (USART1_BASE_ADDR + A_CR1),
-        R_CR1_UE_MASK | R_CR1_RE_MASK | R_CR1_TE_MASK);
+        cr1 | R_CR1_UE_MASK | R_CR1_RE_MASK | R_CR1_TE_MASK);
 }

 static void test_write_read(void)
@ -298,6 +300,37 @@ static void test_send_str(void)
    qtest_quit(qts);
 }

+static void test_ack(void)
+{
+    uint32_t cr1;
+    uint32_t isr;
+    QTestState *qts = qtest_init("-M b-l475e-iot01a");
+
+    init_uart(qts);
+
+    cr1 = qtest_readl(qts, (USART1_BASE_ADDR + A_CR1));
+
+    /* Disable the transmitter and receiver. */
+    qtest_writel(qts, (USART1_BASE_ADDR + A_CR1),
+        cr1 & ~(R_CR1_RE_MASK | R_CR1_TE_MASK));
+
+    /* Test ISR ACK for transmitter and receiver disabled */
+    isr = qtest_readl(qts, (USART1_BASE_ADDR + A_ISR));
+    g_assert_false(isr & R_ISR_TEACK_MASK);
+    g_assert_false(isr & R_ISR_REACK_MASK);
+
+    /* Enable the transmitter and receiver. */
+    qtest_writel(qts, (USART1_BASE_ADDR + A_CR1),
+        cr1 | (R_CR1_RE_MASK | R_CR1_TE_MASK));
+
+    /* Test ISR ACK for transmitter and receiver disabled */
+    isr = qtest_readl(qts, (USART1_BASE_ADDR + A_ISR));
+    g_assert_true(isr & R_ISR_TEACK_MASK);
+    g_assert_true(isr & R_ISR_REACK_MASK);
+
+    qtest_quit(qts);
+}
+
 int main(int argc, char **argv)
 {
    int ret;
@ -310,6 +343,7 @@ int main(int argc, char **argv)
    qtest_add_func("stm32l4x5/usart/send_char", test_send_char);
    qtest_add_func("stm32l4x5/usart/receive_str", test_receive_str);
    qtest_add_func("stm32l4x5/usart/send_str", test_send_str);
+    qtest_add_func("stm32l4x5/usart/ack", test_ack);
    ret = g_test_run();

    return ret;