tcg: Add generic vector expanders

Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2017-09-15 14:11:45 -07:00 · 2017-09-15 14:11:45 -07:00 · db432672dc
commit db432672dc
parent 474b2e8f0f
13 changed files with 2024 additions and 18 deletions
--- a/Makefile.target
+++ b/Makefile.target
@ -93,7 +93,7 @@ all: $(PROGS) stap
 # cpu emulator library
 obj-y += exec.o
 obj-y += accel/
-obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o
+obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o
 obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
 obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
 obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@ -1,6 +1,6 @@
 obj-$(CONFIG_SOFTMMU) += tcg-all.o
 obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += tcg-runtime.o
+obj-y += tcg-runtime.o tcg-runtime-gvec.o
 obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
 obj-y += translator.o
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@ -0,0 +1,325 @@
 /*
 * Generic vectorized operation runtime
 *
 * Copyright (c) 2018 Linaro
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
 #include "qemu/osdep.h"
 #include "qemu/host-utils.h"
 #include "cpu.h"
 #include "exec/helper-proto.h"
 #include "tcg-gvec-desc.h"
 /* Virtually all hosts support 16-byte vectors.  Those that don't can emulate
 * them via GCC's generic vector extension.  This turns out to be simpler and
 * more reliable than getting the compiler to autovectorize.
 *
 * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
 * are multiples of 16.
 *
 * When the compiler does not support all of the operations we require, the
 * loops are written so that we can always fall back on the base types.
 */
 #ifdef CONFIG_VECTOR16
 typedef uint8_t vec8 __attribute__((vector_size(16)));
 typedef uint16_t vec16 __attribute__((vector_size(16)));
 typedef uint32_t vec32 __attribute__((vector_size(16)));
 typedef uint64_t vec64 __attribute__((vector_size(16)));
 typedef int8_t svec8 __attribute__((vector_size(16)));
 typedef int16_t svec16 __attribute__((vector_size(16)));
 typedef int32_t svec32 __attribute__((vector_size(16)));
 typedef int64_t svec64 __attribute__((vector_size(16)));
 #define DUP16(X)  { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
 #define DUP8(X)   { X, X, X, X, X, X, X, X }
 #define DUP4(X)   { X, X, X, X }
 #define DUP2(X)   { X, X }
 #else
 typedef uint8_t vec8;
 typedef uint16_t vec16;
 typedef uint32_t vec32;
 typedef uint64_t vec64;
 typedef int8_t svec8;
 typedef int16_t svec16;
 typedef int32_t svec32;
 typedef int64_t svec64;
 #define DUP16(X)  X
 #define DUP8(X)   X
 #define DUP4(X)   X
 #define DUP2(X)   X
 #endif /* CONFIG_VECTOR16 */
 static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
 {
    intptr_t maxsz = simd_maxsz(desc);
    intptr_t i;
    if (unlikely(maxsz > oprsz)) {
        for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
            *(uint64_t *)(d + i) = 0;
        }
    }
 }
 void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec8)) {
        *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec16)) {
        *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec32)) {
        *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec8)) {
        *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec16)) {
        *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec32)) {
        *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec8)) {
        *(vec8 *)(d + i) = -*(vec8 *)(a + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec16)) {
        *(vec16 *)(d + i) = -*(vec16 *)(a + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec32)) {
        *(vec32 *)(d + i) = -*(vec32 *)(a + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = -*(vec64 *)(a + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    memcpy(d, a, oprsz);
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    if (c == 0) {
        oprsz = 0;
    } else {
        for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
            *(uint64_t *)(d + i) = c;
        }
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    if (c == 0) {
        oprsz = 0;
    } else {
        for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
            *(uint32_t *)(d + i) = c;
        }
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
 {
    HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
 }
 void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
 {
    HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
 }
 void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
 void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
 {
    intptr_t oprsz = simd_oprsz(desc);
    intptr_t i;
    for (i = 0; i < oprsz; i += sizeof(vec64)) {
        *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
    }
    clear_high(d, oprsz, desc);
 }
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@ -134,3 +134,32 @@ GEN_ATOMIC_HELPERS(xor_fetch)
 GEN_ATOMIC_HELPERS(xchg)
 #undef GEN_ATOMIC_HELPERS
 DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32)
 DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32)
 DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32)
 DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64)
 DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
--- a/48
+++ b/48
@ -5000,6 +5000,50 @@ if compile_prog "" "" ; then
  atomic64=yes
 fi
 ########################################
 # See if 16-byte vector operations are supported.
 # Even without a vector unit the compiler may expand these.
 # There is a bug in old GCC for PPC that crashes here.
 # Unfortunately it's the system compiler for Centos 7.
 cat > $TMPC << EOF
 typedef unsigned char U1 __attribute__((vector_size(16)));
 typedef unsigned short U2 __attribute__((vector_size(16)));
 typedef unsigned int U4 __attribute__((vector_size(16)));
 typedef unsigned long long U8 __attribute__((vector_size(16)));
 typedef signed char S1 __attribute__((vector_size(16)));
 typedef signed short S2 __attribute__((vector_size(16)));
 typedef signed int S4 __attribute__((vector_size(16)));
 typedef signed long long S8 __attribute__((vector_size(16)));
 static U1 a1, b1;
 static U2 a2, b2;
 static U4 a4, b4;
 static U8 a8, b8;
 static S1 c1;
 static S2 c2;
 static S4 c4;
 static S8 c8;
 static int i;
 int main(void)
 {
  a1 += b1; a2 += b2; a4 += b4; a8 += b8;
  a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
  a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
  a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
  a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
  a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
  a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
  a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
  c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
  return 0;
 }
 EOF
 vector16=no
 if compile_prog "" "" ; then
  vector16=yes
 fi
 ########################################
 # check if getauxval is available.
@ -6329,6 +6373,10 @@ if test "$atomic64" = "yes" ; then
  echo "CONFIG_ATOMIC64=y" >> $config_host_mak
 fi
 if test "$vector16" = "yes" ; then
  echo "CONFIG_VECTOR16=y" >> $config_host_mak
 fi
 if test "$getauxval" = "yes" ; then
  echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
 fi
--- a/tcg/tcg-gvec-desc.h
+++ b/tcg/tcg-gvec-desc.h
@ -0,0 +1,49 @@
 /*
 * Generic vector operation descriptor
 *
 * Copyright (c) 2018 Linaro
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
 /* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
 #define SIMD_OPRSZ_SHIFT   0
 #define SIMD_OPRSZ_BITS    5
 #define SIMD_MAXSZ_SHIFT   (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
 #define SIMD_MAXSZ_BITS    5
 #define SIMD_DATA_SHIFT    (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
 #define SIMD_DATA_BITS     (32 - SIMD_DATA_SHIFT)
 /* Create a descriptor from components.  */
 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
 /* Extract the operation size from a descriptor.  */
 static inline intptr_t simd_oprsz(uint32_t desc)
 {
    return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
 }
 /* Extract the max vector size from a descriptor.  */
 static inline intptr_t simd_maxsz(uint32_t desc)
 {
    return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
 }
 /* Extract the operation-specific data from a descriptor.  */
 static inline int32_t simd_data(uint32_t desc)
 {
    return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
 }
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@ -0,0 +1,198 @@
 /*
 * Generic vector operation expansion
 *
 * Copyright (c) 2018 Linaro
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
 */
 /*
 * "Generic" vectors.  All operands are given as offsets from ENV,
 * and therefore cannot also be allocated via tcg_global_mem_new_*.
 * OPRSZ is the byte size of the vector upon which the operation is performed.
 * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
 *
 * All sizes must be 8 or any multiple of 16.
 * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
 * Operands may completely, but not partially, overlap.
 */
 /* Expand a call to a gvec-style helper, with pointers to two vector
   operands, and a descriptor (see tcg-gvec-desc.h).  */
 typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
                        uint32_t oprsz, uint32_t maxsz, int32_t data,
                        gen_helper_gvec_2 *fn);
 /* Similarly, passing an extra pointer (e.g. env or float_status).  */
 typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
                        int32_t data, gen_helper_gvec_2_ptr *fn);
 /* Similarly, with three vector operands.  */
 typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                        uint32_t oprsz, uint32_t maxsz, int32_t data,
                        gen_helper_gvec_3 *fn);
 /* Similarly, with four vector operands.  */
 typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
                               TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                        uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
                        int32_t data, gen_helper_gvec_4 *fn);
 /* Similarly, with five vector operands.  */
 typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
                               TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                        uint32_t cofs, uint32_t xofs, uint32_t oprsz,
                        uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
 typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
                                   TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                        TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
                        int32_t data, gen_helper_gvec_3_ptr *fn);
 typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
                                   TCGv_ptr, TCGv_ptr, TCGv_i32);
 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                        uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
                        uint32_t maxsz, int32_t data,
                        gen_helper_gvec_4_ptr *fn);
 /* Expand a gvec operation.  Either inline or out-of-line depending on
   the actual vector size and the operations supported by the host.  */
 typedef struct {
    /* Expand inline as a 64-bit or 32-bit integer.
       Only one of these will be non-NULL.  */
    void (*fni8)(TCGv_i64, TCGv_i64);
    void (*fni4)(TCGv_i32, TCGv_i32);
    /* Expand inline with a host vector type.  */
    void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
    /* Expand out-of-line helper w/descriptor.  */
    gen_helper_gvec_2 *fno;
    /* The opcode, if any, to which this corresponds.  */
    TCGOpcode opc;
    /* The data argument to the out-of-line helper.  */
    int32_t data;
    /* The vector element size, if applicable.  */
    uint8_t vece;
    /* Prefer i64 to v64.  */
    bool prefer_i64;
 } GVecGen2;
 typedef struct {
    /* Expand inline as a 64-bit or 32-bit integer.
       Only one of these will be non-NULL.  */
    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
    /* Expand inline with a host vector type.  */
    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
    /* Expand out-of-line helper w/descriptor.  */
    gen_helper_gvec_3 *fno;
    /* The opcode, if any, to which this corresponds.  */
    TCGOpcode opc;
    /* The data argument to the out-of-line helper.  */
    int32_t data;
    /* The vector element size, if applicable.  */
    uint8_t vece;
    /* Prefer i64 to v64.  */
    bool prefer_i64;
    /* Load dest as a 3rd source operand.  */
    bool load_dest;
 } GVecGen3;
 typedef struct {
    /* Expand inline as a 64-bit or 32-bit integer.
       Only one of these will be non-NULL.  */
    void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
    void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
    /* Expand inline with a host vector type.  */
    void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
    /* Expand out-of-line helper w/descriptor.  */
    gen_helper_gvec_4 *fno;
    /* The opcode, if any, to which this corresponds.  */
    TCGOpcode opc;
    /* The data argument to the out-of-line helper.  */
    int32_t data;
    /* The vector element size, if applicable.  */
    uint8_t vece;
    /* Prefer i64 to v64.  */
    bool prefer_i64;
 } GVecGen4;
 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
                    uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
                    uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
                    uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
 /* Expand a specific vector operation.  */
 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
                     uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
                          uint32_t s, uint32_t m);
 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
                          uint32_t m, TCGv_i32);
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
                          uint32_t m, TCGv_i64);
 void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
 void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
 void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
 void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
 /*
 * 64-bit vector operations.  Use these when the register has been allocated
 * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
 * OPRSZ = MAXSZ = 8.
 */
 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@ -73,7 +73,8 @@ static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
    TCGTemp *at = tcgv_vec_temp(a);
    TCGType type = rt->base_type;
-    tcg_debug_assert(at->base_type == type);
+    /* Must enough inputs for the output.  */
    tcg_debug_assert(at->base_type >= type);
    vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
 }
@ -85,8 +86,9 @@ static void vec_gen_op3(TCGOpcode opc, unsigned vece,
    TCGTemp *bt = tcgv_vec_temp(b);
    TCGType type = rt->base_type;
-    tcg_debug_assert(at->base_type == type);
+    /* Must enough inputs for the output.  */
-    tcg_debug_assert(bt->base_type == type);
+    tcg_debug_assert(at->base_type >= type);
    tcg_debug_assert(bt->base_type >= type);
    vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
 }
@ -99,7 +101,7 @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
 #define MO_REG  (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
-static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
+static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
 {
    TCGTemp *rt = tcgv_vec_temp(r);
    vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
@ -108,14 +110,14 @@ static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
 TCGv_vec tcg_const_zeros_vec(TCGType type)
 {
    TCGv_vec ret = tcg_temp_new_vec(type);
-    tcg_gen_dupi_vec(ret, MO_REG, 0);
+    do_dupi_vec(ret, MO_REG, 0);
    return ret;
 }
 TCGv_vec tcg_const_ones_vec(TCGType type)
 {
    TCGv_vec ret = tcg_temp_new_vec(type);
-    tcg_gen_dupi_vec(ret, MO_REG, -1);
+    do_dupi_vec(ret, MO_REG, -1);
    return ret;
 }
@ -134,9 +136,9 @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
 void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
 {
    if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
-        tcg_gen_dupi_vec(r, MO_32, a);
+        do_dupi_vec(r, MO_32, a);
    } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
-        tcg_gen_dupi_vec(r, MO_64, a);
+        do_dupi_vec(r, MO_64, a);
    } else {
        TCGv_i64 c = tcg_const_i64(a);
        tcg_gen_dup_i64_vec(MO_64, r, c);
@ -146,17 +148,22 @@ void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
 void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
 {
-    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffffffffu) * a);
+    do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
 }
 void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
 {
-    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffff) * (a & 0xffff));
+    do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
 }
 void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
 {
-    tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xff) * (a & 0xff));
+    do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
 }
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
 {
    do_dupi_vec(r, MO_REG, dup_const(vece, a));
 }
 void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
@ -167,14 +174,14 @@ void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
    if (TCG_TARGET_REG_BITS == 64) {
        TCGArg ai = tcgv_i64_arg(a);
-        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
+        vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
    } else if (vece == MO_64) {
        TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
        TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
        vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
    } else {
        TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
-        vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai);
+        vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
    }
 }
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@ -914,6 +914,7 @@ void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
 void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
 void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
 void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
 void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
 void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
 void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@ -228,6 +228,12 @@ DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
 DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
 DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
 DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
 #if TCG_TARGET_MAYBE_vec
 #include "tcg-target.opc.h"
 #endif
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
 #undef IMPL
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@ -1403,10 +1403,10 @@ bool tcg_op_supported(TCGOpcode op)
    case INDEX_op_orc_vec:
        return have_vec && TCG_TARGET_HAS_orc_vec;
-    case NB_OPS:
+    default:
-        break;
+        tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
        return true;
    }
    g_assert_not_reached();
 }
 /* Note: we convert the 64 bit args to 32 bit and do some alignment
@ -3733,3 +3733,10 @@ void tcg_register_jit(void *buf, size_t buf_size)
 {
 }
 #endif /* ELF_HOST_MACHINE */
 #if !TCG_TARGET_MAYBE_vec
 void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...)
 {
    g_assert_not_reached();
 }
 #endif
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@ -1207,6 +1207,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
 void tcg_register_jit(void *buf, size_t buf_size);
 #if TCG_TARGET_MAYBE_vec
 /* Return zero if the tuple (opc, type, vece) is unsupportable;
   return > 0 if it is directly supportable;
   return < 0 if we must call tcg_expand_vec_op.  */
 int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned);
 #else
 static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve)
 {
    return 0;
 }
 #endif
 /* Expand the tuple (opc, type, vece) on the given arguments.  */
 void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...);
 /* Replicate a constant C accoring to the log2 of the element size.  */
 uint64_t dup_const(unsigned vece, uint64_t c);
 #define dup_const(VECE, C)                                         \
    (__builtin_constant_p(VECE)                                    \
     ? (  (VECE) == MO_8  ? 0x0101010101010101ull * (uint8_t)(C)   \
        : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C)  \
        : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C)  \
        : dup_const(VECE, C))                                      \
     : dup_const(VECE, C))
 /*
 * Memory helpers that will be used by TCG generated code.
 */