tcg: Add generic vector expanders

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2017-09-15 14:11:45 -07:00
parent 474b2e8f0f
commit db432672dc
13 changed files with 2024 additions and 18 deletions

View File

@ -93,7 +93,7 @@ all: $(PROGS) stap
# cpu emulator library # cpu emulator library
obj-y += exec.o obj-y += exec.o
obj-y += accel/ obj-y += accel/
obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o
obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o

View File

@ -1,6 +1,6 @@
obj-$(CONFIG_SOFTMMU) += tcg-all.o obj-$(CONFIG_SOFTMMU) += tcg-all.o
obj-$(CONFIG_SOFTMMU) += cputlb.o obj-$(CONFIG_SOFTMMU) += cputlb.o
obj-y += tcg-runtime.o obj-y += tcg-runtime.o tcg-runtime-gvec.o
obj-y += cpu-exec.o cpu-exec-common.o translate-all.o obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
obj-y += translator.o obj-y += translator.o

View File

@ -0,0 +1,325 @@
/*
* Generic vectorized operation runtime
*
* Copyright (c) 2018 Linaro
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
#include "qemu/host-utils.h"
#include "cpu.h"
#include "exec/helper-proto.h"
#include "tcg-gvec-desc.h"
/* Virtually all hosts support 16-byte vectors. Those that don't can emulate
* them via GCC's generic vector extension. This turns out to be simpler and
* more reliable than getting the compiler to autovectorize.
*
* In tcg-op-gvec.c, we asserted that both the size and alignment of the data
* are multiples of 16.
*
* When the compiler does not support all of the operations we require, the
* loops are written so that we can always fall back on the base types.
*/
#ifdef CONFIG_VECTOR16
typedef uint8_t vec8 __attribute__((vector_size(16)));
typedef uint16_t vec16 __attribute__((vector_size(16)));
typedef uint32_t vec32 __attribute__((vector_size(16)));
typedef uint64_t vec64 __attribute__((vector_size(16)));
typedef int8_t svec8 __attribute__((vector_size(16)));
typedef int16_t svec16 __attribute__((vector_size(16)));
typedef int32_t svec32 __attribute__((vector_size(16)));
typedef int64_t svec64 __attribute__((vector_size(16)));
#define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
#define DUP8(X) { X, X, X, X, X, X, X, X }
#define DUP4(X) { X, X, X, X }
#define DUP2(X) { X, X }
#else
typedef uint8_t vec8;
typedef uint16_t vec16;
typedef uint32_t vec32;
typedef uint64_t vec64;
typedef int8_t svec8;
typedef int16_t svec16;
typedef int32_t svec32;
typedef int64_t svec64;
#define DUP16(X) X
#define DUP8(X) X
#define DUP4(X) X
#define DUP2(X) X
#endif /* CONFIG_VECTOR16 */
static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
{
intptr_t maxsz = simd_maxsz(desc);
intptr_t i;
if (unlikely(maxsz > oprsz)) {
for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
*(uint64_t *)(d + i) = 0;
}
}
}
void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec8)) {
*(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec16)) {
*(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec32)) {
*(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec8)) {
*(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec16)) {
*(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec32)) {
*(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec8)) {
*(vec8 *)(d + i) = -*(vec8 *)(a + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec16)) {
*(vec16 *)(d + i) = -*(vec16 *)(a + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec32)) {
*(vec32 *)(d + i) = -*(vec32 *)(a + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = -*(vec64 *)(a + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
memcpy(d, a, oprsz);
clear_high(d, oprsz, desc);
}
void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
if (c == 0) {
oprsz = 0;
} else {
for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
*(uint64_t *)(d + i) = c;
}
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
if (c == 0) {
oprsz = 0;
} else {
for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
*(uint32_t *)(d + i) = c;
}
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
{
HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
}
void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
{
HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
}
void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = ~*(vec64 *)(a + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}
void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
intptr_t i;
for (i = 0; i < oprsz; i += sizeof(vec64)) {
*(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
}
clear_high(d, oprsz, desc);
}

View File

@ -134,3 +134,32 @@ GEN_ATOMIC_HELPERS(xor_fetch)
GEN_ATOMIC_HELPERS(xchg) GEN_ATOMIC_HELPERS(xchg)
#undef GEN_ATOMIC_HELPERS #undef GEN_ATOMIC_HELPERS
DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32)
DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32)
DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32)
DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64)
DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

48
configure vendored
View File

@ -5000,6 +5000,50 @@ if compile_prog "" "" ; then
atomic64=yes atomic64=yes
fi fi
########################################
# See if 16-byte vector operations are supported.
# Even without a vector unit the compiler may expand these.
# There is a bug in old GCC for PPC that crashes here.
# Unfortunately it's the system compiler for Centos 7.
cat > $TMPC << EOF
typedef unsigned char U1 __attribute__((vector_size(16)));
typedef unsigned short U2 __attribute__((vector_size(16)));
typedef unsigned int U4 __attribute__((vector_size(16)));
typedef unsigned long long U8 __attribute__((vector_size(16)));
typedef signed char S1 __attribute__((vector_size(16)));
typedef signed short S2 __attribute__((vector_size(16)));
typedef signed int S4 __attribute__((vector_size(16)));
typedef signed long long S8 __attribute__((vector_size(16)));
static U1 a1, b1;
static U2 a2, b2;
static U4 a4, b4;
static U8 a8, b8;
static S1 c1;
static S2 c2;
static S4 c4;
static S8 c8;
static int i;
int main(void)
{
a1 += b1; a2 += b2; a4 += b4; a8 += b8;
a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
return 0;
}
EOF
vector16=no
if compile_prog "" "" ; then
vector16=yes
fi
######################################## ########################################
# check if getauxval is available. # check if getauxval is available.
@ -6329,6 +6373,10 @@ if test "$atomic64" = "yes" ; then
echo "CONFIG_ATOMIC64=y" >> $config_host_mak echo "CONFIG_ATOMIC64=y" >> $config_host_mak
fi fi
if test "$vector16" = "yes" ; then
echo "CONFIG_VECTOR16=y" >> $config_host_mak
fi
if test "$getauxval" = "yes" ; then if test "$getauxval" = "yes" ; then
echo "CONFIG_GETAUXVAL=y" >> $config_host_mak echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
fi fi

49
tcg/tcg-gvec-desc.h Normal file
View File

@ -0,0 +1,49 @@
/*
* Generic vector operation descriptor
*
* Copyright (c) 2018 Linaro
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
#define SIMD_OPRSZ_SHIFT 0
#define SIMD_OPRSZ_BITS 5
#define SIMD_MAXSZ_SHIFT (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
#define SIMD_MAXSZ_BITS 5
#define SIMD_DATA_SHIFT (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
#define SIMD_DATA_BITS (32 - SIMD_DATA_SHIFT)
/* Create a descriptor from components. */
uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
/* Extract the operation size from a descriptor. */
static inline intptr_t simd_oprsz(uint32_t desc)
{
return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
}
/* Extract the max vector size from a descriptor. */
static inline intptr_t simd_maxsz(uint32_t desc)
{
return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
}
/* Extract the operation-specific data from a descriptor. */
static inline int32_t simd_data(uint32_t desc)
{
return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
}

1309
tcg/tcg-op-gvec.c Normal file

File diff suppressed because it is too large Load Diff

198
tcg/tcg-op-gvec.h Normal file
View File

@ -0,0 +1,198 @@
/*
* Generic vector operation expansion
*
* Copyright (c) 2018 Linaro
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
/*
* "Generic" vectors. All operands are given as offsets from ENV,
* and therefore cannot also be allocated via tcg_global_mem_new_*.
* OPRSZ is the byte size of the vector upon which the operation is performed.
* MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
*
* All sizes must be 8 or any multiple of 16.
* When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
* Operands may completely, but not partially, overlap.
*/
/* Expand a call to a gvec-style helper, with pointers to two vector
operands, and a descriptor (see tcg-gvec-desc.h). */
typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz, int32_t data,
gen_helper_gvec_2 *fn);
/* Similarly, passing an extra pointer (e.g. env or float_status). */
typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
int32_t data, gen_helper_gvec_2_ptr *fn);
/* Similarly, with three vector operands. */
typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz, int32_t data,
gen_helper_gvec_3 *fn);
/* Similarly, with four vector operands. */
typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
int32_t data, gen_helper_gvec_4 *fn);
/* Similarly, with five vector operands. */
typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t cofs, uint32_t xofs, uint32_t oprsz,
uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
int32_t data, gen_helper_gvec_3_ptr *fn);
typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
TCGv_ptr, TCGv_ptr, TCGv_i32);
void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
uint32_t maxsz, int32_t data,
gen_helper_gvec_4_ptr *fn);
/* Expand a gvec operation. Either inline or out-of-line depending on
the actual vector size and the operations supported by the host. */
typedef struct {
/* Expand inline as a 64-bit or 32-bit integer.
Only one of these will be non-NULL. */
void (*fni8)(TCGv_i64, TCGv_i64);
void (*fni4)(TCGv_i32, TCGv_i32);
/* Expand inline with a host vector type. */
void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_2 *fno;
/* The opcode, if any, to which this corresponds. */
TCGOpcode opc;
/* The data argument to the out-of-line helper. */
int32_t data;
/* The vector element size, if applicable. */
uint8_t vece;
/* Prefer i64 to v64. */
bool prefer_i64;
} GVecGen2;
typedef struct {
/* Expand inline as a 64-bit or 32-bit integer.
Only one of these will be non-NULL. */
void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
/* Expand inline with a host vector type. */
void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_3 *fno;
/* The opcode, if any, to which this corresponds. */
TCGOpcode opc;
/* The data argument to the out-of-line helper. */
int32_t data;
/* The vector element size, if applicable. */
uint8_t vece;
/* Prefer i64 to v64. */
bool prefer_i64;
/* Load dest as a 3rd source operand. */
bool load_dest;
} GVecGen3;
typedef struct {
/* Expand inline as a 64-bit or 32-bit integer.
Only one of these will be non-NULL. */
void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
/* Expand inline with a host vector type. */
void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_4 *fno;
/* The opcode, if any, to which this corresponds. */
TCGOpcode opc;
/* The data argument to the out-of-line helper. */
int32_t data;
/* The vector element size, if applicable. */
uint8_t vece;
/* Prefer i64 to v64. */
bool prefer_i64;
} GVecGen4;
void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
/* Expand a specific vector operation. */
void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t s, uint32_t m);
void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
uint32_t m, TCGv_i32);
void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
uint32_t m, TCGv_i64);
void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
/*
* 64-bit vector operations. Use these when the register has been allocated
* with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
* OPRSZ = MAXSZ = 8.
*/
void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);

View File

@ -73,7 +73,8 @@ static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
TCGTemp *at = tcgv_vec_temp(a); TCGTemp *at = tcgv_vec_temp(a);
TCGType type = rt->base_type; TCGType type = rt->base_type;
tcg_debug_assert(at->base_type == type); /* Must enough inputs for the output. */
tcg_debug_assert(at->base_type >= type);
vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at)); vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
} }
@ -85,8 +86,9 @@ static void vec_gen_op3(TCGOpcode opc, unsigned vece,
TCGTemp *bt = tcgv_vec_temp(b); TCGTemp *bt = tcgv_vec_temp(b);
TCGType type = rt->base_type; TCGType type = rt->base_type;
tcg_debug_assert(at->base_type == type); /* Must enough inputs for the output. */
tcg_debug_assert(bt->base_type == type); tcg_debug_assert(at->base_type >= type);
tcg_debug_assert(bt->base_type >= type);
vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt)); vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
} }
@ -99,7 +101,7 @@ void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
#define MO_REG (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32) #define MO_REG (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a) static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
{ {
TCGTemp *rt = tcgv_vec_temp(r); TCGTemp *rt = tcgv_vec_temp(r);
vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a); vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
@ -108,14 +110,14 @@ static void tcg_gen_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
TCGv_vec tcg_const_zeros_vec(TCGType type) TCGv_vec tcg_const_zeros_vec(TCGType type)
{ {
TCGv_vec ret = tcg_temp_new_vec(type); TCGv_vec ret = tcg_temp_new_vec(type);
tcg_gen_dupi_vec(ret, MO_REG, 0); do_dupi_vec(ret, MO_REG, 0);
return ret; return ret;
} }
TCGv_vec tcg_const_ones_vec(TCGType type) TCGv_vec tcg_const_ones_vec(TCGType type)
{ {
TCGv_vec ret = tcg_temp_new_vec(type); TCGv_vec ret = tcg_temp_new_vec(type);
tcg_gen_dupi_vec(ret, MO_REG, -1); do_dupi_vec(ret, MO_REG, -1);
return ret; return ret;
} }
@ -134,9 +136,9 @@ TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a) void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
{ {
if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) { if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
tcg_gen_dupi_vec(r, MO_32, a); do_dupi_vec(r, MO_32, a);
} else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) { } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
tcg_gen_dupi_vec(r, MO_64, a); do_dupi_vec(r, MO_64, a);
} else { } else {
TCGv_i64 c = tcg_const_i64(a); TCGv_i64 c = tcg_const_i64(a);
tcg_gen_dup_i64_vec(MO_64, r, c); tcg_gen_dup_i64_vec(MO_64, r, c);
@ -146,17 +148,22 @@ void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a) void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
{ {
tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffffffffu) * a); do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
} }
void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a) void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
{ {
tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xffff) * (a & 0xffff)); do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
} }
void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a) void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
{ {
tcg_gen_dupi_vec(r, MO_REG, ((TCGArg)-1 / 0xff) * (a & 0xff)); do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
}
void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
{
do_dupi_vec(r, MO_REG, dup_const(vece, a));
} }
void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a) void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
@ -167,14 +174,14 @@ void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
if (TCG_TARGET_REG_BITS == 64) { if (TCG_TARGET_REG_BITS == 64) {
TCGArg ai = tcgv_i64_arg(a); TCGArg ai = tcgv_i64_arg(a);
vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai); vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
} else if (vece == MO_64) { } else if (vece == MO_64) {
TCGArg al = tcgv_i32_arg(TCGV_LOW(a)); TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a)); TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah); vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
} else { } else {
TCGArg ai = tcgv_i32_arg(TCGV_LOW(a)); TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
vec_gen_2(INDEX_op_dup_vec, type, MO_64, ri, ai); vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
} }
} }

View File

@ -914,6 +914,7 @@ void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup16i_vec(TCGv_vec, uint32_t); void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup32i_vec(TCGv_vec, uint32_t); void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup64i_vec(TCGv_vec, uint64_t); void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);

View File

@ -228,6 +228,12 @@ DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
#if TCG_TARGET_MAYBE_vec
#include "tcg-target.opc.h"
#endif
#undef TLADDR_ARGS #undef TLADDR_ARGS
#undef DATA64_ARGS #undef DATA64_ARGS
#undef IMPL #undef IMPL

View File

@ -1403,10 +1403,10 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_orc_vec: case INDEX_op_orc_vec:
return have_vec && TCG_TARGET_HAS_orc_vec; return have_vec && TCG_TARGET_HAS_orc_vec;
case NB_OPS: default:
break; tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
return true;
} }
g_assert_not_reached();
} }
/* Note: we convert the 64 bit args to 32 bit and do some alignment /* Note: we convert the 64 bit args to 32 bit and do some alignment
@ -3733,3 +3733,10 @@ void tcg_register_jit(void *buf, size_t buf_size)
{ {
} }
#endif /* ELF_HOST_MACHINE */ #endif /* ELF_HOST_MACHINE */
#if !TCG_TARGET_MAYBE_vec
void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...)
{
g_assert_not_reached();
}
#endif

View File

@ -1207,6 +1207,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
void tcg_register_jit(void *buf, size_t buf_size); void tcg_register_jit(void *buf, size_t buf_size);
#if TCG_TARGET_MAYBE_vec
/* Return zero if the tuple (opc, type, vece) is unsupportable;
return > 0 if it is directly supportable;
return < 0 if we must call tcg_expand_vec_op. */
int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned);
#else
static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve)
{
return 0;
}
#endif
/* Expand the tuple (opc, type, vece) on the given arguments. */
void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...);
/* Replicate a constant C accoring to the log2 of the element size. */
uint64_t dup_const(unsigned vece, uint64_t c);
#define dup_const(VECE, C) \
(__builtin_constant_p(VECE) \
? ( (VECE) == MO_8 ? 0x0101010101010101ull * (uint8_t)(C) \
: (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C) \
: (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C) \
: dup_const(VECE, C)) \
: dup_const(VECE, C))
/* /*
* Memory helpers that will be used by TCG generated code. * Memory helpers that will be used by TCG generated code.
*/ */