target/i386: add core of new i386 decoder
The new decoder is based on three principles:
- use mostly table-driven decoding, using tables derived as much as possible
from the Intel manual. Centralizing the decode the operands makes it
more homogeneous, for example all immediates are signed. All modrm
handling is in one function, and can be shared between SSE and ALU
instructions (including XMM<->GPR instructions). The SSE/AVX decoder
will also not have duplicated code between the 0F, 0F38 and 0F3A tables.
- keep the code as "non-branchy" as possible. Generally, the code for
the new decoder is more verbose, but the control flow is simpler.
Conditionals are not nested and have small bodies. All instruction
groups are resolved even before operands are decoded, and code
generation is separated as much as possible within small functions
that only handle one instruction each.
- keep address generation and (for ALU operands) memory loads and writeback
as much in common code as possible. All ALU operations for example
are implemented as T0=f(T0,T1). For non-ALU instructions,
read-modify-write memory operations are rare, but registers do not
have TCGv equivalents: therefore, the common logic sets up pointer
temporaries with the operands, while load and writeback are handled
by gvec or by helpers.
These principles make future code review and extensibility simpler, at
the cost of having a relatively large amount of code in the form of this
patch. Even EVEX should not be _too_ hard to implement (it's just a crazy
large amount of possibilities).
This patch introduces the main decoder flow, and integrates the old
decoder with the new one. The old decoder takes care of parsing
prefixes and then optionally drops to the new one. The changes to the
old decoder are minimal and allow it to be replaced incrementally with
the new one.
There is a debugging mechanism through a "LIMIT" environment variable.
In user-mode emulation, the variable is the number of instructions
decoded by the new decoder before permanently switching to the old one.
In system emulation, the variable is the highest opcode that is decoded
by the new decoder (this is less friendly, but it's the best that can
be done without requiring deterministic execution).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-23 12:20:55 +03:00
|
|
|
/*
|
|
|
|
* New-style TCG opcode generator for i386 instructions
|
|
|
|
*
|
|
|
|
* Copyright (c) 2022 Red Hat, Inc.
|
|
|
|
*
|
|
|
|
* Author: Paolo Bonzini <pbonzini@redhat.com>
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2022-09-02 17:30:15 +03:00
|
|
|
#define ZMM_OFFSET(reg) offsetof(CPUX86State, xmm_regs[reg])
|
|
|
|
|
|
|
|
typedef void (*SSEFunc_i_ep)(TCGv_i32 val, TCGv_ptr env, TCGv_ptr reg);
|
|
|
|
typedef void (*SSEFunc_l_ep)(TCGv_i64 val, TCGv_ptr env, TCGv_ptr reg);
|
2022-09-06 00:27:53 +03:00
|
|
|
typedef void (*SSEFunc_0_epp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b);
|
2022-09-02 17:30:15 +03:00
|
|
|
typedef void (*SSEFunc_0_eppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_ptr reg_c);
|
|
|
|
typedef void (*SSEFunc_0_epppp)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_ptr reg_c, TCGv_ptr reg_d);
|
|
|
|
typedef void (*SSEFunc_0_eppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_i32 val);
|
|
|
|
typedef void (*SSEFunc_0_epppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_ptr reg_c, TCGv_i32 val);
|
|
|
|
typedef void (*SSEFunc_0_ppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_i32 val);
|
|
|
|
typedef void (*SSEFunc_0_pppi)(TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c,
|
|
|
|
TCGv_i32 val);
|
|
|
|
typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv val);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale);
|
2022-10-19 14:22:06 +03:00
|
|
|
typedef void (*SSEFunc_0_eppppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 flags);
|
|
|
|
typedef void (*SSEFunc_0_eppppii)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b,
|
|
|
|
TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 even,
|
|
|
|
TCGv_i32 odd);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
|
2022-09-02 19:19:06 +03:00
|
|
|
static inline TCGv_i32 tcg_constant8u_i32(uint8_t val)
|
|
|
|
{
|
|
|
|
return tcg_constant_i32(val);
|
|
|
|
}
|
|
|
|
|
2022-09-18 01:43:52 +03:00
|
|
|
static void gen_NM_exception(DisasContext *s)
|
|
|
|
{
|
|
|
|
gen_exception(s, EXCP07_PREX);
|
|
|
|
}
|
|
|
|
|
target/i386: add core of new i386 decoder
The new decoder is based on three principles:
- use mostly table-driven decoding, using tables derived as much as possible
from the Intel manual. Centralizing the decode the operands makes it
more homogeneous, for example all immediates are signed. All modrm
handling is in one function, and can be shared between SSE and ALU
instructions (including XMM<->GPR instructions). The SSE/AVX decoder
will also not have duplicated code between the 0F, 0F38 and 0F3A tables.
- keep the code as "non-branchy" as possible. Generally, the code for
the new decoder is more verbose, but the control flow is simpler.
Conditionals are not nested and have small bodies. All instruction
groups are resolved even before operands are decoded, and code
generation is separated as much as possible within small functions
that only handle one instruction each.
- keep address generation and (for ALU operands) memory loads and writeback
as much in common code as possible. All ALU operations for example
are implemented as T0=f(T0,T1). For non-ALU instructions,
read-modify-write memory operations are rare, but registers do not
have TCGv equivalents: therefore, the common logic sets up pointer
temporaries with the operands, while load and writeback are handled
by gvec or by helpers.
These principles make future code review and extensibility simpler, at
the cost of having a relatively large amount of code in the form of this
patch. Even EVEX should not be _too_ hard to implement (it's just a crazy
large amount of possibilities).
This patch introduces the main decoder flow, and integrates the old
decoder with the new one. The old decoder takes care of parsing
prefixes and then optionally drops to the new one. The changes to the
old decoder are minimal and allow it to be replaced incrementally with
the new one.
There is a debugging mechanism through a "LIMIT" environment variable.
In user-mode emulation, the variable is the number of instructions
decoded by the new decoder before permanently switching to the old one.
In system emulation, the variable is the highest opcode that is decoded
by the new decoder (this is less friendly, but it's the best that can
be done without requiring deterministic execution).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-23 12:20:55 +03:00
|
|
|
static void gen_illegal(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
}
|
|
|
|
|
2022-09-18 01:43:52 +03:00
|
|
|
static void gen_load_ea(DisasContext *s, AddressParts *mem, bool is_vsib)
|
target/i386: add core of new i386 decoder
The new decoder is based on three principles:
- use mostly table-driven decoding, using tables derived as much as possible
from the Intel manual. Centralizing the decode the operands makes it
more homogeneous, for example all immediates are signed. All modrm
handling is in one function, and can be shared between SSE and ALU
instructions (including XMM<->GPR instructions). The SSE/AVX decoder
will also not have duplicated code between the 0F, 0F38 and 0F3A tables.
- keep the code as "non-branchy" as possible. Generally, the code for
the new decoder is more verbose, but the control flow is simpler.
Conditionals are not nested and have small bodies. All instruction
groups are resolved even before operands are decoded, and code
generation is separated as much as possible within small functions
that only handle one instruction each.
- keep address generation and (for ALU operands) memory loads and writeback
as much in common code as possible. All ALU operations for example
are implemented as T0=f(T0,T1). For non-ALU instructions,
read-modify-write memory operations are rare, but registers do not
have TCGv equivalents: therefore, the common logic sets up pointer
temporaries with the operands, while load and writeback are handled
by gvec or by helpers.
These principles make future code review and extensibility simpler, at
the cost of having a relatively large amount of code in the form of this
patch. Even EVEX should not be _too_ hard to implement (it's just a crazy
large amount of possibilities).
This patch introduces the main decoder flow, and integrates the old
decoder with the new one. The old decoder takes care of parsing
prefixes and then optionally drops to the new one. The changes to the
old decoder are minimal and allow it to be replaced incrementally with
the new one.
There is a debugging mechanism through a "LIMIT" environment variable.
In user-mode emulation, the variable is the number of instructions
decoded by the new decoder before permanently switching to the old one.
In system emulation, the variable is the highest opcode that is decoded
by the new decoder (this is less friendly, but it's the best that can
be done without requiring deterministic execution).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-23 12:20:55 +03:00
|
|
|
{
|
2022-09-18 01:43:52 +03:00
|
|
|
TCGv ea = gen_lea_modrm_1(s, *mem, is_vsib);
|
target/i386: add core of new i386 decoder
The new decoder is based on three principles:
- use mostly table-driven decoding, using tables derived as much as possible
from the Intel manual. Centralizing the decode the operands makes it
more homogeneous, for example all immediates are signed. All modrm
handling is in one function, and can be shared between SSE and ALU
instructions (including XMM<->GPR instructions). The SSE/AVX decoder
will also not have duplicated code between the 0F, 0F38 and 0F3A tables.
- keep the code as "non-branchy" as possible. Generally, the code for
the new decoder is more verbose, but the control flow is simpler.
Conditionals are not nested and have small bodies. All instruction
groups are resolved even before operands are decoded, and code
generation is separated as much as possible within small functions
that only handle one instruction each.
- keep address generation and (for ALU operands) memory loads and writeback
as much in common code as possible. All ALU operations for example
are implemented as T0=f(T0,T1). For non-ALU instructions,
read-modify-write memory operations are rare, but registers do not
have TCGv equivalents: therefore, the common logic sets up pointer
temporaries with the operands, while load and writeback are handled
by gvec or by helpers.
These principles make future code review and extensibility simpler, at
the cost of having a relatively large amount of code in the form of this
patch. Even EVEX should not be _too_ hard to implement (it's just a crazy
large amount of possibilities).
This patch introduces the main decoder flow, and integrates the old
decoder with the new one. The old decoder takes care of parsing
prefixes and then optionally drops to the new one. The changes to the
old decoder are minimal and allow it to be replaced incrementally with
the new one.
There is a debugging mechanism through a "LIMIT" environment variable.
In user-mode emulation, the variable is the number of instructions
decoded by the new decoder before permanently switching to the old one.
In system emulation, the variable is the highest opcode that is decoded
by the new decoder (this is less friendly, but it's the best that can
be done without requiring deterministic execution).
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-08-23 12:20:55 +03:00
|
|
|
gen_lea_v_seg(s, s->aflag, ea, mem->def_seg, s->override);
|
|
|
|
}
|
2022-08-23 15:55:56 +03:00
|
|
|
|
|
|
|
static inline int mmx_offset(MemOp ot)
|
|
|
|
{
|
|
|
|
switch (ot) {
|
|
|
|
case MO_8:
|
|
|
|
return offsetof(MMXReg, MMX_B(0));
|
|
|
|
case MO_16:
|
|
|
|
return offsetof(MMXReg, MMX_W(0));
|
|
|
|
case MO_32:
|
|
|
|
return offsetof(MMXReg, MMX_L(0));
|
|
|
|
case MO_64:
|
|
|
|
return offsetof(MMXReg, MMX_Q(0));
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int xmm_offset(MemOp ot)
|
|
|
|
{
|
|
|
|
switch (ot) {
|
|
|
|
case MO_8:
|
|
|
|
return offsetof(ZMMReg, ZMM_B(0));
|
|
|
|
case MO_16:
|
|
|
|
return offsetof(ZMMReg, ZMM_W(0));
|
|
|
|
case MO_32:
|
|
|
|
return offsetof(ZMMReg, ZMM_L(0));
|
|
|
|
case MO_64:
|
|
|
|
return offsetof(ZMMReg, ZMM_Q(0));
|
|
|
|
case MO_128:
|
|
|
|
return offsetof(ZMMReg, ZMM_X(0));
|
|
|
|
case MO_256:
|
|
|
|
return offsetof(ZMMReg, ZMM_Y(0));
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-20 12:42:45 +03:00
|
|
|
static int vector_reg_offset(X86DecodedOp *op)
|
|
|
|
{
|
|
|
|
assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE);
|
|
|
|
|
|
|
|
if (op->unit == X86_OP_MMX) {
|
|
|
|
return op->offset - mmx_offset(op->ot);
|
|
|
|
} else {
|
|
|
|
return op->offset - xmm_offset(op->ot);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n)
|
|
|
|
{
|
|
|
|
int base_ofs = vector_reg_offset(op);
|
|
|
|
switch(ot) {
|
|
|
|
case MO_8:
|
|
|
|
if (op->unit == X86_OP_MMX) {
|
|
|
|
return base_ofs + offsetof(MMXReg, MMX_B(n));
|
|
|
|
} else {
|
|
|
|
return base_ofs + offsetof(ZMMReg, ZMM_B(n));
|
|
|
|
}
|
|
|
|
case MO_16:
|
|
|
|
if (op->unit == X86_OP_MMX) {
|
|
|
|
return base_ofs + offsetof(MMXReg, MMX_W(n));
|
|
|
|
} else {
|
|
|
|
return base_ofs + offsetof(ZMMReg, ZMM_W(n));
|
|
|
|
}
|
|
|
|
case MO_32:
|
|
|
|
if (op->unit == X86_OP_MMX) {
|
|
|
|
return base_ofs + offsetof(MMXReg, MMX_L(n));
|
|
|
|
} else {
|
|
|
|
return base_ofs + offsetof(ZMMReg, ZMM_L(n));
|
|
|
|
}
|
|
|
|
case MO_64:
|
|
|
|
if (op->unit == X86_OP_MMX) {
|
|
|
|
return base_ofs;
|
|
|
|
} else {
|
|
|
|
return base_ofs + offsetof(ZMMReg, ZMM_Q(n));
|
|
|
|
}
|
|
|
|
case MO_128:
|
|
|
|
assert(op->unit == X86_OP_SSE);
|
|
|
|
return base_ofs + offsetof(ZMMReg, ZMM_X(n));
|
|
|
|
case MO_256:
|
|
|
|
assert(op->unit == X86_OP_SSE);
|
|
|
|
return base_ofs + offsetof(ZMMReg, ZMM_Y(n));
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-23 15:55:56 +03:00
|
|
|
static void compute_mmx_offset(X86DecodedOp *op)
|
|
|
|
{
|
|
|
|
if (!op->has_ea) {
|
|
|
|
op->offset = offsetof(CPUX86State, fpregs[op->n].mmx) + mmx_offset(op->ot);
|
|
|
|
} else {
|
|
|
|
op->offset = offsetof(CPUX86State, mmx_t0) + mmx_offset(op->ot);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void compute_xmm_offset(X86DecodedOp *op)
|
|
|
|
{
|
|
|
|
if (!op->has_ea) {
|
|
|
|
op->offset = ZMM_OFFSET(op->n) + xmm_offset(op->ot);
|
|
|
|
} else {
|
|
|
|
op->offset = offsetof(CPUX86State, xmm_t0) + xmm_offset(op->ot);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_load_sse(DisasContext *s, TCGv temp, MemOp ot, int dest_ofs, bool aligned)
|
|
|
|
{
|
|
|
|
switch(ot) {
|
|
|
|
case MO_8:
|
|
|
|
gen_op_ld_v(s, MO_8, temp, s->A0);
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st8_tl(temp, tcg_env, dest_ofs);
|
2022-08-23 15:55:56 +03:00
|
|
|
break;
|
|
|
|
case MO_16:
|
|
|
|
gen_op_ld_v(s, MO_16, temp, s->A0);
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st16_tl(temp, tcg_env, dest_ofs);
|
2022-08-23 15:55:56 +03:00
|
|
|
break;
|
|
|
|
case MO_32:
|
|
|
|
gen_op_ld_v(s, MO_32, temp, s->A0);
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st32_tl(temp, tcg_env, dest_ofs);
|
2022-08-23 15:55:56 +03:00
|
|
|
break;
|
|
|
|
case MO_64:
|
|
|
|
gen_ldq_env_A0(s, dest_ofs);
|
|
|
|
break;
|
|
|
|
case MO_128:
|
|
|
|
gen_ldo_env_A0(s, dest_ofs, aligned);
|
|
|
|
break;
|
|
|
|
case MO_256:
|
|
|
|
gen_ldy_env_A0(s, dest_ofs, aligned);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-18 01:43:52 +03:00
|
|
|
static bool sse_needs_alignment(DisasContext *s, X86DecodedInsn *decode, MemOp ot)
|
|
|
|
{
|
|
|
|
switch (decode->e.vex_class) {
|
|
|
|
case 2:
|
|
|
|
case 4:
|
|
|
|
if ((s->prefix & PREFIX_VEX) ||
|
|
|
|
decode->e.vex_special == X86_VEX_SSEUnaligned) {
|
|
|
|
/* MOST legacy SSE instructions require aligned memory operands, but not all. */
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
/* fall through */
|
|
|
|
case 1:
|
|
|
|
return ot >= MO_128;
|
|
|
|
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-23 15:55:56 +03:00
|
|
|
static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
|
|
|
|
{
|
|
|
|
X86DecodedOp *op = &decode->op[opn];
|
|
|
|
|
|
|
|
switch (op->unit) {
|
|
|
|
case X86_OP_SKIP:
|
|
|
|
return;
|
|
|
|
case X86_OP_SEG:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld32u_tl(v, tcg_env,
|
2022-08-23 15:55:56 +03:00
|
|
|
offsetof(CPUX86State,segs[op->n].selector));
|
|
|
|
break;
|
|
|
|
case X86_OP_CR:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, cr[op->n]));
|
2022-08-23 15:55:56 +03:00
|
|
|
break;
|
|
|
|
case X86_OP_DR:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_tl(v, tcg_env, offsetof(CPUX86State, dr[op->n]));
|
2022-08-23 15:55:56 +03:00
|
|
|
break;
|
|
|
|
case X86_OP_INT:
|
|
|
|
if (op->has_ea) {
|
|
|
|
gen_op_ld_v(s, op->ot, v, s->A0);
|
|
|
|
} else {
|
|
|
|
gen_op_mov_v_reg(s, op->ot, v, op->n);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case X86_OP_IMM:
|
|
|
|
tcg_gen_movi_tl(v, decode->immediate);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case X86_OP_MMX:
|
|
|
|
compute_mmx_offset(op);
|
|
|
|
goto load_vector;
|
|
|
|
|
|
|
|
case X86_OP_SSE:
|
|
|
|
compute_xmm_offset(op);
|
|
|
|
load_vector:
|
|
|
|
if (op->has_ea) {
|
2022-09-18 01:43:52 +03:00
|
|
|
bool aligned = sse_needs_alignment(s, decode, op->ot);
|
|
|
|
gen_load_sse(s, v, op->ot, op->offset, aligned);
|
2022-08-23 15:55:56 +03:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-20 12:42:45 +03:00
|
|
|
static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn)
|
|
|
|
{
|
|
|
|
X86DecodedOp *op = &decode->op[opn];
|
|
|
|
if (op->v_ptr) {
|
|
|
|
return op->v_ptr;
|
|
|
|
}
|
|
|
|
op->v_ptr = tcg_temp_new_ptr();
|
|
|
|
|
|
|
|
/* The temporary points to the MMXReg or ZMMReg. */
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_addi_ptr(op->v_ptr, tcg_env, vector_reg_offset(op));
|
2022-09-20 12:42:45 +03:00
|
|
|
return op->v_ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define OP_PTR0 op_ptr(decode, 0)
|
|
|
|
#define OP_PTR1 op_ptr(decode, 1)
|
|
|
|
#define OP_PTR2 op_ptr(decode, 2)
|
|
|
|
|
2022-08-23 15:55:56 +03:00
|
|
|
static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v)
|
|
|
|
{
|
|
|
|
X86DecodedOp *op = &decode->op[opn];
|
|
|
|
switch (op->unit) {
|
|
|
|
case X86_OP_SKIP:
|
|
|
|
break;
|
|
|
|
case X86_OP_SEG:
|
|
|
|
/* Note that gen_movl_seg_T0 takes care of interrupt shadow and TF. */
|
|
|
|
gen_movl_seg_T0(s, op->n);
|
|
|
|
break;
|
|
|
|
case X86_OP_INT:
|
|
|
|
if (op->has_ea) {
|
|
|
|
gen_op_st_v(s, op->ot, v, s->A0);
|
|
|
|
} else {
|
|
|
|
gen_op_mov_reg_v(s, op->ot, op->n, v);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case X86_OP_MMX:
|
2022-09-18 01:43:52 +03:00
|
|
|
break;
|
2022-08-23 15:55:56 +03:00
|
|
|
case X86_OP_SSE:
|
2022-10-19 14:22:06 +03:00
|
|
|
if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot <= MO_128) {
|
2022-09-18 01:43:52 +03:00
|
|
|
tcg_gen_gvec_dup_imm(MO_64,
|
|
|
|
offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)),
|
|
|
|
16, 16, 0);
|
|
|
|
}
|
2022-08-23 15:55:56 +03:00
|
|
|
break;
|
|
|
|
case X86_OP_CR:
|
|
|
|
case X86_OP_DR:
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
2022-08-24 19:01:41 +03:00
|
|
|
|
2022-09-20 12:42:45 +03:00
|
|
|
static inline int vector_len(DisasContext *s, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
if (decode->e.special == X86_SPECIAL_MMX &&
|
|
|
|
!(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) {
|
|
|
|
return 8;
|
|
|
|
}
|
|
|
|
return s->vex_l ? 32 : 16;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
bool aligned = sse_needs_alignment(s, decode, ot);
|
|
|
|
|
|
|
|
if (!decode->op[0].has_ea) {
|
|
|
|
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (ot) {
|
|
|
|
case MO_64:
|
|
|
|
gen_stq_env_A0(s, src_ofs);
|
|
|
|
break;
|
|
|
|
case MO_128:
|
|
|
|
gen_sto_env_A0(s, src_ofs, aligned);
|
|
|
|
break;
|
|
|
|
case MO_256:
|
|
|
|
gen_sty_env_A0(s, src_ofs, aligned);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-06 00:27:53 +03:00
|
|
|
static void gen_helper_pavgusb(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b)
|
|
|
|
{
|
|
|
|
gen_helper_pavgb_mmx(env, reg_a, reg_a, reg_b);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define FN_3DNOW_MOVE ((SSEFunc_0_epp) (uintptr_t) 1)
|
|
|
|
static const SSEFunc_0_epp fns_3dnow[] = {
|
|
|
|
[0x0c] = gen_helper_pi2fw,
|
|
|
|
[0x0d] = gen_helper_pi2fd,
|
|
|
|
[0x1c] = gen_helper_pf2iw,
|
|
|
|
[0x1d] = gen_helper_pf2id,
|
|
|
|
[0x8a] = gen_helper_pfnacc,
|
|
|
|
[0x8e] = gen_helper_pfpnacc,
|
|
|
|
[0x90] = gen_helper_pfcmpge,
|
|
|
|
[0x94] = gen_helper_pfmin,
|
|
|
|
[0x96] = gen_helper_pfrcp,
|
|
|
|
[0x97] = gen_helper_pfrsqrt,
|
|
|
|
[0x9a] = gen_helper_pfsub,
|
|
|
|
[0x9e] = gen_helper_pfadd,
|
|
|
|
[0xa0] = gen_helper_pfcmpgt,
|
|
|
|
[0xa4] = gen_helper_pfmax,
|
|
|
|
[0xa6] = FN_3DNOW_MOVE, /* PFRCPIT1; no need to actually increase precision */
|
|
|
|
[0xa7] = FN_3DNOW_MOVE, /* PFRSQIT1 */
|
|
|
|
[0xb6] = FN_3DNOW_MOVE, /* PFRCPIT2 */
|
|
|
|
[0xaa] = gen_helper_pfsubr,
|
|
|
|
[0xae] = gen_helper_pfacc,
|
|
|
|
[0xb0] = gen_helper_pfcmpeq,
|
|
|
|
[0xb4] = gen_helper_pfmul,
|
|
|
|
[0xb7] = gen_helper_pmulhrw_mmx,
|
|
|
|
[0xbb] = gen_helper_pswapd,
|
|
|
|
[0xbf] = gen_helper_pavgusb,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void gen_3dnow(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
uint8_t b = decode->immediate;
|
|
|
|
SSEFunc_0_epp fn = b < ARRAY_SIZE(fns_3dnow) ? fns_3dnow[b] : NULL;
|
|
|
|
|
|
|
|
if (!fn) {
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (s->flags & HF_TS_MASK) {
|
|
|
|
gen_NM_exception(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (s->flags & HF_EM_MASK) {
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_enter_mmx(tcg_env);
|
2022-09-06 00:27:53 +03:00
|
|
|
if (fn == FN_3DNOW_MOVE) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset);
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset);
|
2022-09-06 00:27:53 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1);
|
2022-09-06 00:27:53 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
/*
|
|
|
|
* 00 = v*ps Vps, Hps, Wpd
|
|
|
|
* 66 = v*pd Vpd, Hpd, Wps
|
|
|
|
* f3 = v*ss Vss, Hss, Wps
|
|
|
|
* f2 = v*sd Vsd, Hsd, Wps
|
|
|
|
*/
|
|
|
|
static inline void gen_unary_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_epp pd_xmm, SSEFunc_0_epp ps_xmm,
|
|
|
|
SSEFunc_0_epp pd_ymm, SSEFunc_0_epp ps_ymm,
|
|
|
|
SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
|
|
|
|
{
|
|
|
|
if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
|
|
|
|
SSEFunc_0_eppp fn = s->prefix & PREFIX_REPZ ? ss : sd;
|
|
|
|
if (!fn) {
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
return;
|
|
|
|
}
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
} else {
|
|
|
|
SSEFunc_0_epp ps, pd, fn;
|
|
|
|
ps = s->vex_l ? ps_ymm : ps_xmm;
|
|
|
|
pd = s->vex_l ? pd_ymm : pd_xmm;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? pd : ps;
|
|
|
|
if (!fn) {
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
return;
|
|
|
|
}
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#define UNARY_FP_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_unary_fp_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##pd_xmm, \
|
|
|
|
gen_helper_##lname##ps_xmm, \
|
|
|
|
gen_helper_##lname##pd_ymm, \
|
|
|
|
gen_helper_##lname##ps_ymm, \
|
|
|
|
gen_helper_##lname##sd, \
|
|
|
|
gen_helper_##lname##ss); \
|
|
|
|
}
|
|
|
|
UNARY_FP_SSE(VSQRT, sqrt)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 00 = v*ps Vps, Hps, Wpd
|
|
|
|
* 66 = v*pd Vpd, Hpd, Wps
|
|
|
|
* f3 = v*ss Vss, Hss, Wps
|
|
|
|
* f2 = v*sd Vsd, Hsd, Wps
|
|
|
|
*/
|
|
|
|
static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
|
|
|
|
SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm,
|
|
|
|
SSEFunc_0_eppp sd, SSEFunc_0_eppp ss)
|
|
|
|
{
|
|
|
|
SSEFunc_0_eppp ps, pd, fn;
|
|
|
|
if ((s->prefix & (PREFIX_REPZ | PREFIX_REPNZ)) != 0) {
|
|
|
|
fn = s->prefix & PREFIX_REPZ ? ss : sd;
|
|
|
|
} else {
|
|
|
|
ps = s->vex_l ? ps_ymm : ps_xmm;
|
|
|
|
pd = s->vex_l ? pd_ymm : pd_xmm;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? pd : ps;
|
|
|
|
}
|
|
|
|
if (fn) {
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
} else {
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
}
|
|
|
|
}
|
2022-09-18 00:22:36 +03:00
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
#define FP_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_fp_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##pd_xmm, \
|
|
|
|
gen_helper_##lname##ps_xmm, \
|
|
|
|
gen_helper_##lname##pd_ymm, \
|
|
|
|
gen_helper_##lname##ps_ymm, \
|
|
|
|
gen_helper_##lname##sd, \
|
|
|
|
gen_helper_##lname##ss); \
|
|
|
|
}
|
|
|
|
FP_SSE(VADD, add)
|
|
|
|
FP_SSE(VMUL, mul)
|
|
|
|
FP_SSE(VSUB, sub)
|
|
|
|
FP_SSE(VMIN, min)
|
|
|
|
FP_SSE(VDIV, div)
|
|
|
|
FP_SSE(VMAX, max)
|
|
|
|
|
2022-10-19 14:22:06 +03:00
|
|
|
#define FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, even, odd) \
|
|
|
|
static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
SSEFunc_0_eppppii xmm = s->vex_w ? gen_helper_fma4pd_xmm : gen_helper_fma4ps_xmm; \
|
|
|
|
SSEFunc_0_eppppii ymm = s->vex_w ? gen_helper_fma4pd_ymm : gen_helper_fma4ps_ymm; \
|
|
|
|
SSEFunc_0_eppppii fn = s->vex_l ? ymm : xmm; \
|
|
|
|
\
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2, \
|
2022-10-19 14:22:06 +03:00
|
|
|
tcg_constant_i32(even), \
|
|
|
|
tcg_constant_i32((even) ^ (odd))); \
|
|
|
|
}
|
|
|
|
|
|
|
|
#define FMA_SSE(uname, ptr0, ptr1, ptr2, flags) \
|
|
|
|
FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, flags, flags) \
|
|
|
|
static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
SSEFunc_0_eppppi fn = s->vex_w ? gen_helper_fma4sd : gen_helper_fma4ss; \
|
|
|
|
\
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, ptr0, ptr1, ptr2, \
|
2022-10-19 14:22:06 +03:00
|
|
|
tcg_constant_i32(flags)); \
|
|
|
|
} \
|
|
|
|
|
|
|
|
FMA_SSE(VFMADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0)
|
|
|
|
FMA_SSE(VFMADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0)
|
|
|
|
FMA_SSE(VFMADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0)
|
|
|
|
|
|
|
|
FMA_SSE(VFNMADD231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_product)
|
|
|
|
FMA_SSE(VFNMADD213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_product)
|
|
|
|
FMA_SSE(VFNMADD132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_product)
|
|
|
|
|
|
|
|
FMA_SSE(VFMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c)
|
|
|
|
FMA_SSE(VFMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c)
|
|
|
|
FMA_SSE(VFMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c)
|
|
|
|
|
|
|
|
FMA_SSE(VFNMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c|float_muladd_negate_product)
|
|
|
|
FMA_SSE(VFNMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c|float_muladd_negate_product)
|
|
|
|
FMA_SSE(VFNMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c|float_muladd_negate_product)
|
|
|
|
|
|
|
|
FMA_SSE_PACKED(VFMADDSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c, 0)
|
|
|
|
FMA_SSE_PACKED(VFMADDSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c, 0)
|
|
|
|
FMA_SSE_PACKED(VFMADDSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c, 0)
|
|
|
|
|
|
|
|
FMA_SSE_PACKED(VFMSUBADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0, float_muladd_negate_c)
|
|
|
|
FMA_SSE_PACKED(VFMSUBADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0, float_muladd_negate_c)
|
|
|
|
FMA_SSE_PACKED(VFMSUBADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0, float_muladd_negate_c)
|
|
|
|
|
2022-09-18 00:22:36 +03:00
|
|
|
#define FP_UNPACK_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
/* PS maps to the DQ integer instruction, PD maps to QDQ. */ \
|
|
|
|
gen_fp_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##qdq_xmm, \
|
|
|
|
gen_helper_##lname##dq_xmm, \
|
|
|
|
gen_helper_##lname##qdq_ymm, \
|
|
|
|
gen_helper_##lname##dq_ymm, \
|
|
|
|
NULL, NULL); \
|
|
|
|
}
|
|
|
|
FP_UNPACK_SSE(VUNPCKLPx, punpckl)
|
|
|
|
FP_UNPACK_SSE(VUNPCKHPx, punpckh)
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
/*
|
|
|
|
* 00 = v*ps Vps, Wpd
|
|
|
|
* f3 = v*ss Vss, Wps
|
|
|
|
*/
|
|
|
|
static inline void gen_unary_fp32_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_epp ps_xmm,
|
|
|
|
SSEFunc_0_epp ps_ymm,
|
|
|
|
SSEFunc_0_eppp ss)
|
|
|
|
{
|
|
|
|
if ((s->prefix & (PREFIX_DATA | PREFIX_REPNZ)) != 0) {
|
|
|
|
goto illegal_op;
|
|
|
|
} else if (s->prefix & PREFIX_REPZ) {
|
|
|
|
if (!ss) {
|
|
|
|
goto illegal_op;
|
|
|
|
}
|
2023-09-14 02:37:36 +03:00
|
|
|
ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
} else {
|
|
|
|
SSEFunc_0_epp fn = s->vex_l ? ps_ymm : ps_xmm;
|
|
|
|
if (!fn) {
|
|
|
|
goto illegal_op;
|
|
|
|
}
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
|
|
|
|
illegal_op:
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
}
|
|
|
|
#define UNARY_FP32_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_unary_fp32_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##ps_xmm, \
|
|
|
|
gen_helper_##lname##ps_ymm, \
|
|
|
|
gen_helper_##lname##ss); \
|
|
|
|
}
|
|
|
|
UNARY_FP32_SSE(VRSQRT, rsqrt)
|
|
|
|
UNARY_FP32_SSE(VRCP, rcp)
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
/*
|
|
|
|
* 66 = v*pd Vpd, Hpd, Wpd
|
|
|
|
* f2 = v*ps Vps, Hps, Wps
|
|
|
|
*/
|
|
|
|
static inline void gen_horizontal_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_eppp pd_xmm, SSEFunc_0_eppp ps_xmm,
|
|
|
|
SSEFunc_0_eppp pd_ymm, SSEFunc_0_eppp ps_ymm)
|
|
|
|
{
|
|
|
|
SSEFunc_0_eppp ps, pd, fn;
|
|
|
|
ps = s->vex_l ? ps_ymm : ps_xmm;
|
|
|
|
pd = s->vex_l ? pd_ymm : pd_xmm;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? pd : ps;
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
#define HORIZONTAL_FP_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_horizontal_fp_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##pd_xmm, gen_helper_##lname##ps_xmm, \
|
|
|
|
gen_helper_##lname##pd_ymm, gen_helper_##lname##ps_ymm); \
|
|
|
|
}
|
|
|
|
HORIZONTAL_FP_SSE(VHADD, hadd)
|
|
|
|
HORIZONTAL_FP_SSE(VHSUB, hsub)
|
2022-09-01 15:27:55 +03:00
|
|
|
HORIZONTAL_FP_SSE(VADDSUB, addsub)
|
2022-09-01 15:27:55 +03:00
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static inline void gen_ternary_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
int op3, SSEFunc_0_epppp xmm, SSEFunc_0_epppp ymm)
|
|
|
|
{
|
|
|
|
SSEFunc_0_epppp fn = s->vex_l ? ymm : xmm;
|
|
|
|
TCGv_ptr ptr3 = tcg_temp_new_ptr();
|
|
|
|
|
|
|
|
/* The format of the fourth input is Lx */
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_addi_ptr(ptr3, tcg_env, ZMM_OFFSET(op3));
|
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, ptr3);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
#define TERNARY_SSE(uname, uvname, lname) \
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_##uvname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_ternary_sse(s, env, decode, (uint8_t)decode->immediate >> 4, \
|
|
|
|
gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
} \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_ternary_sse(s, env, decode, 0, \
|
|
|
|
gen_helper_##lname##_xmm, gen_helper_##lname##_ymm); \
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
TERNARY_SSE(BLENDVPS, VBLENDVPS, blendvps)
|
|
|
|
TERNARY_SSE(BLENDVPD, VBLENDVPD, blendvpd)
|
|
|
|
TERNARY_SSE(PBLENDVB, VPBLENDVB, pblendvb)
|
2022-09-06 11:34:11 +03:00
|
|
|
|
|
|
|
static inline void gen_binary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_epppi xmm, SSEFunc_0_epppi ymm)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
if (!s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define BINARY_IMM_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_binary_imm_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##_xmm, \
|
|
|
|
gen_helper_##lname##_ymm); \
|
|
|
|
}
|
|
|
|
|
|
|
|
BINARY_IMM_SSE(VBLENDPD, blendpd)
|
|
|
|
BINARY_IMM_SSE(VBLENDPS, blendps)
|
|
|
|
BINARY_IMM_SSE(VPBLENDW, pblendw)
|
|
|
|
BINARY_IMM_SSE(VDDPS, dpps)
|
|
|
|
#define gen_helper_dppd_ymm NULL
|
|
|
|
BINARY_IMM_SSE(VDDPD, dppd)
|
|
|
|
BINARY_IMM_SSE(VMPSADBW, mpsadbw)
|
|
|
|
BINARY_IMM_SSE(PCLMULQDQ, pclmulqdq)
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
|
|
|
|
#define UNARY_INT_GVEC(uname, func, ...) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
int vec_len = vector_len(s, decode); \
|
|
|
|
\
|
|
|
|
func(__VA_ARGS__, decode->op[0].offset, \
|
|
|
|
decode->op[2].offset, vec_len, vec_len); \
|
|
|
|
}
|
|
|
|
UNARY_INT_GVEC(PABSB, tcg_gen_gvec_abs, MO_8)
|
|
|
|
UNARY_INT_GVEC(PABSW, tcg_gen_gvec_abs, MO_16)
|
|
|
|
UNARY_INT_GVEC(PABSD, tcg_gen_gvec_abs, MO_32)
|
|
|
|
UNARY_INT_GVEC(VBROADCASTx128, tcg_gen_gvec_dup_mem, MO_128)
|
|
|
|
UNARY_INT_GVEC(VPBROADCASTB, tcg_gen_gvec_dup_mem, MO_8)
|
|
|
|
UNARY_INT_GVEC(VPBROADCASTW, tcg_gen_gvec_dup_mem, MO_16)
|
|
|
|
UNARY_INT_GVEC(VPBROADCASTD, tcg_gen_gvec_dup_mem, MO_32)
|
|
|
|
UNARY_INT_GVEC(VPBROADCASTQ, tcg_gen_gvec_dup_mem, MO_64)
|
|
|
|
|
|
|
|
|
2022-09-20 12:42:45 +03:00
|
|
|
#define BINARY_INT_GVEC(uname, func, ...) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
int vec_len = vector_len(s, decode); \
|
|
|
|
\
|
|
|
|
func(__VA_ARGS__, \
|
|
|
|
decode->op[0].offset, decode->op[1].offset, \
|
|
|
|
decode->op[2].offset, vec_len, vec_len); \
|
|
|
|
}
|
|
|
|
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(PADDB, tcg_gen_gvec_add, MO_8)
|
|
|
|
BINARY_INT_GVEC(PADDW, tcg_gen_gvec_add, MO_16)
|
|
|
|
BINARY_INT_GVEC(PADDD, tcg_gen_gvec_add, MO_32)
|
2022-09-01 15:27:55 +03:00
|
|
|
BINARY_INT_GVEC(PADDQ, tcg_gen_gvec_add, MO_64)
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(PADDSB, tcg_gen_gvec_ssadd, MO_8)
|
|
|
|
BINARY_INT_GVEC(PADDSW, tcg_gen_gvec_ssadd, MO_16)
|
|
|
|
BINARY_INT_GVEC(PADDUSB, tcg_gen_gvec_usadd, MO_8)
|
|
|
|
BINARY_INT_GVEC(PADDUSW, tcg_gen_gvec_usadd, MO_16)
|
|
|
|
BINARY_INT_GVEC(PAND, tcg_gen_gvec_and, MO_64)
|
2022-09-02 19:19:06 +03:00
|
|
|
BINARY_INT_GVEC(PCMPEQB, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_8)
|
|
|
|
BINARY_INT_GVEC(PCMPEQD, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_32)
|
|
|
|
BINARY_INT_GVEC(PCMPEQW, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_16)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PCMPEQQ, tcg_gen_gvec_cmp, TCG_COND_EQ, MO_64)
|
2022-09-20 12:42:45 +03:00
|
|
|
BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8)
|
|
|
|
BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16)
|
|
|
|
BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PCMPGTQ, tcg_gen_gvec_cmp, TCG_COND_GT, MO_64)
|
|
|
|
BINARY_INT_GVEC(PMAXSB, tcg_gen_gvec_smax, MO_8)
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(PMAXSW, tcg_gen_gvec_smax, MO_16)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PMAXSD, tcg_gen_gvec_smax, MO_32)
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(PMAXUB, tcg_gen_gvec_umax, MO_8)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PMAXUW, tcg_gen_gvec_umax, MO_16)
|
|
|
|
BINARY_INT_GVEC(PMAXUD, tcg_gen_gvec_umax, MO_32)
|
|
|
|
BINARY_INT_GVEC(PMINSB, tcg_gen_gvec_smin, MO_8)
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(PMINSW, tcg_gen_gvec_smin, MO_16)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PMINSD, tcg_gen_gvec_smin, MO_32)
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(PMINUB, tcg_gen_gvec_umin, MO_8)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PMINUW, tcg_gen_gvec_umin, MO_16)
|
|
|
|
BINARY_INT_GVEC(PMINUD, tcg_gen_gvec_umin, MO_32)
|
2022-09-01 15:27:55 +03:00
|
|
|
BINARY_INT_GVEC(PMULLW, tcg_gen_gvec_mul, MO_16)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_GVEC(PMULLD, tcg_gen_gvec_mul, MO_32)
|
2022-09-05 16:39:36 +03:00
|
|
|
BINARY_INT_GVEC(POR, tcg_gen_gvec_or, MO_64)
|
|
|
|
BINARY_INT_GVEC(PSUBB, tcg_gen_gvec_sub, MO_8)
|
|
|
|
BINARY_INT_GVEC(PSUBW, tcg_gen_gvec_sub, MO_16)
|
|
|
|
BINARY_INT_GVEC(PSUBD, tcg_gen_gvec_sub, MO_32)
|
|
|
|
BINARY_INT_GVEC(PSUBQ, tcg_gen_gvec_sub, MO_64)
|
|
|
|
BINARY_INT_GVEC(PSUBSB, tcg_gen_gvec_sssub, MO_8)
|
|
|
|
BINARY_INT_GVEC(PSUBSW, tcg_gen_gvec_sssub, MO_16)
|
|
|
|
BINARY_INT_GVEC(PSUBUSB, tcg_gen_gvec_ussub, MO_8)
|
|
|
|
BINARY_INT_GVEC(PSUBUSW, tcg_gen_gvec_ussub, MO_16)
|
|
|
|
BINARY_INT_GVEC(PXOR, tcg_gen_gvec_xor, MO_64)
|
2022-09-20 12:42:45 +03:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 00 = p* Pq, Qq (if mmx not NULL; no VEX)
|
|
|
|
* 66 = vp* Vx, Hx, Wx
|
|
|
|
*
|
|
|
|
* These are really the same encoding, because 1) V is the same as P when VEX.V
|
|
|
|
* is not present 2) P and Q are the same as H and W apart from MM/XMM
|
|
|
|
*/
|
|
|
|
static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm)
|
|
|
|
{
|
|
|
|
assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX));
|
|
|
|
|
|
|
|
if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) {
|
|
|
|
/* VEX encoding is not applicable to MMX instructions. */
|
|
|
|
gen_illegal_opcode(s);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!(s->prefix & PREFIX_DATA)) {
|
2023-09-14 02:37:36 +03:00
|
|
|
mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-20 12:42:45 +03:00
|
|
|
} else if (!s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-20 12:42:45 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-20 12:42:45 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#define BINARY_INT_MMX(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_binary_int_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##_mmx, \
|
|
|
|
gen_helper_##lname##_xmm, \
|
|
|
|
gen_helper_##lname##_ymm); \
|
|
|
|
}
|
|
|
|
BINARY_INT_MMX(PUNPCKLBW, punpcklbw)
|
|
|
|
BINARY_INT_MMX(PUNPCKLWD, punpcklwd)
|
|
|
|
BINARY_INT_MMX(PUNPCKLDQ, punpckldq)
|
|
|
|
BINARY_INT_MMX(PACKSSWB, packsswb)
|
|
|
|
BINARY_INT_MMX(PACKUSWB, packuswb)
|
|
|
|
BINARY_INT_MMX(PUNPCKHBW, punpckhbw)
|
|
|
|
BINARY_INT_MMX(PUNPCKHWD, punpckhwd)
|
|
|
|
BINARY_INT_MMX(PUNPCKHDQ, punpckhdq)
|
|
|
|
BINARY_INT_MMX(PACKSSDW, packssdw)
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
BINARY_INT_MMX(PAVGB, pavgb)
|
|
|
|
BINARY_INT_MMX(PAVGW, pavgw)
|
|
|
|
BINARY_INT_MMX(PMADDWD, pmaddwd)
|
|
|
|
BINARY_INT_MMX(PMULHUW, pmulhuw)
|
|
|
|
BINARY_INT_MMX(PMULHW, pmulhw)
|
|
|
|
BINARY_INT_MMX(PMULUDQ, pmuludq)
|
|
|
|
BINARY_INT_MMX(PSADBW, psadbw)
|
|
|
|
|
|
|
|
BINARY_INT_MMX(PSLLW_r, psllw)
|
|
|
|
BINARY_INT_MMX(PSLLD_r, pslld)
|
|
|
|
BINARY_INT_MMX(PSLLQ_r, psllq)
|
|
|
|
BINARY_INT_MMX(PSRLW_r, psrlw)
|
|
|
|
BINARY_INT_MMX(PSRLD_r, psrld)
|
|
|
|
BINARY_INT_MMX(PSRLQ_r, psrlq)
|
|
|
|
BINARY_INT_MMX(PSRAW_r, psraw)
|
|
|
|
BINARY_INT_MMX(PSRAD_r, psrad)
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_MMX(PHADDW, phaddw)
|
|
|
|
BINARY_INT_MMX(PHADDSW, phaddsw)
|
|
|
|
BINARY_INT_MMX(PHADDD, phaddd)
|
|
|
|
BINARY_INT_MMX(PHSUBW, phsubw)
|
|
|
|
BINARY_INT_MMX(PHSUBSW, phsubsw)
|
|
|
|
BINARY_INT_MMX(PHSUBD, phsubd)
|
|
|
|
BINARY_INT_MMX(PMADDUBSW, pmaddubsw)
|
|
|
|
BINARY_INT_MMX(PSHUFB, pshufb)
|
|
|
|
BINARY_INT_MMX(PSIGNB, psignb)
|
|
|
|
BINARY_INT_MMX(PSIGNW, psignw)
|
|
|
|
BINARY_INT_MMX(PSIGND, psignd)
|
|
|
|
BINARY_INT_MMX(PMULHRSW, pmulhrsw)
|
|
|
|
|
2022-09-20 12:42:45 +03:00
|
|
|
/* Instructions with no MMX equivalent. */
|
|
|
|
#define BINARY_INT_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_binary_int_sse(s, env, decode, \
|
|
|
|
NULL, \
|
|
|
|
gen_helper_##lname##_xmm, \
|
|
|
|
gen_helper_##lname##_ymm); \
|
|
|
|
}
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
/* Instructions with no MMX equivalent. */
|
2022-09-20 12:42:45 +03:00
|
|
|
BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq)
|
|
|
|
BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq)
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
BINARY_INT_SSE(VPACKUSDW, packusdw)
|
|
|
|
BINARY_INT_SSE(VPERMILPS, vpermilps)
|
|
|
|
BINARY_INT_SSE(VPERMILPD, vpermilpd)
|
|
|
|
BINARY_INT_SSE(VMASKMOVPS, vpmaskmovd)
|
|
|
|
BINARY_INT_SSE(VMASKMOVPD, vpmaskmovq)
|
|
|
|
|
|
|
|
BINARY_INT_SSE(PMULDQ, pmuldq)
|
|
|
|
|
|
|
|
BINARY_INT_SSE(VAESDEC, aesdec)
|
|
|
|
BINARY_INT_SSE(VAESDECLAST, aesdeclast)
|
|
|
|
BINARY_INT_SSE(VAESENC, aesenc)
|
|
|
|
BINARY_INT_SSE(VAESENCLAST, aesenclast)
|
|
|
|
|
|
|
|
#define UNARY_CMP_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
if (!s->vex_l) { \
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_##lname##_xmm(tcg_env, OP_PTR1, OP_PTR2); \
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
} else { \
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_##lname##_ymm(tcg_env, OP_PTR1, OP_PTR2); \
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
} \
|
|
|
|
set_cc_op(s, CC_OP_EFLAGS); \
|
|
|
|
}
|
|
|
|
UNARY_CMP_SSE(VPTEST, ptest)
|
|
|
|
UNARY_CMP_SSE(VTESTPS, vtestps)
|
|
|
|
UNARY_CMP_SSE(VTESTPD, vtestpd)
|
2022-09-20 12:42:45 +03:00
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static inline void gen_unary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_epp xmm, SSEFunc_0_epp ymm)
|
|
|
|
{
|
|
|
|
if (!s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
xmm(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ymm(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define UNARY_INT_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_unary_int_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##_xmm, \
|
|
|
|
gen_helper_##lname##_ymm); \
|
|
|
|
}
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
UNARY_INT_SSE(VPMOVSXBW, pmovsxbw)
|
|
|
|
UNARY_INT_SSE(VPMOVSXBD, pmovsxbd)
|
|
|
|
UNARY_INT_SSE(VPMOVSXBQ, pmovsxbq)
|
|
|
|
UNARY_INT_SSE(VPMOVSXWD, pmovsxwd)
|
|
|
|
UNARY_INT_SSE(VPMOVSXWQ, pmovsxwq)
|
|
|
|
UNARY_INT_SSE(VPMOVSXDQ, pmovsxdq)
|
|
|
|
|
|
|
|
UNARY_INT_SSE(VPMOVZXBW, pmovzxbw)
|
|
|
|
UNARY_INT_SSE(VPMOVZXBD, pmovzxbd)
|
|
|
|
UNARY_INT_SSE(VPMOVZXBQ, pmovzxbq)
|
|
|
|
UNARY_INT_SSE(VPMOVZXWD, pmovzxwd)
|
|
|
|
UNARY_INT_SSE(VPMOVZXWQ, pmovzxwq)
|
|
|
|
UNARY_INT_SSE(VPMOVZXDQ, pmovzxdq)
|
|
|
|
|
2022-09-18 00:22:36 +03:00
|
|
|
UNARY_INT_SSE(VMOVSLDUP, pmovsldup)
|
|
|
|
UNARY_INT_SSE(VMOVSHDUP, pmovshdup)
|
|
|
|
UNARY_INT_SSE(VMOVDDUP, pmovdldup)
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
|
|
|
|
UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
|
|
|
|
UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
|
2022-09-01 15:27:55 +03:00
|
|
|
UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps)
|
|
|
|
UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq)
|
|
|
|
UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq)
|
2022-10-19 14:22:06 +03:00
|
|
|
UNARY_INT_SSE(VCVTPH2PS, cvtph2ps)
|
2022-09-01 15:27:55 +03:00
|
|
|
|
|
|
|
|
2022-09-02 19:19:06 +03:00
|
|
|
static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_ppi xmm, SSEFunc_0_ppi ymm)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
if (!s->vex_l) {
|
|
|
|
xmm(OP_PTR0, OP_PTR1, imm);
|
|
|
|
} else {
|
|
|
|
ymm(OP_PTR0, OP_PTR1, imm);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define UNARY_IMM_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_unary_imm_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##_xmm, \
|
|
|
|
gen_helper_##lname##_ymm); \
|
|
|
|
}
|
|
|
|
|
|
|
|
UNARY_IMM_SSE(PSHUFD, pshufd)
|
|
|
|
UNARY_IMM_SSE(PSHUFHW, pshufhw)
|
|
|
|
UNARY_IMM_SSE(PSHUFLW, pshuflw)
|
2022-09-06 11:34:11 +03:00
|
|
|
#define gen_helper_vpermq_xmm NULL
|
|
|
|
UNARY_IMM_SSE(VPERMQ, vpermq)
|
|
|
|
UNARY_IMM_SSE(VPERMILPS_i, vpermilps_imm)
|
|
|
|
UNARY_IMM_SSE(VPERMILPD_i, vpermilpd_imm)
|
|
|
|
|
|
|
|
static inline void gen_unary_imm_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_eppi xmm, SSEFunc_0_eppi ymm)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
if (!s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ymm(tcg_env, OP_PTR0, OP_PTR1, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#define UNARY_IMM_FP_SSE(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_unary_imm_fp_sse(s, env, decode, \
|
|
|
|
gen_helper_##lname##_xmm, \
|
|
|
|
gen_helper_##lname##_ymm); \
|
|
|
|
}
|
|
|
|
|
|
|
|
UNARY_IMM_FP_SSE(VROUNDPS, roundps)
|
|
|
|
UNARY_IMM_FP_SSE(VROUNDPD, roundpd)
|
2022-09-02 19:19:06 +03:00
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
static inline void gen_vexw_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_eppp d_xmm, SSEFunc_0_eppp q_xmm,
|
|
|
|
SSEFunc_0_eppp d_ymm, SSEFunc_0_eppp q_ymm)
|
|
|
|
{
|
|
|
|
SSEFunc_0_eppp d = s->vex_l ? d_ymm : d_xmm;
|
|
|
|
SSEFunc_0_eppp q = s->vex_l ? q_ymm : q_xmm;
|
|
|
|
SSEFunc_0_eppp fn = s->vex_w ? q : d;
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* VEX.W affects whether to operate on 32- or 64-bit elements. */
|
|
|
|
#define VEXW_AVX(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_vexw_avx(s, env, decode, \
|
|
|
|
gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \
|
|
|
|
gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \
|
|
|
|
}
|
|
|
|
VEXW_AVX(VPSLLV, vpsllv)
|
|
|
|
VEXW_AVX(VPSRLV, vpsrlv)
|
|
|
|
VEXW_AVX(VPSRAV, vpsrav)
|
|
|
|
VEXW_AVX(VPMASKMOV, vpmaskmov)
|
|
|
|
|
|
|
|
/* Same as above, but with extra arguments to the helper. */
|
|
|
|
static inline void gen_vsib_avx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_epppti d_xmm, SSEFunc_0_epppti q_xmm,
|
|
|
|
SSEFunc_0_epppti d_ymm, SSEFunc_0_epppti q_ymm)
|
|
|
|
{
|
|
|
|
SSEFunc_0_epppti d = s->vex_l ? d_ymm : d_xmm;
|
|
|
|
SSEFunc_0_epppti q = s->vex_l ? q_ymm : q_xmm;
|
|
|
|
SSEFunc_0_epppti fn = s->vex_w ? q : d;
|
|
|
|
TCGv_i32 scale = tcg_constant_i32(decode->mem.scale);
|
|
|
|
TCGv_ptr index = tcg_temp_new_ptr();
|
|
|
|
|
|
|
|
/* Pass third input as (index, base, scale) */
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_addi_ptr(index, tcg_env, ZMM_OFFSET(decode->mem.index));
|
|
|
|
fn(tcg_env, OP_PTR0, OP_PTR1, index, s->A0, scale);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* There are two output operands, so zero OP1's high 128 bits
|
|
|
|
* in the VEX.128 case.
|
|
|
|
*/
|
|
|
|
if (!s->vex_l) {
|
|
|
|
int ymmh_ofs = vector_elem_offset(&decode->op[1], MO_128, 1);
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, ymmh_ofs, 16, 16, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#define VSIB_AVX(uname, lname) \
|
|
|
|
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
|
|
|
|
{ \
|
|
|
|
gen_vsib_avx(s, env, decode, \
|
|
|
|
gen_helper_##lname##d_xmm, gen_helper_##lname##q_xmm, \
|
|
|
|
gen_helper_##lname##d_ymm, gen_helper_##lname##q_ymm); \
|
|
|
|
}
|
|
|
|
VSIB_AVX(VPGATHERD, vpgatherd)
|
|
|
|
VSIB_AVX(VPGATHERQ, vpgatherq)
|
|
|
|
|
2022-08-24 19:01:41 +03:00
|
|
|
static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op)
|
|
|
|
{
|
target/i386: fix ADOX followed by ADCX
When ADCX is followed by ADOX or vice versa, the second instruction's
carry comes from EFLAGS and the condition codes use the CC_OP_ADCOX
operation. Retrieving the carry from EFLAGS is handled by this bit
of gen_ADCOX:
tcg_gen_extract_tl(carry_in, cpu_cc_src,
ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
Unfortunately, in this case cc_op has been overwritten by the previous
"if" statement to CC_OP_ADCOX. This works by chance when the first
instruction is ADCX; however, if the first instruction is ADOX,
ADCX will incorrectly take its carry from OF instead of CF.
Fix by moving the computation of the new cc_op at the end of the function.
The included exhaustive test case fails without this patch and passes
afterwards.
Because ADCX/ADOX need not be invoked through the VEX prefix, this
regression bisects to commit 16fc5726a6e2 ("target/i386: reimplement
0x0f 0x38, add AVX", 2022-10-18). However, the mistake happened a
little earlier, when BMI instructions were rewritten using the new
decoder framework.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1471
Reported-by: Paul Jolly <https://gitlab.com/myitcv>
Fixes: 1d0b926150e5 ("target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder", 2022-10-18)
Cc: qemu-stable@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-31 11:48:03 +03:00
|
|
|
int opposite_cc_op;
|
2022-08-24 19:01:41 +03:00
|
|
|
TCGv carry_in = NULL;
|
|
|
|
TCGv carry_out = (cc_op == CC_OP_ADCX ? cpu_cc_dst : cpu_cc_src2);
|
|
|
|
TCGv zero;
|
|
|
|
|
|
|
|
if (cc_op == s->cc_op || s->cc_op == CC_OP_ADCOX) {
|
|
|
|
/* Re-use the carry-out from a previous round. */
|
|
|
|
carry_in = carry_out;
|
target/i386: fix ADOX followed by ADCX
When ADCX is followed by ADOX or vice versa, the second instruction's
carry comes from EFLAGS and the condition codes use the CC_OP_ADCOX
operation. Retrieving the carry from EFLAGS is handled by this bit
of gen_ADCOX:
tcg_gen_extract_tl(carry_in, cpu_cc_src,
ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
Unfortunately, in this case cc_op has been overwritten by the previous
"if" statement to CC_OP_ADCOX. This works by chance when the first
instruction is ADCX; however, if the first instruction is ADOX,
ADCX will incorrectly take its carry from OF instead of CF.
Fix by moving the computation of the new cc_op at the end of the function.
The included exhaustive test case fails without this patch and passes
afterwards.
Because ADCX/ADOX need not be invoked through the VEX prefix, this
regression bisects to commit 16fc5726a6e2 ("target/i386: reimplement
0x0f 0x38, add AVX", 2022-10-18). However, the mistake happened a
little earlier, when BMI instructions were rewritten using the new
decoder framework.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1471
Reported-by: Paul Jolly <https://gitlab.com/myitcv>
Fixes: 1d0b926150e5 ("target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder", 2022-10-18)
Cc: qemu-stable@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-31 11:48:03 +03:00
|
|
|
} else {
|
|
|
|
/* We don't have a carry-in, get it out of EFLAGS. */
|
2022-08-24 19:01:41 +03:00
|
|
|
if (s->cc_op != CC_OP_ADCX && s->cc_op != CC_OP_ADOX) {
|
|
|
|
gen_compute_eflags(s);
|
|
|
|
}
|
|
|
|
carry_in = s->tmp0;
|
|
|
|
tcg_gen_extract_tl(carry_in, cpu_cc_src,
|
|
|
|
ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (ot) {
|
|
|
|
#ifdef TARGET_X86_64
|
|
|
|
case MO_32:
|
|
|
|
/* If TL is 64-bit just do everything in 64-bit arithmetic. */
|
2023-01-15 04:21:03 +03:00
|
|
|
tcg_gen_ext32u_tl(s->T0, s->T0);
|
|
|
|
tcg_gen_ext32u_tl(s->T1, s->T1);
|
2022-08-24 19:01:41 +03:00
|
|
|
tcg_gen_add_i64(s->T0, s->T0, s->T1);
|
|
|
|
tcg_gen_add_i64(s->T0, s->T0, carry_in);
|
|
|
|
tcg_gen_shri_i64(carry_out, s->T0, 32);
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
default:
|
|
|
|
zero = tcg_constant_tl(0);
|
|
|
|
tcg_gen_add2_tl(s->T0, carry_out, s->T0, zero, carry_in, zero);
|
|
|
|
tcg_gen_add2_tl(s->T0, carry_out, s->T0, carry_out, s->T1, zero);
|
|
|
|
break;
|
|
|
|
}
|
target/i386: fix ADOX followed by ADCX
When ADCX is followed by ADOX or vice versa, the second instruction's
carry comes from EFLAGS and the condition codes use the CC_OP_ADCOX
operation. Retrieving the carry from EFLAGS is handled by this bit
of gen_ADCOX:
tcg_gen_extract_tl(carry_in, cpu_cc_src,
ctz32(cc_op == CC_OP_ADCX ? CC_C : CC_O), 1);
Unfortunately, in this case cc_op has been overwritten by the previous
"if" statement to CC_OP_ADCOX. This works by chance when the first
instruction is ADCX; however, if the first instruction is ADOX,
ADCX will incorrectly take its carry from OF instead of CF.
Fix by moving the computation of the new cc_op at the end of the function.
The included exhaustive test case fails without this patch and passes
afterwards.
Because ADCX/ADOX need not be invoked through the VEX prefix, this
regression bisects to commit 16fc5726a6e2 ("target/i386: reimplement
0x0f 0x38, add AVX", 2022-10-18). However, the mistake happened a
little earlier, when BMI instructions were rewritten using the new
decoder framework.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1471
Reported-by: Paul Jolly <https://gitlab.com/myitcv>
Fixes: 1d0b926150e5 ("target/i386: move scalar 0F 38 and 0F 3A instruction to new decoder", 2022-10-18)
Cc: qemu-stable@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2023-01-31 11:48:03 +03:00
|
|
|
|
|
|
|
opposite_cc_op = cc_op == CC_OP_ADCX ? CC_OP_ADOX : CC_OP_ADCX;
|
|
|
|
if (s->cc_op == CC_OP_ADCOX || s->cc_op == opposite_cc_op) {
|
|
|
|
/* Merge with the carry-out from the opposite instruction. */
|
|
|
|
set_cc_op(s, CC_OP_ADCOX);
|
|
|
|
} else {
|
|
|
|
set_cc_op(s, cc_op);
|
|
|
|
}
|
2022-08-24 19:01:41 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_ADCX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADCX);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_ADOX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_ADCOX(s, env, decode->op[0].ot, CC_OP_ADOX);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_ANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
|
|
|
|
tcg_gen_andc_tl(s->T0, s->T1, s->T0);
|
|
|
|
gen_op_update1_cc(s);
|
|
|
|
set_cc_op(s, CC_OP_LOGICB + ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_BEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
2023-01-15 02:05:42 +03:00
|
|
|
TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
|
|
|
|
TCGv zero = tcg_constant_tl(0);
|
|
|
|
TCGv mone = tcg_constant_tl(-1);
|
2022-08-24 19:01:41 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract START, and shift the operand.
|
|
|
|
* Shifts larger than operand size get zeros.
|
|
|
|
*/
|
|
|
|
tcg_gen_ext8u_tl(s->A0, s->T1);
|
2023-01-15 02:05:42 +03:00
|
|
|
if (TARGET_LONG_BITS == 64 && ot == MO_32) {
|
|
|
|
tcg_gen_ext32u_tl(s->T0, s->T0);
|
|
|
|
}
|
2022-08-24 19:01:41 +03:00
|
|
|
tcg_gen_shr_tl(s->T0, s->T0, s->A0);
|
|
|
|
|
|
|
|
tcg_gen_movcond_tl(TCG_COND_LEU, s->T0, s->A0, bound, s->T0, zero);
|
|
|
|
|
|
|
|
/*
|
2023-01-15 02:05:42 +03:00
|
|
|
* Extract the LEN into an inverse mask. Lengths larger than
|
|
|
|
* operand size get all zeros, length 0 gets all ones.
|
2022-08-24 19:01:41 +03:00
|
|
|
*/
|
|
|
|
tcg_gen_extract_tl(s->A0, s->T1, 8, 8);
|
2023-01-15 02:05:42 +03:00
|
|
|
tcg_gen_shl_tl(s->T1, mone, s->A0);
|
|
|
|
tcg_gen_movcond_tl(TCG_COND_LEU, s->T1, s->A0, bound, s->T1, zero);
|
|
|
|
tcg_gen_andc_tl(s->T0, s->T0, s->T1);
|
2022-08-24 19:01:41 +03:00
|
|
|
|
|
|
|
gen_op_update1_cc(s);
|
|
|
|
set_cc_op(s, CC_OP_LOGICB + ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_BLSI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
|
2023-01-14 21:06:01 +03:00
|
|
|
tcg_gen_mov_tl(cpu_cc_src, s->T0);
|
2022-08-24 19:01:41 +03:00
|
|
|
tcg_gen_neg_tl(s->T1, s->T0);
|
|
|
|
tcg_gen_and_tl(s->T0, s->T0, s->T1);
|
|
|
|
tcg_gen_mov_tl(cpu_cc_dst, s->T0);
|
|
|
|
set_cc_op(s, CC_OP_BMILGB + ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_BLSMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
|
2023-01-14 21:06:01 +03:00
|
|
|
tcg_gen_mov_tl(cpu_cc_src, s->T0);
|
2022-08-24 19:01:41 +03:00
|
|
|
tcg_gen_subi_tl(s->T1, s->T0, 1);
|
|
|
|
tcg_gen_xor_tl(s->T0, s->T0, s->T1);
|
|
|
|
tcg_gen_mov_tl(cpu_cc_dst, s->T0);
|
|
|
|
set_cc_op(s, CC_OP_BMILGB + ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_BLSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
|
2023-01-14 21:06:01 +03:00
|
|
|
tcg_gen_mov_tl(cpu_cc_src, s->T0);
|
2022-08-24 19:01:41 +03:00
|
|
|
tcg_gen_subi_tl(s->T1, s->T0, 1);
|
|
|
|
tcg_gen_and_tl(s->T0, s->T0, s->T1);
|
|
|
|
tcg_gen_mov_tl(cpu_cc_dst, s->T0);
|
|
|
|
set_cc_op(s, CC_OP_BMILGB + ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
2023-01-15 02:32:06 +03:00
|
|
|
TCGv bound = tcg_constant_tl(ot == MO_64 ? 63 : 31);
|
|
|
|
TCGv zero = tcg_constant_tl(0);
|
|
|
|
TCGv mone = tcg_constant_tl(-1);
|
2022-08-24 19:01:41 +03:00
|
|
|
|
2023-01-15 02:32:06 +03:00
|
|
|
tcg_gen_ext8u_tl(s->T1, s->T1);
|
2022-08-24 19:01:41 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that since we're using BMILG (in order to get O
|
|
|
|
* cleared) we need to store the inverse into C.
|
|
|
|
*/
|
2023-01-15 02:32:06 +03:00
|
|
|
tcg_gen_setcond_tl(TCG_COND_LEU, cpu_cc_src, s->T1, bound);
|
2022-08-24 19:01:41 +03:00
|
|
|
|
2023-01-15 02:32:06 +03:00
|
|
|
tcg_gen_shl_tl(s->A0, mone, s->T1);
|
|
|
|
tcg_gen_movcond_tl(TCG_COND_LEU, s->A0, s->T1, bound, s->A0, zero);
|
2022-08-24 19:01:41 +03:00
|
|
|
tcg_gen_andc_tl(s->T0, s->T0, s->A0);
|
|
|
|
|
|
|
|
gen_op_update1_cc(s);
|
|
|
|
set_cc_op(s, CC_OP_BMILGB + ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[2].ot;
|
|
|
|
|
|
|
|
tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
|
|
|
|
gen_helper_crc32(s->T0, s->tmp2_i32, s->T1, tcg_constant_i32(8 << ot));
|
|
|
|
}
|
|
|
|
|
2022-09-06 19:44:02 +03:00
|
|
|
static void gen_CVTPI2Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_enter_mmx(tcg_env);
|
2022-09-06 19:44:02 +03:00
|
|
|
if (s->prefix & PREFIX_DATA) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtpi2pd(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtpi2ps(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_CVTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_enter_mmx(tcg_env);
|
2022-09-06 19:44:02 +03:00
|
|
|
if (s->prefix & PREFIX_DATA) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtpd2pi(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtps2pi(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_CVTTPx2PI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_enter_mmx(tcg_env);
|
2022-09-06 19:44:02 +03:00
|
|
|
if (s->prefix & PREFIX_DATA) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvttpd2pi(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvttps2pi(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-02 19:19:06 +03:00
|
|
|
static void gen_EMMS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_emms(tcg_env);
|
2022-09-02 19:19:06 +03:00
|
|
|
}
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static void gen_EXTRQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
|
|
|
|
TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_extrq_i(tcg_env, OP_PTR0, index, length);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_EXTRQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_extrq_r(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_INSERTQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 length = tcg_constant_i32(decode->immediate & 63);
|
|
|
|
TCGv_i32 index = tcg_constant_i32((decode->immediate >> 8) & 63);
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_insertq_i(tcg_env, OP_PTR0, OP_PTR1, index, length);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_INSERTQ_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_insertq_r(tcg_env, OP_PTR0, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
|
2022-09-11 14:22:32 +03:00
|
|
|
static void gen_LDMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T1);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_ldmxcsr(tcg_env, s->tmp2_i32);
|
2022-09-11 14:22:32 +03:00
|
|
|
}
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static void gen_MASKMOV(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
tcg_gen_mov_tl(s->A0, cpu_regs[R_EDI]);
|
|
|
|
gen_extu(s->aflag, s->A0);
|
|
|
|
gen_add_A0_ds_seg(s);
|
|
|
|
|
|
|
|
if (s->prefix & PREFIX_DATA) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_maskmov_xmm(tcg_env, OP_PTR1, OP_PTR2, s->A0);
|
2022-09-01 15:27:55 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_maskmov_mmx(tcg_env, OP_PTR1, OP_PTR2, s->A0);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-24 19:01:41 +03:00
|
|
|
static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
|
|
|
|
/* M operand type does not load/store */
|
|
|
|
if (decode->e.op0 == X86_TYPE_M) {
|
|
|
|
tcg_gen_qemu_st_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
|
|
|
|
} else {
|
|
|
|
tcg_gen_qemu_ld_tl(s->T0, s->A0, s->mem_index, ot | MO_BE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static void gen_MOVD_from(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[2].ot;
|
|
|
|
|
|
|
|
switch (ot) {
|
|
|
|
case MO_32:
|
|
|
|
#ifdef TARGET_X86_64
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld32u_tl(s->T0, tcg_env, decode->op[2].offset);
|
2022-09-01 15:27:55 +03:00
|
|
|
break;
|
|
|
|
case MO_64:
|
|
|
|
#endif
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_tl(s->T0, tcg_env, decode->op[2].offset);
|
2022-09-01 15:27:55 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-20 12:42:45 +03:00
|
|
|
static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[2].ot;
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0);
|
|
|
|
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
|
|
|
|
switch (ot) {
|
|
|
|
case MO_32:
|
|
|
|
#ifdef TARGET_X86_64
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st32_tl(s->T1, tcg_env, lo_ofs);
|
2022-09-20 12:42:45 +03:00
|
|
|
break;
|
|
|
|
case MO_64:
|
|
|
|
#endif
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st_tl(s->T1, tcg_env, lo_ofs);
|
2022-09-20 12:42:45 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
g_assert_not_reached();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_store_sse(s, decode, decode->op[2].offset);
|
|
|
|
}
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static void gen_MOVMSK(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
typeof(gen_helper_movmskps_ymm) *ps, *pd, *fn;
|
|
|
|
ps = s->vex_l ? gen_helper_movmskps_ymm : gen_helper_movmskps_xmm;
|
|
|
|
pd = s->vex_l ? gen_helper_movmskpd_ymm : gen_helper_movmskpd_xmm;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? pd : ps;
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(s->tmp2_i32, tcg_env, OP_PTR2);
|
2022-09-01 15:27:55 +03:00
|
|
|
tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
|
|
|
|
}
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static void gen_MOVQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
int lo_ofs = vector_elem_offset(&decode->op[0], MO_64, 0);
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
|
2022-09-01 15:27:55 +03:00
|
|
|
if (decode->op[0].has_ea) {
|
|
|
|
tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* tcg_gen_gvec_dup_i64(MO_64, op0.offset, 8, vec_len, s->tmp1_64) would
|
|
|
|
* seem to work, but it does not on big-endian platforms; the cleared parts
|
|
|
|
* are always at higher addresses, but cross-endian emulation inverts the
|
|
|
|
* byte order so that the cleared parts need to be at *lower* addresses.
|
|
|
|
* Because oprsz is 8, we see this here even for SSE; but more in general,
|
|
|
|
* it disqualifies using oprsz < maxsz to emulate VEX128.
|
|
|
|
*/
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, lo_ofs);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_MOVq_dq(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_enter_mmx(tcg_env);
|
2022-09-01 15:27:55 +03:00
|
|
|
/* Otherwise the same as any other movq. */
|
|
|
|
return gen_MOVQ(s, env, decode);
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
|
2022-08-24 19:01:41 +03:00
|
|
|
static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
|
|
|
|
/* low part of result in VEX.vvvv, high in MODRM */
|
|
|
|
switch (ot) {
|
|
|
|
default:
|
|
|
|
tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
|
|
|
|
tcg_gen_trunc_tl_i32(s->tmp3_i32, s->T1);
|
|
|
|
tcg_gen_mulu2_i32(s->tmp2_i32, s->tmp3_i32,
|
|
|
|
s->tmp2_i32, s->tmp3_i32);
|
|
|
|
tcg_gen_extu_i32_tl(cpu_regs[s->vex_v], s->tmp2_i32);
|
|
|
|
tcg_gen_extu_i32_tl(s->T0, s->tmp3_i32);
|
|
|
|
break;
|
|
|
|
#ifdef TARGET_X86_64
|
|
|
|
case MO_64:
|
|
|
|
tcg_gen_mulu2_i64(cpu_regs[s->vex_v], s->T0, s->T0, s->T1);
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_PALIGNR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
if (!(s->prefix & PREFIX_DATA)) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_palignr_mmx(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
} else if (!s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_palignr_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_palignr_ymm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-05 16:39:36 +03:00
|
|
|
static void gen_PANDN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
/* Careful, operand order is reversed! */
|
|
|
|
tcg_gen_gvec_andc(MO_64,
|
|
|
|
decode->op[0].offset, decode->op[2].offset,
|
|
|
|
decode->op[1].offset, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_PCMPESTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_pcmpestri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
set_cc_op(s, CC_OP_EFLAGS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PCMPESTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_pcmpestrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
set_cc_op(s, CC_OP_EFLAGS);
|
|
|
|
if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
|
|
|
|
16, 16, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PCMPISTRI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_pcmpistri_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
set_cc_op(s, CC_OP_EFLAGS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PCMPISTRM(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_pcmpistrm_xmm(tcg_env, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
set_cc_op(s, CC_OP_EFLAGS);
|
|
|
|
if ((s->prefix & PREFIX_VEX) && !s->vex_l) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[0].ZMM_X(1)),
|
|
|
|
16, 16, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-24 19:01:41 +03:00
|
|
|
static void gen_PDEP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[1].ot;
|
|
|
|
if (ot < MO_64) {
|
|
|
|
tcg_gen_ext32u_tl(s->T0, s->T0);
|
|
|
|
}
|
|
|
|
gen_helper_pdep(s->T0, s->T0, s->T1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PEXT(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[1].ot;
|
|
|
|
if (ot < MO_64) {
|
|
|
|
tcg_gen_ext32u_tl(s->T0, s->T0);
|
|
|
|
}
|
|
|
|
gen_helper_pext(s->T0, s->T0, s->T1);
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static inline void gen_pextr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
int mask = (vec_len >> ot) - 1;
|
|
|
|
int val = decode->immediate & mask;
|
|
|
|
|
|
|
|
switch (ot) {
|
|
|
|
case MO_8:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld8u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
case MO_16:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld16u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
case MO_32:
|
|
|
|
#ifdef TARGET_X86_64
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld32u_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
case MO_64:
|
|
|
|
#endif
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_tl(s->T0, tcg_env, vector_elem_offset(&decode->op[1], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PEXTRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_pextr(s, env, decode, MO_8);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PEXTRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_pextr(s, env, decode, MO_16);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PEXTR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
gen_pextr(s, env, decode, ot);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void gen_pinsr(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, MemOp ot)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
int mask = (vec_len >> ot) - 1;
|
|
|
|
int val = decode->immediate & mask;
|
|
|
|
|
|
|
|
if (decode->op[1].offset != decode->op[0].offset) {
|
|
|
|
assert(vec_len == 16);
|
|
|
|
gen_store_sse(s, decode, decode->op[1].offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (ot) {
|
|
|
|
case MO_8:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st8_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
case MO_16:
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st16_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
case MO_32:
|
|
|
|
#ifdef TARGET_X86_64
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st32_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
case MO_64:
|
|
|
|
#endif
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st_tl(s->T1, tcg_env, vector_elem_offset(&decode->op[0], ot, val));
|
2022-09-06 11:34:11 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PINSRB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_pinsr(s, env, decode, MO_8);
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_PINSRW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_pinsr(s, env, decode, MO_16);
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_PINSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_pinsr(s, env, decode, decode->op[2].ot);
|
|
|
|
}
|
|
|
|
|
2022-09-07 10:25:06 +03:00
|
|
|
static void gen_pmovmskb_i64(TCGv_i64 d, TCGv_i64 s)
|
|
|
|
{
|
|
|
|
TCGv_i64 t = tcg_temp_new_i64();
|
|
|
|
|
|
|
|
tcg_gen_andi_i64(d, s, 0x8080808080808080ull);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* After each shift+or pair:
|
|
|
|
* 0: a.......b.......c.......d.......e.......f.......g.......h.......
|
|
|
|
* 7: ab......bc......cd......de......ef......fg......gh......h.......
|
|
|
|
* 14: abcd....bcde....cdef....defg....efgh....fgh.....gh......h.......
|
|
|
|
* 28: abcdefghbcdefgh.cdefgh..defgh...efgh....fgh.....gh......h.......
|
|
|
|
* The result is left in the high bits of the word.
|
|
|
|
*/
|
|
|
|
tcg_gen_shli_i64(t, d, 7);
|
|
|
|
tcg_gen_or_i64(d, d, t);
|
|
|
|
tcg_gen_shli_i64(t, d, 14);
|
|
|
|
tcg_gen_or_i64(d, d, t);
|
|
|
|
tcg_gen_shli_i64(t, d, 28);
|
|
|
|
tcg_gen_or_i64(d, d, t);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_pmovmskb_vec(unsigned vece, TCGv_vec d, TCGv_vec s)
|
|
|
|
{
|
|
|
|
TCGv_vec t = tcg_temp_new_vec_matching(d);
|
|
|
|
TCGv_vec m = tcg_constant_vec_matching(d, MO_8, 0x80);
|
|
|
|
|
|
|
|
/* See above */
|
|
|
|
tcg_gen_and_vec(vece, d, s, m);
|
|
|
|
tcg_gen_shli_vec(vece, t, d, 7);
|
|
|
|
tcg_gen_or_vec(vece, d, d, t);
|
|
|
|
tcg_gen_shli_vec(vece, t, d, 14);
|
|
|
|
tcg_gen_or_vec(vece, d, d, t);
|
|
|
|
tcg_gen_shli_vec(vece, t, d, 28);
|
|
|
|
tcg_gen_or_vec(vece, d, d, t);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef TARGET_X86_64
|
|
|
|
#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i64
|
|
|
|
#else
|
|
|
|
#define TCG_TARGET_HAS_extract2_tl TCG_TARGET_HAS_extract2_i32
|
|
|
|
#endif
|
|
|
|
|
2022-09-01 15:27:55 +03:00
|
|
|
static void gen_PMOVMSKB(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2022-09-07 10:25:06 +03:00
|
|
|
static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
|
|
|
|
static const GVecGen2 g = {
|
|
|
|
.fni8 = gen_pmovmskb_i64,
|
|
|
|
.fniv = gen_pmovmskb_vec,
|
|
|
|
.opt_opc = vecop_list,
|
|
|
|
.vece = MO_64,
|
|
|
|
.prefer_i64 = TCG_TARGET_REG_BITS == 64
|
|
|
|
};
|
|
|
|
MemOp ot = decode->op[2].ot;
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
TCGv t = tcg_temp_new();
|
|
|
|
|
|
|
|
tcg_gen_gvec_2(offsetof(CPUX86State, xmm_t0) + xmm_offset(ot), decode->op[2].offset,
|
|
|
|
vec_len, vec_len, &g);
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld8u_tl(s->T0, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
|
2022-09-07 10:25:06 +03:00
|
|
|
while (vec_len > 8) {
|
|
|
|
vec_len -= 8;
|
|
|
|
if (TCG_TARGET_HAS_extract2_tl) {
|
|
|
|
/*
|
|
|
|
* Load the next byte of the result into the high byte of T.
|
|
|
|
* TCG does a similar expansion of deposit to shl+extract2; by
|
|
|
|
* loading the whole word, the shift left is avoided.
|
|
|
|
*/
|
|
|
|
#ifdef TARGET_X86_64
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_Q((vec_len - 1) / 8)));
|
2022-09-07 10:25:06 +03:00
|
|
|
#else
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L((vec_len - 1) / 4)));
|
2022-09-07 10:25:06 +03:00
|
|
|
#endif
|
|
|
|
|
|
|
|
tcg_gen_extract2_tl(s->T0, t, s->T0, TARGET_LONG_BITS - 8);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* The _previous_ value is deposited into bits 8 and higher of t. Because
|
|
|
|
* those bits are known to be zero after ld8u, this becomes a shift+or
|
|
|
|
* if deposit is not available.
|
|
|
|
*/
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld8u_tl(t, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_B(vec_len - 1)));
|
2022-09-07 10:25:06 +03:00
|
|
|
tcg_gen_deposit_tl(s->T0, t, s->T0, 8, TARGET_LONG_BITS - 8);
|
|
|
|
}
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-02 19:19:06 +03:00
|
|
|
static void gen_PSHUFW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
gen_helper_pshufw_mmx(OP_PTR0, OP_PTR1, imm);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSRLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 16) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_shri(MO_16,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSLLW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 16) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_shli(MO_16,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSRAW_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 16) {
|
|
|
|
decode->immediate = 15;
|
|
|
|
}
|
|
|
|
tcg_gen_gvec_sari(MO_16,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSRLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 32) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_shri(MO_32,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSLLD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 32) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_shli(MO_32,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSRAD_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 32) {
|
|
|
|
decode->immediate = 31;
|
|
|
|
}
|
|
|
|
tcg_gen_gvec_sari(MO_32,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSRLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 64) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_shri(MO_64,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSLLQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
if (decode->immediate >= 64) {
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_shli(MO_64,
|
|
|
|
decode->op[0].offset, decode->op[1].offset,
|
|
|
|
decode->immediate, vec_len, vec_len);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static TCGv_ptr make_imm8u_xmm_vec(uint8_t imm, int vec_len)
|
|
|
|
{
|
|
|
|
MemOp ot = vec_len == 16 ? MO_128 : MO_256;
|
|
|
|
TCGv_i32 imm_v = tcg_constant8u_i32(imm);
|
|
|
|
TCGv_ptr ptr = tcg_temp_new_ptr();
|
|
|
|
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_t0) + xmm_offset(ot),
|
|
|
|
vec_len, vec_len, 0);
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_t0));
|
|
|
|
tcg_gen_st_i32(imm_v, tcg_env, offsetof(CPUX86State, xmm_t0.ZMM_L(0)));
|
2022-09-02 19:19:06 +03:00
|
|
|
return ptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSRLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
|
|
|
|
|
|
|
|
if (s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_psrldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
|
2022-09-02 19:19:06 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_psrldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
|
2022-09-02 19:19:06 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_PSLLDQ_i(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
TCGv_ptr imm_vec = make_imm8u_xmm_vec(decode->immediate, vec_len);
|
|
|
|
|
|
|
|
if (s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_pslldq_ymm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
|
2022-09-02 19:19:06 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_pslldq_xmm(tcg_env, OP_PTR0, OP_PTR1, imm_vec);
|
2022-09-02 19:19:06 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-24 19:01:41 +03:00
|
|
|
static void gen_RORX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
int b = decode->immediate;
|
|
|
|
|
|
|
|
if (ot == MO_64) {
|
|
|
|
tcg_gen_rotri_tl(s->T0, s->T0, b & 63);
|
|
|
|
} else {
|
|
|
|
tcg_gen_trunc_tl_i32(s->tmp2_i32, s->T0);
|
|
|
|
tcg_gen_rotri_i32(s->tmp2_i32, s->tmp2_i32, b & 31);
|
|
|
|
tcg_gen_extu_i32_tl(s->T0, s->tmp2_i32);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SARX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
int mask;
|
|
|
|
|
|
|
|
mask = ot == MO_64 ? 63 : 31;
|
|
|
|
tcg_gen_andi_tl(s->T1, s->T1, mask);
|
|
|
|
if (ot != MO_64) {
|
|
|
|
tcg_gen_ext32s_tl(s->T0, s->T0);
|
|
|
|
}
|
|
|
|
tcg_gen_sar_tl(s->T0, s->T0, s->T1);
|
|
|
|
}
|
|
|
|
|
2023-10-10 11:31:17 +03:00
|
|
|
static void gen_SHA1NEXTE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_helper_sha1nexte(OP_PTR0, OP_PTR1, OP_PTR2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHA1MSG1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_helper_sha1msg1(OP_PTR0, OP_PTR1, OP_PTR2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHA1MSG2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_helper_sha1msg2(OP_PTR0, OP_PTR1, OP_PTR2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHA1RNDS4(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
switch(decode->immediate & 3) {
|
|
|
|
case 0:
|
|
|
|
gen_helper_sha1rnds4_f0(OP_PTR0, OP_PTR0, OP_PTR1);
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
gen_helper_sha1rnds4_f1(OP_PTR0, OP_PTR0, OP_PTR1);
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
gen_helper_sha1rnds4_f2(OP_PTR0, OP_PTR0, OP_PTR1);
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
gen_helper_sha1rnds4_f3(OP_PTR0, OP_PTR0, OP_PTR1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHA256MSG1(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_helper_sha256msg1(OP_PTR0, OP_PTR1, OP_PTR2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHA256MSG2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_helper_sha256msg2(OP_PTR0, OP_PTR1, OP_PTR2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHA256RNDS2(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 wk0 = tcg_temp_new_i32();
|
|
|
|
TCGv_i32 wk1 = tcg_temp_new_i32();
|
|
|
|
|
|
|
|
tcg_gen_ld_i32(wk0, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(0)));
|
|
|
|
tcg_gen_ld_i32(wk1, tcg_env, ZMM_OFFSET(0) + offsetof(ZMMReg, ZMM_L(1)));
|
|
|
|
|
|
|
|
gen_helper_sha256rnds2(OP_PTR0, OP_PTR1, OP_PTR2, wk0, wk1);
|
|
|
|
}
|
|
|
|
|
2022-08-24 19:01:41 +03:00
|
|
|
static void gen_SHLX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
int mask;
|
|
|
|
|
|
|
|
mask = ot == MO_64 ? 63 : 31;
|
|
|
|
tcg_gen_andi_tl(s->T1, s->T1, mask);
|
|
|
|
tcg_gen_shl_tl(s->T0, s->T0, s->T1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_SHRX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
int mask;
|
|
|
|
|
|
|
|
mask = ot == MO_64 ? 63 : 31;
|
|
|
|
tcg_gen_andi_tl(s->T1, s->T1, mask);
|
|
|
|
if (ot != MO_64) {
|
|
|
|
tcg_gen_ext32u_tl(s->T0, s->T0);
|
|
|
|
}
|
|
|
|
tcg_gen_shr_tl(s->T0, s->T0, s->T1);
|
|
|
|
}
|
2022-09-01 15:27:55 +03:00
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_VAESKEYGEN(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
assert(!s->vex_l);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_aeskeygenassist_xmm(tcg_env, OP_PTR0, OP_PTR1, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
|
2022-09-11 14:22:32 +03:00
|
|
|
static void gen_STMXCSR(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_update_mxcsr(tcg_env);
|
|
|
|
tcg_gen_ld32u_tl(s->T0, tcg_env, offsetof(CPUX86State, mxcsr));
|
2022-09-11 14:22:32 +03:00
|
|
|
}
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
static void gen_VAESIMC(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
assert(!s->vex_l);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_aesimc_xmm(tcg_env, OP_PTR0, OP_PTR2);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
/*
|
|
|
|
* 00 = v*ps Vps, Hps, Wpd
|
|
|
|
* 66 = v*pd Vpd, Hpd, Wps
|
|
|
|
* f3 = v*ss Vss, Hss, Wps
|
|
|
|
* f2 = v*sd Vsd, Hsd, Wps
|
|
|
|
*/
|
|
|
|
#define SSE_CMP(x) { \
|
|
|
|
gen_helper_ ## x ## ps ## _xmm, gen_helper_ ## x ## pd ## _xmm, \
|
|
|
|
gen_helper_ ## x ## ss, gen_helper_ ## x ## sd, \
|
|
|
|
gen_helper_ ## x ## ps ## _ymm, gen_helper_ ## x ## pd ## _ymm}
|
|
|
|
static const SSEFunc_0_eppp gen_helper_cmp_funcs[32][6] = {
|
|
|
|
SSE_CMP(cmpeq),
|
|
|
|
SSE_CMP(cmplt),
|
|
|
|
SSE_CMP(cmple),
|
|
|
|
SSE_CMP(cmpunord),
|
|
|
|
SSE_CMP(cmpneq),
|
|
|
|
SSE_CMP(cmpnlt),
|
|
|
|
SSE_CMP(cmpnle),
|
|
|
|
SSE_CMP(cmpord),
|
|
|
|
|
|
|
|
SSE_CMP(cmpequ),
|
|
|
|
SSE_CMP(cmpnge),
|
|
|
|
SSE_CMP(cmpngt),
|
|
|
|
SSE_CMP(cmpfalse),
|
|
|
|
SSE_CMP(cmpnequ),
|
|
|
|
SSE_CMP(cmpge),
|
|
|
|
SSE_CMP(cmpgt),
|
|
|
|
SSE_CMP(cmptrue),
|
|
|
|
|
|
|
|
SSE_CMP(cmpeqs),
|
|
|
|
SSE_CMP(cmpltq),
|
|
|
|
SSE_CMP(cmpleq),
|
|
|
|
SSE_CMP(cmpunords),
|
|
|
|
SSE_CMP(cmpneqq),
|
|
|
|
SSE_CMP(cmpnltq),
|
|
|
|
SSE_CMP(cmpnleq),
|
|
|
|
SSE_CMP(cmpords),
|
|
|
|
|
|
|
|
SSE_CMP(cmpequs),
|
|
|
|
SSE_CMP(cmpngeq),
|
|
|
|
SSE_CMP(cmpngtq),
|
|
|
|
SSE_CMP(cmpfalses),
|
|
|
|
SSE_CMP(cmpnequs),
|
|
|
|
SSE_CMP(cmpgeq),
|
|
|
|
SSE_CMP(cmpgtq),
|
|
|
|
SSE_CMP(cmptrues),
|
|
|
|
};
|
|
|
|
#undef SSE_CMP
|
|
|
|
|
|
|
|
static void gen_VCMP(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int index = decode->immediate & (s->prefix & PREFIX_VEX ? 31 : 7);
|
|
|
|
int b =
|
|
|
|
s->prefix & PREFIX_REPZ ? 2 /* ss */ :
|
|
|
|
s->prefix & PREFIX_REPNZ ? 3 /* sd */ :
|
|
|
|
!!(s->prefix & PREFIX_DATA) /* pd */ + (s->vex_l << 2);
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cmp_funcs[index][b](tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
|
2022-09-06 19:44:02 +03:00
|
|
|
static void gen_VCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
SSEFunc_0_epp fn;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? gen_helper_comisd : gen_helper_comiss;
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR1, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
set_cc_op(s, CC_OP_EFLAGS);
|
|
|
|
}
|
|
|
|
|
2023-08-29 19:28:33 +03:00
|
|
|
static void gen_VCVTPD2PS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
2022-09-01 15:27:55 +03:00
|
|
|
{
|
2023-08-29 19:28:33 +03:00
|
|
|
if (s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtpd2ps_ymm(tcg_env, OP_PTR0, OP_PTR2);
|
2023-08-29 19:28:33 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtpd2ps_xmm(tcg_env, OP_PTR0, OP_PTR2);
|
2023-08-29 19:28:33 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VCVTPS2PD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
if (s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtps2pd_ymm(tcg_env, OP_PTR0, OP_PTR2);
|
2023-08-29 19:28:33 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtps2pd_xmm(tcg_env, OP_PTR0, OP_PTR2);
|
2023-08-29 19:28:33 +03:00
|
|
|
}
|
2022-09-01 15:27:55 +03:00
|
|
|
}
|
2022-09-02 19:19:06 +03:00
|
|
|
|
2022-10-19 14:22:06 +03:00
|
|
|
static void gen_VCVTPS2PH(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_unary_imm_fp_sse(s, env, decode,
|
|
|
|
gen_helper_cvtps2ph_xmm,
|
|
|
|
gen_helper_cvtps2ph_ymm);
|
|
|
|
/*
|
|
|
|
* VCVTPS2PH is the only instruction that performs an operation on a
|
|
|
|
* register source and then *stores* into memory.
|
|
|
|
*/
|
|
|
|
if (decode->op[0].has_ea) {
|
|
|
|
gen_store_sse(s, decode, decode->op[0].offset);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-29 19:28:33 +03:00
|
|
|
static void gen_VCVTSD2SS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtsd2ss(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2023-08-29 19:28:33 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VCVTSS2SD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtss2sd(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2);
|
2023-08-29 19:28:33 +03:00
|
|
|
}
|
|
|
|
|
2022-09-06 19:44:02 +03:00
|
|
|
static void gen_VCVTSI2Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
TCGv_i32 in;
|
|
|
|
|
|
|
|
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
|
|
|
|
|
|
|
|
#ifdef TARGET_X86_64
|
|
|
|
MemOp ot = decode->op[2].ot;
|
|
|
|
if (ot == MO_64) {
|
|
|
|
if (s->prefix & PREFIX_REPNZ) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtsq2sd(tcg_env, OP_PTR0, s->T1);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtsq2ss(tcg_env, OP_PTR0, s->T1);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
in = s->tmp2_i32;
|
|
|
|
tcg_gen_trunc_tl_i32(in, s->T1);
|
|
|
|
#else
|
|
|
|
in = s->T1;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (s->prefix & PREFIX_REPNZ) {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtsi2sd(tcg_env, OP_PTR0, in);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_cvtsi2ss(tcg_env, OP_PTR0, in);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void gen_VCVTtSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_i_ep ss2si, SSEFunc_l_ep ss2sq,
|
|
|
|
SSEFunc_i_ep sd2si, SSEFunc_l_ep sd2sq)
|
|
|
|
{
|
|
|
|
TCGv_i32 out;
|
|
|
|
|
|
|
|
#ifdef TARGET_X86_64
|
|
|
|
MemOp ot = decode->op[0].ot;
|
|
|
|
if (ot == MO_64) {
|
|
|
|
if (s->prefix & PREFIX_REPNZ) {
|
2023-09-14 02:37:36 +03:00
|
|
|
sd2sq(s->T0, tcg_env, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ss2sq(s->T0, tcg_env, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
out = s->tmp2_i32;
|
|
|
|
#else
|
|
|
|
out = s->T0;
|
|
|
|
#endif
|
|
|
|
if (s->prefix & PREFIX_REPNZ) {
|
2023-09-14 02:37:36 +03:00
|
|
|
sd2si(out, tcg_env, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ss2si(out, tcg_env, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
}
|
|
|
|
#ifdef TARGET_X86_64
|
|
|
|
tcg_gen_extu_i32_tl(s->T0, out);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef TARGET_X86_64
|
|
|
|
#define gen_helper_cvtss2sq NULL
|
|
|
|
#define gen_helper_cvtsd2sq NULL
|
|
|
|
#define gen_helper_cvttss2sq NULL
|
|
|
|
#define gen_helper_cvttsd2sq NULL
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void gen_VCVTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_VCVTtSx2SI(s, env, decode,
|
|
|
|
gen_helper_cvtss2si, gen_helper_cvtss2sq,
|
|
|
|
gen_helper_cvtsd2si, gen_helper_cvtsd2sq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VCVTTSx2SI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_VCVTtSx2SI(s, env, decode,
|
|
|
|
gen_helper_cvttss2si, gen_helper_cvttss2sq,
|
|
|
|
gen_helper_cvttsd2si, gen_helper_cvttsd2sq);
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_VEXTRACTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int mask = decode->immediate & 1;
|
|
|
|
int src_ofs = vector_elem_offset(&decode->op[1], MO_128, mask);
|
|
|
|
if (decode->op[0].has_ea) {
|
|
|
|
/* VEX-only instruction, no alignment requirements. */
|
|
|
|
gen_sto_env_A0(s, src_ofs, false);
|
|
|
|
} else {
|
|
|
|
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, 16, 16);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VEXTRACTPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_pextr(s, env, decode, MO_32);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_vinsertps(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int val = decode->immediate;
|
|
|
|
int dest_word = (val >> 4) & 3;
|
|
|
|
int new_mask = (val & 15) | (1 << dest_word);
|
|
|
|
int vec_len = 16;
|
|
|
|
|
|
|
|
assert(!s->vex_l);
|
|
|
|
|
|
|
|
if (new_mask == 15) {
|
|
|
|
/* All zeroes except possibly for the inserted element */
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
} else if (decode->op[1].offset != decode->op[0].offset) {
|
|
|
|
gen_store_sse(s, decode, decode->op[1].offset);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (new_mask != (val & 15)) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st_i32(s->tmp2_i32, tcg_env,
|
2022-09-06 11:34:11 +03:00
|
|
|
vector_elem_offset(&decode->op[0], MO_32, dest_word));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (new_mask != 15) {
|
|
|
|
TCGv_i32 zero = tcg_constant_i32(0); /* float32_zero */
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < 4; i++) {
|
|
|
|
if ((val >> i) & 1) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st_i32(zero, tcg_env,
|
2022-09-06 11:34:11 +03:00
|
|
|
vector_elem_offset(&decode->op[0], MO_32, i));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VINSERTPS_r(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int val = decode->immediate;
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i32(s->tmp2_i32, tcg_env,
|
2022-09-06 11:34:11 +03:00
|
|
|
vector_elem_offset(&decode->op[2], MO_32, (val >> 6) & 3));
|
|
|
|
gen_vinsertps(s, env, decode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VINSERTPS_m(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
|
|
|
|
gen_vinsertps(s, env, decode);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VINSERTx128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int mask = decode->immediate & 1;
|
|
|
|
tcg_gen_gvec_mov(MO_64,
|
|
|
|
decode->op[0].offset + offsetof(YMMReg, YMM_X(mask)),
|
|
|
|
decode->op[2].offset + offsetof(YMMReg, YMM_X(0)), 16, 16);
|
|
|
|
tcg_gen_gvec_mov(MO_64,
|
|
|
|
decode->op[0].offset + offsetof(YMMReg, YMM_X(!mask)),
|
|
|
|
decode->op[1].offset + offsetof(YMMReg, YMM_X(!mask)), 16, 16);
|
|
|
|
}
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
static inline void gen_maskmov(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode,
|
|
|
|
SSEFunc_0_eppt xmm, SSEFunc_0_eppt ymm)
|
|
|
|
{
|
|
|
|
if (!s->vex_l) {
|
2023-09-14 02:37:36 +03:00
|
|
|
xmm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
} else {
|
2023-09-14 02:37:36 +03:00
|
|
|
ymm(tcg_env, OP_PTR2, OP_PTR1, s->A0);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMASKMOVPD_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_maskmov(s, env, decode, gen_helper_vpmaskmovq_st_xmm, gen_helper_vpmaskmovq_st_ymm);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
|
|
|
|
}
|
|
|
|
|
2022-09-18 00:22:36 +03:00
|
|
|
static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
|
|
|
|
if (decode->op[0].offset != decode->op[1].offset) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
|
2022-09-18 00:22:36 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
if (decode->op[0].offset != decode->op[2].offset) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
|
2022-09-18 00:22:36 +03:00
|
|
|
}
|
|
|
|
if (decode->op[0].offset != decode->op[1].offset) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
|
2022-09-18 00:22:36 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
|
2022-09-18 00:22:36 +03:00
|
|
|
if (decode->op[0].offset != decode->op[1].offset) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
|
2022-09-18 00:22:36 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset);
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
|
2022-09-18 00:22:36 +03:00
|
|
|
if (decode->op[0].offset != decode->op[1].offset) {
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
|
2022-09-18 00:22:36 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
|
|
|
|
* Use a gvec move to move everything above the bottom 64 bits.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, tcg_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
|
2022-09-18 00:22:36 +03:00
|
|
|
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_st_i64(s->tmp1_i64, tcg_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
|
2022-09-18 00:22:36 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
|
|
|
|
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
|
|
|
|
tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i64 zero = tcg_constant_i64(0);
|
|
|
|
|
|
|
|
tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
|
|
|
|
tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
|
|
|
|
tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
|
|
|
|
tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
|
|
|
|
tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int vec_len = vector_len(s, decode);
|
|
|
|
|
|
|
|
tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
|
|
|
|
tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
|
|
|
|
tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
|
|
|
|
}
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
if (s->vex_w) {
|
|
|
|
gen_VMASKMOVPD_st(s, env, decode);
|
|
|
|
} else {
|
|
|
|
gen_VMASKMOVPS_st(s, env, decode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VPERMD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
assert(s->vex_l);
|
|
|
|
gen_helper_vpermd_ymm(OP_PTR0, OP_PTR1, OP_PTR2);
|
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_VPERM2x128(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
assert(s->vex_l);
|
|
|
|
gen_helper_vpermdq_ymm(OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
|
|
|
}
|
|
|
|
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
static void gen_VPHMINPOSUW(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
assert(!s->vex_l);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_phminposuw_xmm(tcg_env, OP_PTR0, OP_PTR2);
|
target/i386: reimplement 0x0f 0x38, add AVX
There are several special cases here:
1) extending moves have different widths for the helpers vs. for the
memory loads, and the width for memory loads depends on VEX.L too.
This is represented by X86_SPECIAL_AVXExtMov.
2) some instructions, such as variable-width shifts, select the vector element
size via REX.W.
3) VSIB instructions (VGATHERxPy, VPGATHERxy) are also part of this group,
and they have (among other things) two output operands.
3) the macros for 4-operand blends (which are under 0x0f 0x3a) have to be
extended to support 2-operand blends. The 2-operand variant actually
came a few years earlier, but it is clearer to implement them in the
opposite order.
X86_TYPE_WM, introduced earlier for unaligned loads, is reused for helpers
that accept a Reg* but have a M argument.
These three-byte opcodes also include AVX new instructions, for which
the helpers were originally implemented by Paul Brook <paul@nowt.org>.
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2022-09-14 19:52:44 +03:00
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_VROUNDSD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
assert(!s->vex_l);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_roundsd_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VROUNDSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant8u_i32(decode->immediate);
|
|
|
|
assert(!s->vex_l);
|
2023-09-14 02:37:36 +03:00
|
|
|
gen_helper_roundss_xmm(tcg_env, OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
2022-09-06 11:34:11 +03:00
|
|
|
}
|
|
|
|
|
2022-09-06 11:34:11 +03:00
|
|
|
static void gen_VSHUF(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_i32 imm = tcg_constant_i32(decode->immediate);
|
|
|
|
SSEFunc_0_pppi ps, pd, fn;
|
|
|
|
ps = s->vex_l ? gen_helper_shufps_ymm : gen_helper_shufps_xmm;
|
|
|
|
pd = s->vex_l ? gen_helper_shufpd_ymm : gen_helper_shufpd_xmm;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? pd : ps;
|
|
|
|
fn(OP_PTR0, OP_PTR1, OP_PTR2, imm);
|
|
|
|
}
|
|
|
|
|
2022-09-06 19:44:02 +03:00
|
|
|
static void gen_VUCOMI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
SSEFunc_0_epp fn;
|
|
|
|
fn = s->prefix & PREFIX_DATA ? gen_helper_ucomisd : gen_helper_ucomiss;
|
2023-09-14 02:37:36 +03:00
|
|
|
fn(tcg_env, OP_PTR1, OP_PTR2);
|
2022-09-06 19:44:02 +03:00
|
|
|
set_cc_op(s, CC_OP_EFLAGS);
|
|
|
|
}
|
|
|
|
|
2022-09-02 19:19:06 +03:00
|
|
|
static void gen_VZEROALL(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
TCGv_ptr ptr = tcg_temp_new_ptr();
|
|
|
|
|
2023-09-14 02:37:36 +03:00
|
|
|
tcg_gen_addi_ptr(ptr, tcg_env, offsetof(CPUX86State, xmm_regs));
|
2022-09-02 19:19:06 +03:00
|
|
|
gen_helper_memset(ptr, ptr, tcg_constant_i32(0),
|
|
|
|
tcg_constant_ptr(CPU_NB_REGS * sizeof(ZMMReg)));
|
|
|
|
}
|
|
|
|
|
|
|
|
static void gen_VZEROUPPER(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < CPU_NB_REGS; i++) {
|
|
|
|
int offset = offsetof(CPUX86State, xmm_regs[i].ZMM_X(1));
|
|
|
|
tcg_gen_gvec_dup_imm(MO_64, offset, 16, 16, 0);
|
|
|
|
}
|
|
|
|
}
|