target/i386: DPPS rounding fix
The DPPS (Dot Product) instruction is defined to first sum pairs of intermediate results, then sum those values to get the final result. i.e. (A+B)+(C+D) We incrementally sum the results, i.e. ((A+B)+C)+D, which can result in incorrect rouding. For consistency, also change the variable names to the ones used in the Intel SDM and implement DPPD following the manual. Based on a patch by Paul Brook <paul@nowt.org>. Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
parent
75046ad72e
commit
bf30ad8cef
@ -1943,56 +1943,59 @@ SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
|
|||||||
|
|
||||||
void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
|
void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
|
||||||
{
|
{
|
||||||
float32 iresult = float32_zero;
|
float32 prod1, prod2, temp2, temp3, temp4;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
|
||||||
|
* to correctly round the intermediate results
|
||||||
|
*/
|
||||||
if (mask & (1 << 4)) {
|
if (mask & (1 << 4)) {
|
||||||
iresult = float32_add(iresult,
|
prod1 = float32_mul(d->ZMM_S(0), s->ZMM_S(0), &env->sse_status);
|
||||||
float32_mul(d->ZMM_S(0), s->ZMM_S(0),
|
} else {
|
||||||
&env->sse_status),
|
prod1 = float32_zero;
|
||||||
&env->sse_status);
|
|
||||||
}
|
}
|
||||||
if (mask & (1 << 5)) {
|
if (mask & (1 << 5)) {
|
||||||
iresult = float32_add(iresult,
|
prod2 = float32_mul(d->ZMM_S(1), s->ZMM_S(1), &env->sse_status);
|
||||||
float32_mul(d->ZMM_S(1), s->ZMM_S(1),
|
} else {
|
||||||
&env->sse_status),
|
prod2 = float32_zero;
|
||||||
&env->sse_status);
|
|
||||||
}
|
}
|
||||||
|
temp2 = float32_add(prod1, prod2, &env->sse_status);
|
||||||
if (mask & (1 << 6)) {
|
if (mask & (1 << 6)) {
|
||||||
iresult = float32_add(iresult,
|
prod1 = float32_mul(d->ZMM_S(2), s->ZMM_S(2), &env->sse_status);
|
||||||
float32_mul(d->ZMM_S(2), s->ZMM_S(2),
|
} else {
|
||||||
&env->sse_status),
|
prod1 = float32_zero;
|
||||||
&env->sse_status);
|
|
||||||
}
|
}
|
||||||
if (mask & (1 << 7)) {
|
if (mask & (1 << 7)) {
|
||||||
iresult = float32_add(iresult,
|
prod2 = float32_mul(d->ZMM_S(3), s->ZMM_S(3), &env->sse_status);
|
||||||
float32_mul(d->ZMM_S(3), s->ZMM_S(3),
|
} else {
|
||||||
&env->sse_status),
|
prod2 = float32_zero;
|
||||||
&env->sse_status);
|
|
||||||
}
|
}
|
||||||
d->ZMM_S(0) = (mask & (1 << 0)) ? iresult : float32_zero;
|
temp3 = float32_add(prod1, prod2, &env->sse_status);
|
||||||
d->ZMM_S(1) = (mask & (1 << 1)) ? iresult : float32_zero;
|
temp4 = float32_add(temp2, temp3, &env->sse_status);
|
||||||
d->ZMM_S(2) = (mask & (1 << 2)) ? iresult : float32_zero;
|
|
||||||
d->ZMM_S(3) = (mask & (1 << 3)) ? iresult : float32_zero;
|
d->ZMM_S(0) = (mask & (1 << 0)) ? temp4 : float32_zero;
|
||||||
|
d->ZMM_S(1) = (mask & (1 << 1)) ? temp4 : float32_zero;
|
||||||
|
d->ZMM_S(2) = (mask & (1 << 2)) ? temp4 : float32_zero;
|
||||||
|
d->ZMM_S(3) = (mask & (1 << 3)) ? temp4 : float32_zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
|
void glue(helper_dppd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t mask)
|
||||||
{
|
{
|
||||||
float64 iresult = float64_zero;
|
float64 prod1, prod2, temp2;
|
||||||
|
|
||||||
if (mask & (1 << 4)) {
|
if (mask & (1 << 4)) {
|
||||||
iresult = float64_add(iresult,
|
prod1 = float64_mul(d->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
|
||||||
float64_mul(d->ZMM_D(0), s->ZMM_D(0),
|
} else {
|
||||||
&env->sse_status),
|
prod1 = float64_zero;
|
||||||
&env->sse_status);
|
|
||||||
}
|
}
|
||||||
if (mask & (1 << 5)) {
|
if (mask & (1 << 5)) {
|
||||||
iresult = float64_add(iresult,
|
prod2 = float64_mul(d->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
|
||||||
float64_mul(d->ZMM_D(1), s->ZMM_D(1),
|
} else {
|
||||||
&env->sse_status),
|
prod2 = float64_zero;
|
||||||
&env->sse_status);
|
|
||||||
}
|
}
|
||||||
d->ZMM_D(0) = (mask & (1 << 0)) ? iresult : float64_zero;
|
temp2 = float64_add(prod1, prod2, &env->sse_status);
|
||||||
d->ZMM_D(1) = (mask & (1 << 1)) ? iresult : float64_zero;
|
d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
|
||||||
|
d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
|
||||||
}
|
}
|
||||||
|
|
||||||
void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
|
void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
|
||||||
|
Loading…
Reference in New Issue
Block a user