qemu/tests/tcg/hexagon/hvx_misc.c
Taylor Simpson 83853ea0ef Hexagon (target/hexagon) Fix predicated assignment to .tmp and .cur
Here are example instructions with a predicated .tmp/.cur assignment
    if (p1) v12.tmp = vmem(r7 + #0)
    if (p0) v12.cur = vmem(r9 + #0)
The .tmp/.cur indicates that references to v12 in the same packet
take the result of the load.  However, when the predicate is false,
the value at the start of the packet should be used.  After the packet
commits, the .tmp value is dropped, but the .cur value is maintained.

To fix this bug, we preload the original value from the HVX register
into the temporary used for the result.

Test cases added to tests/tcg/hexagon/hvx_misc.c

Acked-by: Richard Henderson <richard.henderson@linaro.org>
Co-authored-by: Matheus Tavares Bernardino <quic_mathbern@quicinc.com>
Signed-off-by: Matheus Tavares Bernardino <quic_mathbern@quicinc.com>
Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Message-Id: <20221108162906.3166-3-tsimpson@quicinc.com>
2022-12-16 10:10:28 -08:00

656 lines
19 KiB
C

/*
* Copyright(c) 2021-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <limits.h>
int err;
static void __check(int line, int i, int j, uint64_t result, uint64_t expect)
{
if (result != expect) {
printf("ERROR at line %d: [%d][%d] 0x%016llx != 0x%016llx\n",
line, i, j, result, expect);
err++;
}
}
#define check(RES, EXP) __check(__LINE__, RES, EXP)
#define MAX_VEC_SIZE_BYTES 128
typedef union {
uint64_t ud[MAX_VEC_SIZE_BYTES / 8];
int64_t d[MAX_VEC_SIZE_BYTES / 8];
uint32_t uw[MAX_VEC_SIZE_BYTES / 4];
int32_t w[MAX_VEC_SIZE_BYTES / 4];
uint16_t uh[MAX_VEC_SIZE_BYTES / 2];
int16_t h[MAX_VEC_SIZE_BYTES / 2];
uint8_t ub[MAX_VEC_SIZE_BYTES / 1];
int8_t b[MAX_VEC_SIZE_BYTES / 1];
} MMVector;
#define BUFSIZE 16
#define OUTSIZE 16
#define MASKMOD 3
MMVector buffer0[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
MMVector buffer1[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
MMVector mask[BUFSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
MMVector output[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
MMVector expect[OUTSIZE] __attribute__((aligned(MAX_VEC_SIZE_BYTES)));
#define CHECK_OUTPUT_FUNC(FIELD, FIELDSZ) \
static void check_output_##FIELD(int line, size_t num_vectors) \
{ \
for (int i = 0; i < num_vectors; i++) { \
for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
__check(line, i, j, output[i].FIELD[j], expect[i].FIELD[j]); \
} \
} \
}
CHECK_OUTPUT_FUNC(d, 8)
CHECK_OUTPUT_FUNC(w, 4)
CHECK_OUTPUT_FUNC(h, 2)
CHECK_OUTPUT_FUNC(b, 1)
static void init_buffers(void)
{
int counter0 = 0;
int counter1 = 17;
for (int i = 0; i < BUFSIZE; i++) {
for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) {
buffer0[i].b[j] = counter0++;
buffer1[i].b[j] = counter1++;
}
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
mask[i].w[j] = (i + j % MASKMOD == 0) ? 0 : 1;
}
}
}
static void test_load_tmp(void)
{
void *p0 = buffer0;
void *p1 = buffer1;
void *pout = output;
for (int i = 0; i < BUFSIZE; i++) {
/*
* Load into v12 as .tmp, then use it in the next packet
* Should get the new value within the same packet and
* the old value in the next packet
*/
asm("v3 = vmem(%0 + #0)\n\t"
"r1 = #1\n\t"
"v12 = vsplat(r1)\n\t"
"{\n\t"
" v12.tmp = vmem(%1 + #0)\n\t"
" v4.w = vadd(v12.w, v3.w)\n\t"
"}\n\t"
"v4.w = vadd(v4.w, v12.w)\n\t"
"vmem(%2 + #0) = v4\n\t"
: : "r"(p0), "r"(p1), "r"(pout)
: "r1", "v12", "v3", "v4", "v6", "memory");
p0 += sizeof(MMVector);
p1 += sizeof(MMVector);
pout += sizeof(MMVector);
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
expect[i].w[j] = buffer0[i].w[j] + buffer1[i].w[j] + 1;
}
}
check_output_w(__LINE__, BUFSIZE);
}
static void test_load_cur(void)
{
void *p0 = buffer0;
void *pout = output;
for (int i = 0; i < BUFSIZE; i++) {
asm("{\n\t"
" v2.cur = vmem(%0 + #0)\n\t"
" vmem(%1 + #0) = v2\n\t"
"}\n\t"
: : "r"(p0), "r"(pout) : "v2", "memory");
p0 += sizeof(MMVector);
pout += sizeof(MMVector);
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
expect[i].uw[j] = buffer0[i].uw[j];
}
}
check_output_w(__LINE__, BUFSIZE);
}
static void test_load_aligned(void)
{
/* Aligned loads ignore the low bits of the address */
void *p0 = buffer0;
void *pout = output;
const size_t offset = 13;
p0 += offset; /* Create an unaligned address */
asm("v2 = vmem(%0 + #0)\n\t"
"vmem(%1 + #0) = v2\n\t"
: : "r"(p0), "r"(pout) : "v2", "memory");
expect[0] = buffer0[0];
check_output_w(__LINE__, 1);
}
static void test_load_unaligned(void)
{
void *p0 = buffer0;
void *pout = output;
const size_t offset = 12;
p0 += offset; /* Create an unaligned address */
asm("v2 = vmemu(%0 + #0)\n\t"
"vmem(%1 + #0) = v2\n\t"
: : "r"(p0), "r"(pout) : "v2", "memory");
memcpy(expect, &buffer0[0].ub[offset], sizeof(MMVector));
check_output_w(__LINE__, 1);
}
static void test_store_aligned(void)
{
/* Aligned stores ignore the low bits of the address */
void *p0 = buffer0;
void *pout = output;
const size_t offset = 13;
pout += offset; /* Create an unaligned address */
asm("v2 = vmem(%0 + #0)\n\t"
"vmem(%1 + #0) = v2\n\t"
: : "r"(p0), "r"(pout) : "v2", "memory");
expect[0] = buffer0[0];
check_output_w(__LINE__, 1);
}
static void test_store_unaligned(void)
{
void *p0 = buffer0;
void *pout = output;
const size_t offset = 12;
pout += offset; /* Create an unaligned address */
asm("v2 = vmem(%0 + #0)\n\t"
"vmemu(%1 + #0) = v2\n\t"
: : "r"(p0), "r"(pout) : "v2", "memory");
memcpy(expect, buffer0, 2 * sizeof(MMVector));
memcpy(&expect[0].ub[offset], buffer0, sizeof(MMVector));
check_output_w(__LINE__, 2);
}
static void test_masked_store(bool invert)
{
void *p0 = buffer0;
void *pmask = mask;
void *pout = output;
memset(expect, 0xff, sizeof(expect));
memset(output, 0xff, sizeof(expect));
for (int i = 0; i < BUFSIZE; i++) {
if (invert) {
asm("r4 = #0\n\t"
"v4 = vsplat(r4)\n\t"
"v5 = vmem(%0 + #0)\n\t"
"q0 = vcmp.eq(v4.w, v5.w)\n\t"
"v5 = vmem(%1)\n\t"
"if (!q0) vmem(%2) = v5\n\t" /* Inverted test */
: : "r"(pmask), "r"(p0), "r"(pout)
: "r4", "v4", "v5", "q0", "memory");
} else {
asm("r4 = #0\n\t"
"v4 = vsplat(r4)\n\t"
"v5 = vmem(%0 + #0)\n\t"
"q0 = vcmp.eq(v4.w, v5.w)\n\t"
"v5 = vmem(%1)\n\t"
"if (q0) vmem(%2) = v5\n\t" /* Non-inverted test */
: : "r"(pmask), "r"(p0), "r"(pout)
: "r4", "v4", "v5", "q0", "memory");
}
p0 += sizeof(MMVector);
pmask += sizeof(MMVector);
pout += sizeof(MMVector);
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
if (invert) {
if (i + j % MASKMOD != 0) {
expect[i].w[j] = buffer0[i].w[j];
}
} else {
if (i + j % MASKMOD == 0) {
expect[i].w[j] = buffer0[i].w[j];
}
}
}
}
check_output_w(__LINE__, BUFSIZE);
}
static void test_new_value_store(void)
{
void *p0 = buffer0;
void *pout = output;
asm("{\n\t"
" v2 = vmem(%0 + #0)\n\t"
" vmem(%1 + #0) = v2.new\n\t"
"}\n\t"
: : "r"(p0), "r"(pout) : "v2", "memory");
expect[0] = buffer0[0];
check_output_w(__LINE__, 1);
}
static void test_max_temps()
{
void *p0 = buffer0;
void *pout = output;
asm("v0 = vmem(%0 + #0)\n\t"
"v1 = vmem(%0 + #1)\n\t"
"v2 = vmem(%0 + #2)\n\t"
"v3 = vmem(%0 + #3)\n\t"
"v4 = vmem(%0 + #4)\n\t"
"{\n\t"
" v1:0.w = vadd(v3:2.w, v1:0.w)\n\t"
" v2.b = vshuffe(v3.b, v2.b)\n\t"
" v3.w = vadd(v1.w, v4.w)\n\t"
" v4.tmp = vmem(%0 + #5)\n\t"
"}\n\t"
"vmem(%1 + #0) = v0\n\t"
"vmem(%1 + #1) = v1\n\t"
"vmem(%1 + #2) = v2\n\t"
"vmem(%1 + #3) = v3\n\t"
"vmem(%1 + #4) = v4\n\t"
: : "r"(p0), "r"(pout) : "memory");
/* The first two vectors come from the vadd-pair instruction */
for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
expect[0].w[i] = buffer0[0].w[i] + buffer0[2].w[i];
expect[1].w[i] = buffer0[1].w[i] + buffer0[3].w[i];
}
/* The third vector comes from the vshuffe instruction */
for (int i = 0; i < MAX_VEC_SIZE_BYTES / 2; i++) {
expect[2].uh[i] = (buffer0[2].uh[i] & 0xff) |
(buffer0[3].uh[i] & 0xff) << 8;
}
/* The fourth vector comes from the vadd-single instruction */
for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
expect[3].w[i] = buffer0[1].w[i] + buffer0[5].w[i];
}
/*
* The fifth vector comes from the load to v4
* make sure the .tmp is dropped
*/
expect[4] = buffer0[4];
check_output_b(__LINE__, 5);
}
#define VEC_OP1(ASM, EL, IN, OUT) \
asm("v2 = vmem(%0 + #0)\n\t" \
"v2" #EL " = " #ASM "(v2" #EL ")\n\t" \
"vmem(%1 + #0) = v2\n\t" \
: : "r"(IN), "r"(OUT) : "v2", "memory")
#define VEC_OP2(ASM, EL, IN0, IN1, OUT) \
asm("v2 = vmem(%0 + #0)\n\t" \
"v3 = vmem(%1 + #0)\n\t" \
"v2" #EL " = " #ASM "(v2" #EL ", v3" #EL ")\n\t" \
"vmem(%2 + #0) = v2\n\t" \
: : "r"(IN0), "r"(IN1), "r"(OUT) : "v2", "v3", "memory")
#define TEST_VEC_OP1(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
static void test_##NAME(void) \
{ \
void *pin = buffer0; \
void *pout = output; \
for (int i = 0; i < BUFSIZE; i++) { \
VEC_OP1(ASM, EL, pin, pout); \
pin += sizeof(MMVector); \
pout += sizeof(MMVector); \
} \
for (int i = 0; i < BUFSIZE; i++) { \
for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
expect[i].FIELD[j] = OP buffer0[i].FIELD[j]; \
} \
} \
check_output_##FIELD(__LINE__, BUFSIZE); \
}
#define TEST_VEC_OP2(NAME, ASM, EL, FIELD, FIELDSZ, OP) \
static void test_##NAME(void) \
{ \
void *p0 = buffer0; \
void *p1 = buffer1; \
void *pout = output; \
for (int i = 0; i < BUFSIZE; i++) { \
VEC_OP2(ASM, EL, p0, p1, pout); \
p0 += sizeof(MMVector); \
p1 += sizeof(MMVector); \
pout += sizeof(MMVector); \
} \
for (int i = 0; i < BUFSIZE; i++) { \
for (int j = 0; j < MAX_VEC_SIZE_BYTES / FIELDSZ; j++) { \
expect[i].FIELD[j] = buffer0[i].FIELD[j] OP buffer1[i].FIELD[j]; \
} \
} \
check_output_##FIELD(__LINE__, BUFSIZE); \
}
#define THRESHOLD 31
#define PRED_OP2(ASM, IN0, IN1, OUT, INV) \
asm("r4 = #%3\n\t" \
"v1.b = vsplat(r4)\n\t" \
"v2 = vmem(%0 + #0)\n\t" \
"q0 = vcmp.gt(v2.b, v1.b)\n\t" \
"v3 = vmem(%1 + #0)\n\t" \
"q1 = vcmp.gt(v3.b, v1.b)\n\t" \
"q2 = " #ASM "(q0, " INV "q1)\n\t" \
"r4 = #0xff\n\t" \
"v1.b = vsplat(r4)\n\t" \
"if (q2) vmem(%2 + #0) = v1\n\t" \
: : "r"(IN0), "r"(IN1), "r"(OUT), "i"(THRESHOLD) \
: "r4", "v1", "v2", "v3", "q0", "q1", "q2", "memory")
#define TEST_PRED_OP2(NAME, ASM, OP, INV) \
static void test_##NAME(bool invert) \
{ \
void *p0 = buffer0; \
void *p1 = buffer1; \
void *pout = output; \
memset(output, 0, sizeof(expect)); \
for (int i = 0; i < BUFSIZE; i++) { \
PRED_OP2(ASM, p0, p1, pout, INV); \
p0 += sizeof(MMVector); \
p1 += sizeof(MMVector); \
pout += sizeof(MMVector); \
} \
for (int i = 0; i < BUFSIZE; i++) { \
for (int j = 0; j < MAX_VEC_SIZE_BYTES; j++) { \
bool p0 = (buffer0[i].b[j] > THRESHOLD); \
bool p1 = (buffer1[i].b[j] > THRESHOLD); \
if (invert) { \
expect[i].b[j] = (p0 OP !p1) ? 0xff : 0x00; \
} else { \
expect[i].b[j] = (p0 OP p1) ? 0xff : 0x00; \
} \
} \
} \
check_output_b(__LINE__, BUFSIZE); \
}
TEST_VEC_OP2(vadd_w, vadd, .w, w, 4, +)
TEST_VEC_OP2(vadd_h, vadd, .h, h, 2, +)
TEST_VEC_OP2(vadd_b, vadd, .b, b, 1, +)
TEST_VEC_OP2(vsub_w, vsub, .w, w, 4, -)
TEST_VEC_OP2(vsub_h, vsub, .h, h, 2, -)
TEST_VEC_OP2(vsub_b, vsub, .b, b, 1, -)
TEST_VEC_OP2(vxor, vxor, , d, 8, ^)
TEST_VEC_OP2(vand, vand, , d, 8, &)
TEST_VEC_OP2(vor, vor, , d, 8, |)
TEST_VEC_OP1(vnot, vnot, , d, 8, ~)
TEST_PRED_OP2(pred_or, or, |, "")
TEST_PRED_OP2(pred_or_n, or, |, "!")
TEST_PRED_OP2(pred_and, and, &, "")
TEST_PRED_OP2(pred_and_n, and, &, "!")
TEST_PRED_OP2(pred_xor, xor, ^, "")
static void test_vadduwsat(void)
{
/*
* Test for saturation by adding two numbers that add to more than UINT_MAX
* and make sure the result saturates to UINT_MAX
*/
const uint32_t x = 0xffff0000;
const uint32_t y = 0x000fffff;
memset(expect, 0x12, sizeof(MMVector));
memset(output, 0x34, sizeof(MMVector));
asm volatile ("v10 = vsplat(%0)\n\t"
"v11 = vsplat(%1)\n\t"
"v21.uw = vadd(v11.uw, v10.uw):sat\n\t"
"vmem(%2+#0) = v21\n\t"
: /* no outputs */
: "r"(x), "r"(y), "r"(output)
: "v10", "v11", "v21", "memory");
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
expect[0].uw[j] = UINT_MAX;
}
check_output_w(__LINE__, 1);
}
static void test_vsubuwsat_dv(void)
{
/*
* Test for saturation by subtracting two numbers where the result is
* negative and make sure the result saturates to zero
*
* vsubuwsat_dv operates on an HVX register pair, so we'll have a
* pair of subtractions
* w - x < 0
* y - z < 0
*/
const uint32_t w = 0x000000b7;
const uint32_t x = 0xffffff4e;
const uint32_t y = 0x31fe88e7;
const uint32_t z = 0x7fffff79;
memset(expect, 0x12, sizeof(MMVector) * 2);
memset(output, 0x34, sizeof(MMVector) * 2);
asm volatile ("v16 = vsplat(%0)\n\t"
"v17 = vsplat(%1)\n\t"
"v26 = vsplat(%2)\n\t"
"v27 = vsplat(%3)\n\t"
"v25:24.uw = vsub(v17:16.uw, v27:26.uw):sat\n\t"
"vmem(%4+#0) = v24\n\t"
"vmem(%4+#1) = v25\n\t"
: /* no outputs */
: "r"(w), "r"(y), "r"(x), "r"(z), "r"(output)
: "v16", "v17", "v24", "v25", "v26", "v27", "memory");
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
expect[0].uw[j] = 0x00000000;
expect[1].uw[j] = 0x00000000;
}
check_output_w(__LINE__, 2);
}
static void test_vshuff(void)
{
/* Test that vshuff works when the two operands are the same register */
const uint32_t splat = 0x089be55c;
const uint32_t shuff = 0x454fa926;
MMVector v0, v1;
memset(expect, 0x12, sizeof(MMVector));
memset(output, 0x34, sizeof(MMVector));
asm volatile("v25 = vsplat(%0)\n\t"
"vshuff(v25, v25, %1)\n\t"
"vmem(%2 + #0) = v25\n\t"
: /* no outputs */
: "r"(splat), "r"(shuff), "r"(output)
: "v25", "memory");
/*
* The semantics of Hexagon are the operands are pass-by-value, so create
* two copies of the vsplat result.
*/
for (int i = 0; i < MAX_VEC_SIZE_BYTES / 4; i++) {
v0.uw[i] = splat;
v1.uw[i] = splat;
}
/* Do the vshuff operation */
for (int offset = 1; offset < MAX_VEC_SIZE_BYTES; offset <<= 1) {
if (shuff & offset) {
for (int k = 0; k < MAX_VEC_SIZE_BYTES; k++) {
if (!(k & offset)) {
uint8_t tmp = v0.ub[k];
v0.ub[k] = v1.ub[k + offset];
v1.ub[k + offset] = tmp;
}
}
}
}
/* Put the result in the expect buffer for verification */
expect[0] = v1;
check_output_b(__LINE__, 1);
}
static void test_load_tmp_predicated(void)
{
void *p0 = buffer0;
void *p1 = buffer1;
void *pout = output;
bool pred = true;
for (int i = 0; i < BUFSIZE; i++) {
/*
* Load into v12 as .tmp with a predicate
* When the predicate is true, we get the vector from buffer1[i]
* When the predicate is false, we get a vector of all 1's
* Regardless of the predicate, the next packet should have
* a vector of all 1's
*/
asm("v3 = vmem(%0 + #0)\n\t"
"r1 = #1\n\t"
"v12 = vsplat(r1)\n\t"
"p1 = !cmp.eq(%3, #0)\n\t"
"{\n\t"
" if (p1) v12.tmp = vmem(%1 + #0)\n\t"
" v4.w = vadd(v12.w, v3.w)\n\t"
"}\n\t"
"v4.w = vadd(v4.w, v12.w)\n\t"
"vmem(%2 + #0) = v4\n\t"
: : "r"(p0), "r"(p1), "r"(pout), "r"(pred)
: "r1", "p1", "v12", "v3", "v4", "v6", "memory");
p0 += sizeof(MMVector);
p1 += sizeof(MMVector);
pout += sizeof(MMVector);
for (int j = 0; j < MAX_VEC_SIZE_BYTES / 4; j++) {
expect[i].w[j] =
pred ? buffer0[i].w[j] + buffer1[i].w[j] + 1
: buffer0[i].w[j] + 2;
}
pred = !pred;
}
check_output_w(__LINE__, BUFSIZE);
}
static void test_load_cur_predicated(void)
{
bool pred = true;
for (int i = 0; i < BUFSIZE; i++) {
asm volatile("p0 = !cmp.eq(%3, #0)\n\t"
"v3 = vmem(%0+#0)\n\t"
/*
* Preload v4 to make sure that the assignment from the
* packet below is not being ignored when pred is false.
*/
"r0 = #0x01237654\n\t"
"v4 = vsplat(r0)\n\t"
"{\n\t"
" if (p0) v3.cur = vmem(%1+#0)\n\t"
" v4 = v3\n\t"
"}\n\t"
"vmem(%2+#0) = v4\n\t"
:
: "r"(&buffer0[i]), "r"(&buffer1[i]),
"r"(&output[i]), "r"(pred)
: "r0", "p0", "v3", "v4", "memory");
expect[i] = pred ? buffer1[i] : buffer0[i];
pred = !pred;
}
check_output_w(__LINE__, BUFSIZE);
}
int main()
{
init_buffers();
test_load_tmp();
test_load_cur();
test_load_aligned();
test_load_unaligned();
test_store_aligned();
test_store_unaligned();
test_masked_store(false);
test_masked_store(true);
test_new_value_store();
test_max_temps();
test_vadd_w();
test_vadd_h();
test_vadd_b();
test_vsub_w();
test_vsub_h();
test_vsub_b();
test_vxor();
test_vand();
test_vor();
test_vnot();
test_pred_or(false);
test_pred_or_n(true);
test_pred_and(false);
test_pred_and_n(true);
test_pred_xor(false);
test_vadduwsat();
test_vsubuwsat_dv();
test_vshuff();
test_load_tmp_predicated();
test_load_cur_predicated();
puts(err ? "FAIL" : "PASS");
return err ? 1 : 0;
}