c367938540
Replace __builtin_* with inline assembly The __builtin's are subject to change with different compiler releases, so might break Mark arrays as aligned when accessed as HVX vectors Clean up comments Signed-off-by: Taylor Simpson <tsimpson@quicinc.com> Reviewed-by: Anton Johansson <anjo@rev.ng> Message-Id: <20230307025828.1612809-10-tsimpson@quicinc.com>
1041 lines
32 KiB
C
1041 lines
32 KiB
C
/*
|
|
* Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
/*
|
|
* This example tests the HVX scatter/gather instructions
|
|
*
|
|
* See section 5.13 of the V68 HVX Programmer's Reference
|
|
*
|
|
* There are 3 main classes operations
|
|
* _16 16-bit elements and 16-bit offsets
|
|
* _32 32-bit elements and 32-bit offsets
|
|
* _16_32 16-bit elements and 32-bit offsets
|
|
*
|
|
* There are also masked and accumulate versions
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <inttypes.h>
|
|
|
|
typedef long HVX_Vector __attribute__((__vector_size__(128)))
|
|
__attribute__((aligned(128)));
|
|
typedef long HVX_VectorPair __attribute__((__vector_size__(256)))
|
|
__attribute__((aligned(128)));
|
|
typedef long HVX_VectorPred __attribute__((__vector_size__(128)))
|
|
__attribute__((aligned(128)));
|
|
|
|
int err;
|
|
|
|
/* define the number of rows/cols in a square matrix */
|
|
#define MATRIX_SIZE 64
|
|
|
|
/* define the size of the scatter buffer */
|
|
#define SCATTER_BUFFER_SIZE (MATRIX_SIZE * MATRIX_SIZE)
|
|
|
|
/* fake vtcm - put buffers together and force alignment */
|
|
static struct {
|
|
unsigned short vscatter16[SCATTER_BUFFER_SIZE];
|
|
unsigned short vgather16[MATRIX_SIZE];
|
|
unsigned int vscatter32[SCATTER_BUFFER_SIZE];
|
|
unsigned int vgather32[MATRIX_SIZE];
|
|
unsigned short vscatter16_32[SCATTER_BUFFER_SIZE];
|
|
unsigned short vgather16_32[MATRIX_SIZE];
|
|
} vtcm __attribute__((aligned(0x10000)));
|
|
|
|
/* declare the arrays of reference values */
|
|
unsigned short vscatter16_ref[SCATTER_BUFFER_SIZE];
|
|
unsigned short vgather16_ref[MATRIX_SIZE];
|
|
unsigned int vscatter32_ref[SCATTER_BUFFER_SIZE];
|
|
unsigned int vgather32_ref[MATRIX_SIZE];
|
|
unsigned short vscatter16_32_ref[SCATTER_BUFFER_SIZE];
|
|
unsigned short vgather16_32_ref[MATRIX_SIZE];
|
|
|
|
/* declare the arrays of offsets */
|
|
unsigned short half_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned int word_offsets[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
|
|
/* declare the arrays of values */
|
|
unsigned short half_values[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned short half_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned short half_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned int word_values[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned int word_values_acc[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned int word_values_masked[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
|
|
/* declare the arrays of predicates */
|
|
unsigned short half_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
unsigned int word_predicates[MATRIX_SIZE] __attribute__((aligned(128)));
|
|
|
|
/* make this big enough for all the operations */
|
|
const size_t region_len = sizeof(vtcm);
|
|
|
|
/* optionally add sync instructions */
|
|
#define SYNC_VECTOR 1
|
|
|
|
static void sync_scatter(void *addr)
|
|
{
|
|
#if SYNC_VECTOR
|
|
/*
|
|
* Do the scatter release followed by a dummy load to complete the
|
|
* synchronization. Normally the dummy load would be deferred as
|
|
* long as possible to minimize stalls.
|
|
*/
|
|
asm volatile("vmem(%0 + #0):scatter_release\n" : : "r"(addr));
|
|
/* use volatile to force the load */
|
|
volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
|
|
#endif
|
|
}
|
|
|
|
static void sync_gather(void *addr)
|
|
{
|
|
#if SYNC_VECTOR
|
|
/* use volatile to force the load */
|
|
volatile HVX_Vector vDummy = *(HVX_Vector *)addr; vDummy = vDummy;
|
|
#endif
|
|
}
|
|
|
|
/* optionally print the results */
|
|
#define PRINT_DATA 0
|
|
|
|
#define FILL_CHAR '.'
|
|
|
|
/* fill vtcm scratch with ee */
|
|
void prefill_vtcm_scratch(void)
|
|
{
|
|
memset(&vtcm, FILL_CHAR, sizeof(vtcm));
|
|
}
|
|
|
|
/* create byte offsets to be a diagonal of the matrix with 16 bit elements */
|
|
void create_offsets_values_preds_16(void)
|
|
{
|
|
unsigned short half_element = 0;
|
|
unsigned short half_element_masked = 0;
|
|
char letter = 'A';
|
|
char letter_masked = '@';
|
|
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
half_offsets[i] = i * (2 * MATRIX_SIZE + 2);
|
|
|
|
half_element = 0;
|
|
half_element_masked = 0;
|
|
for (int j = 0; j < 2; j++) {
|
|
half_element |= letter << j * 8;
|
|
half_element_masked |= letter_masked << j * 8;
|
|
}
|
|
|
|
half_values[i] = half_element;
|
|
half_values_acc[i] = ((i % 10) << 8) + (i % 10);
|
|
half_values_masked[i] = half_element_masked;
|
|
|
|
letter++;
|
|
/* reset to 'A' */
|
|
if (letter == 'M') {
|
|
letter = 'A';
|
|
}
|
|
|
|
half_predicates[i] = (i % 3 == 0 || i % 5 == 0) ? ~0 : 0;
|
|
}
|
|
}
|
|
|
|
/* create byte offsets to be a diagonal of the matrix with 32 bit elements */
|
|
void create_offsets_values_preds_32(void)
|
|
{
|
|
unsigned int word_element = 0;
|
|
unsigned int word_element_masked = 0;
|
|
char letter = 'A';
|
|
char letter_masked = '&';
|
|
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
word_offsets[i] = i * (4 * MATRIX_SIZE + 4);
|
|
|
|
word_element = 0;
|
|
word_element_masked = 0;
|
|
for (int j = 0; j < 4; j++) {
|
|
word_element |= letter << j * 8;
|
|
word_element_masked |= letter_masked << j * 8;
|
|
}
|
|
|
|
word_values[i] = word_element;
|
|
word_values_acc[i] = ((i % 10) << 8) + (i % 10);
|
|
word_values_masked[i] = word_element_masked;
|
|
|
|
letter++;
|
|
/* reset to 'A' */
|
|
if (letter == 'M') {
|
|
letter = 'A';
|
|
}
|
|
|
|
word_predicates[i] = (i % 4 == 0 || i % 7 == 0) ? ~0 : 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* create byte offsets to be a diagonal of the matrix with 16 bit elements
|
|
* and 32 bit offsets
|
|
*/
|
|
void create_offsets_values_preds_16_32(void)
|
|
{
|
|
unsigned short half_element = 0;
|
|
unsigned short half_element_masked = 0;
|
|
char letter = 'D';
|
|
char letter_masked = '$';
|
|
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
word_offsets[i] = i * (2 * MATRIX_SIZE + 2);
|
|
|
|
half_element = 0;
|
|
half_element_masked = 0;
|
|
for (int j = 0; j < 2; j++) {
|
|
half_element |= letter << j * 8;
|
|
half_element_masked |= letter_masked << j * 8;
|
|
}
|
|
|
|
half_values[i] = half_element;
|
|
half_values_acc[i] = ((i % 10) << 8) + (i % 10);
|
|
half_values_masked[i] = half_element_masked;
|
|
|
|
letter++;
|
|
/* reset to 'A' */
|
|
if (letter == 'P') {
|
|
letter = 'D';
|
|
}
|
|
|
|
half_predicates[i] = (i % 2 == 0 || i % 13 == 0) ? ~0 : 0;
|
|
}
|
|
}
|
|
|
|
/* scatter the 16 bit elements using HVX */
|
|
void vector_scatter_16(void)
|
|
{
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%3 + #0)\n\t"
|
|
"vscatter(%0, m0, v0.h).h = v1\n\t"
|
|
: : "r"(vtcm.vscatter16), "r"(region_len),
|
|
"r"(half_offsets), "r"(half_values)
|
|
: "m0", "v0", "v1", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter16);
|
|
}
|
|
|
|
/* scatter-accumulate the 16 bit elements using HVX */
|
|
void vector_scatter_16_acc(void)
|
|
{
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%3 + #0)\n\t"
|
|
"vscatter(%0, m0, v0.h).h += v1\n\t"
|
|
: : "r"(vtcm.vscatter16), "r"(region_len),
|
|
"r"(half_offsets), "r"(half_values_acc)
|
|
: "m0", "v0", "v1", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter16);
|
|
}
|
|
|
|
/* masked scatter the 16 bit elements using HVX */
|
|
void vector_scatter_16_masked(void)
|
|
{
|
|
asm ("r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"v1 = vmem(%4 + #0)\n\t"
|
|
"if (q0) vscatter(%1, m0, v0.h).h = v1\n\t"
|
|
: : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
|
|
"r"(half_offsets), "r"(half_values_masked)
|
|
: "r1", "q0", "m0", "q0", "v0", "v1", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter16);
|
|
}
|
|
|
|
/* scatter the 32 bit elements using HVX */
|
|
void vector_scatter_32(void)
|
|
{
|
|
HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
|
|
HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
|
|
HVX_Vector *valueslo = (HVX_Vector *)word_values;
|
|
HVX_Vector *valueshi = (HVX_Vector *)&word_values[MATRIX_SIZE / 2];
|
|
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%3 + #0)\n\t"
|
|
"vscatter(%0, m0, v0.w).w = v1\n\t"
|
|
: : "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetslo), "r"(valueslo)
|
|
: "m0", "v0", "v1", "memory");
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%3 + #0)\n\t"
|
|
"vscatter(%0, m0, v0.w).w = v1\n\t"
|
|
: : "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetshi), "r"(valueshi)
|
|
: "m0", "v0", "v1", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter32);
|
|
}
|
|
|
|
/* scatter-accumulate the 32 bit elements using HVX */
|
|
void vector_scatter_32_acc(void)
|
|
{
|
|
HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
|
|
HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
|
|
HVX_Vector *valueslo = (HVX_Vector *)word_values_acc;
|
|
HVX_Vector *valueshi = (HVX_Vector *)&word_values_acc[MATRIX_SIZE / 2];
|
|
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%3 + #0)\n\t"
|
|
"vscatter(%0, m0, v0.w).w += v1\n\t"
|
|
: : "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetslo), "r"(valueslo)
|
|
: "m0", "v0", "v1", "memory");
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%3 + #0)\n\t"
|
|
"vscatter(%0, m0, v0.w).w += v1\n\t"
|
|
: : "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetshi), "r"(valueshi)
|
|
: "m0", "v0", "v1", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter32);
|
|
}
|
|
|
|
/* masked scatter the 32 bit elements using HVX */
|
|
void vector_scatter_32_masked(void)
|
|
{
|
|
HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
|
|
HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
|
|
HVX_Vector *valueslo = (HVX_Vector *)word_values_masked;
|
|
HVX_Vector *valueshi = (HVX_Vector *)&word_values_masked[MATRIX_SIZE / 2];
|
|
HVX_Vector *predslo = (HVX_Vector *)word_predicates;
|
|
HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
|
|
|
|
asm ("r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"v1 = vmem(%4 + #0)\n\t"
|
|
"if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
|
|
: : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetslo), "r"(valueslo)
|
|
: "r1", "q0", "m0", "q0", "v0", "v1", "memory");
|
|
asm ("r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"v1 = vmem(%4 + #0)\n\t"
|
|
"if (q0) vscatter(%1, m0, v0.w).w = v1\n\t"
|
|
: : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetshi), "r"(valueshi)
|
|
: "r1", "q0", "m0", "q0", "v0", "v1", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter32);
|
|
}
|
|
|
|
/* scatter the 16 bit elements with 32 bit offsets using HVX */
|
|
void vector_scatter_16_32(void)
|
|
{
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%2 + #1)\n\t"
|
|
"v2 = vmem(%3 + #0)\n\t"
|
|
"v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
|
|
"vscatter(%0, m0, v1:0.w).h = v2\n\t"
|
|
: : "r"(vtcm.vscatter16_32), "r"(region_len),
|
|
"r"(word_offsets), "r"(half_values)
|
|
: "m0", "v0", "v1", "v2", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter16_32);
|
|
}
|
|
|
|
/* scatter-accumulate the 16 bit elements with 32 bit offsets using HVX */
|
|
void vector_scatter_16_32_acc(void)
|
|
{
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%2 + #1)\n\t"
|
|
"v2 = vmem(%3 + #0)\n\t" \
|
|
"v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
|
|
"vscatter(%0, m0, v1:0.w).h += v2\n\t"
|
|
: : "r"(vtcm.vscatter16_32), "r"(region_len),
|
|
"r"(word_offsets), "r"(half_values_acc)
|
|
: "m0", "v0", "v1", "v2", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter16_32);
|
|
}
|
|
|
|
/* masked scatter the 16 bit elements with 32 bit offsets using HVX */
|
|
void vector_scatter_16_32_masked(void)
|
|
{
|
|
asm ("r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"v1 = vmem(%3 + #1)\n\t"
|
|
"v2 = vmem(%4 + #0)\n\t" \
|
|
"v2.h = vshuff(v2.h)\n\t" /* shuffle the values for the scatter */
|
|
"if (q0) vscatter(%1, m0, v1:0.w).h = v2\n\t"
|
|
: : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
|
|
"r"(word_offsets), "r"(half_values_masked)
|
|
: "r1", "q0", "m0", "v0", "v1", "v2", "memory");
|
|
|
|
sync_scatter(vtcm.vscatter16_32);
|
|
}
|
|
|
|
/* gather the elements from the scatter16 buffer using HVX */
|
|
void vector_gather_16(void)
|
|
{
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"{ vtmp.h = vgather(%0, m0, v0.h).h\n\t"
|
|
" vmem(%3 + #0) = vtmp.new }\n\t"
|
|
: : "r"(vtcm.vscatter16), "r"(region_len),
|
|
"r"(half_offsets), "r"(vtcm.vgather16)
|
|
: "m0", "v0", "memory");
|
|
|
|
sync_gather(vtcm.vgather16);
|
|
}
|
|
|
|
static unsigned short gather_16_masked_init(void)
|
|
{
|
|
char letter = '?';
|
|
return letter | (letter << 8);
|
|
}
|
|
|
|
/* masked gather the elements from the scatter16 buffer using HVX */
|
|
void vector_gather_16_masked(void)
|
|
{
|
|
unsigned short init = gather_16_masked_init();
|
|
|
|
asm ("v0.h = vsplat(%5)\n\t"
|
|
"vmem(%4 + #0) = v0\n\t" /* initialize the write area */
|
|
"r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"{ if (q0) vtmp.h = vgather(%1, m0, v0.h).h\n\t"
|
|
" vmem(%4 + #0) = vtmp.new }\n\t"
|
|
: : "r"(half_predicates), "r"(vtcm.vscatter16), "r"(region_len),
|
|
"r"(half_offsets), "r"(vtcm.vgather16), "r"(init)
|
|
: "r1", "q0", "m0", "v0", "memory");
|
|
|
|
sync_gather(vtcm.vgather16);
|
|
}
|
|
|
|
/* gather the elements from the scatter32 buffer using HVX */
|
|
void vector_gather_32(void)
|
|
{
|
|
HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
|
|
HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
|
|
HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
|
|
HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
|
|
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
|
|
" vmem(%3 + #0) = vtmp.new }\n\t"
|
|
: : "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetslo), "r"(vgatherlo)
|
|
: "m0", "v0", "memory");
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"{ vtmp.w = vgather(%0, m0, v0.w).w\n\t"
|
|
" vmem(%3 + #0) = vtmp.new }\n\t"
|
|
: : "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetshi), "r"(vgatherhi)
|
|
: "m0", "v0", "memory");
|
|
|
|
sync_gather(vgatherlo);
|
|
sync_gather(vgatherhi);
|
|
}
|
|
|
|
static unsigned int gather_32_masked_init(void)
|
|
{
|
|
char letter = '?';
|
|
return letter | (letter << 8) | (letter << 16) | (letter << 24);
|
|
}
|
|
|
|
/* masked gather the elements from the scatter32 buffer using HVX */
|
|
void vector_gather_32_masked(void)
|
|
{
|
|
unsigned int init = gather_32_masked_init();
|
|
HVX_Vector *vgatherlo = (HVX_Vector *)vtcm.vgather32;
|
|
HVX_Vector *vgatherhi = (HVX_Vector *)&vtcm.vgather32[MATRIX_SIZE / 2];
|
|
HVX_Vector *offsetslo = (HVX_Vector *)word_offsets;
|
|
HVX_Vector *offsetshi = (HVX_Vector *)&word_offsets[MATRIX_SIZE / 2];
|
|
HVX_Vector *predslo = (HVX_Vector *)word_predicates;
|
|
HVX_Vector *predshi = (HVX_Vector *)&word_predicates[MATRIX_SIZE / 2];
|
|
|
|
asm ("v0.h = vsplat(%5)\n\t"
|
|
"vmem(%4 + #0) = v0\n\t" /* initialize the write area */
|
|
"r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
|
|
" vmem(%4 + #0) = vtmp.new }\n\t"
|
|
: : "r"(predslo), "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetslo), "r"(vgatherlo), "r"(init)
|
|
: "r1", "q0", "m0", "v0", "memory");
|
|
asm ("v0.h = vsplat(%5)\n\t"
|
|
"vmem(%4 + #0) = v0\n\t" /* initialize the write area */
|
|
"r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"{ if (q0) vtmp.w = vgather(%1, m0, v0.w).w\n\t"
|
|
" vmem(%4 + #0) = vtmp.new }\n\t"
|
|
: : "r"(predshi), "r"(vtcm.vscatter32), "r"(region_len),
|
|
"r"(offsetshi), "r"(vgatherhi), "r"(init)
|
|
: "r1", "q0", "m0", "v0", "memory");
|
|
|
|
sync_gather(vgatherlo);
|
|
sync_gather(vgatherhi);
|
|
}
|
|
|
|
/* gather the elements from the scatter16_32 buffer using HVX */
|
|
void vector_gather_16_32(void)
|
|
{
|
|
asm ("m0 = %1\n\t"
|
|
"v0 = vmem(%2 + #0)\n\t"
|
|
"v1 = vmem(%2 + #1)\n\t"
|
|
"{ vtmp.h = vgather(%0, m0, v1:0.w).h\n\t"
|
|
" vmem(%3 + #0) = vtmp.new }\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */
|
|
"vmem(%3 + #0) = v0\n\t"
|
|
: : "r"(vtcm.vscatter16_32), "r"(region_len),
|
|
"r"(word_offsets), "r"(vtcm.vgather16_32)
|
|
: "m0", "v0", "v1", "memory");
|
|
|
|
sync_gather(vtcm.vgather16_32);
|
|
}
|
|
|
|
/* masked gather the elements from the scatter16_32 buffer using HVX */
|
|
void vector_gather_16_32_masked(void)
|
|
{
|
|
unsigned short init = gather_16_masked_init();
|
|
|
|
asm ("v0.h = vsplat(%5)\n\t"
|
|
"vmem(%4 + #0) = v0\n\t" /* initialize the write area */
|
|
"r1 = #-1\n\t"
|
|
"v0 = vmem(%0 + #0)\n\t"
|
|
"v0.h = vshuff(v0.h)\n\t" /* shuffle the predicates */
|
|
"q0 = vand(v0, r1)\n\t"
|
|
"m0 = %2\n\t"
|
|
"v0 = vmem(%3 + #0)\n\t"
|
|
"v1 = vmem(%3 + #1)\n\t"
|
|
"{ if (q0) vtmp.h = vgather(%1, m0, v1:0.w).h\n\t"
|
|
" vmem(%4 + #0) = vtmp.new }\n\t"
|
|
"v0 = vmem(%4 + #0)\n\t"
|
|
"v0.h = vdeal(v0.h)\n\t" /* deal the elements to get the order back */
|
|
"vmem(%4 + #0) = v0\n\t"
|
|
: : "r"(half_predicates), "r"(vtcm.vscatter16_32), "r"(region_len),
|
|
"r"(word_offsets), "r"(vtcm.vgather16_32), "r"(init)
|
|
: "r1", "q0", "m0", "v0", "v1", "memory");
|
|
|
|
sync_gather(vtcm.vgather16_32);
|
|
}
|
|
|
|
static void check_buffer(const char *name, void *c, void *r, size_t size)
|
|
{
|
|
char *check = (char *)c;
|
|
char *ref = (char *)r;
|
|
for (int i = 0; i < size; i++) {
|
|
if (check[i] != ref[i]) {
|
|
printf("ERROR %s [%d]: 0x%x (%c) != 0x%x (%c)\n", name, i,
|
|
check[i], check[i], ref[i], ref[i]);
|
|
err++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* These scalar functions are the C equivalents of the vector functions that
|
|
* use HVX
|
|
*/
|
|
|
|
/* scatter the 16 bit elements using C */
|
|
void scalar_scatter_16(unsigned short *vscatter16)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vscatter16[half_offsets[i] / 2] = half_values[i];
|
|
}
|
|
}
|
|
|
|
void check_scatter_16()
|
|
{
|
|
memset(vscatter16_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
scalar_scatter_16(vscatter16_ref);
|
|
check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* scatter the 16 bit elements using C */
|
|
void scalar_scatter_16_acc(unsigned short *vscatter16)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vscatter16[half_offsets[i] / 2] += half_values_acc[i];
|
|
}
|
|
}
|
|
|
|
/* scatter-accumulate the 16 bit elements using C */
|
|
void check_scatter_16_acc()
|
|
{
|
|
memset(vscatter16_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
scalar_scatter_16(vscatter16_ref);
|
|
scalar_scatter_16_acc(vscatter16_ref);
|
|
check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* masked scatter the 16 bit elements using C */
|
|
void scalar_scatter_16_masked(unsigned short *vscatter16)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
if (half_predicates[i]) {
|
|
vscatter16[half_offsets[i] / 2] = half_values_masked[i];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
void check_scatter_16_masked()
|
|
{
|
|
memset(vscatter16_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
scalar_scatter_16(vscatter16_ref);
|
|
scalar_scatter_16_acc(vscatter16_ref);
|
|
scalar_scatter_16_masked(vscatter16_ref);
|
|
check_buffer(__func__, vtcm.vscatter16, vscatter16_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* scatter the 32 bit elements using C */
|
|
void scalar_scatter_32(unsigned int *vscatter32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vscatter32[word_offsets[i] / 4] = word_values[i];
|
|
}
|
|
}
|
|
|
|
void check_scatter_32()
|
|
{
|
|
memset(vscatter32_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned int));
|
|
scalar_scatter_32(vscatter32_ref);
|
|
check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned int));
|
|
}
|
|
|
|
/* scatter-accumulate the 32 bit elements using C */
|
|
void scalar_scatter_32_acc(unsigned int *vscatter32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vscatter32[word_offsets[i] / 4] += word_values_acc[i];
|
|
}
|
|
}
|
|
|
|
void check_scatter_32_acc()
|
|
{
|
|
memset(vscatter32_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned int));
|
|
scalar_scatter_32(vscatter32_ref);
|
|
scalar_scatter_32_acc(vscatter32_ref);
|
|
check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned int));
|
|
}
|
|
|
|
/* masked scatter the 32 bit elements using C */
|
|
void scalar_scatter_32_masked(unsigned int *vscatter32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
if (word_predicates[i]) {
|
|
vscatter32[word_offsets[i] / 4] = word_values_masked[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
void check_scatter_32_masked()
|
|
{
|
|
memset(vscatter32_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned int));
|
|
scalar_scatter_32(vscatter32_ref);
|
|
scalar_scatter_32_acc(vscatter32_ref);
|
|
scalar_scatter_32_masked(vscatter32_ref);
|
|
check_buffer(__func__, vtcm.vscatter32, vscatter32_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned int));
|
|
}
|
|
|
|
/* scatter the 16 bit elements with 32 bit offsets using C */
|
|
void scalar_scatter_16_32(unsigned short *vscatter16_32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vscatter16_32[word_offsets[i] / 2] = half_values[i];
|
|
}
|
|
}
|
|
|
|
void check_scatter_16_32()
|
|
{
|
|
memset(vscatter16_32_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
scalar_scatter_16_32(vscatter16_32_ref);
|
|
check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* scatter-accumulate the 16 bit elements with 32 bit offsets using C */
|
|
void scalar_scatter_16_32_acc(unsigned short *vscatter16_32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vscatter16_32[word_offsets[i] / 2] += half_values_acc[i];
|
|
}
|
|
}
|
|
|
|
void check_scatter_16_32_acc()
|
|
{
|
|
memset(vscatter16_32_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
scalar_scatter_16_32(vscatter16_32_ref);
|
|
scalar_scatter_16_32_acc(vscatter16_32_ref);
|
|
check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* masked scatter the 16 bit elements with 32 bit offsets using C */
|
|
void scalar_scatter_16_32_masked(unsigned short *vscatter16_32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
if (half_predicates[i]) {
|
|
vscatter16_32[word_offsets[i] / 2] = half_values_masked[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
void check_scatter_16_32_masked()
|
|
{
|
|
memset(vscatter16_32_ref, FILL_CHAR,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
scalar_scatter_16_32(vscatter16_32_ref);
|
|
scalar_scatter_16_32_acc(vscatter16_32_ref);
|
|
scalar_scatter_16_32_masked(vscatter16_32_ref);
|
|
check_buffer(__func__, vtcm.vscatter16_32, vscatter16_32_ref,
|
|
SCATTER_BUFFER_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* gather the elements from the scatter buffer using C */
|
|
void scalar_gather_16(unsigned short *vgather16)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
|
|
}
|
|
}
|
|
|
|
void check_gather_16()
|
|
{
|
|
memset(vgather16_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
|
|
scalar_gather_16(vgather16_ref);
|
|
check_buffer(__func__, vtcm.vgather16, vgather16_ref,
|
|
MATRIX_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* masked gather the elements from the scatter buffer using C */
|
|
void scalar_gather_16_masked(unsigned short *vgather16)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
if (half_predicates[i]) {
|
|
vgather16[i] = vtcm.vscatter16[half_offsets[i] / 2];
|
|
}
|
|
}
|
|
}
|
|
|
|
void check_gather_16_masked()
|
|
{
|
|
memset(vgather16_ref, gather_16_masked_init(),
|
|
MATRIX_SIZE * sizeof(unsigned short));
|
|
scalar_gather_16_masked(vgather16_ref);
|
|
check_buffer(__func__, vtcm.vgather16, vgather16_ref,
|
|
MATRIX_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* gather the elements from the scatter32 buffer using C */
|
|
void scalar_gather_32(unsigned int *vgather32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
|
|
}
|
|
}
|
|
|
|
void check_gather_32(void)
|
|
{
|
|
memset(vgather32_ref, 0, MATRIX_SIZE * sizeof(unsigned int));
|
|
scalar_gather_32(vgather32_ref);
|
|
check_buffer(__func__, vtcm.vgather32, vgather32_ref,
|
|
MATRIX_SIZE * sizeof(unsigned int));
|
|
}
|
|
|
|
/* masked gather the elements from the scatter32 buffer using C */
|
|
void scalar_gather_32_masked(unsigned int *vgather32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
if (word_predicates[i]) {
|
|
vgather32[i] = vtcm.vscatter32[word_offsets[i] / 4];
|
|
}
|
|
}
|
|
}
|
|
|
|
void check_gather_32_masked(void)
|
|
{
|
|
memset(vgather32_ref, gather_32_masked_init(),
|
|
MATRIX_SIZE * sizeof(unsigned int));
|
|
scalar_gather_32_masked(vgather32_ref);
|
|
check_buffer(__func__, vtcm.vgather32,
|
|
vgather32_ref, MATRIX_SIZE * sizeof(unsigned int));
|
|
}
|
|
|
|
/* gather the elements from the scatter16_32 buffer using C */
|
|
void scalar_gather_16_32(unsigned short *vgather16_32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
|
|
}
|
|
}
|
|
|
|
void check_gather_16_32(void)
|
|
{
|
|
memset(vgather16_32_ref, 0, MATRIX_SIZE * sizeof(unsigned short));
|
|
scalar_gather_16_32(vgather16_32_ref);
|
|
check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
|
|
MATRIX_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* masked gather the elements from the scatter16_32 buffer using C */
|
|
void scalar_gather_16_32_masked(unsigned short *vgather16_32)
|
|
{
|
|
for (int i = 0; i < MATRIX_SIZE; ++i) {
|
|
if (half_predicates[i]) {
|
|
vgather16_32[i] = vtcm.vscatter16_32[word_offsets[i] / 2];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
void check_gather_16_32_masked(void)
|
|
{
|
|
memset(vgather16_32_ref, gather_16_masked_init(),
|
|
MATRIX_SIZE * sizeof(unsigned short));
|
|
scalar_gather_16_32_masked(vgather16_32_ref);
|
|
check_buffer(__func__, vtcm.vgather16_32, vgather16_32_ref,
|
|
MATRIX_SIZE * sizeof(unsigned short));
|
|
}
|
|
|
|
/* print scatter16 buffer */
|
|
void print_scatter16_buffer(void)
|
|
{
|
|
if (PRINT_DATA) {
|
|
printf("\n\nPrinting the 16 bit scatter buffer");
|
|
|
|
for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
|
|
if ((i % MATRIX_SIZE) == 0) {
|
|
printf("\n");
|
|
}
|
|
for (int j = 0; j < 2; j++) {
|
|
printf("%c", (char)((vtcm.vscatter16[i] >> j * 8) & 0xff));
|
|
}
|
|
printf(" ");
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
/* print the gather 16 buffer */
|
|
void print_gather_result_16(void)
|
|
{
|
|
if (PRINT_DATA) {
|
|
printf("\n\nPrinting the 16 bit gather result\n");
|
|
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
for (int j = 0; j < 2; j++) {
|
|
printf("%c", (char)((vtcm.vgather16[i] >> j * 8) & 0xff));
|
|
}
|
|
printf(" ");
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
/* print the scatter32 buffer */
|
|
void print_scatter32_buffer(void)
|
|
{
|
|
if (PRINT_DATA) {
|
|
printf("\n\nPrinting the 32 bit scatter buffer");
|
|
|
|
for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
|
|
if ((i % MATRIX_SIZE) == 0) {
|
|
printf("\n");
|
|
}
|
|
for (int j = 0; j < 4; j++) {
|
|
printf("%c", (char)((vtcm.vscatter32[i] >> j * 8) & 0xff));
|
|
}
|
|
printf(" ");
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
/* print the gather 32 buffer */
|
|
void print_gather_result_32(void)
|
|
{
|
|
if (PRINT_DATA) {
|
|
printf("\n\nPrinting the 32 bit gather result\n");
|
|
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
for (int j = 0; j < 4; j++) {
|
|
printf("%c", (char)((vtcm.vgather32[i] >> j * 8) & 0xff));
|
|
}
|
|
printf(" ");
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
/* print the scatter16_32 buffer */
|
|
void print_scatter16_32_buffer(void)
|
|
{
|
|
if (PRINT_DATA) {
|
|
printf("\n\nPrinting the 16_32 bit scatter buffer");
|
|
|
|
for (int i = 0; i < SCATTER_BUFFER_SIZE; i++) {
|
|
if ((i % MATRIX_SIZE) == 0) {
|
|
printf("\n");
|
|
}
|
|
for (int j = 0; j < 2; j++) {
|
|
printf("%c",
|
|
(unsigned char)((vtcm.vscatter16_32[i] >> j * 8) & 0xff));
|
|
}
|
|
printf(" ");
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
/* print the gather 16_32 buffer */
|
|
void print_gather_result_16_32(void)
|
|
{
|
|
if (PRINT_DATA) {
|
|
printf("\n\nPrinting the 16_32 bit gather result\n");
|
|
|
|
for (int i = 0; i < MATRIX_SIZE; i++) {
|
|
for (int j = 0; j < 2; j++) {
|
|
printf("%c",
|
|
(unsigned char)((vtcm.vgather16_32[i] >> j * 8) & 0xff));
|
|
}
|
|
printf(" ");
|
|
}
|
|
printf("\n");
|
|
}
|
|
}
|
|
|
|
int main()
|
|
{
|
|
prefill_vtcm_scratch();
|
|
|
|
/* 16 bit elements with 16 bit offsets */
|
|
create_offsets_values_preds_16();
|
|
|
|
vector_scatter_16();
|
|
print_scatter16_buffer();
|
|
check_scatter_16();
|
|
|
|
vector_gather_16();
|
|
print_gather_result_16();
|
|
check_gather_16();
|
|
|
|
vector_gather_16_masked();
|
|
print_gather_result_16();
|
|
check_gather_16_masked();
|
|
|
|
vector_scatter_16_acc();
|
|
print_scatter16_buffer();
|
|
check_scatter_16_acc();
|
|
|
|
vector_scatter_16_masked();
|
|
print_scatter16_buffer();
|
|
check_scatter_16_masked();
|
|
|
|
/* 32 bit elements with 32 bit offsets */
|
|
create_offsets_values_preds_32();
|
|
|
|
vector_scatter_32();
|
|
print_scatter32_buffer();
|
|
check_scatter_32();
|
|
|
|
vector_gather_32();
|
|
print_gather_result_32();
|
|
check_gather_32();
|
|
|
|
vector_gather_32_masked();
|
|
print_gather_result_32();
|
|
check_gather_32_masked();
|
|
|
|
vector_scatter_32_acc();
|
|
print_scatter32_buffer();
|
|
check_scatter_32_acc();
|
|
|
|
vector_scatter_32_masked();
|
|
print_scatter32_buffer();
|
|
check_scatter_32_masked();
|
|
|
|
/* 16 bit elements with 32 bit offsets */
|
|
create_offsets_values_preds_16_32();
|
|
|
|
vector_scatter_16_32();
|
|
print_scatter16_32_buffer();
|
|
check_scatter_16_32();
|
|
|
|
vector_gather_16_32();
|
|
print_gather_result_16_32();
|
|
check_gather_16_32();
|
|
|
|
vector_gather_16_32_masked();
|
|
print_gather_result_16_32();
|
|
check_gather_16_32_masked();
|
|
|
|
vector_scatter_16_32_acc();
|
|
print_scatter16_32_buffer();
|
|
check_scatter_16_32_acc();
|
|
|
|
vector_scatter_16_32_masked();
|
|
print_scatter16_32_buffer();
|
|
check_scatter_16_32_masked();
|
|
|
|
puts(err ? "FAIL" : "PASS");
|
|
return err;
|
|
}
|