libweston: Optimise matrix multiplication
The previous version used div() to separate the column and row of the current element, but that function is implemented as a libc call, which prevented the compiler from vectorising the loop and made matrix multiplication appear quite high in profiles. With div() removed, we are down from 64 calls to vfmadd132ss acting on one float at a time, to just 8 calls to vfmadd132ps when compiled with AVX2 support (or 16 mulps, 16 addps with SSE2 support only), and the function isn’t a hot spot any more. Signed-off-by: Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
This commit is contained in:
parent
102acac6a9
commit
e13c99690b
|
@ -61,16 +61,16 @@ weston_matrix_multiply(struct weston_matrix *m, const struct weston_matrix *n)
|
|||
{
|
||||
struct weston_matrix tmp;
|
||||
const float *row, *column;
|
||||
div_t d;
|
||||
int i, j;
|
||||
int i, j, k;
|
||||
|
||||
for (i = 0; i < 16; i++) {
|
||||
tmp.d[i] = 0;
|
||||
d = div(i, 4);
|
||||
row = m->d + d.quot * 4;
|
||||
column = n->d + d.rem;
|
||||
for (j = 0; j < 4; j++)
|
||||
tmp.d[i] += row[j] * column[j * 4];
|
||||
for (i = 0; i < 4; i++) {
|
||||
row = m->d + i * 4;
|
||||
for (j = 0; j < 4; j++) {
|
||||
tmp.d[4 * i + j] = 0;
|
||||
column = n->d + j;
|
||||
for (k = 0; k < 4; k++)
|
||||
tmp.d[4 * i + j] += row[k] * column[k * 4];
|
||||
}
|
||||
}
|
||||
tmp.type = m->type | n->type;
|
||||
memcpy(m, &tmp, sizeof tmp);
|
||||
|
|
Loading…
Reference in New Issue