This commit is contained in:
Branimir Karadžić 2018-10-15 17:54:41 -07:00
parent 82815f1c4c
commit 0f191b507e
2 changed files with 228 additions and 0 deletions

View File

@ -862,6 +862,225 @@ namespace bgfx
return PredefinedUniform::Count;
}
void srtToMatrix4_x1(void* _dst, const void* _src)
{
Matrix4* mtx = reinterpret_cast< Matrix4*>(_dst);
const Srt* srt = reinterpret_cast<const Srt*>(_src);
const float rx = srt->rotate[0];
const float ry = srt->rotate[1];
const float rz = srt->rotate[2];
const float rw = srt->rotate[3];
const float xx2 = 2.0f * rx * rx;
const float yy2 = 2.0f * ry * ry;
const float zz2 = 2.0f * rz * rz;
const float yx2 = 2.0f * ry * rx;
const float yz2 = 2.0f * ry * rz;
const float yw2 = 2.0f * ry * rw;
const float wz2 = 2.0f * rw * rz;
const float wx2 = 2.0f * rw * rx;
const float xz2 = 2.0f * rx * rz;
const float sx = srt->scale[0];
const float sy = srt->scale[1];
const float sz = srt->scale[2];
mtx->un.val[ 0] = (1.0f - yy2 - zz2)*sx;
mtx->un.val[ 1] = ( yx2 + wz2)*sx;
mtx->un.val[ 2] = ( xz2 - yw2)*sx;
mtx->un.val[ 3] = 0.0f;
mtx->un.val[ 4] = ( yx2 - wz2)*sy;
mtx->un.val[ 5] = (1.0f - xx2 - zz2)*sy;
mtx->un.val[ 6] = ( yz2 + wx2)*sy;
mtx->un.val[ 7] = 0.0f;
mtx->un.val[ 8] = ( xz2 + yw2)*sz;
mtx->un.val[ 9] = ( yz2 - wx2)*sz;
mtx->un.val[10] = (1.0f - xx2 - yy2)*sz;
mtx->un.val[11] = 0.0f;
const float tx = srt->translate[0];
const float ty = srt->translate[1];
const float tz = srt->translate[2];
mtx->un.val[12] = tx;
mtx->un.val[13] = ty;
mtx->un.val[14] = tz;
mtx->un.val[15] = 1.0f;
}
void transpose(void* _dst, uint32_t _dstStride, const void* _src, uint32_t _srcStride = sizeof(bx::simd128_t) )
{
uint8_t* dst = reinterpret_cast< uint8_t *>(_dst);
const uint8_t* src = reinterpret_cast<const uint8_t *>(_src);
using namespace bx;
const simd128_t r0 = simd_ld<simd128_t>(src);
src += _srcStride;
const simd128_t r1 = simd_ld<simd128_t>(src);
src += _srcStride;
const simd128_t r2 = simd_ld<simd128_t>(src);
src += _srcStride;
const simd128_t r3 = simd_ld<simd128_t>(src);
const simd128_t aibj = simd_shuf_xAyB(r0, r2); // aibj
const simd128_t emfn = simd_shuf_xAyB(r1, r3); // emfn
const simd128_t ckdl = simd_shuf_zCwD(r0, r2); // ckdl
const simd128_t gohp = simd_shuf_zCwD(r1, r3); // gohp
const simd128_t aeim = simd_shuf_xAyB(aibj, emfn); // aeim
const simd128_t bfjn = simd_shuf_zCwD(aibj, emfn); // bfjn
const simd128_t cgko = simd_shuf_xAyB(ckdl, gohp); // cgko
const simd128_t dhlp = simd_shuf_zCwD(ckdl, gohp); // dhlp
simd_st(dst, aeim);
dst += _dstStride;
simd_st(dst, bfjn);
dst += _dstStride;
simd_st(dst, cgko);
dst += _dstStride;
simd_st(dst, dhlp);
}
void srtToMatrix4_x4_Ref(void* _dst, const void* _src)
{
uint8_t* dst = reinterpret_cast< uint8_t*>(_dst);
const uint8_t* src = reinterpret_cast<const uint8_t*>(_src);
srtToMatrix4_x1(dst + 0*sizeof(Matrix4), src + 0*sizeof(Srt) );
srtToMatrix4_x1(dst + 1*sizeof(Matrix4), src + 1*sizeof(Srt) );
srtToMatrix4_x1(dst + 2*sizeof(Matrix4), src + 2*sizeof(Srt) );
srtToMatrix4_x1(dst + 3*sizeof(Matrix4), src + 3*sizeof(Srt) );
}
void srtToMatrix4_x4_Simd(void* _dst, const void* _src)
{
using namespace bx;
simd128_t* dst = reinterpret_cast< simd128_t*>(_dst);
const simd128_t* src = reinterpret_cast<const simd128_t*>(_src);
simd128_t rotate[4];
simd128_t translate[4];
simd128_t scale[4];
transpose(rotate, sizeof(simd128_t), src + 0, sizeof(Srt) );
transpose(translate, sizeof(simd128_t), src + 1, sizeof(Srt) );
transpose(scale, sizeof(simd128_t), src + 2, sizeof(Srt) );
const simd128_t rx = simd_ld<simd128_t>(rotate + 0);
const simd128_t ry = simd_ld<simd128_t>(rotate + 1);
const simd128_t rz = simd_ld<simd128_t>(rotate + 2);
const simd128_t rw = simd_ld<simd128_t>(rotate + 3);
const simd128_t tx = simd_ld<simd128_t>(translate + 0);
const simd128_t ty = simd_ld<simd128_t>(translate + 1);
const simd128_t tz = simd_ld<simd128_t>(translate + 2);
const simd128_t sx = simd_ld<simd128_t>(scale + 0);
const simd128_t sy = simd_ld<simd128_t>(scale + 1);
const simd128_t sz = simd_ld<simd128_t>(scale + 2);
const simd128_t zero = simd_splat(0.0f);
const simd128_t one = simd_splat(1.0f);
const simd128_t two = simd_splat(2.0f);
const simd128_t xx = simd_mul(rx, rx);
const simd128_t xx2 = simd_mul(two, xx);
const simd128_t yy = simd_mul(ry, ry);
const simd128_t yy2 = simd_mul(two, yy);
const simd128_t zz = simd_mul(rz, rz);
const simd128_t zz2 = simd_mul(two, zz);
const simd128_t yx = simd_mul(ry, rx);
const simd128_t yx2 = simd_mul(two, yx);
const simd128_t yz = simd_mul(ry, rz);
const simd128_t yz2 = simd_mul(two, yz);
const simd128_t yw = simd_mul(ry, rw);
const simd128_t yw2 = simd_mul(two, yw);
const simd128_t wz = simd_mul(rw, rz);
const simd128_t wz2 = simd_mul(two, wz);
const simd128_t wx = simd_mul(rw, rx);
const simd128_t wx2 = simd_mul(two, wx);
const simd128_t xz = simd_mul(rx, rz);
const simd128_t xz2 = simd_mul(two, xz);
const simd128_t t0x = simd_sub(one, yy2);
const simd128_t r0x = simd_sub(t0x, zz2);
const simd128_t r0y = simd_add(yx2, wz2);
const simd128_t r0z = simd_sub(xz2, yw2);
const simd128_t r1x = simd_sub(yx2, wz2);
const simd128_t omxx2 = simd_sub(one, xx2);
const simd128_t r1y = simd_sub(omxx2, zz2);
const simd128_t r1z = simd_add(yz2, wx2);
const simd128_t r2x = simd_add(xz2, yw2);
const simd128_t r2y = simd_sub(yz2, wx2);
const simd128_t r2z = simd_sub(omxx2, yy2);
simd128_t tmp[4];
tmp[0] = simd_mul(r0x, sx);
tmp[1] = simd_mul(r0y, sx);
tmp[2] = simd_mul(r0z, sx);
tmp[3] = zero;
transpose(dst + 0, sizeof(Matrix4), tmp);
tmp[0] = simd_mul(r1x, sy);
tmp[1] = simd_mul(r1y, sy);
tmp[2] = simd_mul(r1z, sy);
tmp[3] = zero;
transpose(dst + 1, sizeof(Matrix4), tmp);
tmp[0] = simd_mul(r2x, sz);
tmp[1] = simd_mul(r2y, sz);
tmp[2] = simd_mul(r2z, sz);
tmp[3] = zero;
transpose(dst + 2, sizeof(Matrix4), tmp);
tmp[0] = tx;
tmp[1] = ty;
tmp[2] = tz;
tmp[3] = one;
transpose(dst + 3, sizeof(Matrix4), tmp);
}
void srtToMatrix4(void* _dst, const void* _src, uint32_t _num)
{
uint8_t* dst = reinterpret_cast< uint8_t*>(_dst);
const uint8_t* src = reinterpret_cast<const uint8_t*>(_src);
if (!bx::isAligned(src, 16) )
{
for (uint32_t ii = 0, num = _num / 4; ii < num; ++ii)
{
srtToMatrix4_x4_Ref(dst, src);
src += 4*sizeof(Srt);
dst += 4*sizeof(Matrix4);
}
}
else
{
for (uint32_t ii = 0, num = _num / 4; ii < num; ++ii)
{
srtToMatrix4_x4_Simd(dst, src);
src += 4*sizeof(Srt);
dst += 4*sizeof(Matrix4);
}
}
for (uint32_t ii = 0, num = _num & 3; ii < num; ++ii)
{
srtToMatrix4_x1(dst, src);
src += sizeof(Srt);
dst += sizeof(Matrix4);
}
}
void EncoderImpl::submit(ViewId _id, ProgramHandle _program, OcclusionQueryHandle _occlusionQuery, uint32_t _depth, bool _preserveState)
{
if (BX_ENABLED(BGFX_CONFIG_DEBUG_UNIFORM)

View File

@ -1166,6 +1166,15 @@ namespace bgfx
ViewId m_view;
};
BX_ALIGN_DECL_16(struct) Srt
{
float rotate[4];
float translate[3];
float pad0;
float scale[3];
float pad1;
};
BX_ALIGN_DECL_16(struct) Matrix4
{
union