From 0f191b507e8fb1121402a02b3b09b3ba6219dfb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Branimir=20Karad=C5=BEi=C4=87?= Date: Mon, 15 Oct 2018 17:54:41 -0700 Subject: [PATCH] SRT: WIP --- src/bgfx.cpp | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/bgfx_p.h | 9 +++ 2 files changed, 228 insertions(+) diff --git a/src/bgfx.cpp b/src/bgfx.cpp index cf042b782..85b68a18c 100644 --- a/src/bgfx.cpp +++ b/src/bgfx.cpp @@ -862,6 +862,225 @@ namespace bgfx return PredefinedUniform::Count; } + void srtToMatrix4_x1(void* _dst, const void* _src) + { + Matrix4* mtx = reinterpret_cast< Matrix4*>(_dst); + const Srt* srt = reinterpret_cast(_src); + + const float rx = srt->rotate[0]; + const float ry = srt->rotate[1]; + const float rz = srt->rotate[2]; + const float rw = srt->rotate[3]; + + const float xx2 = 2.0f * rx * rx; + const float yy2 = 2.0f * ry * ry; + const float zz2 = 2.0f * rz * rz; + const float yx2 = 2.0f * ry * rx; + const float yz2 = 2.0f * ry * rz; + const float yw2 = 2.0f * ry * rw; + const float wz2 = 2.0f * rw * rz; + const float wx2 = 2.0f * rw * rx; + const float xz2 = 2.0f * rx * rz; + + const float sx = srt->scale[0]; + const float sy = srt->scale[1]; + const float sz = srt->scale[2]; + + mtx->un.val[ 0] = (1.0f - yy2 - zz2)*sx; + mtx->un.val[ 1] = ( yx2 + wz2)*sx; + mtx->un.val[ 2] = ( xz2 - yw2)*sx; + mtx->un.val[ 3] = 0.0f; + + mtx->un.val[ 4] = ( yx2 - wz2)*sy; + mtx->un.val[ 5] = (1.0f - xx2 - zz2)*sy; + mtx->un.val[ 6] = ( yz2 + wx2)*sy; + mtx->un.val[ 7] = 0.0f; + + mtx->un.val[ 8] = ( xz2 + yw2)*sz; + mtx->un.val[ 9] = ( yz2 - wx2)*sz; + mtx->un.val[10] = (1.0f - xx2 - yy2)*sz; + mtx->un.val[11] = 0.0f; + + const float tx = srt->translate[0]; + const float ty = srt->translate[1]; + const float tz = srt->translate[2]; + + mtx->un.val[12] = tx; + mtx->un.val[13] = ty; + mtx->un.val[14] = tz; + mtx->un.val[15] = 1.0f; + } + + void transpose(void* _dst, uint32_t _dstStride, const void* _src, uint32_t _srcStride = sizeof(bx::simd128_t) ) + { + uint8_t* dst = reinterpret_cast< uint8_t *>(_dst); + const uint8_t* src = reinterpret_cast(_src); + + using namespace bx; + + const simd128_t r0 = simd_ld(src); + src += _srcStride; + + const simd128_t r1 = simd_ld(src); + src += _srcStride; + + const simd128_t r2 = simd_ld(src); + src += _srcStride; + + const simd128_t r3 = simd_ld(src); + + const simd128_t aibj = simd_shuf_xAyB(r0, r2); // aibj + const simd128_t emfn = simd_shuf_xAyB(r1, r3); // emfn + const simd128_t ckdl = simd_shuf_zCwD(r0, r2); // ckdl + const simd128_t gohp = simd_shuf_zCwD(r1, r3); // gohp + const simd128_t aeim = simd_shuf_xAyB(aibj, emfn); // aeim + const simd128_t bfjn = simd_shuf_zCwD(aibj, emfn); // bfjn + const simd128_t cgko = simd_shuf_xAyB(ckdl, gohp); // cgko + const simd128_t dhlp = simd_shuf_zCwD(ckdl, gohp); // dhlp + + simd_st(dst, aeim); + dst += _dstStride; + + simd_st(dst, bfjn); + dst += _dstStride; + + simd_st(dst, cgko); + dst += _dstStride; + + simd_st(dst, dhlp); + } + + void srtToMatrix4_x4_Ref(void* _dst, const void* _src) + { + uint8_t* dst = reinterpret_cast< uint8_t*>(_dst); + const uint8_t* src = reinterpret_cast(_src); + + srtToMatrix4_x1(dst + 0*sizeof(Matrix4), src + 0*sizeof(Srt) ); + srtToMatrix4_x1(dst + 1*sizeof(Matrix4), src + 1*sizeof(Srt) ); + srtToMatrix4_x1(dst + 2*sizeof(Matrix4), src + 2*sizeof(Srt) ); + srtToMatrix4_x1(dst + 3*sizeof(Matrix4), src + 3*sizeof(Srt) ); + } + + void srtToMatrix4_x4_Simd(void* _dst, const void* _src) + { + using namespace bx; + + simd128_t* dst = reinterpret_cast< simd128_t*>(_dst); + const simd128_t* src = reinterpret_cast(_src); + + simd128_t rotate[4]; + simd128_t translate[4]; + simd128_t scale[4]; + + transpose(rotate, sizeof(simd128_t), src + 0, sizeof(Srt) ); + transpose(translate, sizeof(simd128_t), src + 1, sizeof(Srt) ); + transpose(scale, sizeof(simd128_t), src + 2, sizeof(Srt) ); + + const simd128_t rx = simd_ld(rotate + 0); + const simd128_t ry = simd_ld(rotate + 1); + const simd128_t rz = simd_ld(rotate + 2); + const simd128_t rw = simd_ld(rotate + 3); + + const simd128_t tx = simd_ld(translate + 0); + const simd128_t ty = simd_ld(translate + 1); + const simd128_t tz = simd_ld(translate + 2); + + const simd128_t sx = simd_ld(scale + 0); + const simd128_t sy = simd_ld(scale + 1); + const simd128_t sz = simd_ld(scale + 2); + + const simd128_t zero = simd_splat(0.0f); + const simd128_t one = simd_splat(1.0f); + const simd128_t two = simd_splat(2.0f); + + const simd128_t xx = simd_mul(rx, rx); + const simd128_t xx2 = simd_mul(two, xx); + const simd128_t yy = simd_mul(ry, ry); + const simd128_t yy2 = simd_mul(two, yy); + const simd128_t zz = simd_mul(rz, rz); + const simd128_t zz2 = simd_mul(two, zz); + const simd128_t yx = simd_mul(ry, rx); + const simd128_t yx2 = simd_mul(two, yx); + const simd128_t yz = simd_mul(ry, rz); + const simd128_t yz2 = simd_mul(two, yz); + const simd128_t yw = simd_mul(ry, rw); + const simd128_t yw2 = simd_mul(two, yw); + const simd128_t wz = simd_mul(rw, rz); + const simd128_t wz2 = simd_mul(two, wz); + const simd128_t wx = simd_mul(rw, rx); + const simd128_t wx2 = simd_mul(two, wx); + const simd128_t xz = simd_mul(rx, rz); + const simd128_t xz2 = simd_mul(two, xz); + const simd128_t t0x = simd_sub(one, yy2); + const simd128_t r0x = simd_sub(t0x, zz2); + const simd128_t r0y = simd_add(yx2, wz2); + const simd128_t r0z = simd_sub(xz2, yw2); + const simd128_t r1x = simd_sub(yx2, wz2); + const simd128_t omxx2 = simd_sub(one, xx2); + const simd128_t r1y = simd_sub(omxx2, zz2); + const simd128_t r1z = simd_add(yz2, wx2); + const simd128_t r2x = simd_add(xz2, yw2); + const simd128_t r2y = simd_sub(yz2, wx2); + const simd128_t r2z = simd_sub(omxx2, yy2); + + simd128_t tmp[4]; + tmp[0] = simd_mul(r0x, sx); + tmp[1] = simd_mul(r0y, sx); + tmp[2] = simd_mul(r0z, sx); + tmp[3] = zero; + transpose(dst + 0, sizeof(Matrix4), tmp); + + tmp[0] = simd_mul(r1x, sy); + tmp[1] = simd_mul(r1y, sy); + tmp[2] = simd_mul(r1z, sy); + tmp[3] = zero; + transpose(dst + 1, sizeof(Matrix4), tmp); + + tmp[0] = simd_mul(r2x, sz); + tmp[1] = simd_mul(r2y, sz); + tmp[2] = simd_mul(r2z, sz); + tmp[3] = zero; + transpose(dst + 2, sizeof(Matrix4), tmp); + + tmp[0] = tx; + tmp[1] = ty; + tmp[2] = tz; + tmp[3] = one; + transpose(dst + 3, sizeof(Matrix4), tmp); + } + + void srtToMatrix4(void* _dst, const void* _src, uint32_t _num) + { + uint8_t* dst = reinterpret_cast< uint8_t*>(_dst); + const uint8_t* src = reinterpret_cast(_src); + + if (!bx::isAligned(src, 16) ) + { + for (uint32_t ii = 0, num = _num / 4; ii < num; ++ii) + { + srtToMatrix4_x4_Ref(dst, src); + src += 4*sizeof(Srt); + dst += 4*sizeof(Matrix4); + } + } + else + { + for (uint32_t ii = 0, num = _num / 4; ii < num; ++ii) + { + srtToMatrix4_x4_Simd(dst, src); + src += 4*sizeof(Srt); + dst += 4*sizeof(Matrix4); + } + } + + for (uint32_t ii = 0, num = _num & 3; ii < num; ++ii) + { + srtToMatrix4_x1(dst, src); + src += sizeof(Srt); + dst += sizeof(Matrix4); + } + } + void EncoderImpl::submit(ViewId _id, ProgramHandle _program, OcclusionQueryHandle _occlusionQuery, uint32_t _depth, bool _preserveState) { if (BX_ENABLED(BGFX_CONFIG_DEBUG_UNIFORM) diff --git a/src/bgfx_p.h b/src/bgfx_p.h index e01f7c9b4..91e0b75f6 100644 --- a/src/bgfx_p.h +++ b/src/bgfx_p.h @@ -1166,6 +1166,15 @@ namespace bgfx ViewId m_view; }; + BX_ALIGN_DECL_16(struct) Srt + { + float rotate[4]; + float translate[3]; + float pad0; + float scale[3]; + float pad1; + }; + BX_ALIGN_DECL_16(struct) Matrix4 { union