Updated meshoptimizer.
This commit is contained in:
parent
cc51fd8c38
commit
9bb2dbdb70
6
3rdparty/meshoptimizer/src/clusterizer.cpp
vendored
6
3rdparty/meshoptimizer/src/clusterizer.cpp
vendored
@ -464,7 +464,7 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
|
||||
@ -687,7 +687,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(index_count / 3 <= kMeshletMaxTriangles);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
(void)vertex_count;
|
||||
@ -839,7 +839,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
|
||||
using namespace meshopt;
|
||||
|
||||
assert(triangle_count <= kMeshletMaxTriangles);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
unsigned int indices[kMeshletMaxTriangles * 3];
|
||||
|
@ -412,7 +412,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
@ -483,7 +483,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
24
3rdparty/meshoptimizer/src/meshoptimizer.h
vendored
24
3rdparty/meshoptimizer/src/meshoptimizer.h
vendored
@ -37,8 +37,8 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Vertex attribute stream, similar to glVertexPointer
|
||||
* Each element takes size bytes, with stride controlling the spacing between successive elements.
|
||||
* Vertex attribute stream
|
||||
* Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size).
|
||||
*/
|
||||
struct meshopt_Stream
|
||||
{
|
||||
@ -115,7 +115,7 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* dest
|
||||
* This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count*2 elements)
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
|
||||
@ -131,7 +131,7 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
|
||||
* See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count*4 elements)
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
|
||||
@ -171,7 +171,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count elements)
|
||||
* indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
|
||||
*/
|
||||
MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
|
||||
@ -331,7 +331,7 @@ enum
|
||||
* If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
|
||||
*
|
||||
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
|
||||
* options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
|
||||
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
||||
@ -347,7 +347,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
|
||||
* If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
|
||||
*
|
||||
* destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
|
||||
* result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
|
||||
*/
|
||||
@ -361,7 +361,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
|
||||
* If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
|
||||
*
|
||||
* destination must contain enough space for the target index buffer (target_vertex_count elements)
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
|
||||
|
||||
@ -423,7 +423,7 @@ struct meshopt_OverdrawStatistics
|
||||
* Returns overdraw statistics using a software rasterizer
|
||||
* Results may not match actual GPU performance
|
||||
*
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
*/
|
||||
MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
|
||||
@ -461,7 +461,7 @@ struct meshopt_Meshlet
|
||||
* meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
|
||||
* meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
|
||||
* meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
|
||||
* cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
|
||||
*/
|
||||
@ -503,7 +503,7 @@ struct meshopt_Bounds
|
||||
* The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
|
||||
* to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
|
||||
*
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
* index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
|
||||
*/
|
||||
MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
@ -523,7 +523,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destinati
|
||||
* Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
|
||||
*
|
||||
* destination must contain enough space for the resulting index buffer (index_count elements)
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
|
||||
* vertex_positions should have float3 position in the first 12 bytes of each vertex
|
||||
*/
|
||||
MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
|
||||
|
||||
|
@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
|
8
3rdparty/meshoptimizer/src/simplifier.cpp
vendored
8
3rdparty/meshoptimizer/src/simplifier.cpp
vendored
@ -1282,7 +1282,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
assert(target_index_count <= index_count);
|
||||
assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
|
||||
@ -1425,7 +1425,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
assert(target_index_count <= index_count);
|
||||
|
||||
@ -1556,7 +1556,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
|
||||
{
|
||||
using namespace meshopt;
|
||||
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
assert(target_vertex_count <= vertex_count);
|
||||
|
||||
@ -1668,7 +1668,7 @@ float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count,
|
||||
{
|
||||
using namespace meshopt;
|
||||
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride);
|
||||
|
4
3rdparty/meshoptimizer/src/spatialorder.cpp
vendored
4
3rdparty/meshoptimizer/src/spatialorder.cpp
vendored
@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
|
||||
{
|
||||
using namespace meshopt;
|
||||
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
meshopt_Allocator allocator;
|
||||
@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
|
||||
using namespace meshopt;
|
||||
|
||||
assert(index_count % 3 == 0);
|
||||
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
|
||||
assert(vertex_positions_stride % sizeof(float) == 0);
|
||||
|
||||
(void)vertex_count;
|
||||
|
88
3rdparty/meshoptimizer/src/vertexcodec.cpp
vendored
88
3rdparty/meshoptimizer/src/vertexcodec.cpp
vendored
@ -50,6 +50,12 @@
|
||||
#define SIMD_TARGET
|
||||
#endif
|
||||
|
||||
// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
|
||||
// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define SIMD_LATENCYOPT
|
||||
#endif
|
||||
|
||||
#endif // !MESHOPTIMIZER_NO_SIMD
|
||||
|
||||
#ifdef SIMD_SSE
|
||||
@ -472,6 +478,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
typedef int unaligned_int;
|
||||
#endif
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned int data32;
|
||||
memcpy(&data32, data, 4);
|
||||
data32 &= data32 >> 1;
|
||||
|
||||
// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
|
||||
unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
|
||||
__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
|
||||
|
||||
@ -490,11 +508,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 4 + datacnt;
|
||||
#else
|
||||
return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned long long data64;
|
||||
memcpy(&data64, data, 8);
|
||||
data64 &= data64 >> 1;
|
||||
data64 &= data64 >> 2;
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
|
||||
__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
|
||||
|
||||
@ -512,7 +544,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 8 + datacnt;
|
||||
#else
|
||||
return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 3:
|
||||
@ -604,24 +640,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8
|
||||
|
||||
static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
|
||||
{
|
||||
static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
|
||||
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
|
||||
const uint64_t magic = 0x000103070f1f3f80ull;
|
||||
|
||||
uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
|
||||
uint8x16_t masked = vandq_u8(mask, byte_mask);
|
||||
uint64x2_t mask2 = vreinterpretq_u64_u8(mask);
|
||||
|
||||
#ifdef __aarch64__
|
||||
// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
|
||||
mask0 = vaddv_u8(vget_low_u8(masked));
|
||||
mask1 = vaddv_u8(vget_high_u8(masked));
|
||||
#else
|
||||
// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
|
||||
uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
|
||||
uint8x8_t sum2 = vpadd_u8(sum1, sum1);
|
||||
uint8x8_t sum3 = vpadd_u8(sum2, sum2);
|
||||
|
||||
mask0 = vget_lane_u8(sum3, 0);
|
||||
mask1 = vget_lane_u8(sum3, 1);
|
||||
#endif
|
||||
mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
|
||||
mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
|
||||
}
|
||||
|
||||
static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
|
||||
@ -639,6 +664,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
case 1:
|
||||
{
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned int data32;
|
||||
memcpy(&data32, data, 4);
|
||||
data32 &= data32 >> 1;
|
||||
|
||||
// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
|
||||
unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
uint8x8_t sel2 = vld1_u8(data);
|
||||
uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
|
||||
uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
|
||||
@ -655,11 +692,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
vst1q_u8(buffer, result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 4 + datacnt;
|
||||
#else
|
||||
return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 2:
|
||||
{
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
unsigned long long data64;
|
||||
memcpy(&data64, data, 8);
|
||||
data64 &= data64 >> 1;
|
||||
data64 &= data64 >> 2;
|
||||
|
||||
// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
|
||||
int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
|
||||
#endif
|
||||
|
||||
uint8x8_t sel4 = vld1_u8(data);
|
||||
uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
|
||||
uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
|
||||
@ -675,7 +726,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
|
||||
|
||||
vst1q_u8(buffer, result);
|
||||
|
||||
#ifdef SIMD_LATENCYOPT
|
||||
return data + 8 + datacnt;
|
||||
#else
|
||||
return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
|
||||
#endif
|
||||
}
|
||||
|
||||
case 3:
|
||||
@ -715,7 +770,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
|
||||
// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
|
||||
const uint64_t magic = 0x000103070f1f3f80ull;
|
||||
|
||||
// TODO: This can use v8x16_bitmask in the future
|
||||
mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
|
||||
mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
|
||||
}
|
||||
|
2
3rdparty/meshoptimizer/src/vertexfilter.cpp
vendored
2
3rdparty/meshoptimizer/src/vertexfilter.cpp
vendored
@ -931,7 +931,7 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
|
||||
const float* v = &data[i * stride_float];
|
||||
unsigned int* d = &destination[i * stride_float];
|
||||
|
||||
// use maximum exponent to encode values; this guarantess that mantissa is [-1, 1]
|
||||
// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
|
||||
int exp = -100;
|
||||
|
||||
for (size_t j = 0; j < stride_float; ++j)
|
||||
|
Loading…
Reference in New Issue
Block a user