Updated meshoptimizer.

2022-10-11 19:54:30 -07:00 · 2022-10-11 19:54:30 -07:00 · 9bb2dbdb70
commit 9bb2dbdb70
parent cc51fd8c38
9 changed files with 97 additions and 43 deletions
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@ -464,7 +464,7 @@ size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_ve
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
@ -687,7 +687,7 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t

 	assert(index_count % 3 == 0);
 	assert(index_count / 3 <= kMeshletMaxTriangles);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	(void)vertex_count;
@ -839,7 +839,7 @@ meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices
 	using namespace meshopt;

 	assert(triangle_count <= kMeshletMaxTriangles);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	unsigned int indices[kMeshletMaxTriangles * 3];
--- a/3rdparty/meshoptimizer/src/indexgenerator.cpp
+++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp
@ -412,7 +412,7 @@ void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsig
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@ -483,7 +483,7 @@ void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const un
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@ -37,8 +37,8 @@ extern "C" {
 #endif

 /**
- * Vertex attribute stream, similar to glVertexPointer
- * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ * Vertex attribute stream
+ * Each element takes size bytes, beginning at data, with stride controlling the spacing between successive elements (stride >= size).
 */
 struct meshopt_Stream
 {
@ -115,7 +115,7 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* dest
 * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
 *
 * destination must contain enough space for the resulting index buffer (index_count*2 elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
 MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

@ -131,7 +131,7 @@ MESHOPTIMIZER_API void meshopt_generateAdjacencyIndexBuffer(unsigned int* destin
 * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
 *
 * destination must contain enough space for the resulting index buffer (index_count*4 elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
 MESHOPTIMIZER_API void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

@ -171,7 +171,7 @@ MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
 * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
 */
 MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
@ -331,7 +331,7 @@ enum
 * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
 *
 * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
 * options must be a bitmask composed of meshopt_SimplifyX options; 0 is a safe default
 * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
@ -347,7 +347,7 @@ MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsig
 * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
 *
 * destination must contain enough space for the target index buffer, worst case is index_count elements (*not* target_index_count)!
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * target_error represents the error relative to mesh extents that can be tolerated, e.g. 0.01 = 1% deformation
 * result_error can be NULL; when it's not NULL, it will contain the resulting (relative) error after simplification
 */
@ -361,7 +361,7 @@ MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destinati
 * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
 *
 * destination must contain enough space for the target index buffer (target_vertex_count elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);

@ -423,7 +423,7 @@ struct meshopt_OverdrawStatistics
 * Returns overdraw statistics using a software rasterizer
 * Results may not match actual GPU performance
 *
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
 MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

@ -461,7 +461,7 @@ struct meshopt_Meshlet
 * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
 * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
 * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
 * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
 */
@ -503,7 +503,7 @@ struct meshopt_Bounds
 * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
 * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
 *
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
 */
 MESHOPTIMIZER_API struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
@ -523,7 +523,7 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destinati
 * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
 *
 * destination must contain enough space for the resulting index buffer (index_count elements)
- * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex
 */
 MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);

--- a/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp
@ -147,7 +147,7 @@ meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices,
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
--- a/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
@ -272,7 +272,7 @@ void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* ind
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
@ -1282,7 +1282,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);
 	assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
@ -1425,7 +1425,7 @@ size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* ind
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);

@ -1556,7 +1556,7 @@ size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_pos
 {
 	using namespace meshopt;

-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_vertex_count <= vertex_count);

@ -1668,7 +1668,7 @@ float meshopt_simplifyScale(const float* vertex_positions, size_t vertex_count,
 {
 	using namespace meshopt;

-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	float extent = rescalePositions(NULL, vertex_positions, vertex_count, vertex_positions_stride);
--- a/3rdparty/meshoptimizer/src/spatialorder.cpp
+++ b/3rdparty/meshoptimizer/src/spatialorder.cpp
@ -113,7 +113,7 @@ void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_pos
 {
 	using namespace meshopt;

-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	meshopt_Allocator allocator;
@ -144,7 +144,7 @@ void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int*
 	using namespace meshopt;

 	assert(index_count % 3 == 0);
-	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride >= 12 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);

 	(void)vertex_count;
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@ -50,6 +50,12 @@
 #define SIMD_TARGET
 #endif

+// When targeting AArch64/x64, optimize for latency to allow decoding of individual 16-byte groups to overlap
+// We don't do this for 32-bit systems because we need 64-bit math for this and this will hurt in-order CPUs
+#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64)
+#define SIMD_LATENCYOPT
+#endif
+
 #endif // !MESHOPTIMIZER_NO_SIMD

 #ifdef SIMD_SSE
@ -472,6 +478,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi
 		typedef int unaligned_int;
 #endif

+#ifdef SIMD_LATENCYOPT
+		unsigned int data32;
+		memcpy(&data32, data, 4);
+		data32 &= data32 >> 1;
+
+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));

@ -490,11 +508,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);

+#ifdef SIMD_LATENCYOPT
+		return data + 4 + datacnt;
+#else
 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 2:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned long long data64;
+		memcpy(&data64, data, 8);
+		data64 &= data64 >> 1;
+		data64 &= data64 >> 2;
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
 		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));

@ -512,7 +544,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);

+#ifdef SIMD_LATENCYOPT
+		return data + 8 + datacnt;
+#else
 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 3:
@ -604,24 +640,13 @@ static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8

 static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
+	const uint64_t magic = 0x000103070f1f3f80ull;

-	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
-	uint8x16_t masked = vandq_u8(mask, byte_mask);
+	uint64x2_t mask2 = vreinterpretq_u64_u8(mask);

-#ifdef __aarch64__
-	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
-	mask0 = vaddv_u8(vget_low_u8(masked));
-	mask1 = vaddv_u8(vget_high_u8(masked));
-#else
-	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
-	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
-	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
-	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
-
-	mask0 = vget_lane_u8(sum3, 0);
-	mask1 = vget_lane_u8(sum3, 1);
-#endif
+	mask0 = uint8_t((vgetq_lane_u64(mask2, 0) * magic) >> 56);
+	mask1 = uint8_t((vgetq_lane_u64(mask2, 1) * magic) >> 56);
 }

 static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
@ -639,6 +664,18 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 	case 1:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned int data32;
+		memcpy(&data32, data, 4);
+		data32 &= data32 >> 1;
+
+		// arrange bits such that low bits of nibbles of data64 contain all 2-bit elements of data32
+		unsigned long long data64 = ((unsigned long long)data32 << 30) | (data32 & 0x3fffffff);
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		uint8x8_t sel2 = vld1_u8(data);
 		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
 		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
@ -655,11 +692,25 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		vst1q_u8(buffer, result);

+#ifdef SIMD_LATENCYOPT
+		return data + 4 + datacnt;
+#else
 		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 2:
 	{
+#ifdef SIMD_LATENCYOPT
+		unsigned long long data64;
+		memcpy(&data64, data, 8);
+		data64 &= data64 >> 1;
+		data64 &= data64 >> 2;
+
+		// adds all 1-bit nibbles together; the sum fits in 4 bits because datacnt=16 would have used mode 3
+		int datacnt = int(((data64 & 0x1111111111111111ull) * 0x1111111111111111ull) >> 60);
+#endif
+
 		uint8x8_t sel4 = vld1_u8(data);
 		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
 		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
@ -675,7 +726,11 @@ static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsi

 		vst1q_u8(buffer, result);

+#ifdef SIMD_LATENCYOPT
+		return data + 8 + datacnt;
+#else
 		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+#endif
 	}

 	case 3:
@ -715,7 +770,6 @@ static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1
 	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
 	const uint64_t magic = 0x000103070f1f3f80ull;

-	// TODO: This can use v8x16_bitmask in the future
 	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
 	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
 }
--- a/3rdparty/meshoptimizer/src/vertexfilter.cpp
+++ b/3rdparty/meshoptimizer/src/vertexfilter.cpp
@ -931,7 +931,7 @@ void meshopt_encodeFilterExp(void* destination_, size_t count, size_t stride, in
 		const float* v = &data[i * stride_float];
 		unsigned int* d = &destination[i * stride_float];

-		// use maximum exponent to encode values; this guarantess that mantissa is [-1, 1]
+		// use maximum exponent to encode values; this guarantees that mantissa is [-1, 1]
 		int exp = -100;

 		for (size_t j = 0; j < stride_float; ++j)