From 6bab752aca28e07373d98cc19120ad7756a48a13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=91=D1=80=D0=B0=D0=BD=D0=B8=D0=BC=D0=B8=D1=80=20=D0=9A?=
 =?UTF-8?q?=D0=B0=D1=80=D0=B0=D1=9F=D0=B8=D1=9B?=
 <branimirkaradzic@gmail.com>
Date: Sat, 24 Jun 2023 09:54:31 -0700
Subject: [PATCH] Updated meshoptimizer.

---
 3rdparty/meshoptimizer/src/indexgenerator.cpp |  39 +-
 3rdparty/meshoptimizer/src/meshoptimizer.h    |  29 ++
 3rdparty/meshoptimizer/src/simplifier.cpp     | 389 +++++++++++++-----
 .../meshoptimizer/src/vcacheoptimizer.cpp     |  21 +-
 4 files changed, 355 insertions(+), 123 deletions(-)
diff --git a/3rdparty/meshoptimizer/src/indexgenerator.cpp b/3rdparty/meshoptimizer/src/indexgenerator.cpp
index cad808a2b..a9d57fb2c 100644
--- a/3rdparty/meshoptimizer/src/indexgenerator.cpp
+++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp
@@ -178,6 +178,22 @@ static void buildPositionRemap(unsigned int* remap, const float* vertex_position
 
 		remap[index] = *entry;
 	}
+
+	allocator.deallocate(vertex_table);
+}
+
+template <size_t BlockSize>
+static void remapVertices(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	size_t block_size = BlockSize == 0 ? vertex_size : BlockSize;
+	assert(block_size == vertex_size);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * block_size, static_cast<const unsigned char*>(vertices) + i * block_size, block_size);
+		}
 }
 
 } // namespace meshopt
@@ -288,6 +304,8 @@ size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigne
 
 void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
 {
+	using namespace meshopt;
+
 	assert(vertex_size > 0 && vertex_size <= 256);
 
 	meshopt_Allocator allocator;
@@ -300,14 +318,23 @@ void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t v
 		vertices = vertices_copy;
 	}
 
-	for (size_t i = 0; i < vertex_count; ++i)
+	// specialize the loop for common vertex sizes to ensure memcpy is compiled as an inlined intrinsic
+	switch (vertex_size)
 	{
-		if (remap[i] != ~0u)
-		{
-			assert(remap[i] < vertex_count);
+	case 4:
+		return remapVertices<4>(destination, vertices, vertex_count, vertex_size, remap);
 
-			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
-		}
+	case 8:
+		return remapVertices<8>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 12:
+		return remapVertices<12>(destination, vertices, vertex_count, vertex_size, remap);
+
+	case 16:
+		return remapVertices<16>(destination, vertices, vertex_count, vertex_size, remap);
+
+	default:
+		return remapVertices<0>(destination, vertices, vertex_count, vertex_size, remap);
 	}
 }
 
diff --git a/3rdparty/meshoptimizer/src/meshoptimizer.h b/3rdparty/meshoptimizer/src/meshoptimizer.h
index 26c4e99a5..f07eecb8b 100644
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@@ -347,6 +347,17 @@ enum
  */
 MESHOPTIMIZER_API size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* result_error);
 
+/**
+ * Experimental: Mesh simplifier with attribute metric
+ * The algorithm ehnahces meshopt_simplify by incorporating attribute values into the error metric used to prioritize simplification order; see meshopt_simplify documentation for details.
+ * Note that the number of attributes affects memory requirements and running time; this algorithm requires ~1.5x more memory and time compared to meshopt_simplify when using 4 scalar attributes.
+ *
+ * vertex_attributes should have attribute_count floats for each vertex
+ * attribute_weights should have attribute_count floats in total; the weights determine relative priority of attributes between each other and wrt position. The recommended weight range is [1e-3..1e-1], assuming attribute data is in [0..1] range.
+ * TODO target_error/result_error currently use combined distance+attribute error; this may change in the future
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error);
+
 /**
  * Experimental: Mesh simplifier (sloppy)
  * Reduces the number of triangles in the mesh, sacrificing mesh appearance for simplification performance
@@ -627,6 +638,8 @@ inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const
 template <typename T>
 inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = 0);
 template <typename T>
+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options = 0, float* result_error = 0);
+template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error = 0);
 template <typename T>
 inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
@@ -751,6 +764,13 @@ public:
 		return result;
 	}
 
+	void deallocate(void* ptr)
+	{
+		assert(count > 0 && blocks[count - 1] == ptr);
+		Storage::deallocate(ptr);
+		count--;
+	}
+
 private:
 	void* blocks[24];
 	size_t count;
@@ -968,6 +988,15 @@ inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_co
 	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error, options, result_error);
 }
 
+template <typename T>
+inline size_t meshopt_simplifyWithAttributes(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* result_error)
+{
+    meshopt_IndexAdapter<T> in(0, indices, index_count);
+    meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+    return meshopt_simplifyWithAttributes(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, vertex_attributes, vertex_attributes_stride, attribute_weights, attribute_count, target_index_count, target_error, options, result_error);
+}
+
 template <typename T>
 inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* result_error)
 {
diff --git a/3rdparty/meshoptimizer/src/simplifier.cpp b/3rdparty/meshoptimizer/src/simplifier.cpp
index 5f0e9bac3..864fc8f78 100644
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
@@ -26,6 +26,7 @@
 // Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
 // Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
 // Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
+// Hugues Hoppe. New Quadric Metric for Simplifying Meshes with Appearance Attributes. 1999
 namespace meshopt
 {
 
@@ -37,31 +38,31 @@ struct EdgeAdjacency
 		unsigned int prev;
 	};
 
-	unsigned int* counts;
 	unsigned int* offsets;
 	Edge* data;
 };
 
 static void prepareEdgeAdjacency(EdgeAdjacency& adjacency, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
 {
-	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
-	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count + 1);
 	adjacency.data = allocator.allocate<EdgeAdjacency::Edge>(index_count);
 }
 
 static void updateEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* remap)
 {
 	size_t face_count = index_count / 3;
+	unsigned int* offsets = adjacency.offsets + 1;
+	EdgeAdjacency::Edge* data = adjacency.data;
 
 	// fill edge counts
-	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+	memset(offsets, 0, vertex_count * sizeof(unsigned int));
 
 	for (size_t i = 0; i < index_count; ++i)
 	{
 		unsigned int v = remap ? remap[indices[i]] : indices[i];
 		assert(v < vertex_count);
 
-		adjacency.counts[v]++;
+		offsets[v]++;
 	}
 
 	// fill offset table
@@ -69,8 +70,9 @@ static void updateEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* in
 
 	for (size_t i = 0; i < vertex_count; ++i)
 	{
-		adjacency.offsets[i] = offset;
-		offset += adjacency.counts[i];
+		unsigned int count = offsets[i];
+		offsets[i] = offset;
+		offset += count;
 	}
 
 	assert(offset == index_count);
@@ -87,26 +89,22 @@ static void updateEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* in
 			c = remap[c];
 		}
 
-		adjacency.data[adjacency.offsets[a]].next = b;
-		adjacency.data[adjacency.offsets[a]].prev = c;
-		adjacency.offsets[a]++;
+		data[offsets[a]].next = b;
+		data[offsets[a]].prev = c;
+		offsets[a]++;
 
-		adjacency.data[adjacency.offsets[b]].next = c;
-		adjacency.data[adjacency.offsets[b]].prev = a;
-		adjacency.offsets[b]++;
+		data[offsets[b]].next = c;
+		data[offsets[b]].prev = a;
+		offsets[b]++;
 
-		adjacency.data[adjacency.offsets[c]].next = a;
-		adjacency.data[adjacency.offsets[c]].prev = b;
-		adjacency.offsets[c]++;
+		data[offsets[c]].next = a;
+		data[offsets[c]].prev = b;
+		offsets[c]++;
 	}
 
-	// fix offsets that have been disturbed by the previous pass
-	for (size_t i = 0; i < vertex_count; ++i)
-	{
-		assert(adjacency.offsets[i] >= adjacency.counts[i]);
-
-		adjacency.offsets[i] -= adjacency.counts[i];
-	}
+	// finalize offsets
+	adjacency.offsets[0] = 0;
+	assert(adjacency.offsets[vertex_count] == index_count);
 }
 
 struct PositionHasher
@@ -203,6 +201,8 @@ static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const f
 			wedge[i] = wedge[r];
 			wedge[r] = unsigned(i);
 		}
+
+	allocator.deallocate(table);
 }
 
 enum VertexKind
@@ -242,7 +242,7 @@ const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
 
 static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b)
 {
-	unsigned int count = adjacency.counts[a];
+	unsigned int count = adjacency.offsets[a + 1] - adjacency.offsets[a];
 	const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[a];
 
 	for (size_t i = 0; i < count; ++i)
@@ -267,7 +267,7 @@ static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned
 	{
 		unsigned int vertex = unsigned(i);
 
-		unsigned int count = adjacency.counts[vertex];
+		unsigned int count = adjacency.offsets[vertex + 1] - adjacency.offsets[vertex];
 		const EdgeAdjacency::Edge* edges = adjacency.data + adjacency.offsets[vertex];
 
 		for (size_t j = 0; j < count; ++j)
@@ -426,6 +426,23 @@ static float rescalePositions(Vector3* result, const float* vertex_positions_dat
 	return extent;
 }
 
+static void rescaleAttributes(float* result, const float* vertex_attributes_data, size_t vertex_count, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count)
+{
+	size_t vertex_attributes_stride_float = vertex_attributes_stride / sizeof(float);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		for (size_t k = 0; k < attribute_count; ++k)
+		{
+			float a = vertex_attributes_data[i * vertex_attributes_stride_float + k];
+
+			result[i * attribute_count + k] = a * attribute_weights[k];
+		}
+	}
+}
+
+static const size_t kMaxAttributes = 16;
+
 struct Quadric
 {
 	float a00, a11, a22;
@@ -434,6 +451,11 @@ struct Quadric
 	float w;
 };
 
+struct QuadricGrad
+{
+	float gx, gy, gz, gw;
+};
+
 struct Collapse
 {
 	unsigned int v0;
@@ -476,6 +498,17 @@ static void quadricAdd(Quadric& Q, const Quadric& R)
 	Q.w += R.w;
 }
 
+static void quadricAdd(QuadricGrad* G, const QuadricGrad* R, size_t attribute_count)
+{
+	for (size_t k = 0; k < attribute_count; ++k)
+	{
+		G[k].gx += R[k].gx;
+		G[k].gy += R[k].gy;
+		G[k].gz += R[k].gz;
+		G[k].gw += R[k].gw;
+	}
+}
+
 static float quadricError(const Quadric& Q, const Vector3& v)
 {
 	float rx = Q.b0;
@@ -504,6 +537,45 @@ static float quadricError(const Quadric& Q, const Vector3& v)
 	return fabsf(r) * s;
 }
 
+static float quadricError(const Quadric& Q, const QuadricGrad* G, size_t attribute_count, const Vector3& v, const float* va)
+{
+	float rx = Q.b0;
+	float ry = Q.b1;
+	float rz = Q.b2;
+
+	rx += Q.a10 * v.y;
+	ry += Q.a21 * v.z;
+	rz += Q.a20 * v.x;
+
+	rx *= 2;
+	ry *= 2;
+	rz *= 2;
+
+	rx += Q.a00 * v.x;
+	ry += Q.a11 * v.y;
+	rz += Q.a22 * v.z;
+
+	float r = Q.c;
+	r += rx * v.x;
+	r += ry * v.y;
+	r += rz * v.z;
+
+	// see quadricFromAttributes for general derivation; here we need to add the parts of (eval(pos) - attr)^2 that depend on attr
+	for (size_t k = 0; k < attribute_count; ++k)
+	{
+		float a = va[k];
+		float g = v.x * G[k].gx + v.y * G[k].gy + v.z * G[k].gz + G[k].gw;
+
+		r += a * a * Q.w;
+		r -= 2 * a * g;
+	}
+
+	// TODO: weight normalization is breaking attribute error somehow
+	float s = 1;// Q.w == 0.f ? 0.f : 1.f / Q.w;
+
+	return fabsf(r) * s;
+}
+
 static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
 {
 	float aw = a * w;
@@ -574,6 +646,82 @@ static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3
 	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
 }
 
+static void quadricFromAttributes(Quadric& Q, QuadricGrad* G, const Vector3& p0, const Vector3& p1, const Vector3& p2, const float* va0, const float* va1, const float* va2, size_t attribute_count)
+{
+	// for each attribute we want to encode the following function into the quadric:
+	// (eval(pos) - attr)^2
+	// where eval(pos) interpolates attribute across the triangle like so:
+	// eval(pos) = pos.x * gx + pos.y * gy + pos.z * gz + gw
+	// where gx/gy/gz/gw are gradients
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+	// weight is scaled linearly with edge length
+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+	float area = sqrtf(normal.x * normal.x + normal.y * normal.y + normal.z * normal.z);
+	float w = sqrtf(area); // TODO this needs more experimentation
+
+	// we compute gradients using barycentric coordinates; barycentric coordinates can be computed as follows:
+	// v = (d11 * d20 - d01 * d21) / denom
+	// w = (d00 * d21 - d01 * d20) / denom
+	// u = 1 - v - w
+	// here v0, v1 are triangle edge vectors, v2 is a vector from point to triangle corner, and dij = dot(vi, vj)
+	const Vector3& v0 = p10;
+	const Vector3& v1 = p20;
+	float d00 = v0.x * v0.x + v0.y * v0.y + v0.z * v0.z;
+	float d01 = v0.x * v1.x + v0.y * v1.y + v0.z * v1.z;
+	float d11 = v1.x * v1.x + v1.y * v1.y + v1.z * v1.z;
+	float denom = d00 * d11 - d01 * d01;
+	float denomr = denom == 0 ? 0.f : 1.f / denom;
+
+	// precompute gradient factors
+	// these are derived by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w and factoring out common factors that are shared between attributes
+	float gx1 = (d11 * v0.x - d01 * v1.x) * denomr;
+	float gx2 = (d00 * v1.x - d01 * v0.x) * denomr;
+	float gy1 = (d11 * v0.y - d01 * v1.y) * denomr;
+	float gy2 = (d00 * v1.y - d01 * v0.y) * denomr;
+	float gz1 = (d11 * v0.z - d01 * v1.z) * denomr;
+	float gz2 = (d00 * v1.z - d01 * v0.z) * denomr;
+
+	memset(&Q, 0, sizeof(Quadric));
+
+	Q.w = w;
+
+	for (size_t k = 0; k < attribute_count; ++k)
+	{
+		float a0 = va0[k], a1 = va1[k], a2 = va2[k];
+
+		// compute gradient of eval(pos) for x/y/z/w
+		// the formulas below are obtained by directly computing derivative of eval(pos) = a0 * u + a1 * v + a2 * w
+		float gx = gx1 * (a1 - a0) + gx2 * (a2 - a0);
+		float gy = gy1 * (a1 - a0) + gy2 * (a2 - a0);
+		float gz = gz1 * (a1 - a0) + gz2 * (a2 - a0);
+		float gw = a0 - p0.x * gx - p0.y * gy - p0.z * gz;
+
+		// quadric encodes (eval(pos)-attr)^2; this means that the resulting expansion needs to compute, for example, pos.x * pos.y * K
+		// since quadrics already encode factors for pos.x * pos.y, we can accumulate almost everything in basic quadric fields
+		Q.a00 += w * (gx * gx);
+		Q.a11 += w * (gy * gy);
+		Q.a22 += w * (gz * gz);
+
+		Q.a10 += w * (gy * gx);
+		Q.a20 += w * (gz * gx);
+		Q.a21 += w * (gz * gy);
+
+		Q.b0 += w * (gx * gw);
+		Q.b1 += w * (gy * gw);
+		Q.b2 += w * (gz * gw);
+
+		Q.c += w * (gw * gw);
+
+		// the only remaining sum components are ones that depend on attr; these will be addded during error evaluation, see quadricError
+		G[k].gx = w * gx;
+		G[k].gy = w * gy;
+		G[k].gz = w * gz;
+		G[k].gw = w * gw;
+	}
+}
+
 static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
 {
 	for (size_t i = 0; i < index_count; i += 3)
@@ -639,6 +787,29 @@ static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indic
 	}
 }
 
+static void fillAttributeQuadrics(Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const float* vertex_attributes, size_t attribute_count, const unsigned int* remap)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		Quadric QA;
+		QuadricGrad G[kMaxAttributes];
+		quadricFromAttributes(QA, G, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], &vertex_attributes[i0 * attribute_count], &vertex_attributes[i1 * attribute_count], &vertex_attributes[i2 * attribute_count], attribute_count);
+
+		// TODO: This blends together attribute weights across attribute discontinuities, which is probably not a great idea
+		quadricAdd(attribute_quadrics[remap[i0]], QA);
+		quadricAdd(attribute_quadrics[remap[i1]], QA);
+		quadricAdd(attribute_quadrics[remap[i2]], QA);
+
+		quadricAdd(&attribute_gradients[remap[i0] * attribute_count], G, attribute_count);
+		quadricAdd(&attribute_gradients[remap[i1] * attribute_count], G, attribute_count);
+		quadricAdd(&attribute_gradients[remap[i2] * attribute_count], G, attribute_count);
+	}
+}
+
 // does triangle ABC flip when C is replaced with D?
 static bool hasTriangleFlip(const Vector3& a, const Vector3& b, const Vector3& c, const Vector3& d)
 {
@@ -649,7 +820,7 @@ static bool hasTriangleFlip(const Vector3& a, const Vector3& b, const Vector3& c
 	Vector3 nbc = {eb.y * ec.z - eb.z * ec.y, eb.z * ec.x - eb.x * ec.z, eb.x * ec.y - eb.y * ec.x};
 	Vector3 nbd = {eb.y * ed.z - eb.z * ed.y, eb.z * ed.x - eb.x * ed.z, eb.x * ed.y - eb.y * ed.x};
 
-	return nbc.x * nbd.x + nbc.y * nbd.y + nbc.z * nbd.z < 0;
+	return nbc.x * nbd.x + nbc.y * nbd.y + nbc.z * nbd.z <= 0;
 }
 
 static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vertex_positions, const unsigned int* collapse_remap, unsigned int i0, unsigned int i1)
@@ -661,16 +832,15 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
 	const Vector3& v1 = vertex_positions[i1];
 
 	const EdgeAdjacency::Edge* edges = &adjacency.data[adjacency.offsets[i0]];
-	size_t count = adjacency.counts[i0];
+	size_t count = adjacency.offsets[i0 + 1] - adjacency.offsets[i0];
 
 	for (size_t i = 0; i < count; ++i)
 	{
 		unsigned int a = collapse_remap[edges[i].next];
 		unsigned int b = collapse_remap[edges[i].prev];
 
-		// skip triangles that get collapsed
-		// note: this is mathematically redundant as if either of these is true, the dot product in hasTriangleFlip should be 0
-		if (a == i1 || b == i1)
+		// skip triangles that will get collapsed by i0->i1 collapse or already got collapsed previously
+		if (a == i1 || b == i1 || a == b)
 			continue;
 
 		// early-out when at least one triangle flips due to a collapse
@@ -681,7 +851,25 @@ static bool hasTriangleFlips(const EdgeAdjacency& adjacency, const Vector3* vert
 	return false;
 }
 
-static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+static size_t boundEdgeCollapses(const EdgeAdjacency& adjacency, size_t vertex_count, size_t index_count, unsigned char* vertex_kind)
+{
+	size_t dual_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned char k = vertex_kind[i];
+		unsigned int e = adjacency.offsets[i + 1] - adjacency.offsets[i];
+
+		dual_count += (k == Kind_Manifold || k == Kind_Seam) ? e : 0;
+	}
+
+	assert(dual_count <= index_count);
+
+	// pad capacity by 3 so that we can check for overflow once per triangle instead of once per edge
+	return (index_count - dual_count / 2) + 3;
+}
+
+static size_t pickEdgeCollapses(Collapse* collapses, size_t collapse_capacity, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
 {
 	size_t collapse_count = 0;
 
@@ -689,6 +877,10 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices
 	{
 		static const int next[3] = {1, 2, 0};
 
+		// this should never happen as boundEdgeCollapses should give an upper bound for the collapse count, but in an unlikely event it does we can just drop extra collapses
+		if (collapse_count + 3 > collapse_capacity)
+			break;
+
 		for (int e = 0; e < 3; ++e)
 		{
 			unsigned int i0 = indices[i + e];
@@ -739,7 +931,7 @@ static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices
 	return collapse_count;
 }
 
-static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap)
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const float* vertex_attributes, const Quadric* vertex_quadrics, const Quadric* attribute_quadrics, const QuadricGrad* attribute_gradients, size_t attribute_count, const unsigned int* remap)
 {
 	for (size_t i = 0; i < collapse_count; ++i)
 	{
@@ -753,11 +945,14 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 		unsigned int j0 = c.bidi ? i1 : i0;
 		unsigned int j1 = c.bidi ? i0 : i1;
 
-		const Quadric& qi = vertex_quadrics[remap[i0]];
-		const Quadric& qj = vertex_quadrics[remap[j0]];
+		float ei = quadricError(vertex_quadrics[remap[i0]], vertex_positions[i1]);
+		float ej = quadricError(vertex_quadrics[remap[j0]], vertex_positions[j1]);
 
-		float ei = quadricError(qi, vertex_positions[i1]);
-		float ej = quadricError(qj, vertex_positions[j1]);
+		if (attribute_count)
+		{
+			ei += quadricError(attribute_quadrics[remap[i0]], &attribute_gradients[remap[i0] * attribute_count], attribute_count, vertex_positions[i1], &vertex_attributes[i1 * attribute_count]);
+			ej += quadricError(attribute_quadrics[remap[j0]], &attribute_gradients[remap[j0] * attribute_count], attribute_count, vertex_positions[j1], &vertex_attributes[j1 * attribute_count]);
+		}
 
 		// pick edge direction with minimal error
 		c.v0 = ei <= ej ? i0 : j0;
@@ -766,61 +961,6 @@ static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const
 	}
 }
 
-#if TRACE > 1
-static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind)
-{
-	size_t ckinds[Kind_Count][Kind_Count] = {};
-	float cerrors[Kind_Count][Kind_Count] = {};
-
-	for (int k0 = 0; k0 < Kind_Count; ++k0)
-		for (int k1 = 0; k1 < Kind_Count; ++k1)
-			cerrors[k0][k1] = FLT_MAX;
-
-	for (size_t i = 0; i < collapse_count; ++i)
-	{
-		unsigned int i0 = collapses[i].v0;
-		unsigned int i1 = collapses[i].v1;
-
-		unsigned char k0 = vertex_kind[i0];
-		unsigned char k1 = vertex_kind[i1];
-
-		ckinds[k0][k1]++;
-		cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1];
-	}
-
-	for (int k0 = 0; k0 < Kind_Count; ++k0)
-		for (int k1 = 0; k1 < Kind_Count; ++k1)
-			if (ckinds[k0][k1])
-				printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), ckinds[k0][k1] ? sqrtf(cerrors[k0][k1]) : 0.f);
-}
-
-static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind)
-{
-	size_t locked_collapses[Kind_Count][Kind_Count] = {};
-
-	for (size_t i = 0; i < index_count; i += 3)
-	{
-		static const int next[3] = {1, 2, 0};
-
-		for (int e = 0; e < 3; ++e)
-		{
-			unsigned int i0 = indices[i + e];
-			unsigned int i1 = indices[i + next[e]];
-
-			unsigned char k0 = vertex_kind[i0];
-			unsigned char k1 = vertex_kind[i1];
-
-			locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0];
-		}
-	}
-
-	for (int k0 = 0; k0 < Kind_Count; ++k0)
-		for (int k1 = 0; k1 < Kind_Count; ++k1)
-			if (locked_collapses[k0][k1])
-				printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1]));
-}
-#endif
-
 static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
 {
 	const int sort_bits = 11;
@@ -859,7 +999,7 @@ static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapse
 	}
 }
 
-static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, Quadric* attribute_quadrics, QuadricGrad* attribute_gradients, size_t attribute_count, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, const Vector3* vertex_positions, const EdgeAdjacency& adjacency, size_t triangle_collapse_goal, float error_limit, float& result_error)
 {
 	size_t edge_collapses = 0;
 	size_t triangle_collapses = 0;
@@ -922,6 +1062,12 @@ static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char*
 
 		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
 
+		if (attribute_count)
+		{
+			quadricAdd(attribute_quadrics[r1], attribute_quadrics[r0]);
+			quadricAdd(&attribute_gradients[r1 * attribute_count], &attribute_gradients[r0 * attribute_count], attribute_count);
+		}
+
 		if (vertex_kind[i0] == Kind_Complex)
 		{
 			unsigned int v = i0;
@@ -1277,7 +1423,7 @@ MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoop = 0;
 MESHOPTIMIZER_API unsigned int* meshopt_simplifyDebugLoopBack = 0;
 #endif
 
-size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
+size_t meshopt_simplifyEdge(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
 {
 	using namespace meshopt;
 
@@ -1286,6 +1432,9 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	assert(vertex_positions_stride % sizeof(float) == 0);
 	assert(target_index_count <= index_count);
 	assert((options & ~(meshopt_SimplifyLockBorder)) == 0);
+	assert(vertex_attributes_stride >= attribute_count * sizeof(float) && vertex_attributes_stride <= 256);
+	assert(vertex_attributes_stride % sizeof(float) == 0);
+	assert(attribute_count <= kMaxAttributes);
 
 	meshopt_Allocator allocator;
 
@@ -1325,12 +1474,35 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
 	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
 
+	float* vertex_attributes = NULL;
+
+	if (attribute_count)
+	{
+		vertex_attributes = allocator.allocate<float>(vertex_count * attribute_count);
+		rescaleAttributes(vertex_attributes, vertex_attributes_data, vertex_count, vertex_attributes_stride, attribute_weights, attribute_count);
+	}
+
 	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
 	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
 
+	Quadric* attribute_quadrics = NULL;
+	QuadricGrad* attribute_gradients = NULL;
+
+	if (attribute_count)
+	{
+		attribute_quadrics = allocator.allocate<Quadric>(vertex_count);
+		memset(attribute_quadrics, 0, vertex_count * sizeof(Quadric));
+
+		attribute_gradients = allocator.allocate<QuadricGrad>(vertex_count * attribute_count);
+		memset(attribute_gradients, 0, vertex_count * attribute_count * sizeof(QuadricGrad));
+	}
+
 	fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap);
 	fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
 
+	if (attribute_count)
+		fillAttributeQuadrics(attribute_quadrics, attribute_gradients, indices, index_count, vertex_positions, vertex_attributes, attribute_count, remap);
+
 	if (result != indices)
 		memcpy(result, indices, index_count * sizeof(unsigned int));
 
@@ -1338,8 +1510,10 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	size_t pass_count = 0;
 #endif
 
-	Collapse* edge_collapses = allocator.allocate<Collapse>(index_count);
-	unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count);
+	size_t collapse_capacity = boundEdgeCollapses(adjacency, vertex_count, index_count, vertex_kind);
+
+	Collapse* edge_collapses = allocator.allocate<Collapse>(collapse_capacity);
+	unsigned int* collapse_order = allocator.allocate<unsigned int>(collapse_capacity);
 	unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count);
 	unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count);
 
@@ -1354,17 +1528,14 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 		// note: throughout the simplification process adjacency structure reflects welded topology for result-in-progress
 		updateEdgeAdjacency(adjacency, result, result_count, vertex_count, remap);
 
-		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop);
+		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, collapse_capacity, result, result_count, remap, vertex_kind, loop);
+		assert(edge_collapse_count <= collapse_capacity);
 
 		// no edges can be collapsed any more due to topology restrictions
 		if (edge_collapse_count == 0)
 			break;
 
-		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap);
-
-#if TRACE > 1
-		dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
-#endif
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_attributes, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, remap);
 
 		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
 
@@ -1379,7 +1550,7 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 		printf("pass %d: ", int(pass_count++));
 #endif
 
-		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, attribute_quadrics, attribute_gradients, attribute_count, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, vertex_positions, adjacency, triangle_collapse_goal, error_limit, result_error);
 
 		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
 		if (collapses == 0)
@@ -1398,10 +1569,6 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	printf("result: %d triangles, error: %e; total %d passes\n", int(result_count), sqrtf(result_error), int(pass_count));
 #endif
 
-#if TRACE > 1
-	dumpLockedCollapses(result, result_count, vertex_kind);
-#endif
-
 #ifndef NDEBUG
 	if (meshopt_simplifyDebugKind)
 		memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
@@ -1420,6 +1587,16 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices,
 	return result_count;
 }
 
+size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
+{
+	return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, NULL, 0, NULL, 0, target_index_count, target_error, options, out_result_error);
+}
+
+size_t meshopt_simplifyWithAttributes(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, const float* vertex_attributes_data, size_t vertex_attributes_stride, const float* attribute_weights, size_t attribute_count, size_t target_index_count, float target_error, unsigned int options, float* out_result_error)
+{
+	return meshopt_simplifyEdge(destination, indices, index_count, vertex_positions_data, vertex_count, vertex_positions_stride, vertex_attributes_data, vertex_attributes_stride, attribute_weights, attribute_count, target_index_count, target_error, options, out_result_error);
+}
+
 size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float* out_result_error)
 {
 	using namespace meshopt;
diff --git a/3rdparty/meshoptimizer/src/vcacheoptimizer.cpp b/3rdparty/meshoptimizer/src/vcacheoptimizer.cpp
index ce8fd3a88..d4b08ba34 100644
--- a/3rdparty/meshoptimizer/src/vcacheoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/vcacheoptimizer.cpp
@@ -221,9 +221,9 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
 	}
 
-	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int cache_holder[2 * (kCacheSizeMax + 4)];
 	unsigned int* cache = cache_holder;
-	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 4;
 	size_t cache_count = 0;
 
 	unsigned int current_triangle = 0;
@@ -260,10 +260,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];
 
-			if (index != a && index != b && index != c)
-			{
-				cache_new[cache_write++] = index;
-			}
+			cache_new[cache_write] = index;
+			cache_write += (index != a && index != b && index != c);
 		}
 
 		unsigned int* cache_temp = cache;
@@ -305,6 +303,10 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 		{
 			unsigned int index = cache[i];
 
+			// no need to update scores if we are never going to use this vertex
+			if (adjacency.counts[index] == 0)
+				continue;
+
 			int cache_position = i >= cache_size ? -1 : int(i);
 
 			// update vertex score
@@ -325,11 +327,8 @@ void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned
 				float tri_score = triangle_scores[tri] + score_diff;
 				assert(tri_score > 0);
 
-				if (best_score < tri_score)
-				{
-					best_triangle = tri;
-					best_score = tri_score;
-				}
+				best_triangle = best_score < tri_score ? tri : best_triangle;
+				best_score = best_score < tri_score ? tri_score : best_score;
 
 				triangle_scores[tri] = tri_score;
 			}