added meshoptimizer to 3rdparty

2019-04-13 11:52:26 +02:00 · 2019-04-13 11:52:26 +02:00 · aca1b6c774
parent 4fc111c358
commit aca1b6c774
39 changed files with 23230 additions and 0 deletions
--- a/3rdparty/meshoptimizer/.clang-format
+++ b/3rdparty/meshoptimizer/.clang-format
@ -0,0 +1,12 @@
+Standard: Cpp03
+UseTab: ForIndentation
+TabWidth: 4
+IndentWidth: 4
+AccessModifierOffset: -4
+BreakBeforeBraces: Allman
+IndentCaseLabels: false
+ColumnLimit: 0
+PointerAlignment: Left
+BreakConstructorInitializersBeforeComma: true
+NamespaceIndentation: None
+AlignEscapedNewlines: DontAlign
--- a/3rdparty/meshoptimizer/.gitignore
+++ b/3rdparty/meshoptimizer/.gitignore
@ -0,0 +1,2 @@
+/build/
+/data/
--- a/3rdparty/meshoptimizer/.travis.yml
+++ b/3rdparty/meshoptimizer/.travis.yml
@ -0,0 +1,37 @@
+language: cpp
+
+matrix:
+  include:
+    - os: linux
+      compiler: gcc
+    - os: linux
+      compiler: clang
+    - os: osx
+      compiler: clang
+    - os: windows
+      compiler: cl
+      env:
+        - TARGET="Visual Studio 15 2017"
+    - os: windows
+      compiler: cl
+      env:
+        - TARGET="Visual Studio 15 2017 Win64"
+
+script:
+  - if [[ "$TRAVIS_COMPILER" == "gcc" ]]; then make config=coverage test; fi
+  - if [[ "$TRAVIS_COMPILER" == "clang" ]]; then make config=sanitize test; fi
+  - if [[ "$TRAVIS_OS_NAME" != "windows" ]]; then make config=debug test; fi
+  - if [[ "$TRAVIS_OS_NAME" != "windows" ]]; then make config=release test; fi
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then cmake -G "$TARGET" -DBUILD_DEMO=ON; fi
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then cmake --build . -- -property:Configuration=Debug -verbosity:minimal; fi
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then ./Debug/demo.exe demo/pirate.obj; fi
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then cmake --build . -- -property:Configuration=Release -verbosity:minimal; fi
+  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then ./Release/demo.exe demo/pirate.obj; fi
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then make config=iphone; fi
+
+after_script:
+  - if [[ "$TRAVIS_COMPILER" == "gcc" ]]; then
+    find . -type f -name '*.gcno' -exec gcov -p {} +;
+    sed -i -e "s/#####\(.*\)\(\/\/ unreachable.*\)/    -\1\2/" *.gcov;
+    bash <(curl -s https://codecov.io/bash) -f 'src#*.gcov' -X search;
+    fi
--- a/3rdparty/meshoptimizer/CMakeLists.txt
+++ b/3rdparty/meshoptimizer/CMakeLists.txt
@ -0,0 +1,40 @@
+project(meshoptimizer)
+cmake_minimum_required(VERSION 3.0)
+
+option(BUILD_DEMO "Build demo" OFF)
+option(BUILD_TOOLS "Build tools" OFF)
+
+set(SOURCES
+    src/meshoptimizer.h
+    src/clusterizer.cpp
+    src/indexcodec.cpp
+    src/indexgenerator.cpp
+    src/overdrawanalyzer.cpp
+    src/overdrawoptimizer.cpp
+    src/simplifier.cpp
+    src/stripifier.cpp
+    src/vcacheanalyzer.cpp
+    src/vcacheoptimizer.cpp
+    src/vertexcodec.cpp
+    src/vfetchanalyzer.cpp
+    src/vfetchoptimizer.cpp
+)
+
+add_library(meshoptimizer STATIC ${SOURCES})
+target_include_directories(meshoptimizer INTERFACE "${CMAKE_CURRENT_SOURCE_DIR}/src")
+
+if(MSVC)
+	target_compile_options(meshoptimizer PRIVATE /W4 /WX)
+else()
+	target_compile_options(meshoptimizer PRIVATE -Wall -Wextra -Werror)
+endif()
+
+if(BUILD_DEMO)
+    add_executable(demo demo/main.cpp demo/miniz.cpp demo/tests.cpp tools/objparser.cpp)
+    target_link_libraries(demo meshoptimizer)
+endif()
+
+if(BUILD_TOOLS)
+    add_executable(meshencoder tools/meshencoder.cpp tools/objparser.cpp)
+    target_link_libraries(meshencoder meshoptimizer)
+endif()
--- a/3rdparty/meshoptimizer/CONTRIBUTING.md
+++ b/3rdparty/meshoptimizer/CONTRIBUTING.md
@ -0,0 +1,54 @@
+Thanks for deciding to contribute to meshoptimizer! These guidelines will try to help make the process painless and efficient.
+
+## Questions
+
+If you have a question regarding the library usage, please [open a GitHub issue](https://github.com/zeux/meshoptimizer/issues/new).
+Some questions just need answers, but it's nice to keep them for future reference in case other people want to know the same thing.
+Some questions help improve the library interface or documentation by inspiring future changes.
+
+## Bugs
+
+If the library doesn't compile on your system, compiles with warnings, doesn't seem to run correctly for your input data or if anything else is amiss, please [open a GitHub issue](https://github.com/zeux/meshoptimizer/issues/new).
+It helps if you note the version of the library this issue happens in, the version of your compiler for compilation issues, and a reproduction case for runtime bugs.
+
+Of course, feel free to [create a pull request](https://help.github.com/articles/about-pull-requests/) to fix the bug yourself.
+
+## Features
+
+New algorithms and improvements to existing algorithms are always welcome; you can open an issue or make the change yourself and submit a pull request.
+
+For major features, consider opening an issue describing an improvement you'd like to see or make before opening a pull request.
+This will give us a chance to discuss the idea before implementing it - some algorithms may not be easy to integrate into existing programs, may not be robust to arbitrary meshes or may be expensive to run or implement/maintain, so a discussion helps make sure these don't block the algorithm development.
+
+## Code style
+
+Contributions to this project are expected to follow the existing code style.
+`.clang-format` file mostly defines syntactic styling rules (you can run `make format` to format the code accordingly).
+
+As for naming conventions, this library uses `snake_case` for variables, `lowerCamelCase` for functions, `UpperCamelCase` for types, `kCamelCase` for global constants and `SCARY_CASE` for macros. All public functions/types must additionally have an extra `meshopt_` prefix to avoid symbol conflicts.
+
+## Dependencies
+
+Please note that this library uses C89 interface for all APIs and a C++98 implementation - C++11 features can not be used.
+This choice is made to maximize compatibility to make sure that any toolchain, including legacy proprietary gaming console toolchains, can compile this code.
+
+Additionally, the library code has zero external dependencies, does not depend on STL and does not use RTTI or exceptions.
+This, again, maximizes compatibility and makes sure the library can be used in environments where STL use is discouraged or prohibited, as well as maximizing runtime performance and minimizing compilation times.
+
+The demo program uses STL since it serves as an example of usage and as a test harness, not as production-ready code.
+
+## Testing
+
+All pull requests will run through a continuous integration pipeline hosted on [Travis CI](https://travis-ci.org/zeux/meshoptimizer) that will run the built-in unit tests and integration tests on Windows, macOS and Linux with gcc, clang and msvc compilers.
+You can run the tests yourself using `make test` or building the demo program with `cmake -DBUILD_DEMO=ON` and running it.
+
+Unit tests can be found in `demo/tests.cpp` and functional tests - in `demo/main.cpp`; when making code changes please try to make sure they are covered by an existing test or add a new test accordingly.
+
+## Documentation
+
+Documentation for this library resides in the `meshoptimizer.h` header, with examples as part of a usage manual available in `README.md`.
+Changes to documentation are always welcome and should use issues/pull requests as outlined above; please note that `README.md` only contains documentation for stable algorithms, as experimental algorithms may change the interface without concern for backwards compatibility.
+
+## Sensitive communication
+
+If you prefer to not disclose the issues or information relevant to the issue such as reproduction case to the public, you can always contact the author via e-mail (arseny.kapoulkine@gmail.com).
--- a/3rdparty/meshoptimizer/LICENSE.md
+++ b/3rdparty/meshoptimizer/LICENSE.md
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2019 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/3rdparty/meshoptimizer/Makefile
+++ b/3rdparty/meshoptimizer/Makefile
@ -0,0 +1,97 @@
+.SUFFIXES:
+MAKEFLAGS+=-r
+
+config=debug
+files=demo/pirate.obj
+
+BUILD=build/$(config)
+
+LIBRARY_SOURCES=$(wildcard src/*.cpp)
+LIBRARY_OBJECTS=$(LIBRARY_SOURCES:%=$(BUILD)/%.o)
+
+DEMO_SOURCES=$(wildcard demo/*.c demo/*.cpp) tools/objparser.cpp
+DEMO_OBJECTS=$(DEMO_SOURCES:%=$(BUILD)/%.o)
+
+ENCODER_SOURCES=tools/meshencoder.cpp tools/objparser.cpp
+ENCODER_OBJECTS=$(ENCODER_SOURCES:%=$(BUILD)/%.o)
+
+OBJECTS=$(LIBRARY_OBJECTS) $(DEMO_OBJECTS) $(ENCODER_OBJECTS)
+
+LIBRARY=$(BUILD)/libmeshoptimizer.a
+EXECUTABLE=$(BUILD)/meshoptimizer
+
+CFLAGS=-g -Wall -Wextra -Werror -std=c89
+CXXFLAGS=-g -Wall -Wextra -Wshadow -Wno-missing-field-initializers -Werror -std=c++98
+LDFLAGS=
+
+ifeq ($(config),iphone)
+	IPHONESDK=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk
+	CFLAGS+=-arch armv7 -arch arm64 -isysroot $(IPHONESDK)
+	CXXFLAGS+=-arch armv7 -arch arm64 -isysroot $(IPHONESDK) -stdlib=libc++
+	LDFLAGS+=-arch armv7 -arch arm64 -L $(IPHONESDK)/usr/lib -mios-version-min=7.0
+endif
+
+ifeq ($(config),trace)
+	CXXFLAGS+=-DTRACE=2
+endif
+
+ifeq ($(config),release)
+	CXXFLAGS+=-O3 -DNDEBUG
+endif
+
+ifeq ($(config),coverage)
+	CXXFLAGS+=-coverage
+	LDFLAGS+=-coverage
+endif
+
+ifeq ($(config),sanitize)
+	CXXFLAGS+=-fsanitize=address,undefined -fno-sanitize-recover=all
+	LDFLAGS+=-fsanitize=address,undefined
+endif
+
+ifeq ($(config),analyze)
+	CXXFLAGS+=--analyze
+endif
+
+all: $(EXECUTABLE)
+
+test: $(EXECUTABLE)
+	$(EXECUTABLE) $(files)
+
+check: $(EXECUTABLE)
+	$(EXECUTABLE)
+
+dev: $(EXECUTABLE)
+	$(EXECUTABLE) -d $(files)
+
+format:
+	clang-format -i $(LIBRARY_SOURCES) $(DEMO_SOURCES)
+
+meshencoder: $(ENCODER_OBJECTS) $(LIBRARY)
+	$(CXX) $^ $(LDFLAGS) -o $@
+
+js/decoder.js: src/vertexcodec.cpp src/indexcodec.cpp
+	@mkdir -p build
+	emcc $(filter %.cpp,$^) -O3 -DNDEBUG -s EXPORTED_FUNCTIONS='["_meshopt_decodeVertexBuffer", "_meshopt_decodeIndexBuffer"]' -s ALLOW_MEMORY_GROWTH=1 -s TOTAL_STACK=32768 -s TOTAL_MEMORY=65536 -o build/decoder.wasm
+	sed -i "s#\(var wasm = \)\".*\";#\\1\"$$(cat build/decoder.wasm | base64 -w 0)\";#" $@
+
+$(EXECUTABLE): $(DEMO_OBJECTS) $(LIBRARY)
+	$(CXX) $^ $(LDFLAGS) -o $@
+
+$(LIBRARY): $(LIBRARY_OBJECTS)
+	ar rcs $@ $^
+
+$(BUILD)/%.cpp.o: %.cpp
+	@mkdir -p $(dir $@)
+	$(CXX) $< $(CXXFLAGS) -c -MMD -MP -o $@
+
+$(BUILD)/%.c.o: %.c
+	@mkdir -p $(dir $@)
+	$(CC) $< $(CFLAGS) -c -MMD -MP -o $@
+
+-include $(OBJECTS:.o=.d)
+
+clean:
+	rm -rf $(BUILD)
+
+.PHONY: all clean format
--- a/3rdparty/meshoptimizer/README.md
+++ b/3rdparty/meshoptimizer/README.md
@ -0,0 +1,264 @@
+# meshoptimizer [![Build Status](https://travis-ci.org/zeux/meshoptimizer.svg?branch=master)](https://travis-ci.org/zeux/meshoptimizer) [![codecov.io](https://codecov.io/github/zeux/meshoptimizer/coverage.svg?branch=master)](https://codecov.io/github/zeux/meshoptimizer?branch=master) ![MIT](https://img.shields.io/badge/license-MIT-blue.svg) [![GitHub](https://img.shields.io/badge/repo-github-green.svg)](https://github.com/zeux/meshoptimizer)
+
+## Purpose
+
+When a GPU renders triangle meshes, various stages of the GPU pipeline have to process vertex and index data. The efficiency of these stages depends on the data you feed to them; this library provides algorithms to help optimize meshes for these stages, as well as algorithms to reduce the mesh complexity and storage overhead.
+
+The library provides a C and C++ interface for all algorithms; you can use it from C/C++ or from other languages via FFI (such as P/Invoke). If you want to use this library from Rust, you should use [meshopt crate](https://crates.io/crates/meshopt).
+
+## Installing
+
+meshoptimizer is hosted on GitHub; you can download the latest release using git:
+
+```
+git clone -b v0.11 https://github.com/zeux/meshoptimizer.git
+```
+
+Alternatively you can [download the .zip archive from GitHub](https://github.com/zeux/meshoptimizer/archive/v0.11.zip).
+
+## Building
+
+meshoptimizer is distributed as a set of C++ source files. To include it into your project, you can use one of the two options:
+
+* Use CMake to build the library (either as a standalone project or as part of your project)
+* Add source files to your project's build system
+
+The source files are organized in such a way that you don't need to change your build-system settings, and you only need to add the files for the algorithms you use.
+
+## Pipeline
+
+When optimizing a mesh, you should typically feed it through a set of optimizations (the order is important!):
+
+1. Indexing
+2. Vertex cache optimization
+3. Overdraw optimization
+4. Vertex fetch optimization
+5. Vertex quantization
+6. (optional) Vertex/index buffer compression
+
+## Indexing
+
+Most algorithms in this library assume that a mesh has a vertex buffer and an index buffer. For algorithms to work well and also for GPU to render your mesh efficiently, the vertex buffer has to have no redundant vertices; you can generate an index buffer from an unindexed vertex buffer or reindex an existing (potentially redundant) index buffer as follows:
+
+First, generate a remap table from your existing vertex (and, optionally, index) data:
+
+```c++
+size_t index_count = face_count * 3;
+std::vector<unsigned int> remap(index_count); // allocate temporary memory for the remap table
+size_t vertex_count = meshopt_generateVertexRemap(&remap[0], NULL, index_count, &unindexed_vertices[0], index_count, sizeof(Vertex));
+```
+
+Note that in this case we only have an unindexed vertex buffer; the remap table is generated based on binary equivalence of the input vertices, so the resulting mesh will render the same way.
+
+After generating the remap table, you can allocate space for the target vertex buffer (`vertex_count` elements) and index buffer (`index_count` elements) and generate them:
+
+```c++
+meshopt_remapIndexBuffer(indices, NULL, index_count, &remap[0]);
+meshopt_remapVertexBuffer(vertices, &unindexed_vertices[0], index_count, sizeof(Vertex), &remap[0]);
+```
+
+You can then further optimize the resulting buffers by calling the other functions on them in-place.
+
+## Vertex cache optimization
+
+When the GPU renders the mesh, it has to run the vertex shader for each vertex; usually GPUs have a built-in fixed size cache that stores the transformed vertices (the result of running the vertex shader), and uses this cache to reduce the number of vertex shader invocations. This cache is usually small, 16-32 vertices, and can have different replacement policies; to use this cache efficiently, you have to reorder your triangles to maximize the locality of reused vertex references like so:
+
+```c++
+meshopt_optimizeVertexCache(indices, indices, index_count, vertex_count);
+```
+
+## Overdraw optimization
+
+After transforming the vertices, GPU sends the triangles for rasterization which results in generating pixels that are usually first ran through the depth test, and pixels that pass it get the pixel shader executed to generate the final color. As pixel shaders get more expensive, it becomes more and more important to reduce overdraw. While in general improving overdraw requires view-dependent operations, this library provides an algorithm to reorder triangles to minimize the overdraw from all directions, which you should run after vertex cache optimization like this:
+
+```c++
+meshopt_optimizeOverdraw(indices, indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex), 1.05f);
+```
+
+The overdraw optimizer needs to read vertex positions as a float3 from the vertex; the code snippet above assumes that the vertex stores position as `float x, y, z`.
+
+When performing the overdraw optimization you have to specify a floating-point threshold parameter. The algorithm tries to maintain a balance between vertex cache efficiency and overdraw; the threshold determines how much the algorithm can compromise the vertex cache hit ratio, with 1.05 meaning that the resulting ratio should be at most 5% worse than before the optimization.
+
+## Vertex fetch optimization
+
+After the final triangle order has been established, we still can optimize the vertex buffer for memory efficiency. Before running the vertex shader GPU has to fetch the vertex attributes from the vertex buffer; the fetch is usually backed by a memory cache, and as such optimizing the data for the locality of memory access is important. You can do this by running this code:
+
+To optimize the index/vertex buffers for vertex fetch efficiency, call:
+
+```c++
+meshopt_optimizeVertexFetch(vertices, indices, index_count, vertices, vertex_count, sizeof(Vertex));
+```
+
+This will reorder the vertices in the vertex buffer to try to improve the locality of reference, and rewrite the indices in place to match; if the vertex data is stored using multiple streams, you should use `meshopt_optimizeVertexFetchRemap` instead. This optimization has to be performed on the final index buffer since the optimal vertex order depends on the triangle order.
+
+Note that the algorithm does not try to model cache replacement precisely and instead just orders vertices in the order of use, which generally produces results that are close to optimal.
+
+## Vertex quantization
+
+To optimize memory bandwidth when fetching the vertex data even further, and to reduce the amount of memory required to store the mesh, it is often beneficial to quantize the vertex attributes to smaller types. While this optimization can technically run at any part of the pipeline (and sometimes doing quantization as the first step can improve indexing by merging almost identical vertices), it generally is easier to run this after all other optimizations since some of them require access to float3 positions.
+
+Quantization is usually domain specific; it's common to quantize normals using 3 8-bit integers but you can use higher-precision quantization (for example using 10 bits per component in a 10_10_10_2 format), or a different encoding to use just 2 components. For positions and texture coordinate data the two most common storage formats are half precision floats, and 16-bit normalized integers that encode the position relative to the AABB of the mesh or the UV bounding rectangle.
+
+The number of possible combinations here is very large but this library does provide the building blocks, specifically functions to quantize floating point values to normalized integers, as well as half-precision floats. For example, here's how you can quantize a normal:
+
+```c++
+unsigned int normal =
+	(meshopt_quantizeUnorm(v.nx, 10) << 20) |
+	(meshopt_quantizeUnorm(v.ny, 10) << 10) |
+	 meshopt_quantizeUnorm(v.nz, 10);
+```
+
+and here's how you can quantize a position:
+
+```c++
+unsigned short px = meshopt_quantizeHalf(v.x);
+unsigned short py = meshopt_quantizeHalf(v.y);
+unsigned short pz = meshopt_quantizeHalf(v.z);
+```
+
+## Vertex/index buffer compression
+
+In case storage size or transmission bandwidth is of importance, you might want to additionally compress vertex and index data. While several mesh compression libraries, like Google Draco, are available, they typically are designed to maximize the compression ratio at the cost of disturbing the vertex/index order (which makes the meshes inefficient to render on GPU) or decompression performance. They also frequently don't support custom game-ready quantized vertex formats and thus require to re-quantize the data after loading it, introducing extra quantization errors and making decoding slower.
+
+Alternatively you can use general purpose compression libraries like zstd or Oodle to compress vertex/index data - however these compressors aren't designed to exploit redundancies in vertex/index data and as such compression rates can be unsatisfactory.
+
+To that end, this library provides algorithms to "encode" vertex and index data. The result of the encoding is generally significantly smaller than initial data, and remains compressible with general purpose compressors - so you can either store encoded data directly (for modest compression ratios and maximum decoding performance), or further compress it with zstd/Oodle to maximize compression ratio.
+
+To encode, you need to allocate target buffers (preferably using the worst case bound) and call encoding functions:
+
+```c++
+std::vector<unsigned char> vbuf(meshopt_encodeVertexBufferBound(vertex_count, sizeof(Vertex)));
+vbuf.resize(meshopt_encodeVertexBuffer(&vbuf[0], vbuf.size(), vertices, vertex_count, sizeof(Vertex)));
+
+std::vector<unsigned char> ibuf(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+ibuf.resize(meshopt_encodeIndexBuffer(&ibuf[0], ibuf.size(), indices, index_count));
+```
+
+You can then either serialize `vbuf`/`ibuf` as is, or compress them further. To decode the data at runtime, call decoding functions:
+
+```c++
+int resvb = meshopt_decodeVertexBuffer(vertices, vertex_count, sizeof(Vertex), &vbuf[0], vbuf.size());
+int resib = meshopt_decodeIndexBuffer(indices, index_count, &buffer[0], buffer.size());
+assert(resvb == 0 && resib == 0);
+```
+
+Note that vertex encoding assumes that vertex buffer was optimized for vertex fetch, and that vertices are quantized; index encoding assumes that the vertex/index buffers were optimized for vertex cache and vertex fetch. Feeding unoptimized data into the encoders will produce poor compression ratios. Both codecs are lossless - the only lossy step is quantization that happens before encoding.
+
+Decoding functions are heavily optimized and can directly target write-combined memory; you can expect both decoders to run at 1-3 GB/s on modern desktop CPUs. Compression ratios depend on the data; vertex data compression ratio is typically around 2-4x (compared to already quantized data), index data compression ratio is around 5-6x (compared to raw 16-bit index data). General purpose lossless compressors can further improve on these results.
+
+Due to a very high decoding performance and compatibility with general purpose lossless compressors, the compression is a good fit for the use on the web. To that end, meshoptimizer provides both vertex and index decoders compiled into WebAssembly and wrapped into a module with JavaScript-friendly interface, `js/decoder.js`, that you can use to decode meshes that were encoded offline:
+
+```js
+// ready is a Promise that is resolved when (asynchronous) WebAssembly compilation finishes
+await MeshoptDecoder.ready;
+
+// decode from *Data (Uint8Array) into *Buffer (Uint8Array)
+MeshoptDecoder.decodeVertexBuffer(vertexBuffer, vertexCount, vertexSize, vertexData);
+MeshoptDecoder.decodeIndexBuffer(indexBuffer, indexCount, indexSize, indexData);
+```
+
+A THREE.js mesh loader is provided as an example in `tools/OptMeshLoader.js`; it loads meshes encoded using `tools/meshencoder.cpp`. [Usage example](https://zeuxcg.org/meshoptimizer/demo/) is available, with source in `demo/index.html`.
+
+## Triangle strip conversion
+
+On most hardware, indexed triangle lists are the most efficient way to drive the GPU. However, in some cases triangle strips might prove beneficial:
+
+- On some older GPUs, triangle strips may be a bit more efficient to render
+- On extremely memory constrained systems, index buffers for triangle strips could save a bit of memory
+
+This library provides an algorithm for converting a vertex cache optimized triangle list to a triangle strip:
+
+```c++
+std::vector<unsigned int> strip(meshopt_stripifyBound(index_count));
+size_t strip_size = meshopt_stripify(&strip[0], indices, index_count, vertex_count);
+```
+
+Typically you should expect triangle strips to have ~50-60% of indices compared to triangle lists (~1.5-1.8 indices per triangle) and have ~5% worse ACMR. Note that triangle strips require restart index support for rendering; using degenerate triangles to connect strips is not supported.
+
+## Deinterleaved geometry
+
+All of the examples above assume that geometry is represented as a single vertex buffer and a single index buffer. This requires storing all vertex attributes - position, normal, texture coordinate, skinning weights etc. - in a single contiguous struct. However, in some cases using multiple vertex streams may be preferable. In particular, if some passes require only positional data - such as depth pre-pass or shadow map - then it may be beneficial to split it from the rest of the vertex attributes to make sure the bandwidth use during these passes is optimal. On some mobile GPUs a position-only attribute stream also improves efficiency of tiling algorithms.
+
+Most of the functions in this library either only need the index buffer (such as vertex cache optimization) or only need positional information (such as overdraw optimization). However, several tasks require knowledge about all vertex attributes.
+
+For indexing, `meshopt_generateVertexRemap` assumes that there's just one vertex stream; when multiple vertex streams are used, it's necessary to use `meshopt_generateVertexRemapMulti` as follows:
+
+```c++
+meshopt_Stream streams[] = {
+    {&unindexed_pos[0], sizeof(float) * 3, sizeof(float) * 3},
+    {&unindexed_nrm[0], sizeof(float) * 3, sizeof(float) * 3},
+    {&unindexed_uv[0], sizeof(float) * 2, sizeof(float) * 2},
+};
+
+std::vector<unsigned int> remap(index_count);
+size_t vertex_count = meshopt_generateVertexRemapMulti(&remap[0], NULL, index_count, index_count, streams, sizeof(streams) / sizeof(streams[0]));
+```
+
+After this `meshopt_remapVertexBuffer` needs to be called once for each vertex stream to produce the correctly reindexed stream.
+
+Instead of calling `meshopt_optimizeVertexFetch` for reordering vertices in a single vertex buffer for efficiency, calling `meshopt_optimizeVertexFetchRemap` and then calling `meshopt_remapVertexBuffer` for each stream again is recommended.
+
+Finally, when compressing vertex data, `meshopt_encodeVertexBuffer` should be used on each vertex stream separately - this allows the encoder to best utilize corellation between attribute values for different vertices.
+
+## Simplification
+
+All algorithms presented so far don't affect visual appearance at all, with the exception of quantization that has minimal controlled impact. However, fundamentally the most effective way at reducing the rendering or transmission cost of a mesh is to make the mesh simpler.
+
+This library provides two simplification algorithms that reduce the number of triangles in the mesh. Given a vertex and an index buffer, they generate a second index buffer that uses existing vertices in the vertex buffer. This index buffer can be used directly for rendering with the original vertex buffer (preferably after vertex cache optimization), or a new compact vertex/index buffer can be generated using `meshopt_optimizeVertexFetch` that uses the optimal number and order of vertices.
+
+The first simplification algorithm, `meshopt_simplify`, follows the topology of the original mesh in an attempt to preserve attribute seams, borders and overall appearance. For meshes with inconsistent topology or many seams, such as faceted meshes, it can result in simplifier getting "stuck" and not being able to simplify the mesh fully; it's recommended to preprocess the index buffer with `meshopt_generateShadowIndexBuffer` to discard any vertex attributes that aren't critical and can be rebuilt later such as normals.
+
+```
+float threshold = 0.2f;
+size_t target_index_count = size_t(index_count * threshold);
+float target_error = 1e-2f;
+
+std::vector<unsigned int> lod(index_count);
+lod.resize(meshopt_simplify(&lod[0], indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex), target_index_count, target_error));
+```
+
+Target error is an approximate measure of the deviation from the original mesh using distance normalized to 0..1 (so 1e-2f means that simplifier will try to maintain the error to be below 1% of the mesh extents). Note that because of topological restrictions and error bounds simplifier isn't guaranteed to reach the target index count and can stop earlier.
+
+The second simplification algorithm, `meshopt_simplifySloppy`, doesn't follow the topology of the original mesh. This means that it doesn't preserve attribute seams or borders, but it can collapse internal details that are too small to matter better because it can merge mesh features that are topologically disjoint but spatially close.
+
+```
+float threshold = 0.2f;
+size_t target_index_count = size_t(index_count * threshold);
+
+std::vector<unsigned int> lod(target_index_count);
+lod.resize(meshopt_simplifySloppy(&lod[0], indices, index_count, &vertices[0].x, vertex_count, sizeof(Vertex), target_index_count));
+```
+
+This algorithm is guaranteed to return a result at or below the target index count. It is 5-6x faster than `meshopt_simplify` when simplification ratio is large, and is able to reach ~20M triangles/sec on a desktop CPU (`meshopt_simplify` works at ~3M triangles/sec).
+
+When a sequence of LOD meshes is generated that all use the original vertex buffer, care must be taken to order vertices optimally to not penalize mobile GPU architectures that are only capable of transforming a sequential vertex buffer range. It's recommended in this case to first optimize each LOD for vertex cache, then assemble all LODs in one large index buffer starting from the coarsest LOD (the one with fewest triangles), and call `meshopt_optimizeVertexFetch` on the final large index buffer. This will make sure that coarser LODs require a smaller vertex range and are efficient wrt vertex fetch and transform.
+
+## Efficiency analyzers
+
+While the only way to get precise performance data is to measure performance on the target GPU, it can be valuable to measure the impact of these optimization in a GPU-independent manner. To this end, the library provides analyzers for all three major optimization routines. For each optimization there is a corresponding analyze function, like `meshopt_analyzeOverdraw`, that returns a struct with statistics.
+
+`meshopt_analyzeVertexCache` returns vertex cache statistics. The common metric to use is ACMR - average cache miss ratio, which is the ratio of the total number of vertex invocations to the triangle count. The worst-case ACMR is 3 (GPU has to process 3 vertices for each triangle); on regular grids the optimal ACMR approaches 0.5. On real meshes it usually is in [0.5..1.5] range depending on the amount of vertex splits. One other useful metric is ATVR - average transformed vertex ratio - which represents the ratio of vertex shader invocations to the total vertices, and has the best case of 1.0 regardless of mesh topology (each vertex is transformed once).
+
+`meshopt_analyzeVertexFetch` returns vertex fetch statistics. The main metric it uses is overfetch - the ratio between the number of bytes read from the vertex buffer to the total number of bytes in the vertex buffer. Assuming non-redundant vertex buffers, the best case is 1.0 - each byte is fetched once.
+
+`meshopt_analyzeOverdraw` returns overdraw statistics. The main metric it uses is overdraw - the ratio between the number of pixel shader invocations to the total number of covered pixels, as measured from several different orthographic cameras. The best case for overdraw is 1.0 - each pixel is shaded once.
+
+Note that all analyzers use approximate models for the relevant GPU units, so the numbers you will get as the result are only a rough approximation of the actual performance.
+
+## Memory management
+
+Many algorithms allocate temporary memory to store intermediate results or accelerate processing. The amount of memory allocated is a function of various input parameters such as vertex count and index count. By default memory is allocated using `operator new` and `operator delete`; if these operators are overloaded by the application, the overloads will be used instead. Alternatively it's possible to specify custom allocation/deallocation functions using `meshopt_setAllocator`, e.g.
+
+```c++
+meshopt_setAllocator(malloc, free);
+```
+
+> Note that currently the library expects the allocation function to either throw in case of out-of-memory (in which case the exception will propagate to the caller) or abort, so technically the use of `malloc` above isn't safe.
+
+Vertex and index decoders (`meshopt_decodeVertexBuffer` and `meshopt_decodeIndexBuffer`) do not allocate memory and work completely within the buffer space provided via arguments.
+
+All functions have bounded stack usage that does not exceed 32 KB for any algorithms.
+
+## License
+
+This library is available to anybody free of charge, under the terms of MIT License (see LICENSE.md).
--- a/3rdparty/meshoptimizer/codecov.yml
+++ b/3rdparty/meshoptimizer/codecov.yml
@ -0,0 +1,10 @@
+comment: false
+
+coverage:
+  status:
+    project: off
+    patch: off
+
+ignore:
+  - demo
+  - tools
--- a/3rdparty/meshoptimizer/demo/ansi.c
+++ b/3rdparty/meshoptimizer/demo/ansi.c
@ -0,0 +1,2 @@
+/* This file makes sure the library can be used by C89 code */
+#include "../src/meshoptimizer.h"
--- a/3rdparty/meshoptimizer/demo/index.html
+++ b/3rdparty/meshoptimizer/demo/index.html
@ -0,0 +1,122 @@
+<!DOCTYPE html>
+<html lang="en">
+	<head>
+		<title>meshoptimizer - demo</title>
+		<meta charset="utf-8">
+		<meta name="viewport" content="width=device-width, user-scalable=no, minimum-scale=1.0, maximum-scale=1.0">
+		<style>
+			body {
+				font-family: Monospace;
+				background-color: #000;
+				color: #fff;
+				margin: 0px;
+				overflow: hidden;
+			}
+			#info {
+				color: #fff;
+				position: absolute;
+				top: 10px;
+				width: 100%;
+				text-align: center;
+				z-index: 100;
+				display:block;
+			}
+			#info a, .button { color: #f00; font-weight: bold; text-decoration: underline; cursor: pointer }
+		</style>
+	</head>
+
+	<body>
+		<div id="info">
+		<a href="https://github.com/zeux/meshoptimizer" target="_blank" rel="noopener">meshoptimizer</a>
+		</div>
+
+		<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/99/three.min.js"></script>
+
+		<script src="../js/decoder.js"></script>
+		<script src="../tools/OptMeshLoader.js"></script>
+
+		<script>
+			var container;
+
+			var camera, scene, renderer;
+
+			var windowHalfX = window.innerWidth / 2;
+			var windowHalfY = window.innerHeight / 2;
+
+			var timers = {};
+
+			console.time = function(label) {
+				timers[label] = performance.now();
+			};
+
+			console.timeEnd = function(label) {
+				var time = performance.now() - timers[label];
+				document.getElementById('info').append(label + " took " + time.toFixed(2) + " ms");
+			};
+
+			init();
+			animate();
+
+			function init()
+			{
+				container = document.createElement('div');
+				document.body.appendChild(container);
+
+				camera = new THREE.PerspectiveCamera(45, window.innerWidth / window.innerHeight, 0.01, 100);
+				camera.position.y = 1.0;
+				camera.position.z = 3.0;
+
+				scene = new THREE.Scene();
+
+				var ambientLight = new THREE.AmbientLight(0xcccccc, 0.2);
+				scene.add(ambientLight);
+
+				var pointLight = new THREE.PointLight(0xffffff, 0.8);
+				pointLight.position.set(3, 3, 0);
+				camera.add(pointLight);
+				scene.add(camera);
+
+				var onProgress = function (xhr) {};
+				var onError = function () {};
+
+				new THREE.OptMeshLoader()
+					.setDecoder(MeshoptDecoder)
+					.setMaterials(null) // materials can be fetched using MTLLoader
+					.setPath('./')
+					.load('pirate.optmesh', function (object)
+					{
+						scene.add(object);
+					}, onProgress, onError);
+
+				renderer = new THREE.WebGLRenderer();
+				renderer.setPixelRatio(window.devicePixelRatio);
+				renderer.setSize(window.innerWidth, window.innerHeight);
+				container.appendChild(renderer.domElement);
+
+				window.addEventListener('resize', onWindowResize, false);
+			}
+
+			function onWindowResize()
+			{
+				windowHalfX = window.innerWidth / 2;
+				windowHalfY = window.innerHeight / 2;
+
+				camera.aspect = window.innerWidth / window.innerHeight;
+				camera.updateProjectionMatrix();
+
+				renderer.setSize(window.innerWidth, window.innerHeight);
+			}
+
+			function animate()
+			{
+				requestAnimationFrame(animate);
+				render();
+			}
+
+			function render()
+			{
+				renderer.render(scene, camera);
+			}
+		</script>
+	</body>
+</html>
--- a/3rdparty/meshoptimizer/demo/main.cpp
+++ b/3rdparty/meshoptimizer/demo/main.cpp
@ -0,0 +1,941 @@
+#include "../src/meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+
+#include <vector>
+
+#include "../tools/objparser.h"
+#include "miniz.h"
+
+// This file uses assert() to verify algorithm correctness
+#undef NDEBUG
+#include <assert.h>
+
+#if defined(__linux__)
+double timestamp()
+{
+	timespec ts;
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec);
+}
+#elif defined(_WIN32)
+struct LARGE_INTEGER
+{
+	__int64 QuadPart;
+};
+extern "C" __declspec(dllimport) int __stdcall QueryPerformanceCounter(LARGE_INTEGER* lpPerformanceCount);
+extern "C" __declspec(dllimport) int __stdcall QueryPerformanceFrequency(LARGE_INTEGER* lpFrequency);
+
+double timestamp()
+{
+	LARGE_INTEGER freq, counter;
+	QueryPerformanceFrequency(&freq);
+	QueryPerformanceCounter(&counter);
+	return double(counter.QuadPart) / double(freq.QuadPart);
+}
+#else
+double timestamp()
+{
+	return double(clock()) / double(CLOCKS_PER_SEC);
+}
+#endif
+
+const size_t kCacheSize = 16;
+
+struct Vertex
+{
+	float px, py, pz;
+	float nx, ny, nz;
+	float tx, ty;
+};
+
+struct Mesh
+{
+	std::vector<Vertex> vertices;
+	std::vector<unsigned int> indices;
+};
+
+union Triangle {
+	Vertex v[3];
+	char data[sizeof(Vertex) * 3];
+};
+
+Mesh parseObj(const char* path, double& reindex)
+{
+	ObjFile file;
+
+	if (!objParseFile(file, path))
+	{
+		printf("Error loading %s: file not found\n", path);
+		return Mesh();
+	}
+
+	if (!objValidate(file))
+	{
+		printf("Error loading %s: invalid file data\n", path);
+		return Mesh();
+	}
+
+	size_t total_indices = file.f_size / 3;
+
+	std::vector<Vertex> vertices(total_indices);
+
+	for (size_t i = 0; i < total_indices; ++i)
+	{
+		int vi = file.f[i * 3 + 0];
+		int vti = file.f[i * 3 + 1];
+		int vni = file.f[i * 3 + 2];
+
+		Vertex v =
+		    {
+		        file.v[vi * 3 + 0],
+		        file.v[vi * 3 + 1],
+		        file.v[vi * 3 + 2],
+
+		        vni >= 0 ? file.vn[vni * 3 + 0] : 0,
+		        vni >= 0 ? file.vn[vni * 3 + 1] : 0,
+		        vni >= 0 ? file.vn[vni * 3 + 2] : 0,
+
+		        vti >= 0 ? file.vt[vti * 3 + 0] : 0,
+		        vti >= 0 ? file.vt[vti * 3 + 1] : 0,
+		    };
+
+		vertices[i] = v;
+	}
+
+	reindex = timestamp();
+
+	Mesh result;
+
+	std::vector<unsigned int> remap(total_indices);
+
+	size_t total_vertices = meshopt_generateVertexRemap(&remap[0], NULL, total_indices, &vertices[0], total_indices, sizeof(Vertex));
+
+	result.indices.resize(total_indices);
+	meshopt_remapIndexBuffer(&result.indices[0], NULL, total_indices, &remap[0]);
+
+	result.vertices.resize(total_vertices);
+	meshopt_remapVertexBuffer(&result.vertices[0], &vertices[0], total_indices, sizeof(Vertex), &remap[0]);
+
+	return result;
+}
+
+bool isMeshValid(const Mesh& mesh)
+{
+	size_t index_count = mesh.indices.size();
+	size_t vertex_count = mesh.vertices.size();
+
+	if (index_count % 3 != 0)
+		return false;
+
+	const unsigned int* indices = &mesh.indices[0];
+
+	for (size_t i = 0; i < index_count; ++i)
+		if (indices[i] >= vertex_count)
+			return false;
+
+	return true;
+}
+
+bool rotateTriangle(Triangle& t)
+{
+	int c01 = memcmp(&t.v[0], &t.v[1], sizeof(Vertex));
+	int c02 = memcmp(&t.v[0], &t.v[2], sizeof(Vertex));
+	int c12 = memcmp(&t.v[1], &t.v[2], sizeof(Vertex));
+
+	if (c12 < 0 && c01 > 0)
+	{
+		// 1 is minimum, rotate 012 => 120
+		Vertex tv = t.v[0];
+		t.v[0] = t.v[1], t.v[1] = t.v[2], t.v[2] = tv;
+	}
+	else if (c02 > 0 && c12 > 0)
+	{
+		// 2 is minimum, rotate 012 => 201
+		Vertex tv = t.v[2];
+		t.v[2] = t.v[1], t.v[1] = t.v[0], t.v[0] = tv;
+	}
+
+	return c01 != 0 && c02 != 0 && c12 != 0;
+}
+
+unsigned int hashRange(const char* key, size_t len)
+{
+	// MurmurHash2
+	const unsigned int m = 0x5bd1e995;
+	const int r = 24;
+
+	unsigned int h = 0;
+
+	while (len >= 4)
+	{
+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	return h;
+}
+
+unsigned int hashMesh(const Mesh& mesh)
+{
+	size_t triangle_count = mesh.indices.size() / 3;
+
+	const Vertex* vertices = &mesh.vertices[0];
+	const unsigned int* indices = &mesh.indices[0];
+
+	unsigned int h1 = 0;
+	unsigned int h2 = 0;
+
+	for (size_t i = 0; i < triangle_count; ++i)
+	{
+		Triangle t;
+		t.v[0] = vertices[indices[i * 3 + 0]];
+		t.v[1] = vertices[indices[i * 3 + 1]];
+		t.v[2] = vertices[indices[i * 3 + 2]];
+
+		// skip degenerate triangles since some algorithms don't preserve them
+		if (rotateTriangle(t))
+		{
+			unsigned int hash = hashRange(t.data, sizeof(t.data));
+
+			h1 ^= hash;
+			h2 += hash;
+		}
+	}
+
+	return h1 * 0x5bd1e995 + h2;
+}
+
+void optNone(Mesh& mesh)
+{
+	(void)mesh;
+}
+
+void optRandomShuffle(Mesh& mesh)
+{
+	size_t triangle_count = mesh.indices.size() / 3;
+
+	unsigned int* indices = &mesh.indices[0];
+
+	unsigned int rng = 0;
+
+	for (size_t i = triangle_count - 1; i > 0; --i)
+	{
+		// Fisher-Yates shuffle
+		size_t j = rng % (i + 1);
+
+		unsigned int t;
+		t = indices[3 * j + 0], indices[3 * j + 0] = indices[3 * i + 0], indices[3 * i + 0] = t;
+		t = indices[3 * j + 1], indices[3 * j + 1] = indices[3 * i + 1], indices[3 * i + 1] = t;
+		t = indices[3 * j + 2], indices[3 * j + 2] = indices[3 * i + 2], indices[3 * i + 2] = t;
+
+		// LCG RNG, constants from Numerical Recipes
+		rng = rng * 1664525 + 1013904223;
+	}
+}
+
+void optCache(Mesh& mesh)
+{
+	meshopt_optimizeVertexCache(&mesh.indices[0], &mesh.indices[0], mesh.indices.size(), mesh.vertices.size());
+}
+
+void optCacheFifo(Mesh& mesh)
+{
+	meshopt_optimizeVertexCacheFifo(&mesh.indices[0], &mesh.indices[0], mesh.indices.size(), mesh.vertices.size(), kCacheSize);
+}
+
+void optOverdraw(Mesh& mesh)
+{
+	// use worst-case ACMR threshold so that overdraw optimizer can sort *all* triangles
+	// warning: this significantly deteriorates the vertex cache efficiency so it is not advised; look at optComplete for the recommended method
+	const float kThreshold = 3.f;
+	meshopt_optimizeOverdraw(&mesh.indices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), kThreshold);
+}
+
+void optFetch(Mesh& mesh)
+{
+	meshopt_optimizeVertexFetch(&mesh.vertices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0], mesh.vertices.size(), sizeof(Vertex));
+}
+
+void optFetchRemap(Mesh& mesh)
+{
+	// this produces results equivalent to optFetch, but can be used to remap multiple vertex streams
+	std::vector<unsigned int> remap(mesh.vertices.size());
+	meshopt_optimizeVertexFetchRemap(&remap[0], &mesh.indices[0], mesh.indices.size(), mesh.vertices.size());
+
+	meshopt_remapIndexBuffer(&mesh.indices[0], &mesh.indices[0], mesh.indices.size(), &remap[0]);
+	meshopt_remapVertexBuffer(&mesh.vertices[0], &mesh.vertices[0], mesh.vertices.size(), sizeof(Vertex), &remap[0]);
+}
+
+void optComplete(Mesh& mesh)
+{
+	// vertex cache optimization should go first as it provides starting order for overdraw
+	meshopt_optimizeVertexCache(&mesh.indices[0], &mesh.indices[0], mesh.indices.size(), mesh.vertices.size());
+
+	// reorder indices for overdraw, balancing overdraw and vertex cache efficiency
+	const float kThreshold = 1.01f; // allow up to 1% worse ACMR to get more reordering opportunities for overdraw
+	meshopt_optimizeOverdraw(&mesh.indices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), kThreshold);
+
+	// vertex fetch optimization should go last as it depends on the final index order
+	meshopt_optimizeVertexFetch(&mesh.vertices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0], mesh.vertices.size(), sizeof(Vertex));
+}
+
+struct PackedVertex
+{
+	unsigned short px, py, pz;
+	unsigned short pw; // padding to 4b boundary
+	signed char nx, ny, nz, nw;
+	unsigned short tx, ty;
+};
+
+void packMesh(std::vector<PackedVertex>& pv, const std::vector<Vertex>& vertices)
+{
+	for (size_t i = 0; i < vertices.size(); ++i)
+	{
+		const Vertex& vi = vertices[i];
+		PackedVertex& pvi = pv[i];
+
+		pvi.px = meshopt_quantizeHalf(vi.px);
+		pvi.py = meshopt_quantizeHalf(vi.py);
+		pvi.pz = meshopt_quantizeHalf(vi.pz);
+		pvi.pw = 0;
+
+		pvi.nx = char(meshopt_quantizeSnorm(vi.nx, 8));
+		pvi.ny = char(meshopt_quantizeSnorm(vi.ny, 8));
+		pvi.nz = char(meshopt_quantizeSnorm(vi.nz, 8));
+		pvi.nw = 0;
+
+		pvi.tx = meshopt_quantizeHalf(vi.tx);
+		pvi.ty = meshopt_quantizeHalf(vi.ty);
+	}
+}
+
+struct PackedVertexOct
+{
+	unsigned short px, py, pz;
+	signed char nu, nv; // octahedron encoded normal, aliases .pw
+	unsigned short tx, ty;
+};
+
+void packMesh(std::vector<PackedVertexOct>& pv, const std::vector<Vertex>& vertices)
+{
+	for (size_t i = 0; i < vertices.size(); ++i)
+	{
+		const Vertex& vi = vertices[i];
+		PackedVertexOct& pvi = pv[i];
+
+		pvi.px = meshopt_quantizeHalf(vi.px);
+		pvi.py = meshopt_quantizeHalf(vi.py);
+		pvi.pz = meshopt_quantizeHalf(vi.pz);
+
+		float nsum = fabsf(vi.nx) + fabsf(vi.ny) + fabsf(vi.nz);
+		float nx = vi.nx / nsum;
+		float ny = vi.ny / nsum;
+		float nz = vi.nz;
+
+		float nu = nz >= 0 ? nx : (1 - fabsf(ny)) * (nx >= 0 ? 1 : -1);
+		float nv = nz >= 0 ? ny : (1 - fabsf(nx)) * (ny >= 0 ? 1 : -1);
+
+		pvi.nu = char(meshopt_quantizeSnorm(nu, 8));
+		pvi.nv = char(meshopt_quantizeSnorm(nv, 8));
+
+		pvi.tx = meshopt_quantizeHalf(vi.tx);
+		pvi.ty = meshopt_quantizeHalf(vi.ty);
+	}
+}
+
+void simplify(const Mesh& mesh, float threshold = 0.2f)
+{
+	Mesh lod;
+
+	double start = timestamp();
+
+	size_t target_index_count = size_t(mesh.indices.size() * threshold);
+	float target_error = 1e-2f;
+
+	lod.indices.resize(mesh.indices.size()); // note: simplify needs space for index_count elements in the destination array, not target_index_count
+	lod.indices.resize(meshopt_simplify(&lod.indices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), target_index_count, target_error));
+
+	lod.vertices.resize(lod.indices.size() < mesh.vertices.size() ? lod.indices.size() : mesh.vertices.size()); // note: this is just to reduce the cost of resize()
+	lod.vertices.resize(meshopt_optimizeVertexFetch(&lod.vertices[0], &lod.indices[0], lod.indices.size(), &mesh.vertices[0], mesh.vertices.size(), sizeof(Vertex)));
+
+	double end = timestamp();
+
+	printf("%-9s: %d triangles => %d triangles in %.2f msec\n",
+	       "Simplify",
+	       int(mesh.indices.size() / 3), int(lod.indices.size() / 3), (end - start) * 1000);
+}
+
+void simplifySloppy(const Mesh& mesh, float threshold = 0.2f)
+{
+	Mesh lod;
+
+	double start = timestamp();
+
+	size_t target_index_count = size_t(mesh.indices.size() * threshold);
+
+	lod.indices.resize(target_index_count); // note: simplifySloppy, unlike simplify, is guaranteed to output results that don't exceed the requested target_index_count
+	lod.indices.resize(meshopt_simplifySloppy(&lod.indices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), target_index_count));
+
+	lod.vertices.resize(lod.indices.size() < mesh.vertices.size() ? lod.indices.size() : mesh.vertices.size()); // note: this is just to reduce the cost of resize()
+	lod.vertices.resize(meshopt_optimizeVertexFetch(&lod.vertices[0], &lod.indices[0], lod.indices.size(), &mesh.vertices[0], mesh.vertices.size(), sizeof(Vertex)));
+
+	double end = timestamp();
+
+	printf("%-9s: %d triangles => %d triangles in %.2f msec\n",
+	       "SimplifyS",
+	       int(mesh.indices.size() / 3), int(lod.indices.size() / 3), (end - start) * 1000);
+}
+
+void simplifyComplete(const Mesh& mesh)
+{
+	static const size_t lod_count = 5;
+
+	double start = timestamp();
+
+	// generate 4 LOD levels (1-4), with each subsequent LOD using 70% triangles
+	// note that each LOD uses the same (shared) vertex buffer
+	std::vector<unsigned int> lods[lod_count];
+
+	lods[0] = mesh.indices;
+
+	for (size_t i = 1; i < lod_count; ++i)
+	{
+		std::vector<unsigned int>& lod = lods[i];
+
+		float threshold = powf(0.7f, float(i));
+		size_t target_index_count = size_t(mesh.indices.size() * threshold) / 3 * 3;
+		float target_error = 1e-2f;
+
+		// we can simplify all the way from base level or from the last result
+		// simplifying from the base level sometimes produces better results, but simplifying from last level is faster
+		const std::vector<unsigned int>& source = lods[i - 1];
+
+		if (source.size() < target_index_count)
+			target_index_count = source.size();
+
+		lod.resize(source.size());
+		lod.resize(meshopt_simplify(&lod[0], &source[0], source.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), target_index_count, target_error));
+	}
+
+	double middle = timestamp();
+
+	// optimize each individual LOD for vertex cache & overdraw
+	for (size_t i = 0; i < lod_count; ++i)
+	{
+		std::vector<unsigned int>& lod = lods[i];
+
+		meshopt_optimizeVertexCache(&lod[0], &lod[0], lod.size(), mesh.vertices.size());
+		meshopt_optimizeOverdraw(&lod[0], &lod[0], lod.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), 1.0f);
+	}
+
+	// concatenate all LODs into one IB
+	// note: the order of concatenation is important - since we optimize the entire IB for vertex fetch,
+	// putting coarse LODs first makes sure that the vertex range referenced by them is as small as possible
+	// some GPUs process the entire range referenced by the index buffer region so doing this optimizes the vertex transform
+	// cost for coarse LODs
+	// this order also produces much better vertex fetch cache coherency for coarse LODs (since they're essentially optimized first)
+	// somewhat surprisingly, the vertex fetch cache coherency for fine LODs doesn't seem to suffer that much.
+	size_t lod_index_offsets[lod_count] = {};
+	size_t lod_index_counts[lod_count] = {};
+	size_t total_index_count = 0;
+
+	for (int i = lod_count - 1; i >= 0; --i)
+	{
+		lod_index_offsets[i] = total_index_count;
+		lod_index_counts[i] = lods[i].size();
+
+		total_index_count += lods[i].size();
+	}
+
+	std::vector<unsigned int> indices(total_index_count);
+
+	for (size_t i = 0; i < lod_count; ++i)
+	{
+		memcpy(&indices[lod_index_offsets[i]], &lods[i][0], lods[i].size() * sizeof(lods[i][0]));
+	}
+
+	std::vector<Vertex> vertices = mesh.vertices;
+
+	// vertex fetch optimization should go last as it depends on the final index order
+	// note that the order of LODs above affects vertex fetch results
+	meshopt_optimizeVertexFetch(&vertices[0], &indices[0], indices.size(), &vertices[0], vertices.size(), sizeof(Vertex));
+
+	double end = timestamp();
+
+	printf("%-9s: %d triangles => %d LOD levels down to %d triangles in %.2f msec, optimized in %.2f msec\n",
+	       "SimplifyC",
+	       int(lod_index_counts[0]) / 3, int(lod_count), int(lod_index_counts[lod_count - 1]) / 3,
+	       (middle - start) * 1000, (end - middle) * 1000);
+
+	// for using LOD data at runtime, in addition to vertices and indices you have to save lod_index_offsets/lod_index_counts.
+
+	{
+		meshopt_VertexCacheStatistics vcs0 = meshopt_analyzeVertexCache(&indices[lod_index_offsets[0]], lod_index_counts[0], vertices.size(), kCacheSize, 0, 0);
+		meshopt_VertexFetchStatistics vfs0 = meshopt_analyzeVertexFetch(&indices[lod_index_offsets[0]], lod_index_counts[0], vertices.size(), sizeof(Vertex));
+		meshopt_VertexCacheStatistics vcsN = meshopt_analyzeVertexCache(&indices[lod_index_offsets[lod_count - 1]], lod_index_counts[lod_count - 1], vertices.size(), kCacheSize, 0, 0);
+		meshopt_VertexFetchStatistics vfsN = meshopt_analyzeVertexFetch(&indices[lod_index_offsets[lod_count - 1]], lod_index_counts[lod_count - 1], vertices.size(), sizeof(Vertex));
+
+		typedef PackedVertexOct PV;
+
+		std::vector<PV> pv(vertices.size());
+		packMesh(pv, vertices);
+
+		std::vector<unsigned char> vbuf(meshopt_encodeVertexBufferBound(vertices.size(), sizeof(PV)));
+		vbuf.resize(meshopt_encodeVertexBuffer(&vbuf[0], vbuf.size(), &pv[0], vertices.size(), sizeof(PV)));
+
+		std::vector<unsigned char> ibuf(meshopt_encodeIndexBufferBound(indices.size(), vertices.size()));
+		ibuf.resize(meshopt_encodeIndexBuffer(&ibuf[0], ibuf.size(), &indices[0], indices.size()));
+
+		printf("%-9s  ACMR %f...%f Overfetch %f..%f Codec VB %.1f bits/vertex IB %.1f bits/triangle\n",
+		       "",
+		       vcs0.acmr, vcsN.acmr, vfs0.overfetch, vfsN.overfetch,
+		       double(vbuf.size()) / double(vertices.size()) * 8,
+		       double(ibuf.size()) / double(indices.size() / 3) * 8);
+	}
+}
+
+void optimize(const Mesh& mesh, const char* name, void (*optf)(Mesh& mesh))
+{
+	Mesh copy = mesh;
+
+	double start = timestamp();
+	optf(copy);
+	double end = timestamp();
+
+	assert(isMeshValid(copy));
+	assert(hashMesh(mesh) == hashMesh(copy));
+
+	meshopt_VertexCacheStatistics vcs = meshopt_analyzeVertexCache(&copy.indices[0], copy.indices.size(), copy.vertices.size(), kCacheSize, 0, 0);
+	meshopt_VertexFetchStatistics vfs = meshopt_analyzeVertexFetch(&copy.indices[0], copy.indices.size(), copy.vertices.size(), sizeof(Vertex));
+	meshopt_OverdrawStatistics os = meshopt_analyzeOverdraw(&copy.indices[0], copy.indices.size(), &copy.vertices[0].px, copy.vertices.size(), sizeof(Vertex));
+
+	meshopt_VertexCacheStatistics vcs_nv = meshopt_analyzeVertexCache(&copy.indices[0], copy.indices.size(), copy.vertices.size(), 32, 32, 32);
+	meshopt_VertexCacheStatistics vcs_amd = meshopt_analyzeVertexCache(&copy.indices[0], copy.indices.size(), copy.vertices.size(), 14, 64, 128);
+	meshopt_VertexCacheStatistics vcs_intel = meshopt_analyzeVertexCache(&copy.indices[0], copy.indices.size(), copy.vertices.size(), 128, 0, 0);
+
+	printf("%-9s: ACMR %f ATVR %f (NV %f AMD %f Intel %f) Overfetch %f Overdraw %f in %.2f msec\n", name, vcs.acmr, vcs.atvr, vcs_nv.atvr, vcs_amd.atvr, vcs_intel.atvr, vfs.overfetch, os.overdraw, (end - start) * 1000);
+}
+
+template <typename T>
+size_t compress(const std::vector<T>& data)
+{
+	std::vector<unsigned char> cbuf(tdefl_compress_bound(data.size() * sizeof(T)));
+	unsigned int flags = tdefl_create_comp_flags_from_zip_params(MZ_DEFAULT_LEVEL, 15, MZ_DEFAULT_STRATEGY);
+	return tdefl_compress_mem_to_mem(&cbuf[0], cbuf.size(), &data[0], data.size() * sizeof(T), flags);
+}
+
+void encodeIndex(const Mesh& mesh)
+{
+	// allocate result outside of the timing loop to exclude memset() from decode timing
+	std::vector<unsigned int> result(mesh.indices.size());
+
+	double start = timestamp();
+
+	std::vector<unsigned char> buffer(meshopt_encodeIndexBufferBound(mesh.indices.size(), mesh.vertices.size()));
+	buffer.resize(meshopt_encodeIndexBuffer(&buffer[0], buffer.size(), &mesh.indices[0], mesh.indices.size()));
+
+	double middle = timestamp();
+
+	int res = meshopt_decodeIndexBuffer(&result[0], mesh.indices.size(), &buffer[0], buffer.size());
+	assert(res == 0);
+	(void)res;
+
+	double end = timestamp();
+
+	size_t csize = compress(buffer);
+
+	for (size_t i = 0; i < mesh.indices.size(); i += 3)
+	{
+		assert(
+		    (result[i + 0] == mesh.indices[i + 0] && result[i + 1] == mesh.indices[i + 1] && result[i + 2] == mesh.indices[i + 2]) ||
+		    (result[i + 1] == mesh.indices[i + 0] && result[i + 2] == mesh.indices[i + 1] && result[i + 0] == mesh.indices[i + 2]) ||
+		    (result[i + 2] == mesh.indices[i + 0] && result[i + 0] == mesh.indices[i + 1] && result[i + 1] == mesh.indices[i + 2]));
+	}
+
+	printf("IdxCodec : %.1f bits/triangle (post-deflate %.1f bits/triangle); encode %.2f msec, decode %.2f msec (%.2f GB/s)\n",
+	       double(buffer.size() * 8) / double(mesh.indices.size() / 3),
+	       double(csize * 8) / double(mesh.indices.size() / 3),
+	       (middle - start) * 1000,
+	       (end - middle) * 1000,
+	       (double(result.size() * 4) / (1 << 30)) / (end - middle));
+}
+
+template <typename PV>
+void packVertex(const Mesh& mesh, const char* pvn)
+{
+	std::vector<PV> pv(mesh.vertices.size());
+	packMesh(pv, mesh.vertices);
+
+	size_t csize = compress(pv);
+
+	printf("VtxPack%s  : %.1f bits/vertex (post-deflate %.1f bits/vertex)\n", pvn,
+	       double(pv.size() * sizeof(PV) * 8) / double(mesh.vertices.size()),
+	       double(csize * 8) / double(mesh.vertices.size()));
+}
+
+template <typename PV>
+void encodeVertex(const Mesh& mesh, const char* pvn)
+{
+	std::vector<PV> pv(mesh.vertices.size());
+	packMesh(pv, mesh.vertices);
+
+	// allocate result outside of the timing loop to exclude memset() from decode timing
+	std::vector<PV> result(mesh.vertices.size());
+
+	double start = timestamp();
+
+	std::vector<unsigned char> vbuf(meshopt_encodeVertexBufferBound(mesh.vertices.size(), sizeof(PV)));
+	vbuf.resize(meshopt_encodeVertexBuffer(&vbuf[0], vbuf.size(), &pv[0], mesh.vertices.size(), sizeof(PV)));
+
+	double middle = timestamp();
+
+	int res = meshopt_decodeVertexBuffer(&result[0], mesh.vertices.size(), sizeof(PV), &vbuf[0], vbuf.size());
+	assert(res == 0);
+	(void)res;
+
+	double end = timestamp();
+
+	assert(memcmp(&pv[0], &result[0], pv.size() * sizeof(PV)) == 0);
+
+	size_t csize = compress(vbuf);
+
+	printf("VtxCodec%1s: %.1f bits/vertex (post-deflate %.1f bits/vertex); encode %.2f msec, decode %.2f msec (%.2f GB/s)\n", pvn,
+	       double(vbuf.size() * 8) / double(mesh.vertices.size()),
+	       double(csize * 8) / double(mesh.vertices.size()),
+	       (middle - start) * 1000,
+	       (end - middle) * 1000,
+	       (double(result.size() * sizeof(PV)) / (1 << 30)) / (end - middle));
+}
+
+void stripify(const Mesh& mesh)
+{
+	// note: input mesh is assumed to be optimized for vertex cache and vertex fetch
+	double start = timestamp();
+	std::vector<unsigned int> strip(meshopt_stripifyBound(mesh.indices.size()));
+	strip.resize(meshopt_stripify(&strip[0], &mesh.indices[0], mesh.indices.size(), mesh.vertices.size()));
+	double end = timestamp();
+
+	Mesh copy = mesh;
+	copy.indices.resize(meshopt_unstripify(&copy.indices[0], &strip[0], strip.size()));
+	assert(copy.indices.size() <= meshopt_unstripifyBound(strip.size()));
+
+	assert(isMeshValid(copy));
+	assert(hashMesh(mesh) == hashMesh(copy));
+
+	meshopt_VertexCacheStatistics vcs = meshopt_analyzeVertexCache(&copy.indices[0], mesh.indices.size(), mesh.vertices.size(), kCacheSize, 0, 0);
+	meshopt_VertexCacheStatistics vcs_nv = meshopt_analyzeVertexCache(&copy.indices[0], mesh.indices.size(), mesh.vertices.size(), 32, 32, 32);
+	meshopt_VertexCacheStatistics vcs_amd = meshopt_analyzeVertexCache(&copy.indices[0], mesh.indices.size(), mesh.vertices.size(), 14, 64, 128);
+	meshopt_VertexCacheStatistics vcs_intel = meshopt_analyzeVertexCache(&copy.indices[0], mesh.indices.size(), mesh.vertices.size(), 128, 0, 0);
+
+	printf("Stripify : ACMR %f ATVR %f (NV %f AMD %f Intel %f); %d strip indices (%.1f%%) in %.2f msec\n",
+	       vcs.acmr, vcs.atvr, vcs_nv.atvr, vcs_amd.atvr, vcs_intel.atvr,
+	       int(strip.size()), double(strip.size()) / double(mesh.indices.size()) * 100,
+	       (end - start) * 1000);
+}
+
+void shadow(const Mesh& mesh)
+{
+	// note: input mesh is assumed to be optimized for vertex cache and vertex fetch
+
+	double start = timestamp();
+	// this index buffer can be used for position-only rendering using the same vertex data that the original index buffer uses
+	std::vector<unsigned int> shadow_indices(mesh.indices.size());
+	meshopt_generateShadowIndexBuffer(&shadow_indices[0], &mesh.indices[0], mesh.indices.size(), &mesh.vertices[0], mesh.vertices.size(), sizeof(float) * 3, sizeof(Vertex));
+	double end = timestamp();
+
+	// while you can't optimize the vertex data after shadow IB was constructed, you can and should optimize the shadow IB for vertex cache
+	// this is valuable even if the original indices array was optimized for vertex cache!
+	meshopt_optimizeVertexCache(&shadow_indices[0], &shadow_indices[0], shadow_indices.size(), mesh.vertices.size());
+
+	meshopt_VertexCacheStatistics vcs = meshopt_analyzeVertexCache(&mesh.indices[0], mesh.indices.size(), mesh.vertices.size(), kCacheSize, 0, 0);
+	meshopt_VertexCacheStatistics vcss = meshopt_analyzeVertexCache(&shadow_indices[0], shadow_indices.size(), mesh.vertices.size(), kCacheSize, 0, 0);
+
+	std::vector<char> shadow_flags(mesh.vertices.size());
+	size_t shadow_vertices = 0;
+
+	for (size_t i = 0; i < shadow_indices.size(); ++i)
+	{
+		unsigned int index = shadow_indices[i];
+		shadow_vertices += 1 - shadow_flags[index];
+		shadow_flags[index] = 1;
+	}
+
+	printf("ShadowIB : ACMR %f (%.2fx improvement); %d shadow vertices (%.2fx improvement) in %.2f msec\n",
+	       vcss.acmr, double(vcs.vertices_transformed) / double(vcss.vertices_transformed),
+	       int(shadow_vertices), double(mesh.vertices.size()) / double(shadow_vertices),
+	       (end - start) * 1000);
+}
+
+void meshlets(const Mesh& mesh)
+{
+	const size_t max_vertices = 64;
+	const size_t max_triangles = 126;
+
+	// note: input mesh is assumed to be optimized for vertex cache and vertex fetch
+	double start = timestamp();
+	std::vector<meshopt_Meshlet> meshlets(meshopt_buildMeshletsBound(mesh.indices.size(), max_vertices, max_triangles));
+	meshlets.resize(meshopt_buildMeshlets(&meshlets[0], &mesh.indices[0], mesh.indices.size(), mesh.vertices.size(), max_vertices, max_triangles));
+	double end = timestamp();
+
+	double avg_vertices = 0;
+	double avg_triangles = 0;
+	size_t not_full = 0;
+
+	for (size_t i = 0; i < meshlets.size(); ++i)
+	{
+		const meshopt_Meshlet& m = meshlets[i];
+
+		avg_vertices += m.vertex_count;
+		avg_triangles += m.triangle_count;
+		not_full += m.vertex_count < max_vertices;
+	}
+
+	avg_vertices /= double(meshlets.size());
+	avg_triangles /= double(meshlets.size());
+
+	printf("Meshlets : %d meshlets (avg vertices %.1f, avg triangles %.1f, not full %d) in %.2f msec\n",
+	       int(meshlets.size()), avg_vertices, avg_triangles, int(not_full), (end - start) * 1000);
+
+	float camera[3] = {100, 100, 100};
+
+	size_t rejected = 0;
+	size_t rejected_s8 = 0;
+	size_t rejected_alt = 0;
+	size_t rejected_alt_s8 = 0;
+	size_t accepted = 0;
+	size_t accepted_s8 = 0;
+
+	double startc = timestamp();
+	for (size_t i = 0; i < meshlets.size(); ++i)
+	{
+		meshopt_Bounds bounds = meshopt_computeMeshletBounds(&meshlets[i], &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex));
+
+		// trivial accept: we can't ever backface cull this meshlet
+		accepted += (bounds.cone_cutoff >= 1);
+		accepted_s8 += (bounds.cone_cutoff_s8 >= 127);
+
+		// perspective projection: dot(normalize(cone_apex - camera_position), cone_axis) > cone_cutoff
+		float mview[3] = {bounds.cone_apex[0] - camera[0], bounds.cone_apex[1] - camera[1], bounds.cone_apex[2] - camera[2]};
+		float mviewlength = sqrtf(mview[0] * mview[0] + mview[1] * mview[1] + mview[2] * mview[2]);
+
+		rejected += mview[0] * bounds.cone_axis[0] + mview[1] * bounds.cone_axis[1] + mview[2] * bounds.cone_axis[2] >= bounds.cone_cutoff * mviewlength;
+		rejected_s8 += mview[0] * (bounds.cone_axis_s8[0] / 127.f) + mview[1] * (bounds.cone_axis_s8[1] / 127.f) + mview[2] * (bounds.cone_axis_s8[2] / 127.f) >= (bounds.cone_cutoff_s8 / 127.f) * mviewlength;
+
+		// alternative formulation for perspective projection that doesn't use apex (and uses cluster bounding sphere instead):
+		// dot(normalize(center - camera_position), cone_axis) > cone_cutoff + radius / length(center - camera_position)
+		float cview[3] = {bounds.center[0] - camera[0], bounds.center[1] - camera[1], bounds.center[2] - camera[2]};
+		float cviewlength = sqrtf(cview[0] * cview[0] + cview[1] * cview[1] + cview[2] * cview[2]);
+
+		rejected_alt += cview[0] * bounds.cone_axis[0] + cview[1] * bounds.cone_axis[1] + cview[2] * bounds.cone_axis[2] >= bounds.cone_cutoff * cviewlength + bounds.radius;
+		rejected_alt_s8 += cview[0] * (bounds.cone_axis_s8[0] / 127.f) + cview[1] * (bounds.cone_axis_s8[1] / 127.f) + cview[2] * (bounds.cone_axis_s8[2] / 127.f) >= (bounds.cone_cutoff_s8 / 127.f) * cviewlength + bounds.radius;
+	}
+	double endc = timestamp();
+
+	printf("ConeCull : rejected apex %d (%.1f%%) / center %d (%.1f%%), trivially accepted %d (%.1f%%) in %.2f msec\n",
+	       int(rejected), double(rejected) / double(meshlets.size()) * 100,
+	       int(rejected_alt), double(rejected_alt) / double(meshlets.size()) * 100,
+	       int(accepted), double(accepted) / double(meshlets.size()) * 100,
+	       (endc - startc) * 1000);
+	printf("ConeCull8: rejected apex %d (%.1f%%) / center %d (%.1f%%), trivially accepted %d (%.1f%%) in %.2f msec\n",
+	       int(rejected_s8), double(rejected_s8) / double(meshlets.size()) * 100,
+	       int(rejected_alt_s8), double(rejected_alt_s8) / double(meshlets.size()) * 100,
+	       int(accepted_s8), double(accepted_s8) / double(meshlets.size()) * 100,
+	       (endc - startc) * 1000);
+}
+
+bool loadMesh(Mesh& mesh, const char* path)
+{
+	double start = timestamp();
+	double middle;
+	mesh = parseObj(path, middle);
+	double end = timestamp();
+
+	if (mesh.vertices.empty())
+	{
+		printf("Mesh %s is empty, skipping\n", path);
+		return false;
+	}
+
+	printf("# %s: %d vertices, %d triangles; read in %.2f msec; indexed in %.2f msec\n", path, int(mesh.vertices.size()), int(mesh.indices.size() / 3), (middle - start) * 1000, (end - middle) * 1000);
+	return true;
+}
+
+void processDeinterleaved(const char* path)
+{
+	// Most algorithms in the library work out of the box with deinterleaved geometry, but some require slightly special treatment;
+	// this code runs a simplified version of complete opt. pipeline using deinterleaved geo. There's no compression performed but you
+	// can trivially run it by quantizing all elements and running meshopt_encodeVertexBuffer once for each vertex stream.
+	ObjFile file;
+	if (!objParseFile(file, path) || !objValidate(file))
+	{
+		printf("Error loading %s: file not found or invalid file data\n", path);
+		return;
+	}
+
+	size_t total_indices = file.f_size / 3;
+
+	std::vector<float> unindexed_pos(total_indices * 3);
+	std::vector<float> unindexed_nrm(total_indices * 3);
+	std::vector<float> unindexed_uv(total_indices * 2);
+
+	for (size_t i = 0; i < total_indices; ++i)
+	{
+		int vi = file.f[i * 3 + 0];
+		int vti = file.f[i * 3 + 1];
+		int vni = file.f[i * 3 + 2];
+
+		unindexed_pos[i * 3 + 0] = file.v[vi * 3 + 0];
+		unindexed_pos[i * 3 + 1] = file.v[vi * 3 + 1];
+		unindexed_pos[i * 3 + 2] = file.v[vi * 3 + 2];
+
+		if (vni >= 0)
+		{
+			unindexed_nrm[i * 3 + 0] = file.vn[vni * 3 + 0];
+			unindexed_nrm[i * 3 + 1] = file.vn[vni * 3 + 1];
+			unindexed_nrm[i * 3 + 2] = file.vn[vni * 3 + 2];
+		}
+
+		if (vti >= 0)
+		{
+			unindexed_uv[i * 2 + 0] = file.vt[vti * 3 + 0];
+			unindexed_uv[i * 2 + 1] = file.vt[vti * 3 + 1];
+		}
+	}
+
+	double start = timestamp();
+
+	meshopt_Stream streams[] = {
+	    {&unindexed_pos[0], sizeof(float) * 3, sizeof(float) * 3},
+	    {&unindexed_nrm[0], sizeof(float) * 3, sizeof(float) * 3},
+	    {&unindexed_uv[0], sizeof(float) * 2, sizeof(float) * 2},
+	};
+
+	std::vector<unsigned int> remap(total_indices);
+
+	size_t total_vertices = meshopt_generateVertexRemapMulti(&remap[0], NULL, total_indices, total_indices, streams, sizeof(streams) / sizeof(streams[0]));
+
+	std::vector<unsigned int> indices(total_indices);
+	meshopt_remapIndexBuffer(&indices[0], NULL, total_indices, &remap[0]);
+
+	std::vector<float> pos(total_vertices * 3);
+	meshopt_remapVertexBuffer(&pos[0], &unindexed_pos[0], total_indices, sizeof(float) * 3, &remap[0]);
+
+	std::vector<float> nrm(total_vertices * 3);
+	meshopt_remapVertexBuffer(&nrm[0], &unindexed_nrm[0], total_indices, sizeof(float) * 3, &remap[0]);
+
+	std::vector<float> uv(total_vertices * 2);
+	meshopt_remapVertexBuffer(&uv[0], &unindexed_uv[0], total_indices, sizeof(float) * 2, &remap[0]);
+
+	double reindex = timestamp();
+
+	meshopt_optimizeVertexCache(&indices[0], &indices[0], total_indices, total_vertices);
+
+	meshopt_optimizeVertexFetchRemap(&remap[0], &indices[0], total_indices, total_vertices);
+	meshopt_remapVertexBuffer(&pos[0], &pos[0], total_vertices, sizeof(float) * 3, &remap[0]);
+	meshopt_remapVertexBuffer(&nrm[0], &nrm[0], total_vertices, sizeof(float) * 3, &remap[0]);
+	meshopt_remapVertexBuffer(&uv[0], &uv[0], total_vertices, sizeof(float) * 2, &remap[0]);
+
+	double optimize = timestamp();
+
+	// note: since shadow index buffer is computed based on regular vertex/index buffer, the stream points at the indexed data - not unindexed_pos
+	meshopt_Stream shadow_stream = {&pos[0], sizeof(float) * 3, sizeof(float) * 3};
+
+	std::vector<unsigned int> shadow_indices(total_indices);
+	meshopt_generateShadowIndexBufferMulti(&shadow_indices[0], &indices[0], total_indices, total_vertices, &shadow_stream, 1);
+
+	meshopt_optimizeVertexCache(&shadow_indices[0], &shadow_indices[0], total_indices, total_vertices);
+
+	double shadow = timestamp();
+
+	printf("Deintrlvd: %d vertices, reindexed in %.2f msec, optimized in %.2f msec, generated & optimized shadow indices in %.2f msec\n",
+	       int(total_vertices), (reindex - start) * 1000, (optimize - reindex) * 1000, (shadow - optimize) * 1000);
+}
+
+void process(const char* path)
+{
+	Mesh mesh;
+	if (!loadMesh(mesh, path))
+		return;
+
+	optimize(mesh, "Original", optNone);
+	optimize(mesh, "Random", optRandomShuffle);
+	optimize(mesh, "Cache", optCache);
+	optimize(mesh, "CacheFifo", optCacheFifo);
+	optimize(mesh, "Overdraw", optOverdraw);
+	optimize(mesh, "Fetch", optFetch);
+	optimize(mesh, "FetchMap", optFetchRemap);
+	optimize(mesh, "Complete", optComplete);
+
+	Mesh copy = mesh;
+	meshopt_optimizeVertexCache(&copy.indices[0], &copy.indices[0], copy.indices.size(), copy.vertices.size());
+	meshopt_optimizeVertexFetch(&copy.vertices[0], &copy.indices[0], copy.indices.size(), &copy.vertices[0], copy.vertices.size(), sizeof(Vertex));
+
+	stripify(copy);
+	meshlets(copy);
+	shadow(copy);
+
+	encodeIndex(copy);
+	packVertex<PackedVertex>(copy, "");
+	encodeVertex<PackedVertex>(copy, "");
+	encodeVertex<PackedVertexOct>(copy, "O");
+
+	simplify(mesh);
+	simplifySloppy(mesh);
+	simplifyComplete(mesh);
+
+	if (path)
+		processDeinterleaved(path);
+}
+
+void processDev(const char* path)
+{
+	Mesh mesh;
+	if (!loadMesh(mesh, path))
+		return;
+
+	simplify(mesh, 0.01f);
+	simplifySloppy(mesh, 0.01f);
+}
+
+int main(int argc, char** argv)
+{
+	void runTests();
+
+	if (argc == 1)
+	{
+		runTests();
+	}
+	else
+	{
+		if (strcmp(argv[1], "-d") == 0)
+		{
+			for (int i = 2; i < argc; ++i)
+			{
+				processDev(argv[i]);
+			}
+		}
+		else
+		{
+			for (int i = 1; i < argc; ++i)
+			{
+				process(argv[i]);
+			}
+
+			runTests();
+		}
+	}
+}
--- a/3rdparty/meshoptimizer/demo/miniz.cpp
+++ b/3rdparty/meshoptimizer/demo/miniz.cpp
--- a/3rdparty/meshoptimizer/demo/miniz.h
+++ b/3rdparty/meshoptimizer/demo/miniz.h
@ -0,0 +1,298 @@
+/* This is miniz.c with removal of all zlib/zip like functionality - only tdefl/tinfl APIs are left
+   For maximum compatibility unaligned load/store and 64-bit register paths have been removed so this is slower than miniz.c
+
+   miniz.c v1.15 - public domain deflate/inflate, zlib-subset, ZIP reading/writing/appending, PNG writing
+   See "unlicense" statement at the end of this file.
+   Rich Geldreich <richgel99@gmail.com>, last updated Oct. 13, 2013
+   Implements RFC 1950: http://www.ietf.org/rfc/rfc1950.txt and RFC 1951: http://www.ietf.org/rfc/rfc1951.txt
+*/
+
+#ifndef MINIZ_HEADER_INCLUDED
+#define MINIZ_HEADER_INCLUDED
+
+#include <stdlib.h>
+
+// Define MINIZ_NO_MALLOC to disable all calls to malloc, free, and realloc.
+// Note if MINIZ_NO_MALLOC is defined then the user must always provide custom user alloc/free/realloc
+// callbacks to the zlib and archive API's, and a few stand-alone helper API's which don't provide custom user
+// functions (such as tdefl_compress_mem_to_heap() and tinfl_decompress_mem_to_heap()) won't work.
+//#define MINIZ_NO_MALLOC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// mz_free() internally uses the MZ_FREE() macro (which by default calls free() unless you've modified the MZ_MALLOC macro) to release a block allocated from the heap.
+void mz_free(void *p);
+
+// Compression strategies.
+enum { MZ_DEFAULT_STRATEGY = 0, MZ_FILTERED = 1, MZ_HUFFMAN_ONLY = 2, MZ_RLE = 3, MZ_FIXED = 4 };
+
+// Compression levels: 0-9 are the standard zlib-style levels, 10 is best possible compression (not zlib compatible, and may be very slow), MZ_DEFAULT_COMPRESSION=MZ_DEFAULT_LEVEL.
+enum { MZ_NO_COMPRESSION = 0, MZ_BEST_SPEED = 1, MZ_BEST_COMPRESSION = 9, MZ_UBER_COMPRESSION = 10, MZ_DEFAULT_LEVEL = 6, MZ_DEFAULT_COMPRESSION = -1 };
+
+// Window bits
+#define MZ_DEFAULT_WINDOW_BITS 15
+
+// Method
+#define MZ_DEFLATED 8
+
+// ------------------- Types and macros
+
+typedef unsigned char mz_uint8;
+typedef signed short mz_int16;
+typedef unsigned short mz_uint16;
+typedef unsigned int mz_uint32;
+typedef unsigned int mz_uint;
+typedef long long mz_int64;
+typedef unsigned long long mz_uint64;
+typedef int mz_bool;
+
+#define MZ_FALSE (0)
+#define MZ_TRUE (1)
+
+// An attempt to work around MSVC's spammy "warning C4127: conditional expression is constant" message.
+#ifdef _MSC_VER
+   #define MZ_MACRO_END while (0, 0)
+#else
+   #define MZ_MACRO_END while (0)
+#endif
+
+#define MZ_ADLER32_INIT (1)
+// mz_adler32() returns the initial adler-32 value to use when called with ptr==NULL.
+mz_uint32 mz_adler32(mz_uint32 adler, const unsigned char *ptr, size_t buf_len);
+
+// ------------------- Low-level Decompression API Definitions
+
+// Decompression flags used by tinfl_decompress().
+// TINFL_FLAG_PARSE_ZLIB_HEADER: If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
+// TINFL_FLAG_HAS_MORE_INPUT: If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
+// TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF: If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
+// TINFL_FLAG_COMPUTE_ADLER32: Force adler-32 checksum computation of the decompressed bytes.
+enum
+{
+  TINFL_FLAG_PARSE_ZLIB_HEADER = 1,
+  TINFL_FLAG_HAS_MORE_INPUT = 2,
+  TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF = 4,
+  TINFL_FLAG_COMPUTE_ADLER32 = 8
+};
+
+// High level decompression functions:
+// tinfl_decompress_mem_to_mem() decompresses a block in memory to another block in memory.
+// Returns TINFL_DECOMPRESS_MEM_TO_MEM_FAILED on failure, or the number of bytes written on success.
+#define TINFL_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1))
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+// tinfl_decompress_mem_to_callback() decompresses a block in memory to an internal 32KB buffer, and a user provided callback function will be called to flush the buffer.
+// Returns 1 on success or 0 on failure.
+typedef int (*tinfl_put_buf_func_ptr)(const void* pBuf, int len, void *pUser);
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+struct tinfl_decompressor_tag; typedef struct tinfl_decompressor_tag tinfl_decompressor;
+
+// Max size of LZ dictionary.
+#define TINFL_LZ_DICT_SIZE 32768
+
+// Return status.
+typedef enum
+{
+  TINFL_STATUS_BAD_PARAM = -3,
+  TINFL_STATUS_ADLER32_MISMATCH = -2,
+  TINFL_STATUS_FAILED = -1,
+  TINFL_STATUS_DONE = 0,
+  TINFL_STATUS_NEEDS_MORE_INPUT = 1,
+  TINFL_STATUS_HAS_MORE_OUTPUT = 2
+} tinfl_status;
+
+// Initializes the decompressor to its initial state.
+#define tinfl_init(r) do { (r)->m_state = 0; } MZ_MACRO_END
+#define tinfl_get_adler32(r) (r)->m_check_adler32
+
+// Main low-level decompressor coroutine function. This is the only function actually needed for decompression. All the other functions are just high-level helpers for improved usability.
+// This is a universal API, i.e. it can be used as a building block to build any desired higher level decompression API. In the limit case, it can be called once per every byte input or output.
+tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags);
+
+// Internal/private bits follow.
+enum
+{
+  TINFL_MAX_HUFF_TABLES = 3, TINFL_MAX_HUFF_SYMBOLS_0 = 288, TINFL_MAX_HUFF_SYMBOLS_1 = 32, TINFL_MAX_HUFF_SYMBOLS_2 = 19,
+  TINFL_FAST_LOOKUP_BITS = 10, TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
+};
+
+typedef struct
+{
+  mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
+  mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE], m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
+} tinfl_huff_table;
+
+typedef mz_uint32 tinfl_bit_buf_t;
+#define TINFL_BITBUF_SIZE (32)
+
+struct tinfl_decompressor_tag
+{
+  mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type, m_check_adler32, m_dist, m_counter, m_num_extra, m_table_sizes[TINFL_MAX_HUFF_TABLES];
+  tinfl_bit_buf_t m_bit_buf;
+  size_t m_dist_from_out_buf_start;
+  tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
+  mz_uint8 m_raw_header[4], m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
+};
+
+// ------------------- Low-level Compression API Definitions
+
+// Set TDEFL_LESS_MEMORY to 1 to use less memory (compression will be slightly slower, and raw/dynamic blocks will be output more frequently).
+#define TDEFL_LESS_MEMORY 0
+
+// tdefl_init() compression flags logically OR'd together (low 12 bits contain the max. number of probes per dictionary search):
+// TDEFL_DEFAULT_MAX_PROBES: The compressor defaults to 128 dictionary probes per dictionary search. 0=Huffman only, 1=Huffman+LZ (fastest/crap compression), 4095=Huffman+LZ (slowest/best compression).
+enum
+{
+  TDEFL_HUFFMAN_ONLY = 0, TDEFL_DEFAULT_MAX_PROBES = 128, TDEFL_MAX_PROBES_MASK = 0xFFF
+};
+
+// TDEFL_WRITE_ZLIB_HEADER: If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
+// TDEFL_COMPUTE_ADLER32: Always compute the adler-32 of the input data (even when not writing zlib headers).
+// TDEFL_GREEDY_PARSING_FLAG: Set to use faster greedy parsing, instead of more efficient lazy parsing.
+// TDEFL_NONDETERMINISTIC_PARSING_FLAG: Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
+// TDEFL_RLE_MATCHES: Only look for RLE matches (matches with a distance of 1)
+// TDEFL_FILTER_MATCHES: Discards matches <= 5 chars if enabled.
+// TDEFL_FORCE_ALL_STATIC_BLOCKS: Disable usage of optimized Huffman tables.
+// TDEFL_FORCE_ALL_RAW_BLOCKS: Only use raw (uncompressed) deflate blocks.
+// The low 12 bits are reserved to control the max # of hash probes per dictionary lookup (see TDEFL_MAX_PROBES_MASK).
+enum
+{
+  TDEFL_WRITE_ZLIB_HEADER             = 0x01000,
+  TDEFL_COMPUTE_ADLER32               = 0x02000,
+  TDEFL_GREEDY_PARSING_FLAG           = 0x04000,
+  TDEFL_NONDETERMINISTIC_PARSING_FLAG = 0x08000,
+  TDEFL_RLE_MATCHES                   = 0x10000,
+  TDEFL_FILTER_MATCHES                = 0x20000,
+  TDEFL_FORCE_ALL_STATIC_BLOCKS       = 0x40000,
+  TDEFL_FORCE_ALL_RAW_BLOCKS          = 0x80000
+};
+
+// High level compression functions:
+
+
+// tdefl_compress_bound() returns a (very) conservative upper bound on the amount of data that could be generated by calling tdefl_compress_*().
+size_t tdefl_compress_bound(size_t source_len);
+
+// tdefl_compress_mem_to_mem() compresses a block in memory to another block in memory.
+// Returns 0 on failure.
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+// Output stream interface. The compressor uses this interface to write compressed data. It'll typically be called TDEFL_OUT_BUF_SIZE at a time.
+typedef mz_bool (*tdefl_put_buf_func_ptr)(const void* pBuf, int len, void *pUser);
+
+// tdefl_compress_mem_to_output() compresses a block to an output stream. The above helpers use this function internally.
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+enum { TDEFL_MAX_HUFF_TABLES = 3, TDEFL_MAX_HUFF_SYMBOLS_0 = 288, TDEFL_MAX_HUFF_SYMBOLS_1 = 32, TDEFL_MAX_HUFF_SYMBOLS_2 = 19, TDEFL_LZ_DICT_SIZE = 32768, TDEFL_LZ_DICT_SIZE_MASK = TDEFL_LZ_DICT_SIZE - 1, TDEFL_MIN_MATCH_LEN = 3, TDEFL_MAX_MATCH_LEN = 258 };
+
+// TDEFL_OUT_BUF_SIZE MUST be large enough to hold a single entire compressed output block (using static/fixed Huffman codes).
+#if TDEFL_LESS_MEMORY
+enum { TDEFL_LZ_CODE_BUF_SIZE = 24 * 1024, TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13 ) / 10, TDEFL_MAX_HUFF_SYMBOLS = 288, TDEFL_LZ_HASH_BITS = 12, TDEFL_LEVEL1_HASH_SIZE_MASK = 4095, TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3, TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS };
+#else
+enum { TDEFL_LZ_CODE_BUF_SIZE = 64 * 1024, TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13 ) / 10, TDEFL_MAX_HUFF_SYMBOLS = 288, TDEFL_LZ_HASH_BITS = 15, TDEFL_LEVEL1_HASH_SIZE_MASK = 4095, TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3, TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS };
+#endif
+
+// The low-level tdefl functions below may be used directly if the above helper functions aren't flexible enough. The low-level functions don't make any heap allocations, unlike the above helper functions.
+typedef enum
+{
+  TDEFL_STATUS_BAD_PARAM = -2,
+  TDEFL_STATUS_PUT_BUF_FAILED = -1,
+  TDEFL_STATUS_OKAY = 0,
+  TDEFL_STATUS_DONE = 1,
+} tdefl_status;
+
+// Must map to MZ_NO_FLUSH, MZ_SYNC_FLUSH, etc. enums
+typedef enum
+{
+  TDEFL_NO_FLUSH = 0,
+  TDEFL_SYNC_FLUSH = 2,
+  TDEFL_FULL_FLUSH = 3,
+  TDEFL_FINISH = 4
+} tdefl_flush;
+
+// tdefl's compression state structure.
+typedef struct
+{
+  tdefl_put_buf_func_ptr m_pPut_buf_func;
+  void *m_pPut_buf_user;
+  mz_uint m_flags, m_max_probes[2];
+  int m_greedy_parsing;
+  mz_uint m_adler32, m_lookahead_pos, m_lookahead_size, m_dict_size;
+  mz_uint8 *m_pLZ_code_buf, *m_pLZ_flags, *m_pOutput_buf, *m_pOutput_buf_end;
+  mz_uint m_num_flags_left, m_total_lz_bytes, m_lz_code_buf_dict_pos, m_bits_in, m_bit_buffer;
+  mz_uint m_saved_match_dist, m_saved_match_len, m_saved_lit, m_output_flush_ofs, m_output_flush_remaining, m_finished, m_block_index, m_wants_to_finish;
+  tdefl_status m_prev_return_status;
+  const void *m_pIn_buf;
+  void *m_pOut_buf;
+  size_t *m_pIn_buf_size, *m_pOut_buf_size;
+  tdefl_flush m_flush;
+  const mz_uint8 *m_pSrc;
+  size_t m_src_buf_left, m_out_buf_ofs;
+  mz_uint8 m_dict[TDEFL_LZ_DICT_SIZE + TDEFL_MAX_MATCH_LEN - 1];
+  mz_uint16 m_huff_count[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint16 m_huff_codes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_huff_code_sizes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE];
+  mz_uint16 m_next[TDEFL_LZ_DICT_SIZE];
+  mz_uint16 m_hash[TDEFL_LZ_HASH_SIZE];
+  mz_uint8 m_output_buf[TDEFL_OUT_BUF_SIZE];
+} tdefl_compressor;
+
+// Initializes the compressor.
+// There is no corresponding deinit() function because the tdefl API's do not dynamically allocate memory.
+// pBut_buf_func: If NULL, output data will be supplied to the specified callback. In this case, the user should call the tdefl_compress_buffer() API for compression.
+// If pBut_buf_func is NULL the user should always call the tdefl_compress() API.
+// flags: See the above enums (TDEFL_HUFFMAN_ONLY, TDEFL_WRITE_ZLIB_HEADER, etc.)
+tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+// Compresses a block of data, consuming as much of the specified input buffer as possible, and writing as much compressed data to the specified output buffer as possible.
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush);
+
+// tdefl_compress_buffer() is only usable when the tdefl_init() is called with a non-NULL tdefl_put_buf_func_ptr.
+// tdefl_compress_buffer() always consumes the entire input buffer.
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush);
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
+
+// Create tdefl_compress() flags given zlib-style compression parameters.
+// level may range from [0,10] (where 10 is absolute max compression, but may be much slower on some files)
+// window_bits may be -15 (raw deflate) or 15 (zlib)
+// strategy may be either MZ_DEFAULT_STRATEGY, MZ_FILTERED, MZ_HUFFMAN_ONLY, MZ_RLE, or MZ_FIXED
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MINIZ_HEADER_INCLUDED
+
+/*
+  This is free and unencumbered software released into the public domain.
+
+  Anyone is free to copy, modify, publish, use, compile, sell, or
+  distribute this software, either in source code form or as a compiled
+  binary, for any purpose, commercial or non-commercial, and by any
+  means.
+
+  In jurisdictions that recognize copyright laws, the author or authors
+  of this software dedicate any and all copyright interest in the
+  software to the public domain. We make this dedication for the benefit
+  of the public at large and to the detriment of our heirs and
+  successors. We intend this dedication to be an overt act of
+  relinquishment in perpetuity of all present and future rights to this
+  software under copyright law.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+  OTHER DEALINGS IN THE SOFTWARE.
+
+  For more information, please refer to <http://unlicense.org/>
+*/
--- a/3rdparty/meshoptimizer/demo/pirate.obj
+++ b/3rdparty/meshoptimizer/demo/pirate.obj
--- a/3rdparty/meshoptimizer/demo/pirate.optmesh
+++ b/3rdparty/meshoptimizer/demo/pirate.optmesh
--- a/3rdparty/meshoptimizer/demo/tests.cpp
+++ b/3rdparty/meshoptimizer/demo/tests.cpp
@ -0,0 +1,280 @@
+#include "../src/meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include <vector>
+
+// This file uses assert() to verify algorithm correctness
+#undef NDEBUG
+#include <assert.h>
+
+struct PV
+{
+	unsigned short px, py, pz;
+	unsigned char nu, nv; // octahedron encoded normal, aliases .pw
+	unsigned short tx, ty;
+};
+
+// note: 4 6 5 triangle here is a combo-breaker:
+// we encode it without rotating, a=next, c=next - this means we do *not* bump next to 6
+// which means that the next triangle can't be encoded via next sequencing!
+static const unsigned int kIndexBuffer[] = {0, 1, 2, 2, 1, 3, 4, 6, 5, 7, 8, 9};
+
+static const unsigned char kIndexDataV0[] = {
+    0xe0, 0xf0, 0x10, 0xfe, 0xff, 0xf0, 0x0c, 0xff, 0x02, 0x02, 0x02, 0x00, 0x76, 0x87, 0x56, 0x67,
+    0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69, 0x00, 0x00, // clang-format :-/
+};
+
+static const PV kVertexBuffer[] = {
+    {0, 0, 0, 0, 0, 0, 0},
+    {300, 0, 0, 0, 0, 500, 0},
+    {0, 300, 0, 0, 0, 0, 500},
+    {300, 300, 0, 0, 0, 500, 500},
+};
+
+static const unsigned char kVertexDataV0[] = {
+    0xa0, 0x01, 0x3f, 0x00, 0x00, 0x00, 0x58, 0x57, 0x58, 0x01, 0x26, 0x00, 0x00, 0x00, 0x01,
+    0x0c, 0x00, 0x00, 0x00, 0x58, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+    0x3f, 0x00, 0x00, 0x00, 0x17, 0x18, 0x17, 0x01, 0x26, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x00,
+    0x00, 0x00, 0x17, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // clang-format :-/
+};
+
+static void decodeIndexV0()
+{
+	const size_t index_count = sizeof(kIndexBuffer) / sizeof(kIndexBuffer[0]);
+
+	std::vector<unsigned char> buffer(kIndexDataV0, kIndexDataV0 + sizeof(kIndexDataV0));
+
+	unsigned int decoded[index_count];
+	assert(meshopt_decodeIndexBuffer(decoded, index_count, &buffer[0], buffer.size()) == 0);
+	assert(memcmp(decoded, kIndexBuffer, sizeof(kIndexBuffer)) == 0);
+}
+
+static void decodeIndex16()
+{
+	const size_t index_count = sizeof(kIndexBuffer) / sizeof(kIndexBuffer[0]);
+	const size_t vertex_count = 10;
+
+	std::vector<unsigned char> buffer(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+	buffer.resize(meshopt_encodeIndexBuffer(&buffer[0], buffer.size(), kIndexBuffer, index_count));
+
+	unsigned short decoded[index_count];
+	assert(meshopt_decodeIndexBuffer(decoded, index_count, &buffer[0], buffer.size()) == 0);
+
+	for (size_t i = 0; i < index_count; ++i)
+		assert(decoded[i] == kIndexBuffer[i]);
+}
+
+static void encodeIndexMemorySafe()
+{
+	const size_t index_count = sizeof(kIndexBuffer) / sizeof(kIndexBuffer[0]);
+	const size_t vertex_count = 10;
+
+	std::vector<unsigned char> buffer(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+	buffer.resize(meshopt_encodeIndexBuffer(&buffer[0], buffer.size(), kIndexBuffer, index_count));
+
+	// check that encode is memory-safe; note that we reallocate the buffer for each try to make sure ASAN can verify buffer access
+	for (size_t i = 0; i <= buffer.size(); ++i)
+	{
+		std::vector<unsigned char> shortbuffer(i);
+		size_t result = meshopt_encodeIndexBuffer(i == 0 ? 0 : &shortbuffer[0], i, kIndexBuffer, index_count);
+
+		if (i == buffer.size())
+			assert(result == buffer.size());
+		else
+			assert(result == 0);
+	}
+}
+
+static void decodeIndexMemorySafe()
+{
+	const size_t index_count = sizeof(kIndexBuffer) / sizeof(kIndexBuffer[0]);
+	const size_t vertex_count = 10;
+
+	std::vector<unsigned char> buffer(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+	buffer.resize(meshopt_encodeIndexBuffer(&buffer[0], buffer.size(), kIndexBuffer, index_count));
+
+	// check that decode is memory-safe; note that we reallocate the buffer for each try to make sure ASAN can verify buffer access
+	unsigned int decoded[index_count];
+
+	for (size_t i = 0; i <= buffer.size(); ++i)
+	{
+		std::vector<unsigned char> shortbuffer(buffer.begin(), buffer.begin() + i);
+		int result = meshopt_decodeIndexBuffer(decoded, index_count, i == 0 ? 0 : &shortbuffer[0], i);
+
+		if (i == buffer.size())
+			assert(result == 0);
+		else
+			assert(result < 0);
+	}
+}
+
+static void decodeIndexRejectExtraBytes()
+{
+	const size_t index_count = sizeof(kIndexBuffer) / sizeof(kIndexBuffer[0]);
+	const size_t vertex_count = 10;
+
+	std::vector<unsigned char> buffer(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+	buffer.resize(meshopt_encodeIndexBuffer(&buffer[0], buffer.size(), kIndexBuffer, index_count));
+
+	// check that decoder doesn't accept extra bytes after a valid stream
+	std::vector<unsigned char> largebuffer(buffer);
+	largebuffer.push_back(0);
+
+	unsigned int decoded[index_count];
+	assert(meshopt_decodeIndexBuffer(decoded, index_count, &largebuffer[0], largebuffer.size()) < 0);
+}
+
+static void decodeIndexRejectMalformedHeaders()
+{
+	const size_t index_count = sizeof(kIndexBuffer) / sizeof(kIndexBuffer[0]);
+	const size_t vertex_count = 10;
+
+	std::vector<unsigned char> buffer(meshopt_encodeIndexBufferBound(index_count, vertex_count));
+	buffer.resize(meshopt_encodeIndexBuffer(&buffer[0], buffer.size(), kIndexBuffer, index_count));
+
+	// check that decoder doesn't accept malformed headers
+	std::vector<unsigned char> brokenbuffer(buffer);
+	brokenbuffer[0] = 0;
+
+	unsigned int decoded[index_count];
+	assert(meshopt_decodeIndexBuffer(decoded, index_count, &brokenbuffer[0], brokenbuffer.size()) < 0);
+}
+
+static void decodeVertexV0()
+{
+	const size_t vertex_count = sizeof(kVertexBuffer) / sizeof(kVertexBuffer[0]);
+
+	std::vector<unsigned char> buffer(kVertexDataV0, kVertexDataV0 + sizeof(kVertexDataV0));
+
+	PV decoded[vertex_count];
+	assert(meshopt_decodeVertexBuffer(decoded, vertex_count, sizeof(PV), &buffer[0], buffer.size()) == 0);
+	assert(memcmp(decoded, kVertexBuffer, sizeof(kVertexBuffer)) == 0);
+}
+
+static void encodeVertexMemorySafe()
+{
+	const size_t vertex_count = sizeof(kVertexBuffer) / sizeof(kVertexBuffer[0]);
+
+	std::vector<unsigned char> buffer(meshopt_encodeVertexBufferBound(vertex_count, sizeof(PV)));
+	buffer.resize(meshopt_encodeVertexBuffer(&buffer[0], buffer.size(), kVertexBuffer, vertex_count, sizeof(PV)));
+
+	// check that encode is memory-safe; note that we reallocate the buffer for each try to make sure ASAN can verify buffer access
+	for (size_t i = 0; i <= buffer.size(); ++i)
+	{
+		std::vector<unsigned char> shortbuffer(i);
+		size_t result = meshopt_encodeVertexBuffer(i == 0 ? 0 : &shortbuffer[0], i, kVertexBuffer, vertex_count, sizeof(PV));
+
+		if (i == buffer.size())
+			assert(result == buffer.size());
+		else
+			assert(result == 0);
+	}
+}
+
+static void decodeVertexMemorySafe()
+{
+	const size_t vertex_count = sizeof(kVertexBuffer) / sizeof(kVertexBuffer[0]);
+
+	std::vector<unsigned char> buffer(meshopt_encodeVertexBufferBound(vertex_count, sizeof(PV)));
+	buffer.resize(meshopt_encodeVertexBuffer(&buffer[0], buffer.size(), kVertexBuffer, vertex_count, sizeof(PV)));
+
+	// check that decode is memory-safe; note that we reallocate the buffer for each try to make sure ASAN can verify buffer access
+	PV decoded[vertex_count];
+
+	for (size_t i = 0; i <= buffer.size(); ++i)
+	{
+		std::vector<unsigned char> shortbuffer(buffer.begin(), buffer.begin() + i);
+		int result = meshopt_decodeVertexBuffer(decoded, vertex_count, sizeof(PV), i == 0 ? 0 : &shortbuffer[0], i);
+		(void)result;
+
+		if (i == buffer.size())
+			assert(result == 0);
+		else
+			assert(result < 0);
+	}
+}
+
+static void decodeVertexRejectExtraBytes()
+{
+	const size_t vertex_count = sizeof(kVertexBuffer) / sizeof(kVertexBuffer[0]);
+
+	std::vector<unsigned char> buffer(meshopt_encodeVertexBufferBound(vertex_count, sizeof(PV)));
+	buffer.resize(meshopt_encodeVertexBuffer(&buffer[0], buffer.size(), kVertexBuffer, vertex_count, sizeof(PV)));
+
+	// check that decoder doesn't accept extra bytes after a valid stream
+	std::vector<unsigned char> largebuffer(buffer);
+	largebuffer.push_back(0);
+
+	PV decoded[vertex_count];
+	assert(meshopt_decodeVertexBuffer(decoded, vertex_count, sizeof(PV), &largebuffer[0], largebuffer.size()) < 0);
+}
+
+static void decodeVertexRejectMalformedHeaders()
+{
+	const size_t vertex_count = sizeof(kVertexBuffer) / sizeof(kVertexBuffer[0]);
+
+	std::vector<unsigned char> buffer(meshopt_encodeVertexBufferBound(vertex_count, sizeof(PV)));
+	buffer.resize(meshopt_encodeVertexBuffer(&buffer[0], buffer.size(), kVertexBuffer, vertex_count, sizeof(PV)));
+
+	// check that decoder doesn't accept malformed headers
+	std::vector<unsigned char> brokenbuffer(buffer);
+	brokenbuffer[0] = 0;
+
+	PV decoded[vertex_count];
+	assert(meshopt_decodeVertexBuffer(decoded, vertex_count, sizeof(PV), &brokenbuffer[0], brokenbuffer.size()) < 0);
+}
+
+static void clusterBoundsDegenerate()
+{
+	const float vbd[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+	const unsigned int ibd[] = {0, 0, 0};
+	const unsigned int ib1[] = {0, 1, 2};
+
+	// all of the bounds below are degenerate as they use 0 triangles, one topology-degenerate triangle and one position-degenerate triangle respectively
+	meshopt_Bounds bounds0 = meshopt_computeClusterBounds(0, 0, 0, 0, 12);
+	meshopt_Bounds boundsd = meshopt_computeClusterBounds(ibd, 3, vbd, 3, 12);
+	meshopt_Bounds bounds1 = meshopt_computeClusterBounds(ib1, 3, vbd, 3, 12);
+
+	assert(bounds0.center[0] == 0 && bounds0.center[1] == 0 && bounds0.center[2] == 0 && bounds0.radius == 0);
+	assert(boundsd.center[0] == 0 && boundsd.center[1] == 0 && boundsd.center[2] == 0 && boundsd.radius == 0);
+	assert(bounds1.center[0] == 0 && bounds1.center[1] == 0 && bounds1.center[2] == 0 && bounds1.radius == 0);
+
+	const float vb1[] = {1, 0, 0, 0, 1, 0, 0, 0, 1};
+	const unsigned int ib2[] = {0, 1, 2, 0, 2, 1};
+
+	// these bounds have a degenerate cone since the cluster has two triangles with opposite normals
+	meshopt_Bounds bounds2 = meshopt_computeClusterBounds(ib2, 6, vb1, 3, 12);
+
+	assert(bounds2.cone_apex[0] == 0 && bounds2.cone_apex[1] == 0 && bounds2.cone_apex[2] == 0);
+	assert(bounds2.cone_axis[0] == 0 && bounds2.cone_axis[1] == 0 && bounds2.cone_axis[2] == 0);
+	assert(bounds2.cone_cutoff == 1);
+	assert(bounds2.cone_axis_s8[0] == 0 && bounds2.cone_axis_s8[1] == 0 && bounds2.cone_axis_s8[2] == 0);
+	assert(bounds2.cone_cutoff_s8 == 127);
+
+	// however, the bounding sphere needs to be in tact (here we only check bbox for simplicity)
+	assert(bounds2.center[0] - bounds2.radius <= 0 && bounds2.center[0] + bounds2.radius >= 1);
+	assert(bounds2.center[1] - bounds2.radius <= 0 && bounds2.center[1] + bounds2.radius >= 1);
+	assert(bounds2.center[2] - bounds2.radius <= 0 && bounds2.center[2] + bounds2.radius >= 1);
+}
+
+void runTests()
+{
+	decodeIndexV0();
+	decodeIndex16();
+	encodeIndexMemorySafe();
+	decodeIndexMemorySafe();
+	decodeIndexRejectExtraBytes();
+	decodeIndexRejectMalformedHeaders();
+
+	decodeVertexV0();
+	encodeVertexMemorySafe();
+	decodeVertexMemorySafe();
+	decodeVertexRejectExtraBytes();
+	decodeVertexRejectMalformedHeaders();
+
+	clusterBoundsDegenerate();
+}
--- a/3rdparty/meshoptimizer/js/decoder.js
+++ b/3rdparty/meshoptimizer/js/decoder.js
--- a/3rdparty/meshoptimizer/src/allocator.cpp
+++ b/3rdparty/meshoptimizer/src/allocator.cpp
@ -0,0 +1,7 @@
+#include "meshoptimizer.h"
+
+void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
+{
+	meshopt_Allocator::Storage::allocate = allocate;
+	meshopt_Allocator::Storage::deallocate = deallocate;
+}
--- a/3rdparty/meshoptimizer/src/clusterizer.cpp
+++ b/3rdparty/meshoptimizer/src/clusterizer.cpp
@ -0,0 +1,351 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
+// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
+// Jack Ritter. An Efficient Bounding Sphere. 1990
+namespace meshopt
+{
+
+static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+{
+	assert(count > 0);
+
+	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
+	size_t pmin[3] = {0, 0, 0};
+	size_t pmax[3] = {0, 0, 0};
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+
+		for (int axis = 0; axis < 3; ++axis)
+		{
+			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
+			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+		}
+	}
+
+	// find the pair of points with largest distance
+	float paxisd2 = 0;
+	int paxis = 0;
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		const float* p1 = points[pmin[axis]];
+		const float* p2 = points[pmax[axis]];
+
+		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+
+		if (d2 > paxisd2)
+		{
+			paxisd2 = d2;
+			paxis = axis;
+		}
+	}
+
+	// use the longest segment as the initial sphere diameter
+	const float* p1 = points[pmin[paxis]];
+	const float* p2 = points[pmax[paxis]];
+
+	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
+	float radius = sqrtf(paxisd2) / 2;
+
+	// iteratively adjust the sphere up until all points fit
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		if (d2 > radius * radius)
+		{
+			float d = sqrtf(d2);
+			assert(d > 0);
+
+			float k = 0.5f + (radius / d) / 2;
+
+			center[0] = center[0] * k + p[0] * (1 - k);
+			center[1] = center[1] * k + p[1] * (1 - k);
+			center[2] = center[2] * k + p[2] * (1 - k);
+			radius = (radius + d) / 2;
+		}
+	}
+
+	result[0] = center[0];
+	result[1] = center[1];
+	result[2] = center[2];
+	result[3] = radius;
+}
+
+} // namespace meshopt
+
+size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	// meshlet construction is limited by max vertices and max triangles per meshlet
+	// the worst case is that the input is an unindexed stream since this equally stresses both limits
+	// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
+	size_t max_vertices_conservative = max_vertices - 2;
+	size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
+	size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
+
+	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
+}
+
+size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	meshopt_Allocator allocator;
+
+	meshopt_Meshlet meshlet;
+	memset(&meshlet, 0, sizeof(meshlet));
+
+	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
+	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	size_t offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		unsigned char& av = used[a];
+		unsigned char& bv = used[b];
+		unsigned char& cv = used[c];
+
+		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		{
+			destination[offset++] = meshlet;
+
+			for (size_t j = 0; j < meshlet.vertex_count; ++j)
+				used[meshlet.vertices[j]] = 0xff;
+
+			memset(&meshlet, 0, sizeof(meshlet));
+		}
+
+		if (av == 0xff)
+		{
+			av = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = a;
+		}
+
+		if (bv == 0xff)
+		{
+			bv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = b;
+		}
+
+		if (cv == 0xff)
+		{
+			cv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = c;
+		}
+
+		meshlet.indices[meshlet.triangle_count][0] = av;
+		meshlet.indices[meshlet.triangle_count][1] = bv;
+		meshlet.indices[meshlet.triangle_count][2] = cv;
+		meshlet.triangle_count++;
+	}
+
+	if (meshlet.triangle_count)
+		destination[offset++] = meshlet;
+
+	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+
+	return offset;
+}
+
+meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(index_count / 3 <= 256);
+
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	// compute triangle normals and gather triangle corners
+	float normals[256][3];
+	float corners[256][3][3];
+	unsigned int triangles = 0;
+
+	for (unsigned int i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+		// no need to include degenerate triangles - they will be invisible anyway
+		if (area == 0.f)
+			continue;
+
+		// record triangle normals & corners for future use; normal and corner 0 define a plane equation
+		normals[triangles][0] = normalx / area;
+		normals[triangles][1] = normaly / area;
+		normals[triangles][2] = normalz / area;
+		memcpy(corners[triangles][0], p0, 3 * sizeof(float));
+		memcpy(corners[triangles][1], p1, 3 * sizeof(float));
+		memcpy(corners[triangles][2], p2, 3 * sizeof(float));
+		triangles++;
+	}
+
+	meshopt_Bounds bounds = {};
+
+	// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
+	if (triangles == 0)
+		return bounds;
+
+	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
+	float psphere[4] = {};
+	computeBoundingSphere(psphere, corners[0], triangles * 3);
+
+	float center[3] = {psphere[0], psphere[1], psphere[2]};
+
+	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
+	float nsphere[4] = {};
+	computeBoundingSphere(nsphere, normals, triangles);
+
+	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
+	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
+	float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
+
+	axis[0] *= invaxislength;
+	axis[1] *= invaxislength;
+	axis[2] *= invaxislength;
+
+	// compute a tight cone around all normals, mindp = cos(angle/2)
+	float mindp = 1.f;
+
+	for (unsigned int i = 0; i < triangles; ++i)
+	{
+		float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
+
+		mindp = (dp < mindp) ? dp : mindp;
+	}
+
+	// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
+	bounds.center[0] = center[0];
+	bounds.center[1] = center[1];
+	bounds.center[2] = center[2];
+	bounds.radius = psphere[3];
+
+	// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
+	// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
+	// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
+	if (mindp <= 0.1f)
+	{
+		bounds.cone_cutoff = 1;
+		bounds.cone_cutoff_s8 = 127;
+		return bounds;
+	}
+
+	float maxt = 0;
+
+	// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
+	for (unsigned int i = 0; i < triangles; ++i)
+	{
+		// dot(center-t*axis-corner, trinormal) = 0
+		// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
+		float cx = center[0] - corners[i][0][0];
+		float cy = center[1] - corners[i][0][1];
+		float cz = center[2] - corners[i][0][2];
+
+		float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
+		float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
+
+		// dn should be larger than mindp cutoff above
+		assert(dn > 0.f);
+		float t = dc / dn;
+
+		maxt = (t > maxt) ? t : maxt;
+	}
+
+	// cone apex should be in the negative half-space of all cluster triangles by construction
+	bounds.cone_apex[0] = center[0] - axis[0] * maxt;
+	bounds.cone_apex[1] = center[1] - axis[1] * maxt;
+	bounds.cone_apex[2] = center[2] - axis[2] * maxt;
+
+	// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
+	bounds.cone_axis[0] = axis[0];
+	bounds.cone_axis[1] = axis[1];
+	bounds.cone_axis[2] = axis[2];
+
+	// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
+	// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
+	bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
+
+	// quantize axis & cutoff to 8-bit SNORM format
+	bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
+	bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
+	bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
+
+	// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
+	float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
+	float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
+	float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
+
+	// note that we need to round this up instead of rounding to nearest, hence +1
+	int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
+
+	bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
+
+	return bounds;
+}
+
+meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+
+	for (unsigned int i = 0; i < meshlet->triangle_count; ++i)
+	{
+		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
+		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
+		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
+
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		indices[i * 3 + 0] = a;
+		indices[i * 3 + 1] = b;
+		indices[i * 3 + 2] = c;
+	}
+
+	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+}
--- a/3rdparty/meshoptimizer/src/indexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/indexcodec.cpp
@ -0,0 +1,582 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
+namespace meshopt
+{
+
+const unsigned char kIndexHeader = 0xe0;
+
+typedef unsigned int VertexFifo[16];
+typedef unsigned int EdgeFifo[16][2];
+
+static const unsigned int kTriangleIndexOrder[3][3] = {
+    {0, 1, 2},
+    {1, 2, 0},
+    {2, 0, 1},
+};
+
+static const unsigned char kCodeAuxEncodingTable[16] = {
+    0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
+    0, 0, // last two entries aren't used for encoding
+};
+
+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
+{
+	(void)a;
+
+	return (b == next) ? 1 : (c == next) ? 2 : 0;
+}
+
+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		unsigned int e0 = fifo[index][0];
+		unsigned int e1 = fifo[index][1];
+
+		if (e0 == a && e1 == b)
+			return (i << 2) | 0;
+		if (e0 == b && e1 == c)
+			return (i << 2) | 1;
+		if (e0 == c && e1 == a)
+			return (i << 2) | 2;
+	}
+
+	return -1;
+}
+
+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
+{
+	fifo[offset][0] = a;
+	fifo[offset][1] = b;
+	offset = (offset + 1) & 15;
+}
+
+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		if (fifo[index] == v)
+			return i;
+	}
+
+	return -1;
+}
+
+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
+{
+	fifo[offset] = v;
+	offset = (offset + cond) & 15;
+}
+
+static void encodeVByte(unsigned char*& data, unsigned int v)
+{
+	// encode 32-bit value in up to 5 7-bit groups
+	do
+	{
+		*data++ = (v & 127) | (v > 127 ? 128 : 0);
+		v >>= 7;
+	} while (v);
+}
+
+static unsigned int decodeVByte(const unsigned char*& data)
+{
+	unsigned char lead = *data++;
+
+	// fast path: single byte
+	if (lead < 128)
+		return lead;
+
+	// slow path: up to 4 extra bytes
+	// note that this loop always terminates, which is important for malformed data
+	unsigned int result = lead & 127;
+	unsigned int shift = 7;
+
+	for (int i = 0; i < 4; ++i)
+	{
+		unsigned char group = *data++;
+		result |= (group & 127) << shift;
+		shift += 7;
+
+		if (group < 128)
+			break;
+	}
+
+	return result;
+}
+
+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int next, unsigned int last)
+{
+	(void)next;
+
+	unsigned int d = index - last;
+	unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+	encodeVByte(data, v);
+}
+
+static unsigned int decodeIndex(const unsigned char*& data, unsigned int next, unsigned int last)
+{
+	(void)next;
+
+	unsigned int v = decodeVByte(data);
+	unsigned int d = (v >> 1) ^ -int(v & 1);
+
+	return last + d;
+}
+
+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
+{
+	for (int i = 0; i < 16; ++i)
+		if (table[i] == v)
+			return i;
+
+	return -1;
+}
+
+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
+{
+	if (index_size == 2)
+	{
+		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
+		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
+		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
+	}
+	else
+#ifdef __EMSCRIPTEN__
+	if (index_size == 4) // work around Edge (ChakraCore) bug - without this compiler assumes index_size==2
+#endif
+	{
+		static_cast<unsigned int*>(destination)[offset + 0] = a;
+		static_cast<unsigned int*>(destination)[offset + 1] = b;
+		static_cast<unsigned int*>(destination)[offset + 2] = c;
+	}
+}
+
+#if TRACE
+static size_t sortTop16(unsigned char dest[16], size_t stats[256])
+{
+	size_t destsize = 0;
+
+	for (size_t i = 0; i < 256; ++i)
+	{
+		size_t j = 0;
+		for (; j < destsize; ++j)
+		{
+			if (stats[i] >= stats[dest[j]])
+			{
+				if (destsize < 16)
+					destsize++;
+
+				memmove(&dest[j + 1], &dest[j], destsize - 1 - j);
+				dest[j] = (unsigned char)i;
+				break;
+			}
+		}
+
+		if (j == destsize && destsize < 16)
+		{
+			dest[destsize] = (unsigned char)i;
+			destsize++;
+		}
+	}
+
+	return destsize;
+}
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+#if TRACE
+	size_t codestats[256] = {};
+	size_t codeauxstats[256] = {};
+#endif
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return 0;
+
+	buffer[0] = kIndexHeader;
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	unsigned char* code = buffer + 1;
+	unsigned char* data = code + index_count / 3;
+	unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
+	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
+	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough space to write a triangle
+		// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can write without extra bounds checks
+		if (data > data_safe_end)
+			return 0;
+
+		int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
+
+		if (fer >= 0 && (fer >> 2) < 15)
+		{
+			const unsigned int* order = kTriangleIndexOrder[fer & 3];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// encode edge index and vertex fifo index, next or free index
+			int fe = fer >> 2;
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			int fec = (fc >= 1 && fc < 15) ? fc : (c == next) ? (next++, 0) : 15;
+
+			*code++ = (unsigned char)((fe << 4) | fec);
+
+#if TRACE
+			codestats[code[-1]]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fec == 15)
+				encodeIndex(data, c, next, last), last = c;
+
+			// we only need to push third vertex since first two are likely already in the vertex fifo
+			if (fec == 0 || fec == 15)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// we only need to push two new edges to edge fifo since the third one is already there
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+		else
+		{
+			int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
+			const unsigned int* order = kTriangleIndexOrder[rotation];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			int fea = (a == next) ? (next++, 0) : 15;
+			int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15;
+			int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15;
+
+			// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
+			unsigned char codeaux = (unsigned char)((feb << 4) | fec);
+			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
+
+			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14)
+			{
+				*code++ = (unsigned char)((15 << 4) | codeauxindex);
+			}
+			else
+			{
+				*code++ = (unsigned char)((15 << 4) | 14 | fea);
+				*data++ = codeaux;
+			}
+
+#if TRACE
+			codestats[code[-1]]++;
+			codeauxstats[codeaux]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fea == 15)
+				encodeIndex(data, a, next, last), last = a;
+
+			if (feb == 15)
+				encodeIndex(data, b, next, last), last = b;
+
+			if (fec == 15)
+				encodeIndex(data, c, next, last), last = c;
+
+			// only push vertices that weren't already in fifo
+			if (fea == 0 || fea == 15)
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+
+			if (feb == 0 || feb == 15)
+				pushVertexFifo(vertexfifo, b, vertexfifooffset);
+
+			if (fec == 0 || fec == 15)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
+			pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+	}
+
+	// make sure we have enough space to write codeaux table
+	if (data > data_safe_end)
+		return 0;
+
+	// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
+	// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
+	// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
+	for (size_t i = 0; i < 16; ++i)
+	{
+		// decoder assumes that table entries never refer to separately encoded indices
+		assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
+
+		*data++ = codeaux_table[i];
+	}
+
+	assert(data >= buffer + index_count / 3 + 16);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	unsigned char codetop[16], codeauxtop[16];
+	size_t codetopsize = sortTop16(codetop, codestats);
+	size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats);
+
+	size_t sumcode = 0, sumcodeaux = 0;
+	for (size_t i = 0; i < 256; ++i)
+		sumcode += codestats[i], sumcodeaux += codeauxstats[i];
+
+	size_t acccode = 0, acccodeaux = 0;
+
+	printf("code\t\t\t\t\tcodeaux\n");
+
+	for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i)
+	{
+		acccode += codestats[codetop[i]];
+		acccodeaux += codeauxstats[codeauxtop[i]];
+
+		printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n",
+		       int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100,
+		       int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100);
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
+	unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
+
+	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
+}
+
+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(index_size == 2 || index_size == 4);
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return -2;
+
+	if (buffer[0] != kIndexHeader)
+		return -1;
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
+	const unsigned char* code = buffer + 1;
+	const unsigned char* data = code + index_count / 3;
+	const unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	const unsigned char* codeaux_table = data_safe_end;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough data to read for a triangle
+		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can read without extra bounds checks
+		if (data > data_safe_end)
+			return -2;
+
+		unsigned char codetri = *code++;
+
+		if (codetri < 0xf0)
+		{
+			int fe = codetri >> 4;
+
+			// fifo reads are wrapped around 16 entry buffer
+			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
+			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+
+			int fec = codetri & 15;
+
+			// note: this is the most common path in the entire decoder
+			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
+			if (fec != 15)
+			{
+				// fifo reads are wrapped around 16 entry buffer
+				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				unsigned int c = 0;
+
+				// note that we need to update the last index since free indices are delta-encoded
+				last = c = decodeIndex(data, next, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+		else
+		{
+			// fast path: read codeaux from the table
+			if (codetri < 0xfe)
+			{
+				unsigned char codeaux = codeaux_table[codetri & 15];
+
+				// note: table can't contain feb/fec=15
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = next++;
+
+				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int b = (feb == 0) ? next : bf;
+
+				int feb0 = feb == 0;
+				next += feb0;
+
+				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				// slow path: read a full byte for codeaux instead of using a table lookup
+				unsigned char codeaux = *data++;
+
+				int fea = codetri == 0xfe ? 0 : 15;
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = (fea == 0) ? next++ : 0;
+				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
+
+				// note that we need to update the last index since free indices are delta-encoded
+				if (fea == 15)
+					last = a = decodeIndex(data, next, last);
+
+				if (feb == 15)
+					last = b = decodeIndex(data, next, last);
+
+				if (fec == 15)
+					last = c = decodeIndex(data, next, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and codeaux table
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
--- a/3rdparty/meshoptimizer/src/indexgenerator.cpp
+++ b/3rdparty/meshoptimizer/src/indexgenerator.cpp
@ -0,0 +1,347 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+namespace meshopt
+{
+
+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
+{
+	// MurmurHash2
+	const unsigned int m = 0x5bd1e995;
+	const int r = 24;
+
+	while (len >= 4)
+	{
+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	return h;
+}
+
+struct VertexHasher
+{
+	const unsigned char* vertices;
+	size_t vertex_size;
+	size_t vertex_stride;
+
+	size_t hash(unsigned int index) const
+	{
+		return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
+	}
+};
+
+struct VertexStreamHasher
+{
+	const meshopt_Stream* streams;
+	size_t stream_count;
+
+	size_t hash(unsigned int index) const
+	{
+		unsigned int h = 0;
+
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			h = hashUpdate4(h, data + index * s.stride, s.size);
+		}
+
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
+				return false;
+		}
+
+		return true;
+	}
+};
+
+static size_t hashBuckets(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+} // namespace meshopt
+
+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place remap
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
+		}
+	}
+}
+
+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+	assert(index_count % 3 == 0);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(remap[index] != ~0u);
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size <= vertex_stride);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
--- a/3rdparty/meshoptimizer/src/meshoptimizer.h
+++ b/3rdparty/meshoptimizer/src/meshoptimizer.h
@ -0,0 +1,759 @@
+/**
+ * meshoptimizer - version 0.11
+ *
+ * Copyright (C) 2016-2019, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
+ *
+ * This library is distributed under the MIT License. See notice at the end of this file.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+
+/* Version macro; major * 1000 + minor * 10 + patch */
+#define MESHOPTIMIZER_VERSION 110
+
+/* If no API is defined, assume default */
+#ifndef MESHOPTIMIZER_API
+#define MESHOPTIMIZER_API
+#endif
+
+/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+
+/* C interface */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Vertex attribute stream, similar to glVertexPointer
+ * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ */
+struct meshopt_Stream
+{
+	const void* data;
+	size_t size;
+	size_t stride;
+};
+
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap)
+ * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap
+ */
+MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap);
+
+/**
+ * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap);
+
+/**
+ * Experimental: Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+
+/**
+ * Experimental: Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Vertex transform cache optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for FIFO caches
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * cache_size should be less than the actual GPU cache size to avoid cache thrashing
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+
+/**
+ * Overdraw optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
+ */
+MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+
+/**
+ * Vertex fetch cache optimizer
+ * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
+ * indices is used both as an input and as an output index buffer
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Vertex fetch cache optimizer
+ * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Index buffer encoder
+ * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
+ *
+ * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Index buffer decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer encoder
+ * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
+
+/**
+ * Vertex buffer decoder
+ * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Experimental: Mesh simplifier
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+
+/**
+ * Experimental: Mesh simplifier (sloppy)
+ * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
+ * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+
+/**
+ * Mesh stripifier
+ * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index
+ * Returns the number of indices in the resulting strip, with destination containing new index data
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound
+ */
+MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count);
+
+/**
+ * Mesh unstripifier
+ * Converts a triangle strip to a triangle list
+ * Returns the number of indices in the resulting list, with destination containing new index data
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound
+ */
+MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count);
+
+struct meshopt_VertexCacheStatistics
+{
+	unsigned int vertices_transformed;
+	unsigned int warps_executed;
+	float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */
+	float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */
+};
+
+/**
+ * Vertex transform cache analyzer
+ * Returns cache hit statistics using a simplified FIFO model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+
+struct meshopt_OverdrawStatistics
+{
+	unsigned int pixels_covered;
+	unsigned int pixels_shaded;
+	float overdraw; /* shaded pixels / covered pixels; best case 1.0 */
+};
+
+/**
+ * Overdraw analyzer
+ * Returns overdraw statistics using a software rasterizer
+ * Results may not match actual GPU performance
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+struct meshopt_VertexFetchStatistics
+{
+	unsigned int bytes_fetched;
+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
+struct meshopt_Meshlet
+{
+	unsigned int vertices[64];
+	unsigned char indices[126][3];
+	unsigned char triangle_count;
+	unsigned char vertex_count;
+};
+
+/**
+ * Experimental: Meshlet builder
+ * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
+ * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ *
+ * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
+
+struct meshopt_Bounds
+{
+	/* bounding sphere, useful for frustum and occlusion culling */
+	float center[3];
+	float radius;
+
+	/* normal cone, useful for backface culling */
+	float cone_apex[3];
+	float cone_axis[3];
+	float cone_cutoff; /* = cos(angle/2) */
+
+	/* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */
+	signed char cone_axis_s8[3];
+	signed char cone_cutoff_s8;
+};
+
+/**
+ * Experimental: Cluster bounds generator
+ * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
+ *
+ * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
+ *   dot(view, cone_axis) >= cone_cutoff
+ *
+ * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+ *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
+ *
+ * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
+ *   dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position)
+ * or an equivalent formula that doesn't have a singularity at center = camera_position:
+ *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
+ *
+ * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into commonly supported data formats */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
+
+/**
+ * Quantize a float into half-precision floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+inline unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+inline float meshopt_quantizeFloat(float v, int N);
+#endif
+
+/**
+ * C++ template interface
+ *
+ * These functions mirror the C interface the library provides, providing template-based overloads so that
+ * the caller can use an arbitrary type for the index data, both for input and output.
+ * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not,
+ * the wrappers end up allocating memory and copying index data to convert from one type to another.
+ */
+#ifdef __cplusplus
+template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)>
+struct meshopt_IndexAdapter;
+
+template <typename T>
+struct meshopt_IndexAdapter<T, false>
+{
+	T* result;
+	unsigned int* data;
+	size_t count;
+
+	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
+	    : result(result_)
+	    , data(0)
+	    , count(count_)
+	{
+		data = new unsigned int[count];
+
+		if (input)
+		{
+			for (size_t i = 0; i < count; ++i)
+				data[i] = input[i];
+		}
+	}
+
+	~meshopt_IndexAdapter()
+	{
+		if (result)
+		{
+			for (size_t i = 0; i < count; ++i)
+				result[i] = T(data[i]);
+		}
+
+		delete[] data;
+	}
+};
+
+template <typename T>
+struct meshopt_IndexAdapter<T, true>
+{
+	unsigned int* data;
+
+	meshopt_IndexAdapter(T* result, const T* input, size_t)
+	    : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input)))
+	{
+	}
+};
+
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
+}
+
+template <typename T>
+void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
+}
+
+template <typename T>
+void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
+}
+
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+	return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, target_index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count);
+}
+
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 4);
+
+	return meshopt_stripify(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
+
+	return meshopt_unstripify(out.data, in.data, index_count);
+}
+
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+}
+
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+}
+
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+#endif
+
+/* Inline implementation */
+#ifdef __cplusplus
+inline int meshopt_quantizeUnorm(float v, int N)
+{
+	const float scale = float((1 << N) - 1);
+
+	v = (v >= 0) ? v : 0;
+	v = (v <= 1) ? v : 1;
+
+	return int(v * scale + 0.5f);
+}
+
+inline int meshopt_quantizeSnorm(float v, int N)
+{
+	const float scale = float((1 << (N - 1)) - 1);
+
+	float round = (v >= 0 ? 0.5f : -0.5f);
+
+	v = (v >= -1) ? v : -1;
+	v = (v <= +1) ? v : +1;
+
+	return int(v * scale + round);
+}
+
+inline unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	/* underflow: flush to zero; 113 encodes exponent -14 */
+	h = (em < (113 << 23)) ? 0 : h;
+
+	/* overflow: infinity; 143 encodes exponent 16 */
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	/* NaN; note that we convert all types of NaN to qNaN */
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+inline float meshopt_quantizeFloat(float v, int N)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+	ui = e == 0x7f800000 ? ui : rui;
+
+	/* flush denormals to zero */
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+#endif
+
+/* Internal implementation helpers */
+#ifdef __cplusplus
+class meshopt_Allocator
+{
+public:
+	template <typename T>
+	struct StorageT
+	{
+		static void* (*allocate)(size_t);
+		static void (*deallocate)(void*);
+	};
+
+	typedef StorageT<void> Storage;
+
+	meshopt_Allocator()
+		: blocks()
+		, count(0)
+	{
+	}
+
+	~meshopt_Allocator()
+	{
+		for (size_t i = count; i > 0; --i)
+			Storage::deallocate(blocks[i - 1]);
+	}
+
+	template <typename T> T* allocate(size_t size)
+	{
+		assert(count < sizeof(blocks) / sizeof(blocks[0]));
+		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+		blocks[count++] = result;
+		return result;
+	}
+
+private:
+	void* blocks[16];
+	size_t count;
+};
+
+// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
+template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
+template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
+#endif
+
+/**
+ * Copyright (c) 2016-2019 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
--- a/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawanalyzer.cpp
@ -0,0 +1,230 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Nicolas Capens. Advanced Rasterization. 2004
+namespace meshopt
+{
+
+const int kViewport = 256;
+
+struct OverdrawBuffer
+{
+	float z[kViewport][kViewport][2];
+	unsigned int overdraw[kViewport][kViewport][2];
+};
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
+{
+	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
+	// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
+	// (x2-x1 y2-y1)(dzdx) = (z2-z1)
+	// (x3-x1 y3-y1)(dzdy)   (z3-z1)
+	// we'll solve it with Cramer's rule
+	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
+	float invdet = (det == 0) ? 0 : 1 / det;
+
+	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
+	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+
+	return det;
+}
+
+// half-space fixed point triangle rasterizer
+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
+{
+	// compute depth gradients
+	float DZx, DZy;
+	float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+	int sign = det > 0;
+
+	// flip backfacing triangles to simplify rasterization logic
+	if (sign)
+	{
+		// flipping v2 & v3 preserves depth gradients since they're based on v1
+		float t;
+		t = v2x, v2x = v3x, v3x = t;
+		t = v2y, v2y = v3y, v3y = t;
+		t = v2z, v2z = v3z, v3z = t;
+
+		// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
+		v1z = kViewport - v1z;
+		DZx = -DZx;
+		DZy = -DZy;
+	}
+
+	// coordinates, 28.4 fixed point
+	int X1 = int(16.0f * v1x + 0.5f);
+	int X2 = int(16.0f * v2x + 0.5f);
+	int X3 = int(16.0f * v3x + 0.5f);
+
+	int Y1 = int(16.0f * v1y + 0.5f);
+	int Y2 = int(16.0f * v2y + 0.5f);
+	int Y3 = int(16.0f * v3y + 0.5f);
+
+	// bounding rectangle, clipped against viewport
+	// since we rasterize pixels with covered centers, min >0.5 should round up
+	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
+	// so max >= 0.5 should round down
+	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
+	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
+	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
+	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+
+	// deltas, 28.4 fixed point
+	int DX12 = X1 - X2;
+	int DX23 = X2 - X3;
+	int DX31 = X3 - X1;
+
+	int DY12 = Y1 - Y2;
+	int DY23 = Y2 - Y3;
+	int DY31 = Y3 - Y1;
+
+	// fill convention correction
+	int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
+	int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
+	int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
+
+	// half edge equations, 24.8 fixed point
+	// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
+	int FX = (minx << 4) + 8;
+	int FY = (miny << 4) + 8;
+	int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
+	int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
+	int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
+	float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
+
+	for (int y = miny; y < maxy; y++)
+	{
+		int CX1 = CY1;
+		int CX2 = CY2;
+		int CX3 = CY3;
+		float ZX = ZY;
+
+		for (int x = minx; x < maxx; x++)
+		{
+			// check if all CXn are non-negative
+			if ((CX1 | CX2 | CX3) >= 0)
+			{
+				if (ZX >= buffer->z[y][x][sign])
+				{
+					buffer->z[y][x][sign] = ZX;
+					buffer->overdraw[y][x][sign]++;
+				}
+			}
+
+			// signed left shift is UB for negative numbers so use unsigned-signed casts
+			CX1 -= int(unsigned(DY12) << 4);
+			CX2 -= int(unsigned(DY23) << 4);
+			CX3 -= int(unsigned(DY31) << 4);
+			ZX += DZx;
+		}
+
+		// signed left shift is UB for negative numbers so use unsigned-signed casts
+		CY1 += int(unsigned(DX12) << 4);
+		CY2 += int(unsigned(DX23) << 4);
+		CY3 += int(unsigned(DX31) << 4);
+		ZY += DZy;
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_OverdrawStatistics result = {};
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			minv[j] = min(minv[j], v[j]);
+			maxv[j] = max(maxv[j], v[j]);
+		}
+	}
+
+	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
+	float scale = kViewport / extent;
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		const float* v = vertex_positions + index * vertex_stride_float;
+
+		triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
+		triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
+		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
+	}
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+
+		for (size_t i = 0; i < index_count; i += 3)
+		{
+			const float* vn0 = &triangles[3 * (i + 0)];
+			const float* vn1 = &triangles[3 * (i + 1)];
+			const float* vn2 = &triangles[3 * (i + 2)];
+
+			switch (axis)
+			{
+			case 0:
+				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+				break;
+			case 1:
+				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+				break;
+			case 2:
+				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+				break;
+			}
+		}
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				for (int s = 0; s < 2; ++s)
+				{
+					unsigned int overdraw = buffer->overdraw[y][x][s];
+
+					result.pixels_covered += overdraw > 0;
+					result.pixels_shaded += overdraw;
+				}
+	}
+
+	result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
+
+	return result;
+}
--- a/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/overdrawoptimizer.cpp
@ -0,0 +1,333 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float mesh_centroid[3] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		const float* p = vertex_positions + vertex_stride_float * indices[i];
+
+		mesh_centroid[0] += p[0];
+		mesh_centroid[1] += p[1];
+		mesh_centroid[2] += p[2];
+	}
+
+	mesh_centroid[0] /= index_count;
+	mesh_centroid[1] /= index_count;
+	mesh_centroid[2] /= index_count;
+
+	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
+	{
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		float cluster_area = 0;
+		float cluster_centroid[3] = {};
+		float cluster_normal[3] = {};
+
+		for (size_t i = cluster_begin; i < cluster_end; i += 3)
+		{
+			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
+			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
+			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
+
+			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
+			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
+			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
+			cluster_normal[0] += normalx;
+			cluster_normal[1] += normaly;
+			cluster_normal[2] += normalz;
+			cluster_area += area;
+		}
+
+		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
+
+		cluster_centroid[0] *= inv_cluster_area;
+		cluster_centroid[1] *= inv_cluster_area;
+		cluster_centroid[2] *= inv_cluster_area;
+
+		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
+		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
+
+		cluster_normal[0] *= inv_cluster_normal_length;
+		cluster_normal[1] *= inv_cluster_normal_length;
+		cluster_normal[2] *= inv_cluster_normal_length;
+
+		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
+
+		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
+	}
+}
+
+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
+{
+	// compute sort data bounds and renormalize, using fixed point snorm
+	float sort_data_max = 1e-3f;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		float dpa = fabsf(sort_data[i]);
+
+		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
+	}
+
+	const int sort_bits = 11;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		// note that we flip distribution since high dot product should come first
+		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
+
+		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
+	}
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		histogram[sort_keys[i]]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == cluster_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
+	}
+}
+
+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
+{
+	unsigned int cache_misses = 0;
+
+	// if vertex is not in cache, put it in cache
+	if (timestamp - cache_timestamps[a] > cache_size)
+	{
+		cache_timestamps[a] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[b] > cache_size)
+	{
+		cache_timestamps[b] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[c] > cache_size)
+	{
+		cache_timestamps[c] = timestamp++;
+		cache_misses++;
+	}
+
+	return cache_misses;
+}
+
+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	size_t face_count = index_count / 3;
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
+		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
+		// suggests an inefficiency in the vertex cache optimization algorithm
+		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
+		if (i == 0 || m == 3)
+		{
+			destination[result++] = unsigned(i);
+		}
+	}
+
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = 0;
+
+	size_t result = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		size_t start = clusters[it];
+		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
+		assert(start < end);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		// measure cluster ACMR
+		unsigned int cluster_misses = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			cluster_misses += m;
+		}
+
+		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
+
+		// first cluster always starts from the hard cluster boundary
+		destination[result++] = unsigned(start);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		unsigned int running_misses = 0;
+		unsigned int running_faces = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			running_misses += m;
+			running_faces += 1;
+
+			if (float(running_misses) / float(running_faces) <= cluster_threshold)
+			{
+				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
+				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
+				// cluster is empty; however, the 'pop_back' after the loop will clean it up
+				destination[result++] = unsigned(i + 1);
+
+				// reset cache
+				timestamp += cache_size + 1;
+
+				running_misses = 0;
+				running_faces = 0;
+			}
+		}
+
+		// each time we reach the target ACMR we flush the cluster
+		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
+		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
+		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
+		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
+		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
+		if (destination[result - 1] != start)
+		{
+			result--;
+		}
+	}
+
+	assert(result >= cluster_count);
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+
+	// generate hard boundaries from full-triangle cache misses
+	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
+	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
+
+	// generate soft boundaries
+	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
+	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
+
+	const unsigned int* clusters = soft_clusters;
+	size_t cluster_count = soft_cluster_count;
+
+	// fill sort data
+	float* sort_data = allocator.allocate<float>(cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+
+	// sort clusters using sort data
+	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
+	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
+	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
+
+	// fill output buffer
+	size_t offset = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		unsigned int cluster = sort_order[it];
+		assert(cluster < cluster_count);
+
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
+		offset += cluster_end - cluster_begin;
+	}
+
+	assert(offset == index_count);
+}
--- a/3rdparty/meshoptimizer/src/simplifier.cpp
+++ b/3rdparty/meshoptimizer/src/simplifier.cpp
--- a/3rdparty/meshoptimizer/src/stripifier.cpp
+++ b/3rdparty/meshoptimizer/src/stripifier.cpp
@ -0,0 +1,269 @@
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+// This work is based on:
+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
+namespace meshopt
+{
+
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+{
+	unsigned int index = 0;
+	unsigned int iv = ~0u;
+
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc;
+
+		if (v < iv)
+		{
+			index = unsigned(i);
+			iv = v;
+		}
+	}
+
+	return index;
+}
+
+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
+{
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+		if (e0 == a && e1 == b)
+			return (int(i) << 2) | 2;
+		else if (e0 == b && e1 == c)
+			return (int(i) << 2) | 0;
+		else if (e0 == c && e1 == a)
+			return (int(i) << 2) | 1;
+	}
+
+	return -1;
+}
+
+} // namespace meshopt
+
+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(destination != indices);
+	assert(index_count % 3 == 0);
+
+	using namespace meshopt;
+
+	meshopt_Allocator allocator;
+
+	const size_t buffer_capacity = 8;
+
+	unsigned int buffer[buffer_capacity][3] = {};
+	unsigned int buffer_size = 0;
+
+	size_t index_offset = 0;
+
+	unsigned int strip[2] = {};
+	unsigned int parity = 0;
+
+	size_t strip_size = 0;
+
+	// compute vertex valence; this is used to prioritize starting triangle for strips
+	unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
+	memset(valence, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	int next = -1;
+
+	while (buffer_size > 0 || index_offset < index_count)
+	{
+		assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
+
+		// fill triangle buffer
+		while (buffer_size < buffer_capacity && index_offset < index_count)
+		{
+			buffer[buffer_size][0] = indices[index_offset + 0];
+			buffer[buffer_size][1] = indices[index_offset + 1];
+			buffer[buffer_size][2] = indices[index_offset + 2];
+
+			buffer_size++;
+			index_offset += 3;
+		}
+
+		assert(buffer_size > 0);
+
+		if (next >= 0)
+		{
+			unsigned int i = next >> 2;
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+			unsigned int v = buffer[i][next & 3];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// find next triangle (note that edge order flips on every iteration)
+			// in some cases we need to perform a swap to pick a different outgoing triangle edge
+			// for [a b c], the default strip edge is [b c], but we might want to use [a c]
+			int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
+			int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
+
+			if (cont < 0 && swap >= 0)
+			{
+				// [a b c] => [a b a c]
+				destination[strip_size++] = strip[0];
+				destination[strip_size++] = v;
+
+				// next strip has same winding
+				// ? a b => b a v
+				strip[1] = v;
+
+				next = swap;
+			}
+			else
+			{
+				// emit the next vertex in the strip
+				destination[strip_size++] = v;
+
+				// next strip has flipped winding
+				strip[0] = strip[1];
+				strip[1] = v;
+				parity ^= 1;
+
+				next = cont;
+			}
+		}
+		else
+		{
+			// if we didn't find anything, we need to find the next new triangle
+			// we use a heuristic to maximize the strip length
+			unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
+			int ea = findStripNext(buffer, buffer_size, c, b);
+			int eb = findStripNext(buffer, buffer_size, a, c);
+			int ec = findStripNext(buffer, buffer_size, b, a);
+
+			// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
+			// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
+			// reasons - slightly improves the stripification efficiency
+			int mine = INT_MAX;
+			mine = (ea >= 0 && mine > ea) ? ea : mine;
+			mine = (eb >= 0 && mine > eb) ? eb : mine;
+			mine = (ec >= 0 && mine > ec) ? ec : mine;
+
+			if (ea == mine)
+			{
+				// keep abc
+				next = ea;
+			}
+			else if (eb == mine)
+			{
+				// abc -> bca
+				unsigned int t = a;
+				a = b, b = c, c = t;
+
+				next = eb;
+			}
+			else if (ec == mine)
+			{
+				// abc -> cab
+				unsigned int t = c;
+				c = b, b = a, a = t;
+
+				next = ec;
+			}
+
+			// emit the new strip; we use restart indices
+			if (strip_size)
+				destination[strip_size++] = ~0u;
+
+			destination[strip_size++] = a;
+			destination[strip_size++] = b;
+			destination[strip_size++] = c;
+
+			// new strip always starts with the same edge winding
+			strip[0] = b;
+			strip[1] = c;
+			parity = 1;
+		}
+	}
+
+	return strip_size;
+}
+
+size_t meshopt_stripifyBound(size_t index_count)
+{
+	assert(index_count % 3 == 0);
+
+	// worst case is 1 restart index and 3 indices per triangle
+	return (index_count / 3) * 4;
+}
+
+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count)
+{
+	assert(destination != indices);
+
+	size_t offset = 0;
+	size_t start = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		if (indices[i] == ~0u)
+		{
+			start = i + 1;
+		}
+		else if (i - start >= 2)
+		{
+			unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
+
+			// flip winding for odd triangles
+			if ((i - start) & 1)
+			{
+				unsigned int t = a;
+				a = b, b = t;
+			}
+
+			// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
+			if (a != b && a != c && b != c)
+			{
+				destination[offset + 0] = a;
+				destination[offset + 1] = b;
+				destination[offset + 2] = c;
+				offset += 3;
+			}
+		}
+	}
+
+	return offset;
+}
+
+size_t meshopt_unstripifyBound(size_t index_count)
+{
+	assert(index_count == 0 || index_count >= 3);
+
+	return (index_count == 0) ? 0 : (index_count - 2) * 3;
+}
--- a/3rdparty/meshoptimizer/src/vcacheanalyzer.cpp
+++ b/3rdparty/meshoptimizer/src/vcacheanalyzer.cpp
@ -0,0 +1,73 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
+{
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+	assert(warp_size == 0 || warp_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexCacheStatistics result = {};
+
+	unsigned int warp_offset = 0;
+	unsigned int primgroup_offset = 0;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		bool ac = (timestamp - cache_timestamps[a]) > cache_size;
+		bool bc = (timestamp - cache_timestamps[b]) > cache_size;
+		bool cc = (timestamp - cache_timestamps[c]) > cache_size;
+
+		// flush cache if triangle doesn't fit into warp or into the primitive buffer
+		if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
+		{
+			result.warps_executed += warp_offset > 0;
+
+			warp_offset = 0;
+			primgroup_offset = 0;
+
+			// reset cache
+			timestamp += cache_size + 1;
+		}
+
+		// update cache and add vertices to warp
+		for (int j = 0; j < 3; ++j)
+		{
+			unsigned int index = indices[i + j];
+
+			if (timestamp - cache_timestamps[index] > cache_size)
+			{
+				cache_timestamps[index] = timestamp++;
+				result.vertices_transformed++;
+				warp_offset++;
+			}
+		}
+
+		primgroup_offset++;
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += cache_timestamps[i] > 0;
+
+	result.warps_executed += warp_offset > 0;
+
+	result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
+	result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
+
+	return result;
+}
--- a/3rdparty/meshoptimizer/src/vcacheoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/vcacheoptimizer.cpp
@ -0,0 +1,453 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+const size_t kCacheSizeMax = 16;
+const size_t kValenceMax = 8;
+
+static const float kVertexScoreTableCache[1 + kCacheSizeMax] = {
+    0.f,
+    0.792f, 0.767f, 0.764f, 0.956f, 0.827f, 0.751f, 0.820f, 0.864f, 0.738f, 0.788f, 0.642f, 0.646f, 0.165f, 0.654f, 0.545f, 0.284f};
+
+static const float kVertexScoreTableLive[1 + kValenceMax] = {
+    0.f,
+    0.994f, 0.721f, 0.479f, 0.423f, 0.174f, 0.080f, 0.249f, 0.056f};
+
+struct TriangleAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
+{
+	// check dead-end stack
+	while (dead_end_top)
+	{
+		unsigned int vertex = dead_end[--dead_end_top];
+
+		if (live_triangles[vertex] > 0)
+			return vertex;
+	}
+
+	// input order
+	while (input_cursor < vertex_count)
+	{
+		if (live_triangles[input_cursor] > 0)
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+{
+	unsigned int best_candidate = ~0u;
+	int best_priority = -1;
+
+	for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
+	{
+		unsigned int vertex = *next_candidate;
+
+		// otherwise we don't need to process it
+		if (live_triangles[vertex] > 0)
+		{
+			int priority = 0;
+
+			// will it be in cache after fanning?
+			if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
+			{
+				priority = timestamp - cache_timestamps[vertex]; // position in cache
+			}
+
+			if (priority > best_priority)
+			{
+				best_candidate = vertex;
+				best_priority = priority;
+			}
+		}
+	}
+
+	return best_candidate;
+}
+
+static float vertexScore(int cache_position, unsigned int live_triangles)
+{
+	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
+
+	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
+
+	return kVertexScoreTableCache[1 + cache_position] + kVertexScoreTableLive[live_triangles_clamped];
+}
+
+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
+{
+	// input order
+	while (input_cursor < face_count)
+	{
+		if (!emitted_flags[input_cursor])
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+	assert(cache_size <= kCacheSizeMax);
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// compute initial vertex scores
+	float* vertex_scores = allocator.allocate<float>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		vertex_scores[i] = vertexScore(-1, live_triangles[i]);
+
+	// compute triangle scores
+	float* triangle_scores = allocator.allocate<float>(face_count);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0];
+		unsigned int b = indices[i * 3 + 1];
+		unsigned int c = indices[i * 3 + 2];
+
+		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
+	}
+
+	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int* cache = cache_holder;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	size_t cache_count = 0;
+
+	unsigned int current_triangle = 0;
+	unsigned int input_cursor = 1;
+
+	unsigned int output_triangle = 0;
+
+	while (current_triangle != ~0u)
+	{
+		assert(output_triangle < face_count);
+
+		unsigned int a = indices[current_triangle * 3 + 0];
+		unsigned int b = indices[current_triangle * 3 + 1];
+		unsigned int c = indices[current_triangle * 3 + 2];
+
+		// output indices
+		destination[output_triangle * 3 + 0] = a;
+		destination[output_triangle * 3 + 1] = b;
+		destination[output_triangle * 3 + 2] = c;
+		output_triangle++;
+
+		// update emitted flags
+		emitted_flags[current_triangle] = true;
+		triangle_scores[current_triangle] = 0;
+
+		// new triangle
+		size_t cache_write = 0;
+		cache_new[cache_write++] = a;
+		cache_new[cache_write++] = b;
+		cache_new[cache_write++] = c;
+
+		// old triangles
+		for (size_t i = 0; i < cache_count; ++i)
+		{
+			unsigned int index = cache[i];
+
+			if (index != a && index != b && index != c)
+			{
+				cache_new[cache_write++] = index;
+			}
+		}
+
+		unsigned int* cache_temp = cache;
+		cache = cache_new, cache_new = cache_temp;
+		cache_count = cache_write > cache_size ? cache_size : cache_write;
+
+		// update live triangle counts
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
+		{
+			unsigned int index = indices[current_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == current_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
+		}
+
+		unsigned int best_triangle = ~0u;
+		float best_score = 0;
+
+		// update cache positions, vertex scores and triangle scores, and find next best triangle
+		for (size_t i = 0; i < cache_write; ++i)
+		{
+			unsigned int index = cache[i];
+
+			int cache_position = i >= cache_size ? -1 : int(i);
+
+			// update vertex score
+			float score = vertexScore(cache_position, live_triangles[index]);
+			float score_diff = score - vertex_scores[index];
+
+			vertex_scores[index] = score;
+
+			// update scores of vertex triangles
+			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+
+			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+			{
+				unsigned int tri = *it;
+				assert(!emitted_flags[tri]);
+
+				float tri_score = triangle_scores[tri] + score_diff;
+				assert(tri_score > 0);
+
+				if (best_score < tri_score)
+				{
+					best_triangle = tri;
+					best_score = tri_score;
+				}
+
+				triangle_scores[tri] = tri_score;
+			}
+		}
+
+		// step through input triangles in order if we hit a dead-end
+		current_triangle = best_triangle;
+
+		if (current_triangle == ~0u)
+		{
+			current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
+		}
+	}
+
+	assert(input_cursor == face_count);
+	assert(output_triangle == face_count);
+}
+
+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// cache time stamps
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	// dead-end stack
+	unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
+	unsigned int dead_end_top = 0;
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	unsigned int current_vertex = 0;
+
+	unsigned int timestamp = cache_size + 1;
+	unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
+
+	unsigned int output_triangle = 0;
+
+	while (current_vertex != ~0u)
+	{
+		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
+
+		// emit all vertex neighbours
+		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+
+		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+		{
+			unsigned int triangle = *it;
+
+			if (!emitted_flags[triangle])
+			{
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+				// output indices
+				destination[output_triangle * 3 + 0] = a;
+				destination[output_triangle * 3 + 1] = b;
+				destination[output_triangle * 3 + 2] = c;
+				output_triangle++;
+
+				// update dead-end stack
+				dead_end[dead_end_top + 0] = a;
+				dead_end[dead_end_top + 1] = b;
+				dead_end[dead_end_top + 2] = c;
+				dead_end_top += 3;
+
+				// update live triangle counts
+				live_triangles[a]--;
+				live_triangles[b]--;
+				live_triangles[c]--;
+
+				// update cache info
+				// if vertex is not in cache, put it in cache
+				if (timestamp - cache_timestamps[a] > cache_size)
+					cache_timestamps[a] = timestamp++;
+
+				if (timestamp - cache_timestamps[b] > cache_size)
+					cache_timestamps[b] = timestamp++;
+
+				if (timestamp - cache_timestamps[c] > cache_size)
+					cache_timestamps[c] = timestamp++;
+
+				// update emitted flags
+				emitted_flags[triangle] = true;
+			}
+		}
+
+		// next candidates are the ones we pushed to dead-end stack just now
+		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
+
+		// get next vertex
+		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+
+		if (current_vertex == ~0u)
+		{
+			current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
+		}
+	}
+
+	assert(output_triangle == face_count);
+}
--- a/3rdparty/meshoptimizer/src/vertexcodec.cpp
+++ b/3rdparty/meshoptimizer/src/vertexcodec.cpp
@ -0,0 +1,954 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+#if defined(__AVX__) || defined(__SSSE3__)
+#define SIMD_SSE
+#endif
+
+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#include <intrin.h> // __cpuid
+#endif
+
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+#ifdef SIMD_SSE
+#include <tmmintrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+namespace meshopt
+{
+
+const unsigned char kVertexHeader = 0xa0;
+
+const size_t kVertexBlockSizeBytes = 8192;
+const size_t kVertexBlockMaxSize = 256;
+const size_t kByteGroupSize = 16;
+const size_t kTailMaxSize = 32;
+
+static size_t getVertexBlockSize(size_t vertex_size)
+{
+	// make sure the entire block fits into the scratch buffer
+	size_t result = kVertexBlockSizeBytes / vertex_size;
+
+	// align to byte group size; we encode each byte as a byte group
+	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
+	result &= ~(kByteGroupSize - 1);
+
+	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
+}
+
+inline unsigned char zigzag8(unsigned char v)
+{
+	return ((signed char)(v) >> 7) ^ (v << 1);
+}
+
+inline unsigned char unzigzag8(unsigned char v)
+{
+	return -(v & 1) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+	size_t size;
+	size_t header;
+	size_t bitg[4];
+	size_t bitb[4];
+};
+
+Stats* bytestats;
+Stats vertexstats[256];
+#endif
+
+static bool encodeBytesGroupZero(const unsigned char* buffer)
+{
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		if (buffer[i])
+			return false;
+
+	return true;
+}
+
+static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
+
+	if (bits == 8)
+		return kByteGroupSize;
+
+	size_t result = kByteGroupSize * bits / 8;
+
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		result += buffer[i] >= sentinel;
+
+	return result;
+}
+
+static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return data;
+
+	if (bits == 8)
+	{
+		memcpy(data, buffer, kByteGroupSize);
+		return data + kByteGroupSize;
+	}
+
+	size_t byte_size = 8 / bits;
+	assert(kByteGroupSize % byte_size == 0);
+
+	// fixed portion: bits bits for each value
+	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
+	{
+		unsigned char byte = 0;
+
+		for (size_t k = 0; k < byte_size; ++k)
+		{
+			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
+
+			byte <<= bits;
+			byte |= enc;
+		}
+
+		*data++ = byte;
+	}
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+	{
+		if (buffer[i] >= sentinel)
+		{
+			*data++ = buffer[i];
+		}
+	}
+
+	return data;
+}
+
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	memset(header, 0, header_size);
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kTailMaxSize)
+			return 0;
+
+		int best_bits = 8;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+
+		for (int bits = 1; bits < 8; bits *= 2)
+		{
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+
+			if (size < best_size)
+			{
+				best_bits = bits;
+				best_size = size;
+			}
+		}
+
+		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
+		assert((1 << bitslog2) == best_bits);
+
+		size_t header_offset = i / kByteGroupSize;
+
+		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
+
+		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
+
+		assert(data + best_size == next);
+		data = next;
+
+#if TRACE > 1
+		bytestats->bitg[bitslog2]++;
+		bytestats->bitb[bitslog2] += best_size;
+#endif
+	}
+
+#if TRACE > 1
+	bytestats->header += header_size;
+#endif
+
+	return data;
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	assert(sizeof(buffer) % kByteGroupSize == 0);
+
+	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+	memset(buffer, 0, sizeof(buffer));
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+
+			p = vertex_data[vertex_offset];
+
+			vertex_offset += vertex_size;
+		}
+
+#if TRACE
+		const unsigned char* olddata = data;
+		bytestats = &vertexstats[k];
+#endif
+
+		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
+		if (!data)
+			return 0;
+
+#if TRACE
+		bytestats = 0;
+		vertexstats[k].size += data - olddata;
+#endif
+	}
+
+	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON))
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+#define READ() byte = *data++
+#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
+
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+		memset(buffer, 0, kByteGroupSize);
+		return data;
+	case 1:
+		data_var = data + 4;
+
+		// 4 groups with 4 2-bit values in each byte
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+
+		return data_var;
+	case 2:
+		data_var = data + 8;
+
+		// 8 groups with 2 4-bit values in each byte
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+
+		return data_var;
+	case 3:
+		memcpy(buffer, data, kByteGroupSize);
+		return data + kByteGroupSize;
+	default:
+		assert(!"Unexpected bit length"); // This can never happen since bitslog2 is a 2-bit value
+		return data;
+	}
+
+#undef READ
+#undef NEXT
+}
+
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kTailMaxSize)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroup(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
+		if (!data)
+			return 0;
+
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			unsigned char v = unzigzag8(buffer[i]) + p;
+
+			transposed[vertex_offset] = v;
+			p = v;
+
+			vertex_offset += vertex_size;
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON)
+static unsigned char kDecodeBytesGroupShuffle[256][8];
+static unsigned char kDecodeBytesGroupCount[256];
+
+static bool decodeBytesGroupBuildTables()
+{
+	for (int mask = 0; mask < 256; ++mask)
+	{
+		unsigned char shuffle[8];
+		unsigned char count = 0;
+
+		for (int i = 0; i < 8; ++i)
+		{
+			int maski = (mask >> i) & 1;
+			shuffle[i] = maski ? count : 0x80;
+			count += (unsigned char)(maski);
+		}
+
+		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
+		kDecodeBytesGroupCount[mask] = count;
+	}
+
+	return true;
+}
+
+static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
+#endif
+
+#ifdef SIMD_SSE
+static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
+	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
+	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
+
+	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
+
+	return _mm_unpacklo_epi64(sm0, sm1r);
+}
+
+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+{
+	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
+	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
+	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
+	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
+
+	x0 = _mm_unpacklo_epi16(t0, t2);
+	x1 = _mm_unpackhi_epi16(t0, t2);
+	x2 = _mm_unpacklo_epi16(t1, t3);
+	x3 = _mm_unpackhi_epi16(t1, t3);
+}
+
+static __m128i unzigzag8(__m128i v)
+{
+	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
+	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
+
+	return _mm_xor_si128(xl, xr);
+}
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	{
+#ifdef __GNUC__
+		typedef int __attribute__((aligned(1))) unaligned_int;
+#else
+		typedef int unaligned_int;
+#endif
+
+		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
+
+		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
+		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
+		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
+
+		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
+		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		__m128i result = rest;
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // This can never happen since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_NEON
+static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+{
+	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
+	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
+
+	uint8x8_t r0 = vtbl1_u8(rest0, sm0);
+	uint8x8_t r1 = vtbl1_u8(rest1, sm1);
+
+	return vcombine_u8(r0, r1);
+}
+
+static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+
+	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
+	uint8x16_t masked = vandq_u8(mask, byte_mask);
+
+#ifdef __aarch64__
+	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
+	mask0 = vaddv_u8(vget_low_u8(masked));
+	mask1 = vaddv_u8(vget_high_u8(masked));
+#else
+	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
+	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
+	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
+	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
+
+	mask0 = vget_lane_u8(sum3, 0);
+	mask1 = vget_lane_u8(sum3, 1);
+#endif
+}
+
+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+{
+	uint8x16x2_t t01 = vzipq_u8(x0, x1);
+	uint8x16x2_t t23 = vzipq_u8(x2, x3);
+
+	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
+	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
+
+	x0 = vreinterpretq_u8_u16(x01.val[0]);
+	x1 = vreinterpretq_u8_u16(x01.val[1]);
+	x2 = vreinterpretq_u8_u16(x23.val[0]);
+	x3 = vreinterpretq_u8_u16(x23.val[1]);
+}
+
+static uint8x16_t unzigzag8(uint8x16_t v)
+{
+	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
+	uint8x16_t xr = vshrq_n_u8(v, 1);
+
+	return veorq_u8(xl, xr);
+}
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		uint8x16_t result = vdupq_n_u8(0);
+
+		vst1q_u8(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		uint8x8_t sel2 = vld1_u8(data);
+		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
+		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
+		uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 4);
+		uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		uint8x8_t sel4 = vld1_u8(data);
+		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
+		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 8);
+		uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		uint8x16_t rest = vld1q_u8(data);
+
+		uint8x16_t result = rest;
+
+		vst1q_u8(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // This can never happen since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON)
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+	assert(kByteGroupSize == 16);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	size_t i = 0;
+
+	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=32b
+	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kTailMaxSize * 4; i += kByteGroupSize * 4)
+	{
+		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
+
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+	}
+
+	// slow-path: process remaining groups
+	for (; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kTailMaxSize)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize * 4];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; k += 4)
+	{
+		for (size_t j = 0; j < 4; ++j)
+		{
+			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
+			if (!data)
+				return 0;
+		}
+
+#ifdef SIMD_SSE
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+		PREP();
+
+		unsigned char* savep = transposed + k;
+
+		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		{
+			LOAD(0);
+			LOAD(1);
+			LOAD(2);
+			LOAD(3);
+
+			r0 = unzigzag8(r0);
+			r1 = unzigzag8(r1);
+			r2 = unzigzag8(r2);
+			r3 = unzigzag8(r3);
+
+			transpose8(r0, r1, r2, r3);
+
+			TEMP t0, t1, t2, t3;
+
+			GRP4(0);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(1);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(2);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(3);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+#if TRACE
+	memset(vertexstats, 0, sizeof(vertexstats));
+#endif
+
+	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
+
+	unsigned char* data = buffer;
+	unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return 0;
+
+	*data++ = kVertexHeader;
+
+	unsigned char last_vertex[256] = {};
+	if (vertex_count > 0)
+		memcpy(last_vertex, vertex_data, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return 0;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) < tail_size)
+		return 0;
+
+	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
+	if (vertex_size < kTailMaxSize)
+	{
+		memset(data, 0, kTailMaxSize - vertex_size);
+		data += kTailMaxSize - vertex_size;
+	}
+
+	memcpy(data, vertex_data, vertex_size);
+	data += vertex_size;
+
+	assert(data >= buffer + tail_size);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	size_t total_size = data - buffer;
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		const Stats& vsk = vertexstats[k];
+
+		printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+#if TRACE > 1
+		printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
+		       int(vsk.header),
+		       int(vsk.bitg[0]), int(vsk.bitb[0]),
+		       int(vsk.bitg[1]), int(vsk.bitb[1]),
+		       int(vsk.bitg[2]), int(vsk.bitb[2]),
+		       int(vsk.bitg[3]), int(vsk.bitb[3]));
+#endif
+
+		printf("\n");
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
+
+	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
+	size_t vertex_block_data_size = vertex_block_size;
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+}
+
+int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+	int cpuinfo[4] = {};
+	__cpuid(cpuinfo, 1);
+	decode = (cpuinfo[2] & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
+#elif defined(SIMD_SSE) || defined(SIMD_NEON)
+	decode = decodeVertexBlockSimd;
+#else
+	decode = decodeVertexBlock;
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON)
+	assert(gDecodeBytesGroupInitialized);
+#endif
+
+	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
+
+	const unsigned char* data = buffer;
+	const unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return -2;
+
+	if (*data++ != kVertexHeader)
+		return -1;
+
+	unsigned char last_vertex[256];
+	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return -2;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) != tail_size)
+		return -3;
+
+	return 0;
+}
--- a/3rdparty/meshoptimizer/src/vfetchanalyzer.cpp
+++ b/3rdparty/meshoptimizer/src/vfetchanalyzer.cpp
@ -0,0 +1,58 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
--- a/3rdparty/meshoptimizer/src/vfetchoptimizer.cpp
+++ b/3rdparty/meshoptimizer/src/vfetchoptimizer.cpp
@ -0,0 +1,74 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			destination[index] = next_vertex++;
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place optimization
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	// build vertex remap table
+	unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		unsigned int& remap = vertex_remap[index];
+
+		if (remap == ~0u) // vertex was not added to destination VB
+		{
+			// add vertex
+			memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
+
+			remap = next_vertex++;
+		}
+
+		// modify indices in place
+		indices[i] = remap;
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
--- a/3rdparty/meshoptimizer/tools/OptMeshLoader.js
+++ b/3rdparty/meshoptimizer/tools/OptMeshLoader.js
@ -0,0 +1,179 @@
+THREE.OptMeshLoader = (function ()
+{
+	function OptMeshLoader(manager)
+	{
+		this.manager = (manager !== undefined) ? manager : THREE.DefaultLoadingManager;
+		this.materials = Promise.resolve(null);
+	}
+
+	OptMeshLoader.prototype =
+	{
+		constructor: OptMeshLoader,
+
+		load: function (url, onLoad, onProgress, onError)
+		{
+			var scope = this;
+
+			var loader = new THREE.FileLoader(scope.manager);
+			loader.setResponseType('arraybuffer');
+			loader.setPath(this.path);
+			loader.load(url, function (data)
+			{
+				scope.decoder.ready.then(function ()
+				{
+					scope.materials.then(function (materials)
+					{
+						onLoad(scope.parse(data, materials));
+					});
+				});
+			}, onProgress, onError);
+		},
+
+		setDecoder: function (value)
+		{
+			this.decoder = value;
+			return this;
+
+		},
+
+		setPath: function (value)
+		{
+			this.path = value;
+			return this;
+		},
+
+		setMaterials: function (materials)
+		{
+			this.materials = Promise.resolve(materials);
+			return this;
+		},
+
+		setMaterialLib: function (lib)
+		{
+			var scope = this;
+
+			this.materials = new Promise(function (resolve, reject)
+			{
+				var loader = new THREE.MTLLoader();
+				loader.setPath(scope.path);
+				loader.load(lib, function (materials) { materials.preload(); resolve(materials); }, null, reject);
+			});
+
+			return this;
+		},
+
+		parse: function (data, materials)
+		{
+			console.time('OptMeshLoader');
+
+			var array = new Uint8Array(data);
+			var view = new DataView(data);
+
+			var endian = true;
+			var magic = view.getUint32(0, endian);
+			var objectCount = view.getUint32(4, endian);
+			var vertexCount = view.getUint32(8, endian);
+			var indexCount = view.getUint32(12, endian);
+			var vertexDataSize = view.getUint32(16, endian);
+			var indexDataSize = view.getUint32(20, endian);
+			var posOffsetX = view.getFloat32(24, endian);
+			var posOffsetY = view.getFloat32(28, endian);
+			var posOffsetZ = view.getFloat32(32, endian);
+			var posScale = view.getFloat32(36, endian);
+			var uvOffsetX = view.getFloat32(40, endian);
+			var uvOffsetY = view.getFloat32(44, endian);
+			var uvScaleX = view.getFloat32(48, endian);
+			var uvScaleY = view.getFloat32(52, endian);
+
+			if (magic != 0x4D54504F)
+				throw new Error("Malformed mesh file: unrecognized header");
+
+			var objectOffset = 64;
+			var objectDataOffset = objectOffset + 16 * objectCount;
+
+			var objectDataSize = 0;
+
+			for (var i = 0; i < objectCount; ++i)
+				objectDataSize += view.getUint32(objectOffset + 16 * i + 8, endian);
+
+			var vertexDataOffset = objectDataOffset + objectDataSize;
+			var indexDataOffset = vertexDataOffset + vertexDataSize;
+
+			var endOffset = indexDataOffset + indexDataSize;
+
+			if (endOffset != data.byteLength)
+				throw new Error("Malformed mesh file: unexpected input size");
+
+			var vertexSize = 16;
+			var indexSize = 4;
+
+			var vertexBuffer = new ArrayBuffer(vertexCount * vertexSize);
+			var vertexBufferU8 = new Uint8Array(vertexBuffer);
+			this.decoder.decodeVertexBuffer(vertexBufferU8, vertexCount, vertexSize, array.subarray(vertexDataOffset, vertexDataOffset + vertexDataSize));
+
+			var indexBuffer = new ArrayBuffer(indexCount * indexSize);
+			var indexBufferU8 = new Uint8Array(indexBuffer);
+			this.decoder.decodeIndexBuffer(indexBufferU8, indexCount, indexSize, array.subarray(indexDataOffset, indexDataOffset + indexDataSize));
+
+			var geometry = new THREE.BufferGeometry();
+
+			geometry.addAttribute('position', new THREE.InterleavedBufferAttribute(new THREE.InterleavedBuffer(new Uint16Array(vertexBuffer), 8), 3, 0, false));
+			geometry.addAttribute('normal', new THREE.InterleavedBufferAttribute(new THREE.InterleavedBuffer(new Int8Array(vertexBuffer), 16), 3, 8, true));
+			geometry.addAttribute('uv', new THREE.InterleavedBufferAttribute(new THREE.InterleavedBuffer(new Uint16Array(vertexBuffer), 8), 2, 6, false));
+			geometry.setIndex(new THREE.BufferAttribute(new Uint32Array(indexBuffer), 1, false));
+
+			var objectDataOffsetAcc = objectDataOffset;
+
+			var objectMaterials = [];
+			var objectMaterialsLookup = {};
+
+			for (var i = 0; i < objectCount; i++)
+			{
+				var objectIndexOffset = view.getUint32(objectOffset + 16 * i + 0, endian);
+				var objectIndexCount = view.getUint32(objectOffset + 16 * i + 4, endian);
+				var objectMaterialLength = view.getUint32(objectOffset + 16 * i + 8, endian);
+
+				var objectMaterialName = String.fromCharCode.apply(null, array.subarray(objectDataOffsetAcc, objectDataOffsetAcc + objectMaterialLength));
+				var objectMaterialIndex = objectMaterialsLookup[objectMaterialName];
+
+				if (objectMaterialIndex == undefined)
+				{
+					var objectMaterial = null;
+
+					if (materials !== null)
+						objectMaterial = materials.create(objectMaterialName);
+
+					if (!objectMaterial)
+						objectMaterial = new THREE.MeshPhongMaterial();
+
+					if (objectMaterial.map)
+					{
+						objectMaterial.map.offset.set(uvOffsetX, uvOffsetY);
+						objectMaterial.map.repeat.set(uvScaleX, uvScaleY);
+					}
+
+					objectMaterialIndex = objectMaterials.length;
+					objectMaterialsLookup[objectMaterialName] = objectMaterialIndex;
+					objectMaterials.push(objectMaterial);
+				}
+
+				geometry.addGroup(objectIndexOffset, objectIndexCount, objectMaterialIndex);
+
+				objectDataOffsetAcc += objectMaterialLength;
+			}
+
+			var mesh = new THREE.Mesh(geometry, objectMaterials);
+			mesh.position.set(posOffsetX, posOffsetY, posOffsetZ);
+			mesh.scale.set(posScale, posScale, posScale);
+
+			var container = new THREE.Group();
+			container.add(mesh);
+
+			console.timeEnd('OptMeshLoader');
+
+			return container;
+		}
+	};
+
+	return OptMeshLoader;
+})();
--- a/3rdparty/meshoptimizer/tools/lodviewer.cpp
+++ b/3rdparty/meshoptimizer/tools/lodviewer.cpp
@ -0,0 +1,630 @@
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "../src/meshoptimizer.h"
+#include "objparser.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <ctime>
+#include <vector>
+
+#include <GLFW/glfw3.h>
+
+#ifdef GLTF
+#define CGLTF_IMPLEMENTATION
+#include "cgltf.h"
+#endif
+
+#ifdef _WIN32
+#pragma comment(lib, "opengl32.lib")
+#endif
+
+extern unsigned char* meshopt_simplifyDebugKind;
+extern unsigned int* meshopt_simplifyDebugLoop;
+
+#ifndef TRACE
+unsigned char* meshopt_simplifyDebugKind;
+unsigned int* meshopt_simplifyDebugLoop;
+#endif
+
+struct Options
+{
+	bool wireframe;
+	enum
+	{
+		Mode_Default,
+		Mode_Texture,
+		Mode_Normals,
+		Mode_UV,
+		Mode_Kind,
+	} mode;
+};
+
+struct Vertex
+{
+	float px, py, pz;
+	float nx, ny, nz;
+	float tx, ty;
+};
+
+struct Mesh
+{
+	std::vector<Vertex> vertices;
+	std::vector<unsigned int> indices;
+
+	// TODO: this is debug only visualization and will go away at some point
+	std::vector<unsigned char> kinds;
+	std::vector<unsigned int> loop;
+};
+
+Mesh parseObj(const char* path)
+{
+	ObjFile file;
+
+	if (!objParseFile(file, path) || !objValidate(file))
+	{
+		printf("Error loading %s\n", path);
+		return Mesh();
+	}
+
+	size_t total_indices = file.f_size / 3;
+
+	std::vector<Vertex> vertices(total_indices);
+
+	for (size_t i = 0; i < total_indices; ++i)
+	{
+		int vi = file.f[i * 3 + 0];
+		int vti = file.f[i * 3 + 1];
+		int vni = file.f[i * 3 + 2];
+
+		Vertex v =
+		    {
+		        file.v[vi * 3 + 0],
+		        file.v[vi * 3 + 1],
+		        file.v[vi * 3 + 2],
+
+		        vni >= 0 ? file.vn[vni * 3 + 0] : 0,
+		        vni >= 0 ? file.vn[vni * 3 + 1] : 0,
+		        vni >= 0 ? file.vn[vni * 3 + 2] : 0,
+
+		        vti >= 0 ? file.vt[vti * 3 + 0] : 0,
+		        vti >= 0 ? file.vt[vti * 3 + 1] : 0,
+		    };
+
+		vertices[i] = v;
+	}
+
+	Mesh result;
+
+	std::vector<unsigned int> remap(total_indices);
+	size_t total_vertices = meshopt_generateVertexRemap(&remap[0], NULL, total_indices, &vertices[0], total_indices, sizeof(Vertex));
+
+	result.indices.resize(total_indices);
+	meshopt_remapIndexBuffer(&result.indices[0], NULL, total_indices, &remap[0]);
+
+	result.vertices.resize(total_vertices);
+	meshopt_remapVertexBuffer(&result.vertices[0], &vertices[0], total_indices, sizeof(Vertex), &remap[0]);
+
+	return result;
+}
+
+#ifdef GLTF
+cgltf_accessor* getAccessor(const cgltf_attribute* attributes, size_t attribute_count, cgltf_attribute_type type, int index = 0)
+{
+	for (size_t i = 0; i < attribute_count; ++i)
+		if (attributes[i].type == type && attributes[i].index == index)
+			return attributes[i].data;
+
+	return 0;
+}
+
+template <typename T>
+const T* getComponentPtr(const cgltf_accessor* a)
+{
+	const char* buffer = (char*)a->buffer_view->buffer->data;
+	size_t offset = a->offset + a->buffer_view->offset;
+
+	return reinterpret_cast<const T*>(&buffer[offset]);
+}
+
+Mesh parseGltf(const char* path)
+{
+	cgltf_options options = {};
+	cgltf_data* data = 0;
+	cgltf_result res = cgltf_parse_file(&options, path, &data);
+
+	if (res != cgltf_result_success)
+	{
+		return Mesh();
+	}
+
+	res = cgltf_load_buffers(&options, data, path);
+	if (res != cgltf_result_success)
+	{
+		cgltf_free(data);
+		return Mesh();
+	}
+
+	res = cgltf_validate(data);
+	if (res != cgltf_result_success)
+	{
+		cgltf_free(data);
+		return Mesh();
+	}
+
+	size_t total_vertices = 0;
+	size_t total_indices = 0;
+
+	for (size_t ni = 0; ni < data->nodes_count; ++ni)
+	{
+		if (!data->nodes[ni].mesh)
+			continue;
+
+		const cgltf_mesh& mesh = *data->nodes[ni].mesh;
+
+		for (size_t pi = 0; pi < mesh.primitives_count; ++pi)
+		{
+			const cgltf_primitive& primitive = mesh.primitives[pi];
+
+			cgltf_accessor* ai = primitive.indices;
+			cgltf_accessor* ap = getAccessor(primitive.attributes, primitive.attributes_count, cgltf_attribute_type_position);
+
+			if (!ai || !ap)
+				continue;
+
+			total_vertices += ap->count;
+			total_indices += ai->count;
+		}
+	}
+
+	Mesh result;
+	result.vertices.resize(total_vertices);
+	result.indices.resize(total_indices);
+
+	size_t vertex_offset = 0;
+	size_t index_offset = 0;
+
+	for (size_t ni = 0; ni < data->nodes_count; ++ni)
+	{
+		if (!data->nodes[ni].mesh)
+			continue;
+
+		const cgltf_mesh& mesh = *data->nodes[ni].mesh;
+
+		float transform[16];
+		cgltf_node_transform_world(&data->nodes[ni], transform);
+
+		for (size_t pi = 0; pi < mesh.primitives_count; ++pi)
+		{
+			const cgltf_primitive& primitive = mesh.primitives[pi];
+
+			cgltf_accessor* ai = primitive.indices;
+			cgltf_accessor* ap = getAccessor(primitive.attributes, primitive.attributes_count, cgltf_attribute_type_position);
+
+			if (!ai || !ap)
+				continue;
+
+			if (ai->component_type == cgltf_component_type_r_32u)
+			{
+				const unsigned int* ptr = getComponentPtr<unsigned int>(ai);
+
+				for (size_t i = 0; i < ai->count; ++i)
+					result.indices[index_offset + i] = unsigned(vertex_offset + ptr[i]);
+			}
+			else
+			{
+				const unsigned short* ptr = getComponentPtr<unsigned short>(ai);
+
+				for (size_t i = 0; i < ai->count; ++i)
+					result.indices[index_offset + i] = unsigned(vertex_offset + ptr[i]);
+			}
+
+			{
+				const float* ptr = getComponentPtr<float>(ap);
+
+				for (size_t i = 0; i < ap->count; ++i)
+				{
+					result.vertices[vertex_offset + i].px = ptr[0] * transform[0] + ptr[1] * transform[4] + ptr[2] * transform[8] + transform[12];
+					result.vertices[vertex_offset + i].py = ptr[0] * transform[1] + ptr[1] * transform[5] + ptr[2] * transform[9] + transform[13];
+					result.vertices[vertex_offset + i].pz = ptr[0] * transform[2] + ptr[1] * transform[6] + ptr[2] * transform[10] + transform[14];
+					ptr += ap->stride / 4;
+				}
+			}
+
+			if (cgltf_accessor* an = getAccessor(primitive.attributes, primitive.attributes_count, cgltf_attribute_type_normal))
+			{
+				const float* ptr = getComponentPtr<float>(an);
+
+				for (size_t i = 0; i < ap->count; ++i)
+				{
+					result.vertices[vertex_offset + i].nx = ptr[0] * transform[0] + ptr[1] * transform[4] + ptr[2] * transform[8];
+					result.vertices[vertex_offset + i].ny = ptr[0] * transform[1] + ptr[1] * transform[5] + ptr[2] * transform[9];
+					result.vertices[vertex_offset + i].nz = ptr[0] * transform[2] + ptr[1] * transform[6] + ptr[2] * transform[10];
+					ptr += an->stride / 4;
+				}
+			}
+
+			if (cgltf_accessor* at = getAccessor(primitive.attributes, primitive.attributes_count, cgltf_attribute_type_texcoord))
+			{
+				const float* ptr = getComponentPtr<float>(at);
+
+				for (size_t i = 0; i < ap->count; ++i)
+				{
+					result.vertices[vertex_offset + i].tx = ptr[0];
+					result.vertices[vertex_offset + i].ty = ptr[1];
+					ptr += at->stride / 4;
+				}
+			}
+
+			vertex_offset += ap->count;
+			index_offset += ai->count;
+		}
+	}
+
+	std::vector<unsigned int> remap(total_indices);
+	size_t unique_vertices = meshopt_generateVertexRemap(&remap[0], &result.indices[0], total_indices, &result.vertices[0], total_vertices, sizeof(Vertex));
+
+	meshopt_remapIndexBuffer(&result.indices[0], &result.indices[0], total_indices, &remap[0]);
+	meshopt_remapVertexBuffer(&result.vertices[0], &result.vertices[0], total_vertices, sizeof(Vertex), &remap[0]);
+
+	result.vertices.resize(unique_vertices);
+
+	cgltf_free(data);
+
+	return result;
+}
+#endif
+
+Mesh loadMesh(const char* path)
+{
+	if (strstr(path, ".obj"))
+		return parseObj(path);
+
+#ifdef GLTF
+	if (strstr(path, ".gltf") || strstr(path, ".glb"))
+		return parseGltf(path);
+#endif
+
+	return Mesh();
+}
+
+bool saveObj(const Mesh& mesh, const char* path)
+{
+	std::vector<Vertex> verts = mesh.vertices;
+	std::vector<unsigned int> tris = mesh.indices;
+	size_t vertcount = meshopt_optimizeVertexFetch(verts.data(), tris.data(), tris.size(), verts.data(), verts.size(), sizeof(Vertex));
+
+	FILE* obj = fopen(path, "w");
+	if (!obj)
+		return false;
+
+	for (size_t i = 0; i < vertcount; ++i)
+	{
+		fprintf(obj, "v %f %f %f\n", verts[i].px, verts[i].py, verts[i].pz);
+		fprintf(obj, "vn %f %f %f\n", verts[i].nx, verts[i].ny, verts[i].nz);
+		fprintf(obj, "vt %f %f %f\n", verts[i].tx, verts[i].ty, 0.f);
+	}
+
+	for (size_t i = 0; i < tris.size(); i += 3)
+	{
+		unsigned int i0 = tris[i + 0] + 1;
+		unsigned int i1 = tris[i + 1] + 1;
+		unsigned int i2 = tris[i + 2] + 1;
+
+		fprintf(obj, "f %d/%d/%d %d/%d/%d %d/%d/%d\n", i0, i0, i0, i1, i1, i1, i2, i2, i2);
+	}
+
+	fclose(obj);
+
+	return true;
+}
+
+Mesh optimize(const Mesh& mesh, int lod)
+{
+	float threshold = powf(0.5f, float(lod));
+	size_t target_index_count = size_t(mesh.indices.size() * threshold);
+	float target_error = 1e-2f;
+
+	Mesh result = mesh;
+	result.kinds.resize(result.vertices.size());
+	result.loop.resize(result.vertices.size());
+	meshopt_simplifyDebugKind = &result.kinds[0];
+	meshopt_simplifyDebugLoop = &result.loop[0];
+	result.indices.resize(meshopt_simplify(&result.indices[0], &result.indices[0], mesh.indices.size(), &mesh.vertices[0].px, mesh.vertices.size(), sizeof(Vertex), target_index_count, target_error));
+
+	return result;
+}
+
+void display(int x, int y, int width, int height, const Mesh& mesh, const Options& options)
+{
+	glViewport(x, y, width, height);
+	glEnable(GL_DEPTH_TEST);
+	glDepthFunc(GL_LESS);
+	glDepthMask(GL_TRUE);
+
+	glMatrixMode(GL_MODELVIEW);
+	glLoadIdentity();
+	glRotatef(0.f, 0.f, 1.f, 0.f);
+
+	glPolygonMode(GL_FRONT_AND_BACK, options.wireframe ? GL_LINE : GL_FILL);
+
+	float centerx = 0;
+	float centery = 0;
+	float centerz = 0;
+	float centeru = 0;
+	float centerv = 0;
+
+	for (size_t i = 0; i < mesh.vertices.size(); ++i)
+	{
+		const Vertex& v = mesh.vertices[i];
+
+		centerx += v.px;
+		centery += v.py;
+		centerz += v.pz;
+		centeru += v.tx;
+		centerv += v.ty;
+	}
+
+	centerx /= float(mesh.vertices.size());
+	centery /= float(mesh.vertices.size());
+	centerz /= float(mesh.vertices.size());
+	centeru /= float(mesh.vertices.size());
+	centerv /= float(mesh.vertices.size());
+
+	float extent = 0;
+	float extentuv = 0;
+
+	for (size_t i = 0; i < mesh.vertices.size(); ++i)
+	{
+		const Vertex& v = mesh.vertices[i];
+
+		extent = std::max(extent, fabsf(v.px - centerx));
+		extent = std::max(extent, fabsf(v.py - centery));
+		extent = std::max(extent, fabsf(v.pz - centerz));
+		extentuv = std::max(extentuv, fabsf(v.tx - centeru));
+		extentuv = std::max(extentuv, fabsf(v.ty - centerv));
+	}
+
+	extent *= 1.1f;
+	extentuv *= 1.1f;
+
+	float scalex = width > height ? float(height) / float(width) : 1;
+	float scaley = height > width ? float(width) / float(height) : 1;
+
+	glBegin(GL_TRIANGLES);
+
+	for (size_t i = 0; i < mesh.indices.size(); ++i)
+	{
+		const Vertex& v = mesh.vertices[mesh.indices[i]];
+
+		float intensity = -(v.pz - centerz) / extent * 0.5f + 0.5f;
+
+		switch (options.mode)
+		{
+		case Options::Mode_UV:
+			glColor3f(intensity, intensity, intensity);
+			glVertex3f((v.tx - centeru) / extentuv * scalex, (v.ty - centerv) / extentuv * scaley, 0);
+			break;
+
+		case Options::Mode_Texture:
+			glColor3f(v.tx - floorf(v.tx), v.ty - floorf(v.ty), 0.5f);
+			glVertex3f((v.px - centerx) / extent * scalex, (v.py - centery) / extent * scaley, (v.pz - centerz) / extent);
+			break;
+
+		case Options::Mode_Normals:
+			glColor3f(v.nx * 0.5f + 0.5f, v.ny * 0.5f + 0.5f, v.nz * 0.5f + 0.5f);
+			glVertex3f((v.px - centerx) / extent * scalex, (v.py - centery) / extent * scaley, (v.pz - centerz) / extent);
+			break;
+
+		default:
+			glColor3f(intensity, intensity, intensity);
+			glVertex3f((v.px - centerx) / extent * scalex, (v.py - centery) / extent * scaley, (v.pz - centerz) / extent);
+		}
+	}
+
+	glEnd();
+
+	float zbias = 1e-3f;
+
+	if (options.mode == Options::Mode_Kind && !mesh.kinds.empty() && !mesh.loop.empty())
+	{
+		glLineWidth(1);
+
+		glBegin(GL_LINES);
+
+		for (size_t i = 0; i < mesh.indices.size(); ++i)
+		{
+			unsigned int a = mesh.indices[i];
+			unsigned int b = mesh.loop[a];
+
+			if (b != ~0u)
+			{
+				const Vertex& v0 = mesh.vertices[a];
+				const Vertex& v1 = mesh.vertices[b];
+
+				unsigned char kind = mesh.kinds[a];
+
+				glColor3f(kind == 0 || kind == 4, kind == 0 || kind == 2 || kind == 3, kind == 0 || kind == 1 || kind == 3);
+				glVertex3f((v0.px - centerx) / extent * scalex, (v0.py - centery) / extent * scaley, (v0.pz - centerz) / extent - zbias);
+				glVertex3f((v1.px - centerx) / extent * scalex, (v1.py - centery) / extent * scaley, (v1.pz - centerz) / extent - zbias);
+			}
+		}
+
+		glEnd();
+
+		glPointSize(3);
+
+		glBegin(GL_POINTS);
+
+		for (size_t i = 0; i < mesh.indices.size(); ++i)
+		{
+			const Vertex& v = mesh.vertices[mesh.indices[i]];
+			unsigned char kind = mesh.kinds[mesh.indices[i]];
+
+			if (kind != 0)
+			{
+				glColor3f(kind == 0 || kind == 4, kind == 0 || kind == 2 || kind == 3, kind == 0 || kind == 1 || kind == 3);
+				glVertex3f((v.px - centerx) / extent * scalex, (v.py - centery) / extent * scaley, (v.pz - centerz) / extent - zbias * 2);
+			}
+		}
+
+		glEnd();
+	}
+}
+
+void stats(GLFWwindow* window, const char* path, unsigned int triangles, int lod, double time)
+{
+	char title[256];
+	snprintf(title, sizeof(title), "%s: LOD %d - %d triangles (%.1f msec)", path, lod, triangles, time * 1000);
+
+	glfwSetWindowTitle(window, title);
+}
+
+struct File
+{
+	Mesh basemesh;
+	Mesh lodmesh;
+	const char* path;
+};
+
+std::vector<File> files;
+Options options;
+bool redraw;
+
+void keyhandler(GLFWwindow* window, int key, int scancode, int action, int mods)
+{
+	if (action == GLFW_PRESS)
+	{
+		if (key == GLFW_KEY_W)
+		{
+			options.wireframe = !options.wireframe;
+			redraw = true;
+		}
+		else if (key == GLFW_KEY_T)
+		{
+			options.mode = options.mode == Options::Mode_Texture ? Options::Mode_Default : Options::Mode_Texture;
+			redraw = true;
+		}
+		else if (key == GLFW_KEY_N)
+		{
+			options.mode = options.mode == Options::Mode_Normals ? Options::Mode_Default : Options::Mode_Normals;
+			redraw = true;
+		}
+		else if (key == GLFW_KEY_U)
+		{
+			options.mode = options.mode == Options::Mode_UV ? Options::Mode_Default : Options::Mode_UV;
+			redraw = true;
+		}
+		else if (key == GLFW_KEY_K)
+		{
+			options.mode = options.mode == Options::Mode_Kind ? Options::Mode_Default : Options::Mode_Kind;
+			redraw = true;
+		}
+		else if (key >= GLFW_KEY_0 && key <= GLFW_KEY_9)
+		{
+			int lod = int(key - GLFW_KEY_0);
+
+			unsigned int triangles = 0;
+
+			clock_t start = clock();
+			for (auto& f : files)
+			{
+				f.lodmesh = optimize(f.basemesh, lod);
+				triangles += unsigned(f.lodmesh.indices.size() / 3);
+			}
+			clock_t end = clock();
+
+			stats(window, files[0].path, triangles, lod, double(end - start) / CLOCKS_PER_SEC);
+			redraw = true;
+		}
+		else if (key == GLFW_KEY_S)
+		{
+			int i = 0;
+
+			for (auto& f : files)
+			{
+				char path[32];
+				sprintf(path, "result%d.obj", i);
+
+				saveObj(f.lodmesh, path);
+
+				printf("Saved LOD of %s to %s\n", f.path, path);
+			}
+		}
+	}
+}
+
+void sizehandler(GLFWwindow* window, int width, int height)
+{
+	redraw = true;
+}
+
+int main(int argc, char** argv)
+{
+	if (argc <= 1)
+	{
+		printf("Usage: %s [.obj files]\n", argv[0]);
+		return 0;
+	}
+
+	unsigned int basetriangles = 0;
+
+	for (int i = 1; i < argc; ++i)
+	{
+		files.emplace_back();
+		File& f = files.back();
+
+		f.path = argv[i];
+		f.basemesh = loadMesh(f.path);
+		f.lodmesh = optimize(f.basemesh, 0);
+
+		basetriangles += unsigned(f.basemesh.indices.size() / 3);
+	}
+
+	glfwInit();
+
+	GLFWwindow* window = glfwCreateWindow(640, 480, "Simple example", NULL, NULL);
+	glfwMakeContextCurrent(window);
+
+	stats(window, files[0].path, basetriangles, 0, 0);
+
+	glfwSetKeyCallback(window, keyhandler);
+	glfwSetWindowSizeCallback(window, sizehandler);
+
+	redraw = true;
+
+	while (!glfwWindowShouldClose(window))
+	{
+		if (redraw)
+		{
+			redraw = false;
+
+			int width, height;
+			glfwGetFramebufferSize(window, &width, &height);
+
+			glViewport(0, 0, width, height);
+			glClearDepth(1.f);
+			glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+			int cols = int(ceil(sqrt(double(files.size()))));
+			int rows = int(ceil(double(files.size()) / cols));
+
+			int tilew = width / cols;
+			int tileh = height / rows;
+
+			for (size_t i = 0; i < files.size(); ++i)
+			{
+				File& f = files[i];
+				int x = int(i) % cols;
+				int y = int(i) / cols;
+
+				display(x * tilew, y * tileh, tilew, tileh, f.lodmesh, options);
+			}
+
+			glfwSwapBuffers(window);
+		}
+
+		glfwWaitEvents();
+	}
+}
--- a/3rdparty/meshoptimizer/tools/meshencoder.cpp
+++ b/3rdparty/meshoptimizer/tools/meshencoder.cpp
@ -0,0 +1,255 @@
+// Converts .obj files to .optmesh files
+// Usage: meshencoder [.obj] [.optmesh]
+
+// Data layout:
+// Header: 64b
+// Object table: 16b * object_count
+// Object data
+// Vertex data
+// Index data
+
+#include "../src/meshoptimizer.h"
+#include "objparser.h"
+
+#include <algorithm>
+#include <vector>
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+struct Header
+{
+	char magic[4]; // OPTM
+
+	unsigned int group_count;
+	unsigned int vertex_count;
+	unsigned int index_count;
+	unsigned int vertex_data_size;
+	unsigned int index_data_size;
+
+	float pos_offset[3];
+	float pos_scale;
+	float uv_offset[2];
+	float uv_scale[2];
+
+	unsigned int reserved[2];
+};
+
+struct Object
+{
+	unsigned int index_offset;
+	unsigned int index_count;
+	unsigned int material_length;
+	unsigned int reserved;
+};
+
+struct Vertex
+{
+	unsigned short px, py, pz, pw; // unsigned 16-bit value, use pos_offset/pos_scale to unpack
+	char nx, ny, nz, nw; // normalized signed 8-bit value
+	unsigned short tx, ty; // unsigned 16-bit value, use uv_offset/uv_scale to unpack
+};
+
+float rcpSafe(float v)
+{
+	return v == 0.f ? 0.f : 1.f / v;
+}
+
+int main(int argc, char** argv)
+{
+	if (argc <= 2)
+	{
+		printf("Usage: %s [.obj] [.optmesh]\n", argv[0]);
+		return 1;
+	}
+
+	const char* input = argv[1];
+	const char* output = argv[2];
+
+	ObjFile file;
+
+	if (!objParseFile(file, input))
+	{
+		printf("Error loading %s: file not found\n", input);
+		return 2;
+	}
+
+	if (!objValidate(file))
+	{
+		printf("Error loading %s: invalid file data\n", input);
+		return 3;
+	}
+
+	float pos_offset[3] = { FLT_MAX, FLT_MAX, FLT_MAX };
+	float pos_scale = 0.f;
+
+	for (size_t i = 0; i < file.v_size; i += 3)
+	{
+		pos_offset[0] = std::min(pos_offset[0], file.v[i + 0]);
+		pos_offset[1] = std::min(pos_offset[1], file.v[i + 1]);
+		pos_offset[2] = std::min(pos_offset[2], file.v[i + 2]);
+	}
+
+	for (size_t i = 0; i < file.v_size; i += 3)
+	{
+		pos_scale = std::max(pos_scale, file.v[i + 0] - pos_offset[0]);
+		pos_scale = std::max(pos_scale, file.v[i + 1] - pos_offset[1]);
+		pos_scale = std::max(pos_scale, file.v[i + 2] - pos_offset[2]);
+	}
+
+	float uv_offset[2] = { FLT_MAX, FLT_MAX };
+	float uv_scale[2] = { 0, 0 };
+
+	for (size_t i = 0; i < file.vt_size; i += 3)
+	{
+		uv_offset[0] = std::min(uv_offset[0], file.vt[i + 0]);
+		uv_offset[1] = std::min(uv_offset[1], file.vt[i + 1]);
+	}
+
+	for (size_t i = 0; i < file.vt_size; i += 3)
+	{
+		uv_scale[0] = std::max(uv_scale[0], file.vt[i + 0] - uv_offset[0]);
+		uv_scale[1] = std::max(uv_scale[1], file.vt[i + 1] - uv_offset[1]);
+	}
+
+	float pos_scale_inverse = rcpSafe(pos_scale);
+	float uv_scale_inverse[2] = { rcpSafe(uv_scale[0]), rcpSafe(uv_scale[1]) };
+
+	size_t total_indices = file.f_size / 3;
+
+	std::vector<Vertex> triangles(total_indices);
+
+	int pos_bits = 14;
+	int uv_bits = 12;
+
+	for (size_t i = 0; i < total_indices; ++i)
+	{
+		int vi = file.f[i * 3 + 0];
+		int vti = file.f[i * 3 + 1];
+		int vni = file.f[i * 3 + 2];
+
+		// note: we scale the vertices uniformly; this is not the best option wrt compression quality
+		// however, it means we can scale the mesh uniformly without distorting the normals
+		// this is helpful for backends like ThreeJS that apply mesh scaling to normals
+		float px = (file.v[vi * 3 + 0] - pos_offset[0]) * pos_scale_inverse;
+		float py = (file.v[vi * 3 + 1] - pos_offset[1]) * pos_scale_inverse;
+		float pz = (file.v[vi * 3 + 2] - pos_offset[2]) * pos_scale_inverse;
+
+		// normal is 0 if absent from the mesh
+		float nx = vni >= 0 ? file.vn[vni * 3 + 0] : 0;
+		float ny = vni >= 0 ? file.vn[vni * 3 + 1] : 0;
+		float nz = vni >= 0 ? file.vn[vni * 3 + 2] : 0;
+
+		// scale the normal to make sure the largest component is +-1.0
+		// this reduces the entropy of the normal by ~1.5 bits without losing precision
+		// it's better to use octahedral encoding but that requires special shader support
+		float nm = std::max(fabsf(nx), std::max(fabsf(ny), fabsf(nz)));
+		float ns = nm == 0.f ? 0.f : 1 / nm;
+
+		nx *= ns;
+		ny *= ns;
+		nz *= ns;
+
+		// texture coordinates are 0 if absent, and require a texture matrix to decode
+		float tx = vti >= 0 ? (file.vt[vti * 3 + 0] - uv_offset[0]) * uv_scale_inverse[0] : 0;
+		float ty = vti >= 0 ? (file.vt[vti * 3 + 1] - uv_offset[1]) * uv_scale_inverse[1] : 0;
+
+		Vertex v =
+		    {
+		        (unsigned short)(meshopt_quantizeUnorm(px, pos_bits)),
+		        (unsigned short)(meshopt_quantizeUnorm(py, pos_bits)),
+		        (unsigned short)(meshopt_quantizeUnorm(pz, pos_bits)),
+				0,
+
+		        char(meshopt_quantizeSnorm(nx, 8)),
+		        char(meshopt_quantizeSnorm(ny, 8)),
+		        char(meshopt_quantizeSnorm(nz, 8)),
+				0,
+
+		        (unsigned short)(meshopt_quantizeUnorm(tx, uv_bits)),
+		        (unsigned short)(meshopt_quantizeUnorm(ty, uv_bits)),
+		    };
+
+		triangles[i] = v;
+	}
+
+	std::vector<unsigned int> remap(total_indices);
+
+	size_t total_vertices = meshopt_generateVertexRemap(&remap[0], NULL, total_indices, &triangles[0], total_indices, sizeof(Vertex));
+
+	std::vector<unsigned int> indices(total_indices);
+	meshopt_remapIndexBuffer(&indices[0], NULL, total_indices, &remap[0]);
+
+	std::vector<Vertex> vertices(total_vertices);
+	meshopt_remapVertexBuffer(&vertices[0], &triangles[0], total_indices, sizeof(Vertex), &remap[0]);
+
+	for (size_t i = 0; i < file.g_size; ++i)
+	{
+		ObjGroup& g = file.g[i];
+
+		meshopt_optimizeVertexCache(&indices[g.index_offset], &indices[g.index_offset], g.index_count, vertices.size());
+	}
+
+	meshopt_optimizeVertexFetch(&vertices[0], &indices[0], indices.size(), &vertices[0], vertices.size(), sizeof(Vertex));
+
+	std::vector<unsigned char> vbuf(meshopt_encodeVertexBufferBound(vertices.size(), sizeof(Vertex)));
+	vbuf.resize(meshopt_encodeVertexBuffer(&vbuf[0], vbuf.size(), &vertices[0], vertices.size(), sizeof(Vertex)));
+
+	std::vector<unsigned char> ibuf(meshopt_encodeIndexBufferBound(indices.size(), vertices.size()));
+	ibuf.resize(meshopt_encodeIndexBuffer(&ibuf[0], ibuf.size(), &indices[0], indices.size()));
+
+	FILE* result = fopen(output, "wb");
+	if (!result)
+	{
+		printf("Error saving %s: can't open file for writing\n", output);
+		return 4;
+	}
+
+	Header header = {};
+	memcpy(header.magic, "OPTM", 4);
+
+	header.group_count = unsigned(file.g_size);
+	header.vertex_count = unsigned(vertices.size());
+	header.index_count = unsigned(indices.size());
+	header.vertex_data_size = unsigned(vbuf.size());
+	header.index_data_size = unsigned(ibuf.size());
+
+	header.pos_offset[0] = pos_offset[0];
+	header.pos_offset[1] = pos_offset[1];
+	header.pos_offset[2] = pos_offset[2];
+	header.pos_scale = pos_scale / float((1 << pos_bits) - 1);
+
+	header.uv_offset[0] = uv_offset[0];
+	header.uv_offset[1] = uv_offset[1];
+	header.uv_scale[0] = uv_scale[0] / float((1 << uv_bits) - 1);
+	header.uv_scale[1] = uv_scale[1] / float((1 << uv_bits) - 1);
+
+	fwrite(&header, 1, sizeof(header), result);
+
+	for (size_t i = 0; i < file.g_size; ++i)
+	{
+		ObjGroup& g = file.g[i];
+
+		Object object = {};
+		object.index_offset = unsigned(g.index_offset);
+		object.index_count = unsigned(g.index_count);
+		object.material_length = unsigned(strlen(g.material));
+
+		fwrite(&object, 1, sizeof(object), result);
+	}
+
+	for (size_t i = 0; i < file.g_size; ++i)
+	{
+		ObjGroup& g = file.g[i];
+
+		fwrite(g.material, 1, strlen(g.material), result);
+	}
+
+	fwrite(&vbuf[0], 1, vbuf.size(), result);
+	fwrite(&ibuf[0], 1, ibuf.size(), result);
+	fclose(result);
+
+	return 0;
+}
--- a/3rdparty/meshoptimizer/tools/objparser.cpp
+++ b/3rdparty/meshoptimizer/tools/objparser.cpp
@ -0,0 +1,383 @@
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "objparser.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+template <typename T>
+static void growArray(T*& data, size_t& capacity)
+{
+	size_t newcapacity = capacity == 0 ? 32 : capacity + capacity / 2;
+	T* newdata = new T[newcapacity];
+
+	if (data)
+	{
+		memcpy(newdata, data, capacity * sizeof(T));
+		delete[] data;
+	}
+
+	data = newdata;
+	capacity = newcapacity;
+}
+
+static int fixupIndex(int index, size_t size)
+{
+	return (index >= 0) ? index - 1 : int(size) + index;
+}
+
+static int parseInt(const char* s, const char** end)
+{
+	// skip whitespace
+	while (*s == ' ' || *s == '\t')
+		s++;
+
+	// read sign bit
+	int sign = (*s == '-');
+	s += (*s == '-' || *s == '+');
+
+	unsigned int result = 0;
+
+	for (;;)
+	{
+		if (unsigned(*s - '0') < 10)
+			result = result * 10 + (*s - '0');
+		else
+			break;
+
+		s++;
+	}
+
+	// return end-of-string
+	*end = s;
+
+	return sign ? -int(result) : int(result);
+}
+
+static float parseFloat(const char* s, const char** end)
+{
+	static const double digits[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+	static const double powers[] = {1e0, 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6, 1e+7, 1e+8, 1e+9, 1e+10, 1e+11, 1e+12, 1e+13, 1e+14, 1e+15, 1e+16, 1e+17, 1e+18, 1e+19, 1e+20, 1e+21, 1e+22};
+
+	// skip whitespace
+	while (*s == ' ' || *s == '\t')
+		s++;
+
+	// read sign
+	double sign = (*s == '-') ? -1 : 1;
+	s += (*s == '-' || *s == '+');
+
+	// read integer part
+	double result = 0;
+	int power = 0;
+
+	while (unsigned(*s - '0') < 10)
+	{
+		result = result * 10 + digits[*s - '0'];
+		s++;
+	}
+
+	// read fractional part
+	if (*s == '.')
+	{
+		s++;
+
+		while (unsigned(*s - '0') < 10)
+		{
+			result = result * 10 + digits[*s - '0'];
+			s++;
+			power--;
+		}
+	}
+
+	// read exponent part
+	if ((*s | ' ') == 'e')
+	{
+		s++;
+
+		// read exponent sign
+		int expsign = (*s == '-') ? -1 : 1;
+		s += (*s == '-' || *s == '+');
+
+		// read exponent
+		int exppower = 0;
+
+		while (unsigned(*s - '0') < 10)
+		{
+			exppower = exppower * 10 + (*s - '0');
+			s++;
+		}
+
+		// done!
+		power += expsign * exppower;
+	}
+
+	// return end-of-string
+	*end = s;
+
+	// note: this is precise if result < 9e15
+	// for longer inputs we lose a bit of precision here
+	if (unsigned(-power) < sizeof(powers) / sizeof(powers[0]))
+		return float(sign * result / powers[-power]);
+	else if (unsigned(power) < sizeof(powers) / sizeof(powers[0]))
+		return float(sign * result * powers[power]);
+	else
+		return float(sign * result * pow(10.0, power));
+}
+
+static const char* parseFace(const char* s, int& vi, int& vti, int& vni)
+{
+	while (*s == ' ' || *s == '\t')
+		s++;
+
+	vi = parseInt(s, &s);
+
+	if (*s != '/')
+		return s;
+	s++;
+
+	// handle vi//vni indices
+	if (*s != '/')
+		vti = parseInt(s, &s);
+
+	if (*s != '/')
+		return s;
+	s++;
+
+	vni = parseInt(s, &s);
+
+	return s;
+}
+
+ObjFile::ObjFile()
+    : v(0)
+    , v_size(0)
+    , v_cap(0)
+    , vt(0)
+    , vt_size(0)
+    , vt_cap(0)
+    , vn(0)
+    , vn_size(0)
+    , vn_cap(0)
+    , f(0)
+    , f_size(0)
+    , f_cap(0)
+    , g(0)
+    , g_size(0)
+    , g_cap(0)
+{
+}
+
+ObjFile::~ObjFile()
+{
+	delete[] v;
+	delete[] vt;
+	delete[] vn;
+	delete[] f;
+	delete[] g;
+}
+
+void objParseLine(ObjFile& result, const char* line)
+{
+	if (line[0] == 'v' && line[1] == ' ')
+	{
+		const char* s = line + 2;
+
+		float x = parseFloat(s, &s);
+		float y = parseFloat(s, &s);
+		float z = parseFloat(s, &s);
+
+		if (result.v_size + 3 > result.v_cap)
+			growArray(result.v, result.v_cap);
+
+		result.v[result.v_size++] = x;
+		result.v[result.v_size++] = y;
+		result.v[result.v_size++] = z;
+	}
+	else if (line[0] == 'v' && line[1] == 't' && line[2] == ' ')
+	{
+		const char* s = line + 3;
+
+		float u = parseFloat(s, &s);
+		float v = parseFloat(s, &s);
+		float w = parseFloat(s, &s);
+
+		if (result.vt_size + 3 > result.vt_cap)
+			growArray(result.vt, result.vt_cap);
+
+		result.vt[result.vt_size++] = u;
+		result.vt[result.vt_size++] = v;
+		result.vt[result.vt_size++] = w;
+	}
+	else if (line[0] == 'v' && line[1] == 'n' && line[2] == ' ')
+	{
+		const char* s = line + 3;
+
+		float x = parseFloat(s, &s);
+		float y = parseFloat(s, &s);
+		float z = parseFloat(s, &s);
+
+		if (result.vn_size + 3 > result.vn_cap)
+			growArray(result.vn, result.vn_cap);
+
+		result.vn[result.vn_size++] = x;
+		result.vn[result.vn_size++] = y;
+		result.vn[result.vn_size++] = z;
+	}
+	else if (line[0] == 'f' && line[1] == ' ')
+	{
+		const char* s = line + 2;
+
+		if (!result.g)
+		{
+			growArray(result.g, result.g_cap);
+
+			ObjGroup g = {};
+			result.g[result.g_size++] = g;
+		}
+
+		size_t v = result.v_size / 3;
+		size_t vt = result.vt_size / 3;
+		size_t vn = result.vn_size / 3;
+
+		int fv = 0;
+		int f[3][3] = {};
+
+		while (*s)
+		{
+			int vi = 0, vti = 0, vni = 0;
+			s = parseFace(s, vi, vti, vni);
+
+			if (vi == 0)
+				break;
+
+			f[fv][0] = fixupIndex(vi, v);
+			f[fv][1] = fixupIndex(vti, vt);
+			f[fv][2] = fixupIndex(vni, vn);
+
+			if (fv == 2)
+			{
+				if (result.f_size + 9 > result.f_cap)
+					growArray(result.f, result.f_cap);
+
+				memcpy(&result.f[result.f_size], f, 9 * sizeof(int));
+				result.f_size += 9;
+
+				result.g[result.g_size - 1].index_count += 3;
+
+				f[1][0] = f[2][0];
+				f[1][1] = f[2][1];
+				f[1][2] = f[2][2];
+			}
+			else
+			{
+				fv++;
+			}
+		}
+	}
+	else if (strncmp(line, "usemtl", 6) == 0)
+	{
+		const char* s = line + 6;
+
+		// skip whitespace
+		while (*s == ' ' || *s == '\t')
+			s++;
+
+		if (result.g_size + 1 > result.g_cap)
+			growArray(result.g, result.g_cap);
+
+		ObjGroup g = {};
+		g.index_offset = result.f_size / 3;
+
+		strncpy(g.material, s, sizeof(g.material));
+		g.material[sizeof(g.material) - 1] = 0;
+
+		result.g[result.g_size++] = g;
+	}
+}
+
+bool objParseFile(ObjFile& result, const char* path)
+{
+	FILE* file = fopen(path, "rb");
+	if (!file)
+		return false;
+
+	char buffer[65536];
+	size_t size = 0;
+
+	while (!feof(file))
+	{
+		size += fread(buffer + size, 1, sizeof(buffer) - size, file);
+
+		size_t line = 0;
+
+		while (line < size)
+		{
+			// find the end of current line
+			void* eol = memchr(buffer + line, '\n', size - line);
+			if (!eol)
+				break;
+
+			// zero-terminate for objParseLine
+			size_t next = static_cast<char*>(eol) - buffer;
+
+			buffer[next] = 0;
+
+			// process next line
+			objParseLine(result, buffer + line);
+
+			line = next + 1;
+		}
+
+		// move prefix of the last line in the buffer to the beginning of the buffer for next iteration
+		assert(line <= size);
+
+		memmove(buffer, buffer + line, size - line);
+		size -= line;
+	}
+
+	if (size)
+	{
+		// process last line
+		assert(size < sizeof(buffer));
+		buffer[size] = 0;
+
+		objParseLine(result, buffer);
+	}
+
+	fclose(file);
+	return true;
+}
+
+bool objValidate(const ObjFile& result)
+{
+	size_t v = result.v_size / 3;
+	size_t vt = result.vt_size / 3;
+	size_t vn = result.vn_size / 3;
+
+	for (size_t i = 0; i < result.f_size; i += 3)
+	{
+		int vi = result.f[i + 0];
+		int vti = result.f[i + 1];
+		int vni = result.f[i + 2];
+
+		if (vi < 0)
+			return false;
+
+		if (vi >= 0 && size_t(vi) >= v)
+			return false;
+
+		if (vti >= 0 && size_t(vti) >= vt)
+			return false;
+
+		if (vni >= 0 && size_t(vni) >= vn)
+			return false;
+	}
+
+	return true;
+}
--- a/3rdparty/meshoptimizer/tools/objparser.h
+++ b/3rdparty/meshoptimizer/tools/objparser.h
@ -0,0 +1,42 @@
+#pragma once
+
+#include <stddef.h>
+
+struct ObjGroup
+{
+	char material[256];
+
+	size_t index_offset;
+	size_t index_count;
+};
+
+class ObjFile
+{
+public:
+	float* v; // positions; stride 3 (xyz)
+	size_t v_size, v_cap;
+
+	float* vt; // texture coordinates; stride 3 (uvw)
+	size_t vt_size, vt_cap;
+
+	float* vn; // vertex normals; stride 3 (xyz)
+	size_t vn_size, vn_cap;
+
+	int* f; // face elements; stride 9 (3 groups of indices into v/vt/vn)
+	size_t f_size, f_cap;
+
+	ObjGroup* g;
+	size_t g_size, g_cap;
+
+	ObjFile();
+	~ObjFile();
+
+private:
+	ObjFile(const ObjFile&);
+	ObjFile& operator=(const ObjFile&);
+};
+
+void objParseLine(ObjFile& result, const char* line);
+bool objParseFile(ObjFile& result, const char* path);
+
+bool objValidate(const ObjFile& result);
--- a/3rdparty/meshoptimizer/tools/vcachetester.cpp
+++ b/3rdparty/meshoptimizer/tools/vcachetester.cpp
@ -0,0 +1,495 @@
+#ifdef _WIN32
+#include <assert.h>
+#include <d3d11.h>
+#include <d3dcompiler.h>
+#include <stdio.h>
+
+#include <cassert>
+#include <cmath>
+
+#include <algorithm>
+#include <vector>
+
+#include "../src/meshoptimizer.h"
+#include "objparser.h"
+
+#pragma comment(lib, "d3d11.lib")
+#pragma comment(lib, "d3dcompiler.lib")
+#pragma comment(lib, "dxgi.lib")
+
+void stripGen(std::vector<unsigned int>& indices, int x0, int x1, int y0, int y1, int width, bool prefetch)
+{
+	if (prefetch)
+	{
+		for (int x = x0; x < x1; x++)
+		{
+			indices.push_back(x + 0);
+			indices.push_back(x + 0);
+			indices.push_back(x + 1);
+		}
+	}
+
+	for (int y = y0; y < y1; y++)
+	{
+		for (int x = x0; x < x1; x++)
+		{
+			indices.push_back((width + 1) * (y + 0) + (x + 0));
+			indices.push_back((width + 1) * (y + 1) + (x + 0));
+			indices.push_back((width + 1) * (y + 0) + (x + 1));
+
+			indices.push_back((width + 1) * (y + 0) + (x + 1));
+			indices.push_back((width + 1) * (y + 1) + (x + 0));
+			indices.push_back((width + 1) * (y + 1) + (x + 1));
+		}
+	}
+}
+
+void gridGen(std::vector<unsigned int>& indices, int x0, int x1, int y0, int y1, int width, int cacheSize, bool prefetch)
+{
+	if (x1 - x0 + 1 < cacheSize)
+	{
+		bool prefetchStrip = 2 * (x1 - x0) + 1 > cacheSize && prefetch;
+
+		stripGen(indices, x0, x1, y0, y1, width, prefetchStrip);
+	}
+	else
+	{
+		int xm = x0 + cacheSize - 2;
+		gridGen(indices, x0, xm, y0, y1, width, cacheSize, prefetch);
+		gridGen(indices, xm, x1, y0, y1, width, cacheSize, prefetch);
+	}
+}
+
+unsigned int queryVSInvocations(ID3D11Device* device, ID3D11DeviceContext* context, const unsigned int* indices, size_t index_count)
+{
+	if (index_count == 0)
+		return 0;
+
+	ID3D11Buffer* ib = 0;
+
+	{
+		D3D11_BUFFER_DESC bd = {};
+
+		bd.Usage = D3D11_USAGE_DYNAMIC;
+		bd.ByteWidth = index_count * 4;
+		bd.BindFlags = D3D11_BIND_INDEX_BUFFER;
+		bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+
+		device->CreateBuffer(&bd, 0, &ib);
+
+		D3D11_MAPPED_SUBRESOURCE ms;
+		context->Map(ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &ms);
+		memcpy(ms.pData, indices, index_count * 4);
+		context->Unmap(ib, 0);
+	}
+
+	context->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+	context->IASetIndexBuffer(ib, DXGI_FORMAT_R32_UINT, 0);
+
+	D3D11_QUERY_DESC qdesc = {D3D11_QUERY_PIPELINE_STATISTICS};
+	ID3D11Query* query = 0;
+	device->CreateQuery(&qdesc, &query);
+
+	context->Begin(query);
+	context->DrawIndexed(index_count, 0, 0);
+	context->End(query);
+
+	D3D11_QUERY_DATA_PIPELINE_STATISTICS stats = {};
+	while (S_FALSE == context->GetData(query, &stats, sizeof(stats), 0))
+		;
+
+	query->Release();
+	ib->Release();
+
+	assert(stats.IAVertices == index_count);
+
+	return stats.VSInvocations;
+}
+
+void setupShaders(ID3D11Device* device, ID3D11DeviceContext* context)
+{
+	// load and compile the two shaders
+	const char* shaders =
+	    "#define ATTRIBUTES 5\n"
+	    "struct Foo { float4 v[ATTRIBUTES]; };"
+	    "float4 VS(uint index: SV_VertexId, out Foo foo: FOO): SV_Position { uint i = index % 3; [unroll] for (int j = 0; j < ATTRIBUTES; j++) foo.v[j] = j; return float4(i != 0, i != 2, 0, 1); }"
+	    "float4 PS(Foo foo: FOO): SV_Target { float4 result = 0; [unroll] for (int j = 0; j < ATTRIBUTES; j++) result += foo.v[j]; return result; }";
+
+	ID3DBlob* vsblob = 0;
+	ID3DBlob* psblob = 0;
+	D3DCompile(shaders, strlen(shaders), 0, 0, 0, "VS", "vs_5_0", 0, 0, &vsblob, 0);
+	D3DCompile(shaders, strlen(shaders), 0, 0, 0, "PS", "ps_5_0", 0, 0, &psblob, 0);
+
+	ID3D11VertexShader* vs = 0;
+	ID3D11PixelShader* ps = 0;
+	device->CreateVertexShader(vsblob->GetBufferPointer(), vsblob->GetBufferSize(), 0, &vs);
+	device->CreatePixelShader(psblob->GetBufferPointer(), psblob->GetBufferSize(), 0, &ps);
+
+	context->VSSetShader(vs, 0, 0);
+	context->PSSetShader(ps, 0, 0);
+}
+
+template <typename Cache>
+void inspectCache(Cache cache)
+{
+	unsigned int max_cache_size = 200;
+	unsigned int grid_size = 100;
+
+	for (unsigned int cache_size = 3; cache_size <= max_cache_size; cache_size += 1)
+	{
+		std::vector<unsigned int> grid1;
+		gridGen(grid1, 0, grid_size, 0, grid_size, grid_size, cache_size, true);
+
+		std::vector<unsigned int> grid2;
+		gridGen(grid2, 0, grid_size, 0, grid_size, grid_size, cache_size, false);
+
+		std::vector<unsigned int> grid3;
+		gridGen(grid3, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles
+		meshopt_optimizeVertexCacheFifo(&grid3[0], &grid3[0], grid3.size(), (grid_size + 1) * (grid_size + 1), cache_size);
+
+		std::vector<unsigned int> grid4;
+		gridGen(grid4, 0, grid_size, 0, grid_size, grid_size, grid_size * 4, false); // this generates a simple indexed grid without striping/degenerate triangles
+		meshopt_optimizeVertexCache(&grid4[0], &grid4[0], grid4.size(), (grid_size + 1) * (grid_size + 1));
+
+		unsigned int invocations1 = cache(&grid1[0], grid1.size());
+		unsigned int invocations2 = cache(&grid2[0], grid2.size());
+		unsigned int invocations3 = cache(&grid3[0], grid3.size());
+		unsigned int invocations4 = cache(&grid4[0], grid4.size());
+
+		unsigned int ideal_invocations = (grid_size + 1) * (grid_size + 1);
+
+		printf("%d, %f, %f, %f, %f\n", cache_size,
+		       double(invocations1) / double(ideal_invocations),
+		       double(invocations2) / double(ideal_invocations),
+		       double(invocations3) / double(ideal_invocations),
+		       double(invocations4) / double(ideal_invocations));
+	}
+}
+
+void testCache(IDXGIAdapter* adapter)
+{
+	ID3D11Device* device = 0;
+	ID3D11DeviceContext* context = 0;
+	D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context);
+
+	setupShaders(device, context);
+
+	inspectCache([&](const unsigned int* indices, size_t index_count) { return queryVSInvocations(device, context, indices, index_count); });
+}
+
+void testCacheSequence(IDXGIAdapter* adapter, int argc, char** argv)
+{
+	ID3D11Device* device = 0;
+	ID3D11DeviceContext* context = 0;
+	D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context);
+
+	setupShaders(device, context);
+
+	std::vector<unsigned int> ib;
+
+	for (int i = 2; i < argc; ++i)
+	{
+		char* end;
+		int i0 = strtol(argv[i], &end, 10);
+
+		if (end[0] == '-')
+		{
+			int i1 = strtol(end + 1, &end, 10);
+
+			if (end[0] != 0)
+			{
+				printf("Unrecognized index range: %s\n", argv[i]);
+				return;
+			}
+
+			if (i0 < i1)
+			{
+				for (int ii = i0; ii <= i1; ++ii)
+					ib.push_back(ii);
+			}
+			else
+			{
+				for (int ii = i0; ii >= i1; --ii)
+					ib.push_back(ii);
+			}
+		}
+		else if (end[0] == '*')
+		{
+			int i1 = strtol(end + 1, &end, 10);
+
+			if (end[0] != 0 || i1 == 0)
+			{
+				printf("Unrecognized index range: %s\n", argv[i]);
+				return;
+			}
+
+			for (int ii = 0; ii < i1; ++ii)
+				ib.push_back(i0);
+		}
+		else if (end[0] == 'x')
+		{
+			int i1 = strtol(end + 1, &end, 10);
+
+			if (end[0] != 0)
+			{
+				printf("Unrecognized index range: %s\n", argv[i]);
+				return;
+			}
+
+			stripGen(ib, 0, i0, 0, i1, i0, true);
+		}
+		else if (end[0] == 0)
+		{
+			ib.push_back(i0);
+		}
+		else
+		{
+			printf("Unrecognized index range: %s\n", argv[i]);
+			return;
+		}
+	}
+
+	if (ib.size() % 3)
+		ib.resize(ib.size() - ib.size() % 3);
+
+	std::vector<bool> xformed(ib.size());
+
+	for (size_t i = 0; i < ib.size(); i += 3)
+	{
+		unsigned int inv0 = i == 0 ? 0 : queryVSInvocations(device, context, ib.data(), i);
+		unsigned int inv1 = queryVSInvocations(device, context, ib.data(), i + 3);
+
+		assert(inv0 <= inv1);
+		assert(inv0 + 3 >= inv1);
+
+		switch (inv1 - inv0)
+		{
+		case 0:
+			xformed[i + 0] = xformed[i + 1] = xformed[i + 2] = false;
+			break;
+
+		case 3:
+			xformed[i + 0] = xformed[i + 1] = xformed[i + 2] = true;
+			break;
+
+		case 1:
+		case 2:
+		{
+			unsigned int a = ib[i + 0];
+			unsigned int b = ib[i + 1];
+			unsigned int c = ib[i + 2];
+
+			ib[i + 0] = ib[i + 1] = ib[i + 2] = a;
+			unsigned int inva = queryVSInvocations(device, context, ib.data(), i + 3);
+
+			ib[i + 1] = ib[i + 2] = b;
+			unsigned int invb = queryVSInvocations(device, context, ib.data(), i + 3);
+
+			ib[i + 2] = c;
+			unsigned int invc = queryVSInvocations(device, context, ib.data(), i + 3);
+
+			assert(inv0 <= inva && inva <= inv1);
+			assert(inv0 <= invb && invb <= inv1);
+			assert(inv0 <= invc && invc <= inv1);
+
+			if (inv1 - inv0 == 1 && a == c && inva == inv1 && invb == inv0 && invc == inv1)
+			{
+				xformed[i + 0] = false;
+				xformed[i + 1] = false;
+				xformed[i + 2] = true;
+			}
+			else
+			{
+				assert(inva <= invb);
+				assert(invb <= invc);
+
+				xformed[i + 0] = inva == inv0 + 1;
+				xformed[i + 1] = invb == inva + 1;
+				xformed[i + 2] = invc == invb + 1;
+			}
+			break;
+		}
+		}
+	}
+
+	unsigned int xformed_total = 0;
+
+	for (size_t i = 0; i < ib.size(); ++i)
+		xformed_total += xformed[i];
+
+	printf("// Sequence: %d indices", int(ib.size()));
+
+	for (size_t i = 0; i < ib.size(); ++i)
+	{
+		if (i % 12 == 0)
+		{
+			printf("\n// %3d*3:", int(i / 3));
+		}
+
+		if (xformed[i])
+			printf(" %3d*", ib[i]);
+		else
+			printf(" %3d ", ib[i]);
+	}
+
+	printf("\n");
+
+	std::vector<unsigned int> cached;
+
+	for (size_t i = 0; i < ib.size(); ++i)
+	{
+		unsigned int index = ib[i];
+		unsigned int inv0 = queryVSInvocations(device, context, ib.data(), ib.size());
+
+		ib.push_back(index);
+		ib.push_back(index);
+		ib.push_back(index);
+
+		unsigned int inv1 = queryVSInvocations(device, context, ib.data(), ib.size());
+
+		ib.resize(ib.size() - 3);
+
+		if (inv1 == inv0)
+			cached.push_back(index);
+	}
+
+	std::sort(cached.begin(), cached.end());
+	cached.erase(std::unique(cached.begin(), cached.end()), cached.end());
+
+	printf("// Cached  :");
+
+	for (size_t i = 0; i < cached.size(); ++i)
+		printf(" %d", cached[i]);
+
+	printf(" (%d)\n", int(cached.size()));
+
+	unsigned int invocations = queryVSInvocations(device, context, ib.data(), ib.size());
+
+	printf("// Invocations: %d\n", invocations);
+
+	assert(xformed_total == invocations);
+}
+
+void testCacheMeshes(IDXGIAdapter* adapter, int argc, char** argv)
+{
+	ID3D11Device* device = 0;
+	ID3D11DeviceContext* context = 0;
+	D3D11CreateDevice(adapter, D3D_DRIVER_TYPE_UNKNOWN, 0, 0, 0, 0, D3D11_SDK_VERSION, &device, 0, &context);
+
+	setupShaders(device, context);
+
+	bool stat = false;
+
+	double atvr_sum = 0;
+	double atvr_count = 0;
+
+	unsigned int total_invocations = 0;
+	unsigned int total_vertices = 0;
+
+	for (int i = 1; i < argc; ++i)
+	{
+		const char* path = argv[i];
+
+		if (strcmp(path, "--stat") == 0)
+		{
+			stat = true;
+			continue;
+		}
+
+		ObjFile file;
+
+		if (!objParseFile(file, path))
+		{
+			printf("Error loading %s: file not found\n", path);
+			continue;
+		}
+
+		if (!objValidate(file))
+		{
+			printf("Error loading %s: invalid file data\n", path);
+			continue;
+		}
+
+		std::vector<unsigned int> ib1;
+
+		for (size_t i = 0; i < file.f_size; i += 3)
+			ib1.push_back(file.f[i]);
+
+		unsigned int vertex_count = file.v_size / 3;
+		unsigned int index_count = ib1.size();
+
+		unsigned int invocations1 = queryVSInvocations(device, context, ib1.data(), index_count);
+
+		if (stat)
+		{
+			std::vector<unsigned int> ib2(ib1.size());
+			meshopt_optimizeVertexCache(&ib2[0], &ib1[0], ib1.size(), vertex_count);
+
+			unsigned int invocations = queryVSInvocations(device, context, ib2.data(), index_count);
+
+			atvr_sum += double(invocations) / double(vertex_count);
+			atvr_count += 1;
+
+			total_invocations += invocations;
+			total_vertices += vertex_count;
+		}
+		else
+		{
+			printf("%s: baseline    %f\n", path, double(invocations1) / double(vertex_count));
+
+			std::vector<unsigned int> ib3(ib1.size());
+			meshopt_optimizeVertexCache(&ib3[0], &ib1[0], ib1.size(), vertex_count);
+
+			unsigned int invocations3 = queryVSInvocations(device, context, ib3.data(), index_count);
+
+			printf("%s: forsyth     %f\n", path, double(invocations3) / double(vertex_count));
+
+			for (unsigned int cache_size = 12; cache_size <= 24; ++cache_size)
+			{
+				std::vector<unsigned int> ib2(ib1.size());
+				meshopt_optimizeVertexCacheFifo(&ib2[0], &ib1[0], ib1.size(), vertex_count, cache_size);
+
+				unsigned int invocations2 = queryVSInvocations(device, context, ib2.data(), index_count);
+
+				printf("%s: tipsify(%d) %f\n", path, cache_size, double(invocations2) / double(vertex_count));
+			}
+		}
+	}
+
+	if (stat)
+	{
+		printf("ATVR: average %f cumulative %f; %d vertices\n", atvr_sum / atvr_count, double(total_invocations) / double(total_vertices), total_vertices);
+	}
+}
+
+int main(int argc, char** argv)
+{
+	IDXGIFactory1* factory = 0;
+	CreateDXGIFactory1(__uuidof(IDXGIFactory1), (void**)&factory);
+
+	IDXGIAdapter* adapter = NULL;
+	for (unsigned int index = 0; SUCCEEDED(factory->EnumAdapters(index, &adapter)); ++index)
+	{
+		DXGI_ADAPTER_DESC ad = {};
+		adapter->GetDesc(&ad);
+
+		if (ad.VendorId == 0x1414 && ad.DeviceId == 0x8c)
+			continue; // Skip Microsoft Basic Render Driver
+
+		printf("// GPU %d: %S (Vendor %04x Device %04x)\n", index, ad.Description, ad.VendorId, ad.DeviceId);
+
+		if (argc == 1)
+		{
+			testCache(adapter);
+		}
+		else if (argc > 1 && strcmp(argv[1], "--") == 0)
+		{
+			testCacheSequence(adapter, argc, argv);
+		}
+		else
+		{
+			testCacheMeshes(adapter, argc, argv);
+		}
+	}
+}
+#endif
--- a/3rdparty/meshoptimizer/tools/vcachetuner.cpp
+++ b/3rdparty/meshoptimizer/tools/vcachetuner.cpp
@ -0,0 +1,587 @@
+#include "../src/meshoptimizer.h"
+#include "objparser.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+
+const int kCacheSizeMax = 16;
+const int kValenceMax = 8;
+
+namespace meshopt
+{
+extern thread_local float kVertexScoreTableCache[1 + kCacheSizeMax];
+extern thread_local float kVertexScoreTableLive[1 + kValenceMax];
+} // namespace meshopt
+
+struct { int cache, warp, triangle; } profiles[] =
+{
+	{14, 64, 128}, // AMD GCN
+	{32, 32, 32},  // NVidia Pascal
+	// { 16, 32, 32 }, // NVidia Kepler, Maxwell
+	// { 128, 0, 0 }, // Intel
+};
+
+const int Profile_Count = sizeof(profiles) / sizeof(profiles[0]);
+
+struct pcg32_random_t
+{
+	uint64_t state;
+	uint64_t inc;
+};
+
+#define PCG32_INITIALIZER { 0x853c49e6748fea9bULL, 0xda3e39cb94b95bdbULL }
+
+uint32_t pcg32_random_r(pcg32_random_t* rng)
+{
+	uint64_t oldstate = rng->state;
+	// Advance internal state
+	rng->state = oldstate * 6364136223846793005ULL + (rng->inc | 1);
+	// Calculate output function (XSH RR), uses old state for max ILP
+	uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+	uint32_t rot = oldstate >> 59u;
+	return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
+
+pcg32_random_t rngstate = PCG32_INITIALIZER;
+
+float rand01()
+{
+	return pcg32_random_r(&rngstate) / float(1ull << 32);
+}
+
+uint32_t rand32()
+{
+	return pcg32_random_r(&rngstate);
+}
+
+struct State
+{
+	float cache[kCacheSizeMax];
+	float live[kValenceMax];
+};
+
+struct Mesh
+{
+	size_t vertex_count;
+	std::vector<unsigned int> indices;
+
+	float atvr_base[Profile_Count];
+};
+
+Mesh gridmesh(unsigned int N)
+{
+	Mesh result;
+
+	result.vertex_count = (N + 1) * (N + 1);
+	result.indices.reserve(N * N * 6);
+
+	for (unsigned int y = 0; y < N; ++y)
+		for (unsigned int x = 0; x < N; ++x)
+		{
+			result.indices.push_back((y + 0) * (N + 1) + (x + 0));
+			result.indices.push_back((y + 0) * (N + 1) + (x + 1));
+			result.indices.push_back((y + 1) * (N + 1) + (x + 0));
+
+			result.indices.push_back((y + 1) * (N + 1) + (x + 0));
+			result.indices.push_back((y + 0) * (N + 1) + (x + 1));
+			result.indices.push_back((y + 1) * (N + 1) + (x + 1));
+		}
+
+	return result;
+}
+
+Mesh objmesh(const char* path)
+{
+	ObjFile file;
+
+	if (!objParseFile(file, path))
+	{
+		printf("Error loading %s: file not found\n", path);
+		return Mesh();
+	}
+
+	if (!objValidate(file))
+	{
+		printf("Error loading %s: invalid file data\n", path);
+		return Mesh();
+	}
+
+	size_t total_indices = file.f_size / 3;
+
+	struct Vertex
+	{
+		float px, py, pz;
+		float nx, ny, nz;
+		float tx, ty;
+	};
+
+	std::vector<Vertex> vertices(total_indices);
+
+	for (size_t i = 0; i < total_indices; ++i)
+	{
+		int vi = file.f[i * 3 + 0];
+		int vti = file.f[i * 3 + 1];
+		int vni = file.f[i * 3 + 2];
+
+		Vertex v =
+		    {
+		        file.v[vi * 3 + 0],
+		        file.v[vi * 3 + 1],
+		        file.v[vi * 3 + 2],
+
+		        vni >= 0 ? file.vn[vni * 3 + 0] : 0,
+		        vni >= 0 ? file.vn[vni * 3 + 1] : 0,
+		        vni >= 0 ? file.vn[vni * 3 + 2] : 0,
+
+		        vti >= 0 ? file.vt[vti * 3 + 0] : 0,
+		        vti >= 0 ? file.vt[vti * 3 + 1] : 0,
+		    };
+
+		vertices[i] = v;
+	}
+
+	Mesh result;
+
+	std::vector<unsigned int> remap(total_indices);
+
+	size_t total_vertices = meshopt_generateVertexRemap(&remap[0], NULL, total_indices, &vertices[0], total_indices, sizeof(Vertex));
+
+	result.indices.resize(total_indices);
+	meshopt_remapIndexBuffer(&result.indices[0], NULL, total_indices, &remap[0]);
+
+	result.vertex_count = total_vertices;
+
+	return result;
+}
+
+void compute_atvr(const State& state, const Mesh& mesh, float result[Profile_Count])
+{
+	memcpy(meshopt::kVertexScoreTableCache + 1, state.cache, kCacheSizeMax * sizeof(float));
+	memcpy(meshopt::kVertexScoreTableLive + 1, state.live, kValenceMax * sizeof(float));
+
+	std::vector<unsigned int> indices(mesh.indices.size());
+
+	meshopt_optimizeVertexCache(&indices[0], &mesh.indices[0], mesh.indices.size(), mesh.vertex_count);
+
+	for (int profile = 0; profile < Profile_Count; ++profile)
+		result[profile] = meshopt_analyzeVertexCache(&indices[0], indices.size(), mesh.vertex_count, profiles[profile].cache, profiles[profile].warp, profiles[profile].triangle).atvr;
+}
+
+float fitness_score(const State& state, const std::vector<Mesh>& meshes)
+{
+	float result = 0;
+	float count = 0;
+
+	for (auto& mesh : meshes)
+	{
+		float atvr[Profile_Count];
+		compute_atvr(state, mesh, atvr);
+
+		for (int profile = 0; profile < Profile_Count; ++profile)
+		{
+			result += mesh.atvr_base[profile] / atvr[profile];
+			count += 1;
+		}
+	}
+
+	return result / count;
+}
+
+float rndcache()
+{
+	return rand01();
+}
+
+float rndlive()
+{
+	return rand01();
+}
+
+std::vector<State> gen0(size_t count)
+{
+	std::vector<State> result;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		State state = {};
+
+		for (int j = 0; j < kCacheSizeMax; ++j)
+			state.cache[j] = rndcache();
+
+		for (int j = 0; j < kValenceMax; ++j)
+			state.live[j] = rndlive();
+
+		result.push_back(state);
+	}
+
+	return result;
+}
+
+size_t rndindex(const std::vector<float>& prob)
+{
+	float r = rand01();
+
+	for (size_t i = 0; i < prob.size(); ++i)
+	{
+		r -= prob[i];
+
+		if (r <= 0)
+			return i;
+	}
+
+	return prob.size() - 1;
+}
+
+State mutate(const State& state)
+{
+	State result = state;
+
+	if (rand01() < 0.7f)
+	{
+		size_t idxcache = std::min(int(rand01() * kCacheSizeMax + 0.5f), int(kCacheSizeMax - 1));
+
+		result.cache[idxcache] = rndcache();
+	}
+
+	if (rand01() < 0.7f)
+	{
+		size_t idxlive = std::min(int(rand01() * kValenceMax + 0.5f), int(kValenceMax - 1));
+
+		result.live[idxlive] = rndlive();
+	}
+
+	if (rand01() < 0.2f)
+	{
+		uint32_t mask = rand32();
+
+		for (size_t i = 0; i < kCacheSizeMax; ++i)
+			if (mask & (1 << i))
+				result.cache[i] *= 0.9f + 0.2f * rand01();
+	}
+
+	if (rand01() < 0.2f)
+	{
+		uint32_t mask = rand32();
+
+		for (size_t i = 0; i < kValenceMax; ++i)
+			if (mask & (1 << i))
+				result.live[i] *= 0.9f + 0.2f * rand01();
+	}
+
+	if (rand01() < 0.05f)
+	{
+		uint32_t mask = rand32();
+
+		for (size_t i = 0; i < kCacheSizeMax; ++i)
+			if (mask & (1 << i))
+				result.cache[i] = rndcache();
+	}
+
+	if (rand01() < 0.05f)
+	{
+		uint32_t mask = rand32();
+
+		for (size_t i = 0; i < kValenceMax; ++i)
+			if (mask & (1 << i))
+				result.live[i] = rndlive();
+	}
+
+	return result;
+}
+
+bool accept(float fitnew, float fitold, float temp)
+{
+	if (fitnew >= fitold)
+		return true;
+
+	if (temp == 0)
+		return false;
+
+	float prob = exp2((fitnew - fitold) / temp);
+
+	return rand01() < prob;
+}
+
+std::pair<State, float> genN_SA(std::vector<State>& seed, const std::vector<Mesh>& meshes, size_t steps)
+{
+	std::vector<State> result;
+	result.reserve(seed.size() * (1 + steps));
+
+	// perform several parallel steps of mutation for each temperature
+	for (size_t i = 0; i < seed.size(); ++i)
+	{
+		result.push_back(seed[i]);
+
+		for (size_t s = 0; s < steps; ++s)
+			result.push_back(mutate(seed[i]));
+	}
+
+	// compute fitness for all temperatures & mutations in parallel
+	std::vector<float> resultfit(result.size());
+
+#pragma omp parallel for
+	for (size_t i = 0; i < result.size(); ++i)
+	{
+		resultfit[i] = fitness_score(result[i], meshes);
+	}
+
+	// perform annealing for each temperature
+	std::vector<float> seedfit(seed.size());
+
+	for (size_t i = 0; i < seed.size(); ++i)
+	{
+		size_t offset = i * (1 + steps);
+
+		seedfit[i] = resultfit[offset];
+
+		float temp = (float(i) / float(seed.size() - 1)) / 0.1f;
+
+		for (size_t s = 0; s < steps; ++s)
+		{
+			if (accept(resultfit[offset + s + 1], seedfit[i], temp))
+			{
+				seedfit[i] = resultfit[offset + s + 1];
+				seed[i] = result[offset + s + 1];
+			}
+		}
+	}
+
+	// perform promotion from each temperature to the next one
+	for (size_t i = seed.size() - 1; i > 0; --i)
+	{
+		if (seedfit[i] > seedfit[i - 1])
+		{
+			seedfit[i - 1] = seedfit[i];
+			seed[i - 1] = seed[i];
+		}
+	}
+
+	return std::make_pair(seed[0], seedfit[0]);
+}
+
+std::pair<State, float> genN_GA(std::vector<State>& seed, const std::vector<Mesh>& meshes, float crossover, float mutate)
+{
+	std::vector<State> result;
+	result.reserve(seed.size());
+
+	std::vector<float> seedprob(seed.size());
+
+#pragma omp parallel for
+	for (size_t i = 0; i < seed.size(); ++i)
+	{
+		seedprob[i] = fitness_score(seed[i], meshes);
+	}
+
+	State best = {};
+	float bestfit = 0;
+	float probsum = 0;
+
+	for (size_t i = 0; i < seed.size(); ++i)
+	{
+		float score = seedprob[i];
+		probsum += score;
+
+		if (score > bestfit)
+		{
+			best = seed[i];
+			bestfit = score;
+		}
+	}
+
+	for (auto& prob : seedprob)
+	{
+		prob /= probsum;
+	}
+
+	std::vector<unsigned int> seedidx;
+	seedidx.reserve(seed.size());
+	for (size_t i = 0; i < seed.size(); ++i)
+		seedidx.push_back(i);
+
+	std::sort(seedidx.begin(), seedidx.end(), [&](size_t l, size_t r) { return seedprob[l] < seedprob[r]; });
+
+	while (result.size() < seed.size() / 4)
+	{
+		size_t idx = seedidx.back();
+		seedidx.pop_back();
+
+		result.push_back(seed[idx]);
+	}
+
+	while (result.size() < seed.size())
+	{
+		State s0 = seed[rndindex(seedprob)];
+		State s1 = seed[rndindex(seedprob)];
+
+		State state = s0;
+
+		// crossover
+		if (rand01() < crossover)
+		{
+			size_t idxcache = std::min(int(rand01() * kCacheSizeMax + 0.5f), 15);
+
+			memcpy(state.cache + idxcache, s1.cache + idxcache, (kCacheSizeMax - idxcache) * sizeof(float));
+		}
+
+		if (rand01() < crossover)
+		{
+			size_t idxlive = std::min(int(rand01() * kValenceMax + 0.5f), 7);
+
+			memcpy(state.live + idxlive, s1.live + idxlive, (kValenceMax - idxlive) * sizeof(float));
+		}
+
+		// mutate
+		if (rand01() < mutate)
+		{
+			size_t idxcache = std::min(int(rand01() * kCacheSizeMax + 0.5f), 15);
+
+			state.cache[idxcache] = rndcache();
+		}
+
+		if (rand01() < mutate)
+		{
+			size_t idxlive = std::min(int(rand01() * kValenceMax + 0.5f), 7);
+
+			state.live[idxlive] = rndlive();
+		}
+
+		result.push_back(state);
+	}
+
+	seed.swap(result);
+
+	return std::make_pair(best, bestfit);
+}
+
+bool load_state(const char* path, std::vector<State>& result)
+{
+	FILE* file = fopen(path, "rb");
+	if (!file)
+		return false;
+
+	State state;
+
+	result.clear();
+
+	while (fread(&state, sizeof(State), 1, file) == 1)
+		result.push_back(state);
+
+	fclose(file);
+
+	return true;
+}
+
+bool save_state(const char* path, const std::vector<State>& result)
+{
+	FILE* file = fopen(path, "wb");
+	if (!file)
+		return false;
+
+	for (auto& state : result)
+	{
+		if (fwrite(&state, sizeof(State), 1, file) != 1)
+		{
+			fclose(file);
+			return false;
+		}
+	}
+
+	return fclose(file) == 0;
+}
+
+void dump_state(const State& state)
+{
+	printf("cache:");
+	for (int i = 0; i < kCacheSizeMax; ++i)
+	{
+		printf(" %.3f", state.cache[i]);
+	}
+	printf("\n");
+
+	printf("live:");
+	for (int i = 0; i < kValenceMax; ++i)
+	{
+		printf(" %.3f", state.live[i]);
+	}
+	printf("\n");
+}
+
+int main(int argc, char** argv)
+{
+	bool annealing = false;
+
+	State baseline;
+	memcpy(baseline.cache, meshopt::kVertexScoreTableCache + 1, kCacheSizeMax * sizeof(float));
+	memcpy(baseline.live, meshopt::kVertexScoreTableLive + 1, kValenceMax * sizeof(float));
+
+	std::vector<Mesh> meshes;
+
+	meshes.push_back(gridmesh(50));
+
+	for (int i = 1; i < argc; ++i)
+		meshes.push_back(objmesh(argv[i]));
+
+	size_t total_triangles = 0;
+
+	for (auto& mesh : meshes)
+	{
+		compute_atvr(baseline, mesh, mesh.atvr_base);
+
+		total_triangles += mesh.indices.size() / 3;
+	}
+
+	std::vector<State> pop;
+	size_t gen = 0;
+
+	if (load_state("mutator.state", pop))
+	{
+		printf("Loaded %d state vectors\n", int(pop.size()));
+	}
+	else
+	{
+		pop = gen0(annealing ? 32 : 1000);
+	}
+
+	printf("%d meshes, %.1fM triangles\n", int(meshes.size()), double(total_triangles) / 1e6);
+
+	float atvr_0[Profile_Count];
+	float atvr_N[Profile_Count];
+	compute_atvr(baseline, meshes[0], atvr_0);
+	compute_atvr(baseline, meshes.back(), atvr_N);
+
+	printf("baseline: grid %f %f %s %f %f\n", atvr_0[0], atvr_0[1], argv[argc - 1], atvr_N[0], atvr_N[1]);
+
+	for (;;)
+	{
+		auto best = annealing ? genN_SA(pop, meshes, 31) : genN_GA(pop, meshes, 0.7f, 0.3f);
+		gen++;
+
+		compute_atvr(best.first, meshes[0], atvr_0);
+		compute_atvr(best.first, meshes.back(), atvr_N);
+
+		printf("%d: fitness %f; grid %f %f %s %f %f\n", int(gen), best.second, atvr_0[0], atvr_0[1], argv[argc - 1], atvr_N[0], atvr_N[1]);
+
+		if (gen % 100 == 0)
+		{
+			char buf[128];
+			sprintf(buf, "gcloud logging write vcache-log \"fitness %f; grid %f %f %s %f %f\"", best.second, atvr_0[0], atvr_0[1], argv[argc - 1], atvr_N[0], atvr_N[1]);
+			system(buf);
+		}
+
+		dump_state(best.first);
+
+		if (save_state("mutator.state-temp", pop) && rename("mutator.state-temp", "mutator.state") == 0)
+		{
+		}
+		else
+		{
+			printf("ERROR: Can't save state\n");
+		}
+	}
+}