Updated external libraries

2021-10-06 21:13:17 +02:00 · 2021-10-06 21:13:17 +02:00 · 700d448d75
commit 700d448d75
parent 8722ff7043
14 changed files with 3415 additions and 1124 deletions
--- a/src/external/cgltf.h
+++ b/src/external/cgltf.h
@ -1,7 +1,7 @@
 /**
 * cgltf - a single-file glTF 2.0 parser written in C99.
 *
- * Version: 1.10
+ * Version: 1.11
 *
 * Website: https://github.com/jkuhlmann/cgltf
 *
@ -234,6 +234,12 @@ typedef enum cgltf_light_type {
 	cgltf_light_type_spot,
 } cgltf_light_type;

+typedef enum cgltf_data_free_method {
+	cgltf_data_free_method_none,
+	cgltf_data_free_method_file_release,
+	cgltf_data_free_method_memory_free,
+} cgltf_data_free_method;
+
 typedef struct cgltf_extras {
 	cgltf_size start_offset;
 	cgltf_size end_offset;
@ -250,6 +256,7 @@ typedef struct cgltf_buffer
 	cgltf_size size;
 	char* uri;
 	void* data; /* loaded by cgltf_load_buffers */
+	cgltf_data_free_method data_free_method;
 	cgltf_extras extras;
 	cgltf_size extensions_count;
 	cgltf_extension* extensions;
@ -372,6 +379,8 @@ typedef struct cgltf_texture
 	char* name;
 	cgltf_image* image;
 	cgltf_sampler* sampler;
+	cgltf_bool has_basisu;
+	cgltf_image* basisu_image;
 	cgltf_extras extras;
 	cgltf_size extensions_count;
 	cgltf_extension* extensions;
@ -382,6 +391,7 @@ typedef struct cgltf_texture_transform
 	cgltf_float offset[2];
 	cgltf_float rotation;
 	cgltf_float scale[2];
+	cgltf_bool has_texcoord;
 	cgltf_int texcoord;
 } cgltf_texture_transform;

@ -595,6 +605,7 @@ typedef struct cgltf_light {
 	cgltf_float range;
 	cgltf_float spot_inner_cone_angle;
 	cgltf_float spot_outer_cone_angle;
+	cgltf_extras extras;
 } cgltf_light;

 struct cgltf_node {
@ -768,6 +779,7 @@ cgltf_result cgltf_load_buffers(

 cgltf_result cgltf_load_buffer_base64(const cgltf_options* options, cgltf_size size, const char* base64, void** out_data);

+void cgltf_decode_string(char* string);
 void cgltf_decode_uri(char* uri);

 cgltf_result cgltf_validate(cgltf_data* data);
@ -813,7 +825,7 @@ cgltf_result cgltf_copy_extras_json(const cgltf_data* data, const cgltf_extras*
 #include <limits.h> /* For UINT_MAX etc */
 #include <float.h>  /* For FLT_MAX */

-#if !defined(CGLTF_MALLOC) || !defined(CGLTF_FREE) || !defined(CGLTF_ATOI) || !defined(CGLTF_ATOF)
+#if !defined(CGLTF_MALLOC) || !defined(CGLTF_FREE) || !defined(CGLTF_ATOI) || !defined(CGLTF_ATOF) || !defined(CGLTF_ATOLL)
 #include <stdlib.h> /* For malloc, free, atoi, atof */
 #endif

@ -883,6 +895,9 @@ static const uint32_t GlbMagicBinChunk = 0x004E4942;
 #ifndef CGLTF_ATOF
 #define CGLTF_ATOF(str) atof(str)
 #endif
+#ifndef CGLTF_ATOLL
+#define CGLTF_ATOLL(str) atoll(str)
+#endif
 #ifndef CGLTF_VALIDATE_ENABLE_ASSERTS
 #define CGLTF_VALIDATE_ENABLE_ASSERTS 0
 #endif
@ -932,7 +947,12 @@ static cgltf_result cgltf_default_file_read(const struct cgltf_memory_options* m
 	{
 		fseek(file, 0, SEEK_END);

+#ifdef _WIN32
+		__int64 length = _ftelli64(file);
+#else
 		long length = ftell(file);
+#endif
+
 		if (length < 0)
 		{
 			fclose(file);
@ -1120,8 +1140,8 @@ cgltf_result cgltf_parse_file(const cgltf_options* options, const char* path, cg
 		return cgltf_result_invalid_options;
 	}

-	void (*memory_free)(void*, void*) = options->memory.free ? options->memory.free : &cgltf_default_free;
 	cgltf_result (*file_read)(const struct cgltf_memory_options*, const struct cgltf_file_options*, const char*, cgltf_size*, void**) = options->file.read ? options->file.read : &cgltf_default_file_read;
+	void (*file_release)(const struct cgltf_memory_options*, const struct cgltf_file_options*, void* data) = options->file.release ? options->file.release : cgltf_default_file_release;

 	void* file_data = NULL;
 	cgltf_size file_size = 0;
@ -1135,7 +1155,7 @@ cgltf_result cgltf_parse_file(const cgltf_options* options, const char* path, cg

 	if (result != cgltf_result_success)
 	{
-		memory_free(options->memory.user_data, file_data);
+		file_release(&options->memory, &options->file, file_data);
 		return result;
 	}

@ -1246,6 +1266,72 @@ static int cgltf_unhex(char ch)
 		-1;
 }

+void cgltf_decode_string(char* string)
+{
+	char* read = strchr(string, '\\');
+	if (read == NULL)
+	{
+		return;
+	}
+	char* write = string;
+	char* last = string;
+
+	while (read)
+	{
+		// Copy characters since last escaped sequence
+		cgltf_size written = read - last;
+		strncpy(write, last, written);
+		write += written;
+
+		// jsmn already checked that all escape sequences are valid
+		++read;
+		switch (*read++)
+		{
+		case '\"': *write++ = '\"'; break;
+		case '/':  *write++ = '/';  break;
+		case '\\': *write++ = '\\'; break;
+		case 'b':  *write++ = '\b'; break;
+		case 'f':  *write++ = '\f'; break;
+		case 'r':  *write++ = '\r'; break;
+		case 'n':  *write++ = '\n'; break;
+		case 't':  *write++ = '\t'; break;
+		case 'u':
+		{
+			// UCS-2 codepoint \uXXXX to UTF-8
+			int character = 0;
+			for (cgltf_size i = 0; i < 4; ++i)
+			{
+				character = (character << 4) + cgltf_unhex(*read++);
+			}
+
+			if (character <= 0x7F)
+			{
+				*write++ = character & 0xFF;
+			}
+			else if (character <= 0x7FF)
+			{
+				*write++ = 0xC0 | ((character >> 6) & 0xFF);
+				*write++ = 0x80 | (character & 0x3F);
+			}
+			else
+			{
+				*write++ = 0xE0 | ((character >> 12) & 0xFF);
+				*write++ = 0x80 | ((character >> 6) & 0x3F);
+				*write++ = 0x80 | (character & 0x3F);
+			}
+			break;
+		}
+		default:
+			break;
+		}
+
+		last = read;
+		read = strchr(read, '\\');
+	}
+
+	strcpy(write, last);
+}
+
 void cgltf_decode_uri(char* uri)
 {
 	char* write = uri;
@ -1291,6 +1377,7 @@ cgltf_result cgltf_load_buffers(const cgltf_options* options, cgltf_data* data,
 		}

 		data->buffers[0].data = (void*)data->bin;
+		data->buffers[0].data_free_method = cgltf_data_free_method_none;
 	}

 	for (cgltf_size i = 0; i < data->buffers_count; ++i)
@ -1314,6 +1401,7 @@ cgltf_result cgltf_load_buffers(const cgltf_options* options, cgltf_data* data,
 			if (comma && comma - uri >= 7 && strncmp(comma - 7, ";base64", 7) == 0)
 			{
 				cgltf_result res = cgltf_load_buffer_base64(options, data->buffers[i].size, comma + 1, &data->buffers[i].data);
+				data->buffers[i].data_free_method = cgltf_data_free_method_memory_free;

 				if (res != cgltf_result_success)
 				{
@ -1328,6 +1416,7 @@ cgltf_result cgltf_load_buffers(const cgltf_options* options, cgltf_data* data,
 		else if (strstr(uri, "://") == NULL && gltf_path)
 		{
 			cgltf_result res = cgltf_load_buffer_file(options, data->buffers[i].size, uri, gltf_path, &data->buffers[i].data);
+			data->buffers[i].data_free_method = cgltf_data_free_method_file_release;

 			if (res != cgltf_result_success)
 			{
@ -1655,10 +1744,15 @@ void cgltf_free(cgltf_data* data)
 	{
 		data->memory.free(data->memory.user_data, data->buffers[i].name);

-		if (data->buffers[i].data != data->bin)
+		if (data->buffers[i].data_free_method == cgltf_data_free_method_file_release)
 		{
 			file_release(&data->memory, &data->file, data->buffers[i].data);
 		}
+		else if (data->buffers[i].data_free_method == cgltf_data_free_method_memory_free)
+		{
+			data->memory.free(data->memory.user_data, data->buffers[i].data);
+		}
+
 		data->memory.free(data->memory.user_data, data->buffers[i].uri);

 		cgltf_free_extensions(data, data->buffers[i].extensions, data->buffers[i].extensions_count);
@ -2259,6 +2353,7 @@ cgltf_size cgltf_accessor_read_index(const cgltf_accessor* accessor, cgltf_size
 #define CGLTF_ERROR_LEGACY -3

 #define CGLTF_CHECK_TOKTYPE(tok_, type_) if ((tok_).type != (type_)) { return CGLTF_ERROR_JSON; }
+#define CGLTF_CHECK_TOKTYPE_RETTYPE(tok_, type_, ret_) if ((tok_).type != (type_)) { return (ret_)CGLTF_ERROR_JSON; }
 #define CGLTF_CHECK_KEY(tok_) if ((tok_).type != JSMN_STRING || (tok_).size == 0) { return CGLTF_ERROR_JSON; } /* checking size for 0 verifies that a value follows the key */

 #define CGLTF_PTRINDEX(type, idx) (type*)((cgltf_size)idx + 1)
@ -2283,6 +2378,16 @@ static int cgltf_json_to_int(jsmntok_t const* tok, const uint8_t* json_chunk)
 	return CGLTF_ATOI(tmp);
 }

+static cgltf_size cgltf_json_to_size(jsmntok_t const* tok, const uint8_t* json_chunk)
+{
+	CGLTF_CHECK_TOKTYPE_RETTYPE(*tok, JSMN_PRIMITIVE, cgltf_size);
+	char tmp[128];
+	int size = (cgltf_size)(tok->end - tok->start) < sizeof(tmp) ? tok->end - tok->start : (int)(sizeof(tmp) - 1);
+	strncpy(tmp, (const char*)json_chunk + tok->start, size);
+	tmp[size] = 0;
+	return (cgltf_size)CGLTF_ATOLL(tmp);
+}
+
 static cgltf_float cgltf_json_to_float(jsmntok_t const* tok, const uint8_t* json_chunk)
 {
 	CGLTF_CHECK_TOKTYPE(*tok, JSMN_PRIMITIVE);
@ -3024,7 +3129,7 @@ static int cgltf_parse_json_accessor_sparse(cgltf_options* options, jsmntok_t co
 				else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteOffset") == 0)
 				{
 					++i;
-					out_sparse->indices_byte_offset = cgltf_json_to_int(tokens + i, json_chunk);
+					out_sparse->indices_byte_offset = cgltf_json_to_size(tokens + i, json_chunk);
 					++i;
 				}
 				else if (cgltf_json_strcmp(tokens+i, json_chunk, "componentType") == 0)
@ -3073,7 +3178,7 @@ static int cgltf_parse_json_accessor_sparse(cgltf_options* options, jsmntok_t co
 				else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteOffset") == 0)
 				{
 					++i;
-					out_sparse->values_byte_offset = cgltf_json_to_int(tokens + i, json_chunk);
+					out_sparse->values_byte_offset = cgltf_json_to_size(tokens + i, json_chunk);
 					++i;
 				}
 				else if (cgltf_json_strcmp(tokens + i, json_chunk, "extras") == 0)
@ -3142,7 +3247,7 @@ static int cgltf_parse_json_accessor(cgltf_options* options, jsmntok_t const* to
 		{
 			++i;
 			out_accessor->offset =
-					cgltf_json_to_int(tokens+i, json_chunk);
+					cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "componentType") == 0)
@ -3268,6 +3373,7 @@ static int cgltf_parse_json_texture_transform(jsmntok_t const* tokens, int i, co
 		else if (cgltf_json_strcmp(tokens + i, json_chunk, "texCoord") == 0)
 		{
 			++i;
+			out_texture_transform->has_texcoord = 1;
 			out_texture_transform->texcoord = cgltf_json_to_int(tokens + i, json_chunk);
 			++i;
 		}
@ -3885,7 +3991,62 @@ static int cgltf_parse_json_texture(cgltf_options* options, jsmntok_t const* tok
 		}
 		else if (cgltf_json_strcmp(tokens + i, json_chunk, "extensions") == 0)
 		{
-			i = cgltf_parse_json_unprocessed_extensions(options, tokens, i, json_chunk, &out_texture->extensions_count, &out_texture->extensions);
+			++i;
+
+			CGLTF_CHECK_TOKTYPE(tokens[i], JSMN_OBJECT);
+			if (out_texture->extensions)
+			{
+				return CGLTF_ERROR_JSON;
+			}
+
+			int extensions_size = tokens[i].size;
+			++i;
+			out_texture->extensions = (cgltf_extension*)cgltf_calloc(options, sizeof(cgltf_extension), extensions_size);
+			out_texture->extensions_count = 0;
+
+			if (!out_texture->extensions)
+			{
+				return CGLTF_ERROR_NOMEM;
+			}
+
+			for (int k = 0; k < extensions_size; ++k)
+			{
+				CGLTF_CHECK_KEY(tokens[i]);
+
+				if (cgltf_json_strcmp(tokens + i, json_chunk, "KHR_texture_basisu") == 0)
+				{
+					out_texture->has_basisu = 1;
+					++i;
+					CGLTF_CHECK_TOKTYPE(tokens[i], JSMN_OBJECT);
+					int num_properties = tokens[i].size;
+					++i;
+
+					for (int t = 0; t < num_properties; ++t)
+					{
+						CGLTF_CHECK_KEY(tokens[i]);
+
+						if (cgltf_json_strcmp(tokens + i, json_chunk, "source") == 0)
+						{
+							++i;
+							out_texture->basisu_image = CGLTF_PTRINDEX(cgltf_image, cgltf_json_to_int(tokens + i, json_chunk));
+							++i;
+						}
+						else
+						{
+							i = cgltf_skip_json(tokens, i + 1);
+						}
+					}
+				}
+				else
+				{
+					i = cgltf_parse_json_unprocessed_extension(options, tokens, i, json_chunk, &(out_texture->extensions[out_texture->extensions_count++]));
+				}
+
+				if (i < 0)
+				{
+					return i;
+				}
+			}
 		}
 		else
 		{
@ -4192,19 +4353,19 @@ static int cgltf_parse_json_meshopt_compression(cgltf_options* options, jsmntok_
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteOffset") == 0)
 		{
 			++i;
-			out_meshopt_compression->offset = cgltf_json_to_int(tokens+i, json_chunk);
+			out_meshopt_compression->offset = cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteLength") == 0)
 		{
 			++i;
-			out_meshopt_compression->size = cgltf_json_to_int(tokens+i, json_chunk);
+			out_meshopt_compression->size = cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteStride") == 0)
 		{
 			++i;
-			out_meshopt_compression->stride = cgltf_json_to_int(tokens+i, json_chunk);
+			out_meshopt_compression->stride = cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "count") == 0)
@ -4290,21 +4451,21 @@ static int cgltf_parse_json_buffer_view(cgltf_options* options, jsmntok_t const*
 		{
 			++i;
 			out_buffer_view->offset =
-					cgltf_json_to_int(tokens+i, json_chunk);
+					cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteLength") == 0)
 		{
 			++i;
 			out_buffer_view->size =
-					cgltf_json_to_int(tokens+i, json_chunk);
+					cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "byteStride") == 0)
 		{
 			++i;
 			out_buffer_view->stride =
-					cgltf_json_to_int(tokens+i, json_chunk);
+					cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "target") == 0)
@ -4422,7 +4583,7 @@ static int cgltf_parse_json_buffer(cgltf_options* options, jsmntok_t const* toke
 		{
 			++i;
 			out_buffer->size =
-					cgltf_json_to_int(tokens+i, json_chunk);
+					cgltf_json_to_size(tokens+i, json_chunk);
 			++i;
 		}
 		else if (cgltf_json_strcmp(tokens+i, json_chunk, "uri") == 0)
@ -4737,6 +4898,14 @@ static int cgltf_parse_json_light(cgltf_options* options, jsmntok_t const* token
 {
 	CGLTF_CHECK_TOKTYPE(tokens[i], JSMN_OBJECT);

+	out_light->color[0] = 1.f;
+	out_light->color[1] = 1.f;
+	out_light->color[2] = 1.f;
+	out_light->intensity = 1.f;
+
+	out_light->spot_inner_cone_angle = 0.f;
+	out_light->spot_outer_cone_angle = 3.1415926535f / 4.0f;
+
 	int size = tokens[i].size;
 	++i;

@ -4817,6 +4986,10 @@ static int cgltf_parse_json_light(cgltf_options* options, jsmntok_t const* token
 				}
 			}
 		}
+		else if (cgltf_json_strcmp(tokens + i, json_chunk, "extras") == 0)
+		{
+			i = cgltf_parse_json_extras(tokens, i + 1, json_chunk, &out_light->extras);
+		}
 		else
 		{
 			i = cgltf_skip_json(tokens, i+1);
@ -5851,6 +6024,7 @@ static int cgltf_fixup_pointers(cgltf_data* data)
 	for (cgltf_size i = 0; i < data->textures_count; ++i)
 	{
 		CGLTF_PTRFIXUP(data->textures[i].image, data->images, data->images_count);
+		CGLTF_PTRFIXUP(data->textures[i].basisu_image, data->images, data->images_count);
 		CGLTF_PTRFIXUP(data->textures[i].sampler, data->samplers, data->samplers_count);
 	}

@ -6305,7 +6479,7 @@ static void jsmn_init(jsmn_parser *parser) {

 /* cgltf is distributed under MIT license:
 *
- * Copyright (c) 2018 Johannes Kuhlmann
+ * Copyright (c) 2018-2021 Johannes Kuhlmann

 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
--- a/src/external/dr_flac.h
+++ b/src/external/dr_flac.h
@ -1,6 +1,6 @@
 /*
 FLAC audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file.
-dr_flac - v0.12.29 - 2021-04-02
+dr_flac - v0.12.31 - 2021-08-16

 David Reid - mackron@gmail.com

@ -232,7 +232,7 @@ extern "C" {

 #define DRFLAC_VERSION_MAJOR     0
 #define DRFLAC_VERSION_MINOR     12
-#define DRFLAC_VERSION_REVISION  29
+#define DRFLAC_VERSION_REVISION  31
 #define DRFLAC_VERSION_STRING    DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MAJOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_MINOR) "." DRFLAC_XSTRINGIFY(DRFLAC_VERSION_REVISION)

 #include <stddef.h> /* For size_t. */
@ -261,7 +261,7 @@ typedef unsigned int            drflac_uint32;
        #pragma GCC diagnostic pop
    #endif
 #endif
-#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
    typedef drflac_uint64       drflac_uintptr;
 #else
    typedef drflac_uint32       drflac_uintptr;
@ -11516,7 +11516,7 @@ static type* drflac__full_read_and_close_ ## extension (drflac* pFlac, unsigned
        DRFLAC_ZERO_MEMORY(pSampleData + (totalPCMFrameCount*pFlac->channels), (size_t)(sampleDataBufferSize - totalPCMFrameCount*pFlac->channels*sizeof(type)));   \
    } else {                                                                                                                                                        \
        drflac_uint64 dataSize = totalPCMFrameCount*pFlac->channels*sizeof(type);                                                                                   \
-        if (dataSize > DRFLAC_SIZE_MAX) {                                                                                                                           \
+        if (dataSize > (drflac_uint64)DRFLAC_SIZE_MAX) {                                                                                                            \
            goto on_error;  /* The decoded data is too big. */                                                                                                      \
        }                                                                                                                                                           \
                                                                                                                                                                    \
@ -11851,6 +11851,12 @@ DRFLAC_API drflac_bool32 drflac_next_cuesheet_track(drflac_cuesheet_track_iterat
 /*
 REVISION HISTORY
 ================
+v0.12.31 - 2021-08-16
+  - Silence some warnings.
+
+v0.12.30 - 2021-07-31
+  - Fix platform detection for ARM64.
+
 v0.12.29 - 2021-04-02
  - Fix a bug where the running PCM frame index is set to an invalid value when over-seeking.
  - Fix a decoding error due to an incorrect validation check.
--- a/src/external/dr_mp3.h
+++ b/src/external/dr_mp3.h
@ -1,6 +1,6 @@
 /*
 MP3 audio decoder. Choice of public domain or MIT-0. See license statements at the end of this file.
-dr_mp3 - v0.6.27 - 2021-02-21
+dr_mp3 - v0.6.31 - 2021-08-22

 David Reid - mackron@gmail.com

@ -95,7 +95,7 @@ extern "C" {

 #define DRMP3_VERSION_MAJOR     0
 #define DRMP3_VERSION_MINOR     6
-#define DRMP3_VERSION_REVISION  27
+#define DRMP3_VERSION_REVISION  31
 #define DRMP3_VERSION_STRING    DRMP3_XSTRINGIFY(DRMP3_VERSION_MAJOR) "." DRMP3_XSTRINGIFY(DRMP3_VERSION_MINOR) "." DRMP3_XSTRINGIFY(DRMP3_VERSION_REVISION)

 #include <stddef.h> /* For size_t. */
@ -124,7 +124,7 @@ typedef unsigned int            drmp3_uint32;
        #pragma GCC diagnostic pop
    #endif
 #endif
-#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__)
+#if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__)) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
    typedef drmp3_uint64        drmp3_uintptr;
 #else
    typedef drmp3_uint32        drmp3_uintptr;
@ -701,7 +701,7 @@ static int drmp3_have_simd(void)

 #if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64)
 #define DRMP3_HAVE_ARMV6 1
-static __inline__ __attribute__((always_inline)) drmp3_int32 drmp3_clip_int16_arm(int32_t a)
+static __inline__ __attribute__((always_inline)) drmp3_int32 drmp3_clip_int16_arm(drmp3_int32 a)
 {
    drmp3_int32 x = 0;
    __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
@ -712,6 +712,31 @@ static __inline__ __attribute__((always_inline)) drmp3_int32 drmp3_clip_int16_ar
 #endif


+/* Standard library stuff. */
+#ifndef DRMP3_ASSERT
+#include <assert.h>
+#define DRMP3_ASSERT(expression) assert(expression)
+#endif
+#ifndef DRMP3_COPY_MEMORY
+#define DRMP3_COPY_MEMORY(dst, src, sz) memcpy((dst), (src), (sz))
+#endif
+#ifndef DRMP3_MOVE_MEMORY
+#define DRMP3_MOVE_MEMORY(dst, src, sz) memmove((dst), (src), (sz))
+#endif
+#ifndef DRMP3_ZERO_MEMORY
+#define DRMP3_ZERO_MEMORY(p, sz) memset((p), 0, (sz))
+#endif
+#define DRMP3_ZERO_OBJECT(p) DRMP3_ZERO_MEMORY((p), sizeof(*(p)))
+#ifndef DRMP3_MALLOC
+#define DRMP3_MALLOC(sz) malloc((sz))
+#endif
+#ifndef DRMP3_REALLOC
+#define DRMP3_REALLOC(p, sz) realloc((p), (sz))
+#endif
+#ifndef DRMP3_FREE
+#define DRMP3_FREE(p) free((p))
+#endif
+
 typedef struct
 {
    const drmp3_uint8 *buf;
@ -978,7 +1003,7 @@ static int drmp3_L12_dequantize_granule(float *grbuf, drmp3_bs *bs, drmp3_L12_sc
 static void drmp3_L12_apply_scf_384(drmp3_L12_scale_info *sci, const float *scf, float *dst)
 {
    int i, k;
-    memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
+    DRMP3_COPY_MEMORY(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
    for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
    {
        for (k = 0; k < 12; k++)
@ -1123,14 +1148,14 @@ static void drmp3_L3_read_scalefactors(drmp3_uint8 *scf, drmp3_uint8 *ist_pos, c
        int cnt = scf_count[i];
        if (scfsi & 8)
        {
-            memcpy(scf, ist_pos, cnt);
+            DRMP3_COPY_MEMORY(scf, ist_pos, cnt);
        } else
        {
            int bits = scf_size[i];
            if (!bits)
            {
-                memset(scf, 0, cnt);
-                memset(ist_pos, 0, cnt);
+                DRMP3_ZERO_MEMORY(scf, cnt);
+                DRMP3_ZERO_MEMORY(ist_pos, cnt);
            } else
            {
                int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
@ -1390,12 +1415,22 @@ static void drmp3_L3_midside_stereo(float *left, int n)
    int i = 0;
    float *right = left + 576;
 #if DRMP3_HAVE_SIMD
-    if (drmp3_have_simd()) for (; i < n - 3; i += 4)
+    if (drmp3_have_simd())
    {
-        drmp3_f4 vl = DRMP3_VLD(left + i);
-        drmp3_f4 vr = DRMP3_VLD(right + i);
-        DRMP3_VSTORE(left + i, DRMP3_VADD(vl, vr));
-        DRMP3_VSTORE(right + i, DRMP3_VSUB(vl, vr));
+        for (; i < n - 3; i += 4)
+        {
+            drmp3_f4 vl = DRMP3_VLD(left + i);
+            drmp3_f4 vr = DRMP3_VLD(right + i);
+            DRMP3_VSTORE(left + i, DRMP3_VADD(vl, vr));
+            DRMP3_VSTORE(right + i, DRMP3_VSUB(vl, vr));
+        }
+#ifdef __GNUC__
+        /* Workaround for spurious -Waggressive-loop-optimizations warning from gcc.
+         * For more info see: https://github.com/lieff/minimp3/issues/88
+         */
+        if (__builtin_constant_p(n % 4 == 0) && n % 4 == 0)
+            return;
+#endif
    }
 #endif
    for (; i < n; i++)
@ -1505,7 +1540,7 @@ static void drmp3_L3_reorder(float *grbuf, float *scratch, const drmp3_uint8 *sf
            *dst++ = src[2*len];
        }
    }
-    memcpy(grbuf, scratch, (dst - scratch)*sizeof(float));
+    DRMP3_COPY_MEMORY(grbuf, scratch, (dst - scratch)*sizeof(float));
 }

 static void drmp3_L3_antialias(float *grbuf, int nbands)
@ -1674,8 +1709,8 @@ static void drmp3_L3_imdct_short(float *grbuf, float *overlap, int nbands)
    for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
    {
        float tmp[18];
-        memcpy(tmp, grbuf, sizeof(tmp));
-        memcpy(grbuf, overlap, 6*sizeof(float));
+        DRMP3_COPY_MEMORY(tmp, grbuf, sizeof(tmp));
+        DRMP3_COPY_MEMORY(grbuf, overlap, 6*sizeof(float));
        drmp3_L3_imdct12(tmp, grbuf + 6, overlap + 6);
        drmp3_L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
        drmp3_L3_imdct12(tmp + 2, overlap, overlap + 6);
@ -1719,7 +1754,7 @@ static void drmp3_L3_save_reservoir(drmp3dec *h, drmp3dec_scratch *s)
    }
    if (remains > 0)
    {
-        memmove(h->reserv_buf, s->maindata + pos, remains);
+        DRMP3_MOVE_MEMORY(h->reserv_buf, s->maindata + pos, remains);
    }
    h->reserv = remains;
 }
@ -1728,8 +1763,8 @@ static int drmp3_L3_restore_reservoir(drmp3dec *h, drmp3_bs *bs, drmp3dec_scratc
 {
    int frame_bytes = (bs->limit - bs->pos)/8;
    int bytes_have = DRMP3_MIN(h->reserv, main_data_begin);
-    memcpy(s->maindata, h->reserv_buf + DRMP3_MAX(0, h->reserv - main_data_begin), DRMP3_MIN(h->reserv, main_data_begin));
-    memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
+    DRMP3_COPY_MEMORY(s->maindata, h->reserv_buf + DRMP3_MAX(0, h->reserv - main_data_begin), DRMP3_MIN(h->reserv, main_data_begin));
+    DRMP3_COPY_MEMORY(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
    drmp3_bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
    return h->reserv >= main_data_begin;
 }
@ -2136,7 +2171,7 @@ static void drmp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int
        drmp3d_DCT_II(grbuf + 576*i, nbands);
    }

-    memcpy(lins, qmf_state, sizeof(float)*15*64);
+    DRMP3_COPY_MEMORY(lins, qmf_state, sizeof(float)*15*64);

    for (i = 0; i < nbands; i += 2)
    {
@ -2152,7 +2187,7 @@ static void drmp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int
    } else
 #endif
    {
-        memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64);
+        DRMP3_COPY_MEMORY(qmf_state, lins + nbands*64, sizeof(float)*15*64);
    }
 }

@ -2230,7 +2265,7 @@ DRMP3_API int drmp3dec_decode_frame(drmp3dec *dec, const drmp3_uint8 *mp3, int m
    }
    if (!frame_size)
    {
-        memset(dec, 0, sizeof(drmp3dec));
+        DRMP3_ZERO_MEMORY(dec, sizeof(drmp3dec));
        i = drmp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
        if (!frame_size || i + frame_size > mp3_bytes)
        {
@ -2240,7 +2275,7 @@ DRMP3_API int drmp3dec_decode_frame(drmp3dec *dec, const drmp3_uint8 *mp3, int m
    }

    hdr = mp3 + i;
-    memcpy(dec->header, hdr, DRMP3_HDR_SIZE);
+    DRMP3_COPY_MEMORY(dec->header, hdr, DRMP3_HDR_SIZE);
    info->frame_bytes = i + frame_size;
    info->channels = DRMP3_HDR_IS_MONO(hdr) ? 1 : 2;
    info->hz = drmp3_hdr_sample_rate_hz(hdr);
@ -2266,7 +2301,7 @@ DRMP3_API int drmp3dec_decode_frame(drmp3dec *dec, const drmp3_uint8 *mp3, int m
        {
            for (igr = 0; igr < (DRMP3_HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm = DRMP3_OFFSET_PTR(pcm, sizeof(drmp3d_sample_t)*576*info->channels))
            {
-                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+                DRMP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
                drmp3_L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
                drmp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, (drmp3d_sample_t*)pcm, scratch.syn[0]);
            }
@ -2285,7 +2320,7 @@ DRMP3_API int drmp3dec_decode_frame(drmp3dec *dec, const drmp3_uint8 *mp3, int m

        drmp3_L12_read_scale_info(hdr, bs_frame, sci);

-        memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+        DRMP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
        for (i = 0, igr = 0; igr < 3; igr++)
        {
            if (12 == (i += drmp3_L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
@ -2293,7 +2328,7 @@ DRMP3_API int drmp3dec_decode_frame(drmp3dec *dec, const drmp3_uint8 *mp3, int m
                i = 0;
                drmp3_L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
                drmp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, (drmp3d_sample_t*)pcm, scratch.syn[0]);
-                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+                DRMP3_ZERO_MEMORY(scratch.grbuf[0], 576*2*sizeof(float));
                pcm = DRMP3_OFFSET_PTR(pcm, sizeof(drmp3d_sample_t)*384*info->channels);
            }
            if (bs_frame->pos > bs_frame->limit)
@ -2396,28 +2431,6 @@ DRMP3_API void drmp3dec_f32_to_s16(const float *in, drmp3_int16 *out, size_t num
 #endif


-/* Standard library stuff. */
-#ifndef DRMP3_ASSERT
-#include <assert.h>
-#define DRMP3_ASSERT(expression) assert(expression)
-#endif
-#ifndef DRMP3_COPY_MEMORY
-#define DRMP3_COPY_MEMORY(dst, src, sz) memcpy((dst), (src), (sz))
-#endif
-#ifndef DRMP3_ZERO_MEMORY
-#define DRMP3_ZERO_MEMORY(p, sz) memset((p), 0, (sz))
-#endif
-#define DRMP3_ZERO_OBJECT(p) DRMP3_ZERO_MEMORY((p), sizeof(*(p)))
-#ifndef DRMP3_MALLOC
-#define DRMP3_MALLOC(sz) malloc((sz))
-#endif
-#ifndef DRMP3_REALLOC
-#define DRMP3_REALLOC(p, sz) realloc((p), (sz))
-#endif
-#ifndef DRMP3_FREE
-#define DRMP3_FREE(p) free((p))
-#endif
-
 #define DRMP3_COUNTOF(x)        (sizeof(x) / sizeof(x[0]))
 #define DRMP3_CLAMP(x, lo, hi)  (DRMP3_MAX(lo, DRMP3_MIN(x, hi)))

@ -2649,7 +2662,7 @@ static drmp3_uint32 drmp3_decode_next_frame_ex__callbacks(drmp3* pMP3, drmp3d_sa

            /* First we need to move the data down. */
            if (pMP3->pData != NULL) {
-                memmove(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
+                DRMP3_MOVE_MEMORY(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
            }

            pMP3->dataConsumed = 0;
@ -2709,7 +2722,7 @@ static drmp3_uint32 drmp3_decode_next_frame_ex__callbacks(drmp3* pMP3, drmp3d_sa
            size_t bytesRead;

            /* First we need to move the data down. */
-            memmove(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
+            DRMP3_MOVE_MEMORY(pMP3->pData, pMP3->pData + pMP3->dataConsumed, pMP3->dataSize);
            pMP3->dataConsumed = 0;

            if (pMP3->dataCapacity == pMP3->dataSize) {
@ -2754,12 +2767,22 @@ static drmp3_uint32 drmp3_decode_next_frame_ex__memory(drmp3* pMP3, drmp3d_sampl
        return 0;
    }

-    pcmFramesRead = drmp3dec_decode_frame(&pMP3->decoder, pMP3->memory.pData + pMP3->memory.currentReadPos, (int)(pMP3->memory.dataSize - pMP3->memory.currentReadPos), pPCMFrames, &info);
-    if (pcmFramesRead > 0) {
-        pMP3->pcmFramesConsumedInMP3Frame  = 0;
-        pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
-        pMP3->mp3FrameChannels             = info.channels;
-        pMP3->mp3FrameSampleRate           = info.hz;
+    for (;;) {
+        pcmFramesRead = drmp3dec_decode_frame(&pMP3->decoder, pMP3->memory.pData + pMP3->memory.currentReadPos, (int)(pMP3->memory.dataSize - pMP3->memory.currentReadPos), pPCMFrames, &info);
+        if (pcmFramesRead > 0) {
+            pcmFramesRead = drmp3_hdr_frame_samples(pMP3->decoder.header);
+            pMP3->pcmFramesConsumedInMP3Frame  = 0;
+            pMP3->pcmFramesRemainingInMP3Frame = pcmFramesRead;
+            pMP3->mp3FrameChannels             = info.channels;
+            pMP3->mp3FrameSampleRate           = info.hz;
+            break;
+        } else if (info.frame_bytes > 0) {
+            /* No frames were read, but it looks like we skipped past one. Read the next MP3 frame. */
+            pMP3->memory.currentReadPos += (size_t)info.frame_bytes;
+        } else {
+            /* Nothing at all was read. Abort. */
+            break;
+        }
    }

    /* Consume the data. */
@ -2822,7 +2845,7 @@ static drmp3_bool32 drmp3_init_internal(drmp3* pMP3, drmp3_read_proc onRead, drm
    }

    /* Decode the first frame to confirm that it is indeed a valid MP3 stream. */
-    if (!drmp3_decode_next_frame(pMP3)) {
+    if (drmp3_decode_next_frame(pMP3) == 0) {
        drmp3__free_from_callbacks(pMP3->pData, &pMP3->allocationCallbacks);    /* The call above may have allocated memory. Need to make sure it's freed before aborting. */
        return DRMP3_FALSE; /* Not a valid MP3 stream. */
    }
@ -4177,7 +4200,7 @@ static float* drmp3__full_read_and_close_f32(drmp3* pMP3, drmp3_config* pConfig,

            oldFramesBufferSize = framesCapacity * pMP3->channels * sizeof(float);
            newFramesBufferSize = newFramesCap   * pMP3->channels * sizeof(float);
-            if (newFramesBufferSize > DRMP3_SIZE_MAX) {
+            if (newFramesBufferSize > (drmp3_uint64)DRMP3_SIZE_MAX) {
                break;
            }

@ -4244,7 +4267,7 @@ static drmp3_int16* drmp3__full_read_and_close_s16(drmp3* pMP3, drmp3_config* pC

            oldFramesBufferSize = framesCapacity * pMP3->channels * sizeof(drmp3_int16);
            newFramesBufferSize = newFramesCap   * pMP3->channels * sizeof(drmp3_int16);
-            if (newFramesBufferSize > DRMP3_SIZE_MAX) {
+            if (newFramesBufferSize > (drmp3_uint64)DRMP3_SIZE_MAX) {
                break;
            }

@ -4450,6 +4473,20 @@ counts rather than sample counts.
 /*
 REVISION HISTORY
 ================
+v0.6.31 - 2021-08-22
+  - Fix a bug when loading from memory.
+
+v0.6.30 - 2021-08-16
+  - Silence some warnings.
+  - Replace memory operations with DRMP3_* macros.
+
+v0.6.29 - 2021-08-08
+  - Bring up to date with minimp3.
+
+v0.6.28 - 2021-07-31
+  - Fix platform detection for ARM64.
+  - Fix a compilation error with C89.
+
 v0.6.27 - 2021-02-21
  - Fix a warning due to referencing _MSC_VER when it is undefined.

--- a/src/external/dr_wav.h
+++ b/src/external/dr_wav.h
--- a/src/external/msf_gif.h
+++ b/src/external/msf_gif.h
@ -13,19 +13,31 @@ USAGE EXAMPLE:

    int width = 480, height = 320, centisecondsPerFrame = 5, bitDepth = 16;
    MsfGifState gifState = {};
+    // msf_gif_bgra_flag = true; //optionally, set this flag if your pixels are in BGRA format instead of RGBA
+    // msf_gif_alpha_threshold = 128; //optionally, enable transparency (see function documentation below for details)
    msf_gif_begin(&gifState, width, height);
    msf_gif_frame(&gifState, ..., centisecondsPerFrame, bitDepth, width * 4); //frame 1
    msf_gif_frame(&gifState, ..., centisecondsPerFrame, bitDepth, width * 4); //frame 2
    msf_gif_frame(&gifState, ..., centisecondsPerFrame, bitDepth, width * 4); //frame 3, etc...
    MsfGifResult result = msf_gif_end(&gifState);
-    FILE * fp = fopen("MyGif.gif", "wb");
-    fwrite(result.data, result.dataSize, 1, fp);
-    fclose(fp);
+    if (result.data) {
+        FILE * fp = fopen("MyGif.gif", "wb");
+        fwrite(result.data, result.dataSize, 1, fp);
+        fclose(fp);
+    }
    msf_gif_free(result);

 Detailed function documentation can be found in the header section below.


+ERROR HANDLING:
+
+    If memory allocation fails, the functions will signal the error via their return values.
+    If one function call fails, the library will free all of its allocations,
+    and all subsequent calls will safely no-op and return 0 until the next call to `msf_gif_begin()`.
+    Therefore, it's safe to check only the return value of `msf_gif_end()`.
+
+
 REPLACING MALLOC:

    This library uses malloc+realloc+free internally for memory allocation.
@ -39,10 +51,20 @@ REPLACING MALLOC:
    If your allocator needs a context pointer, you can set the `customAllocatorContext` field of the MsfGifState struct
    before calling msf_gif_begin(), and it will be passed to all subsequent allocator macro calls.

+    The maximum number of bytes the library will allocate to encode a single gif is bounded by the following formula:
+    `(2 * 1024 * 1024) + (width * height * 8) + ((1024 + width * height * 1.5) * 3 * frameCount)`
+    The peak heap memory usage in bytes, if using a general-purpose heap allocator, is bounded by the following formula:
+    `(2 * 1024 * 1024) + (width * height * 9.5) + 1024 + (16 * frameCount) + (2 * sizeOfResultingGif)
+
+
 See end of file for license information.
 */

-//version 2.1
+//version 2.2
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// HEADER                                                                                                           ///
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

 #ifndef MSF_GIF_H
 #define MSF_GIF_H
@ -63,12 +85,24 @@ typedef struct { //internal use
    int depth, count, rbits, gbits, bbits;
 } MsfCookedFrame;

+typedef struct MsfGifBuffer {
+    struct MsfGifBuffer * next;
+    size_t size;
+    uint8_t data[1];
+} MsfGifBuffer;
+
+typedef size_t (* MsfGifFileWriteFunc) (const void * buffer, size_t size, size_t count, void * stream);
 typedef struct {
+    MsfGifFileWriteFunc fileWriteFunc;
+    void * fileWriteData;
    MsfCookedFrame previousFrame;
-    uint8_t * listHead;
-    uint8_t * listTail;
+    MsfCookedFrame currentFrame;
+    int16_t * lzwMem;
+    MsfGifBuffer * listHead;
+    MsfGifBuffer * listTail;
    int width, height;
    void * customAllocatorContext;
+    int framesSubmitted; //needed for transparency to work correctly (because we reach into the previous frame)
 } MsfGifState;

 #ifdef __cplusplus
@ -83,7 +117,8 @@ extern "C" {
 int msf_gif_begin(MsfGifState * handle, int width, int height);

 /**
- * @param pixelData            Pointer to raw framebuffer data. Rows must be contiguous in memory, in RGBA8 format.
+ * @param pixelData            Pointer to raw framebuffer data. Rows must be contiguous in memory, in RGBA8 format
+ *                             (or BGRA8 if you have set `msf_gif_bgra_flag = true`).
 *                             Note: This function does NOT free `pixelData`. You must free it yourself afterwards.
 * @param centiSecondsPerFrame How many hundredths of a second this frame should be displayed for.
 *                             Note: This being specified in centiseconds is a limitation of the GIF format.
@ -111,6 +146,35 @@ MsfGifResult msf_gif_end(MsfGifState * handle);
 */
 void msf_gif_free(MsfGifResult result);

+//The gif format only supports 1-bit transparency, meaning a pixel will either be fully transparent or fully opaque.
+//Pixels with an alpha value less than the alpha threshold will be treated as transparent.
+//To enable exporting transparent gifs, set it to a value between 1 and 255 (inclusive) before calling msf_gif_frame().
+//Setting it to 0 causes the alpha channel to be ignored. Its initial value is 0.
+extern int msf_gif_alpha_threshold;
+
+//Set `msf_gif_bgra_flag = true` before calling `msf_gif_frame()` if your pixels are in BGRA byte order instead of RBGA.
+extern int msf_gif_bgra_flag;
+
+
+
+//TO-FILE FUNCTIONS
+//These functions are equivalent to the ones above, but they write results to a file incrementally,
+//instead of building a buffer in memory. This can result in lower memory usage when saving large gifs,
+//because memory usage is bounded by only the size of a single frame, and is not dependent on the number of frames.
+//There is currently no reason to use these unless you are on a memory-constrained platform.
+//If in doubt about which API to use, for now you should use the normal (non-file) functions above.
+//The signature of MsfGifFileWriteFunc matches fwrite for convenience, so that you can use the C file API like so:
+//  FILE * fp = fopen("MyGif.gif", "wb");
+//  msf_gif_begin_to_file(&handle, width, height, (MsfGifFileWriteFunc) fwrite, (void *) fp);
+//  msf_gif_frame_to_file(...)
+//  msf_gif_end_to_file(&handle);
+//  fclose(fp);
+//If you use a custom file write function, you must take care to return the same values that fwrite() would return.
+//Note that all three functions will potentially write to the file.
+int msf_gif_begin_to_file(MsfGifState * handle, int width, int height, MsfGifFileWriteFunc func, void * filePointer);
+int msf_gif_frame_to_file(MsfGifState * handle, uint8_t * pixelData, int centiSecondsPerFame, int maxBitDepth, int pitchInBytes);
+int msf_gif_end_to_file(MsfGifState * handle); //returns 0 on error and non-zero on success
+
 #ifdef __cplusplus
 }
 #endif //__cplusplus
@ -125,10 +189,6 @@ void msf_gif_free(MsfGifResult result);
 #ifndef MSF_GIF_ALREADY_IMPLEMENTED_IN_THIS_TRANSLATION_UNIT
 #define MSF_GIF_ALREADY_IMPLEMENTED_IN_THIS_TRANSLATION_UNIT

-#ifndef MSF_GIF_BUFFER_INIT_SIZE
-#define MSF_GIF_BUFFER_INIT_SIZE 1024 * 1024 * 4 //4MB by default, you can increase this if you want to realloc less
-#endif
-
 //ensure the library user has either defined all of malloc/realloc/free, or none
 #if defined(MSF_GIF_MALLOC) && defined(MSF_GIF_REALLOC) && defined(MSF_GIF_FREE) //ok
 #elif !defined(MSF_GIF_MALLOC) && !defined(MSF_GIF_REALLOC) && !defined(MSF_GIF_FREE) //ok
@ -189,13 +249,21 @@ static inline int msf_imax(int a, int b) { return b < a? a : b; }
 #include <emmintrin.h>
 #endif

-static MsfCookedFrame msf_cook_frame(void * allocContext, uint8_t * raw, uint8_t * used,
-                                     int width, int height, int pitch, int depth)
+int msf_gif_alpha_threshold = 0;
+int msf_gif_bgra_flag = 0;
+
+static void msf_cook_frame(MsfCookedFrame * frame, uint8_t * raw, uint8_t * used,
+                           int width, int height, int pitch, int depth)
 { MsfTimeFunc
    //bit depth for each channel
-    const static int rdepths[17] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5 };
-    const static int gdepths[17] = { 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 };
-    const static int bdepths[17] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 };
+    const static int rdepthsArray[17] = { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5 };
+    const static int gdepthsArray[17] = { 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 };
+    const static int bdepthsArray[17] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 };
+    //this extra level of indirection looks unnecessary but we need to explicitly decay the arrays to pointers
+    //in order to be able to swap them because of C's annoying not-quite-pointers, not-quite-value-types stack arrays.
+    const int * rdepths = msf_gif_bgra_flag? bdepthsArray : rdepthsArray;
+    const int * gdepths =                                   gdepthsArray;
+    const int * bdepths = msf_gif_bgra_flag? rdepthsArray : bdepthsArray;

    const static int ditherKernel[16] = {
         0 << 12,  8 << 12,  2 << 12, 10 << 12,
@ -204,13 +272,11 @@ static MsfCookedFrame msf_cook_frame(void * allocContext, uint8_t * raw, uint8_t
        15 << 12,  7 << 12, 13 << 12,  5 << 12,
    };

-    uint32_t * cooked = (uint32_t *) MSF_GIF_MALLOC(allocContext, width * height * sizeof(uint32_t));
-    if (!cooked) { MsfCookedFrame blank = {0}; return blank; }
-
+    uint32_t * cooked = frame->pixels;
    int count = 0;
    MsfTimeLoop("do") do {
        int rbits = rdepths[depth], gbits = gdepths[depth], bbits = bdepths[depth];
-        int paletteSize = 1 << (rbits + gbits + bbits);
+        int paletteSize = (1 << (rbits + gbits + bbits)) + 1;
        memset(used, 0, paletteSize * sizeof(uint8_t));

        //TODO: document what this math does and why it's correct
@ -230,7 +296,6 @@ static MsfCookedFrame msf_cook_frame(void * allocContext, uint8_t * raw, uint8_t
            #if (defined (__SSE2__) || defined (_M_X64) || _M_IX86_FP == 2) && !defined(MSF_GIF_NO_SSE2)
                __m128i k = _mm_loadu_si128((__m128i *) &ditherKernel[(y & 3) * 4]);
                __m128i k2 = _mm_or_si128(_mm_srli_epi32(k, rbits), _mm_slli_epi32(_mm_srli_epi32(k, bbits), 16));
-                // MsfTimeLoop("SIMD")
                for (; x < width - 3; x += 4) {
                    uint8_t * pixels = &raw[y * pitch + x * 4];
                    __m128i p = _mm_loadu_si128((__m128i *) pixels);
@ -246,17 +311,30 @@ static MsfCookedFrame msf_cook_frame(void * allocContext, uint8_t * raw, uint8_t
                    __m128i g2 = _mm_adds_epu16(g1, _mm_srli_epi32(k, gbits));
                    __m128i g3 = _mm_and_si128(_mm_srli_epi32(g2, 16 - rbits - gbits), _mm_set1_epi32(gmask));

+                    __m128i out = _mm_or_si128(_mm_or_si128(r3, g3), b3);
+
+                    //mask in transparency based on threshold
+                    //NOTE: we can theoretically do a sub instead of srli by doing an unsigned compare via bias
+                    //      to maybe save a TINY amount of throughput? but lol who cares maybe I'll do it later -m
+                    __m128i invAlphaMask = _mm_cmplt_epi32(_mm_srli_epi32(p, 24), _mm_set1_epi32(msf_gif_alpha_threshold));
+                    out = _mm_or_si128(_mm_and_si128(invAlphaMask, _mm_set1_epi32(paletteSize - 1)), _mm_andnot_si128(invAlphaMask, out));
+
                    //TODO: does storing this as a __m128i then reading it back as a uint32_t violate strict aliasing?
                    uint32_t * c = &cooked[y * width + x];
-                    __m128i out = _mm_or_si128(_mm_or_si128(r3, g3), b3);
                    _mm_storeu_si128((__m128i *) c, out);
                }
            #endif

            //scalar cleanup loop
-            // MsfTimeLoop("scalar")
            for (; x < width; ++x) {
                uint8_t * p = &raw[y * pitch + x * 4];
+
+                //transparent pixel if alpha is low
+                if (p[3] < msf_gif_alpha_threshold) {
+                    cooked[y * width + x] = paletteSize - 1;
+                    continue;
+                }
+
                int dx = x & 3, dy = y & 3;
                int k = ditherKernel[dy * 4 + dx];
                cooked[y * width + x] =
@ -267,30 +345,25 @@ static MsfCookedFrame msf_cook_frame(void * allocContext, uint8_t * raw, uint8_t
        }

        count = 0;
-        MsfTimeLoop("mark and count") for (int i = 0; i < width * height; ++i) {
+        MsfTimeLoop("mark") for (int i = 0; i < width * height; ++i) {
            used[cooked[i]] = 1;
        }

-        //count used colors
-        MsfTimeLoop("count") for (int j = 0; j < paletteSize; ++j) {
+        //count used colors, transparent is ignored
+        MsfTimeLoop("count") for (int j = 0; j < paletteSize - 1; ++j) {
            count += used[j];
        }
    } while (count >= 256 && --depth);

    MsfCookedFrame ret = { cooked, depth, count, rdepths[depth], gdepths[depth], bdepths[depth] };
-    return ret;
+    *frame = ret;
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Frame Compression                                                                                                ///
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

-typedef struct {
-    uint8_t * next;
-    size_t size;
-} MsfBufferHeader;
-
-static inline int msf_put_code(uint8_t * * writeHead, uint32_t * blockBits, int len, uint32_t code) {
+static inline void msf_put_code(uint8_t * * writeHead, uint32_t * blockBits, int len, uint32_t code) {
    //insert new code into block buffer
    int idx = *blockBits / 8;
    int bit = *blockBits % 8;
@ -308,8 +381,6 @@ static inline int msf_put_code(uint8_t * * writeHead, uint32_t * blockBits, int
        (*writeHead)[0] = 255;
        memset((*writeHead) + 4, 0, 256);
    }
-
-    return 1;
 }

 typedef struct {
@ -324,30 +395,29 @@ static inline void msf_lzw_reset(MsfStridedList * lzw, int tableSize, int stride
    lzw->stride = stride;
 }

-static uint8_t * msf_compress_frame(void * allocContext, int width, int height, int centiSeconds,
-                                    MsfCookedFrame frame, MsfCookedFrame previous, uint8_t * used)
+static MsfGifBuffer * msf_compress_frame(void * allocContext, int width, int height, int centiSeconds,
+                                         MsfCookedFrame frame, MsfGifState * handle, uint8_t * used, int16_t * lzwMem)
 { MsfTimeFunc
    //NOTE: we reserve enough memory for theoretical the worst case upfront because it's a reasonable amount,
    //      and prevents us from ever having to check size or realloc during compression
-    int maxBufSize = sizeof(MsfBufferHeader) + 32 + 256 * 3 + width * height * 3 / 2; //headers + color table + data
-    uint8_t * allocation = (uint8_t *) MSF_GIF_MALLOC(allocContext, maxBufSize);
-    if (!allocation) { return NULL; }
-    uint8_t * writeBase = allocation + sizeof(MsfBufferHeader);
-    uint8_t * writeHead = writeBase;
-    int lzwAllocSize = 4096 * (frame.count + 1) * sizeof(int16_t);
-    MsfStridedList lzw = { (int16_t *) MSF_GIF_MALLOC(allocContext, lzwAllocSize) };
-    if (!lzw.data) { MSF_GIF_FREE(allocContext, allocation, maxBufSize); return NULL; }
+    int maxBufSize = offsetof(MsfGifBuffer, data) + 32 + 256 * 3 + width * height * 3 / 2; //headers + color table + data
+    MsfGifBuffer * buffer = (MsfGifBuffer *) MSF_GIF_MALLOC(allocContext, maxBufSize);
+    if (!buffer) { return NULL; }
+    uint8_t * writeHead = buffer->data;
+    MsfStridedList lzw = { lzwMem };

    //allocate tlb
    int totalBits = frame.rbits + frame.gbits + frame.bbits;
-    int tlbSize = 1 << totalBits;
-    uint8_t tlb[1 << 16]; //only 64k, so stack allocating is fine
+    int tlbSize = (1 << totalBits) + 1;
+    uint8_t tlb[(1 << 16) + 1]; //only 64k, so stack allocating is fine

    //generate palette
    typedef struct { uint8_t r, g, b; } Color3;
    Color3 table[256] = { {0} };
    int tableIdx = 1; //we start counting at 1 because 0 is the transparent color
-    MsfTimeLoop("table") for (int i = 0; i < tlbSize; ++i) {
+    //transparent is always last in the table
+    tlb[tlbSize-1] = 0;
+    MsfTimeLoop("table") for (int i = 0; i < tlbSize-1; ++i) {
        if (used[i]) {
            tlb[i] = tableIdx;
            int rmask = (1 << frame.rbits) - 1;
@ -363,19 +433,32 @@ static uint8_t * msf_compress_frame(void * allocContext, int width, int height,
            table[tableIdx].r = r | r >> frame.rbits | r >> (frame.rbits * 2) | r >> (frame.rbits * 3);
            table[tableIdx].g = g | g >> frame.gbits | g >> (frame.gbits * 2) | g >> (frame.gbits * 3);
            table[tableIdx].b = b | b >> frame.bbits | b >> (frame.bbits * 2) | b >> (frame.bbits * 3);
+            if (msf_gif_bgra_flag) {
+                uint8_t temp = table[tableIdx].r;
+                table[tableIdx].r = table[tableIdx].b;
+                table[tableIdx].b = temp;
+            }
            ++tableIdx;
        }
    }
+    int hasTransparentPixels = used[tlbSize-1];

    //SPEC: "Because of some algorithmic constraints however, black & white images which have one color bit
    //       must be indicated as having a code size of 2."
    int tableBits = msf_imax(2, msf_bit_log(tableIdx - 1));
    int tableSize = 1 << tableBits;
    //NOTE: we don't just compare `depth` field here because it will be wrong for the first frame and we will segfault
+    MsfCookedFrame previous = handle->previousFrame;
    int hasSamePal = frame.rbits == previous.rbits && frame.gbits == previous.gbits && frame.bbits == previous.bbits;
+    int framesCompatible = hasSamePal && !hasTransparentPixels;

    //NOTE: because __attribute__((__packed__)) is annoyingly compiler-specific, we do this unreadable weirdness
    char headerBytes[19] = "\x21\xF9\x04\x05\0\0\0\0" "\x2C\0\0\0\0\0\0\0\0\x80";
+    //NOTE: we need to check the frame number because if we reach into the buffer prior to the first frame,
+    //      we'll just clobber the file header instead, which is a bug
+    if (hasTransparentPixels && handle->framesSubmitted > 0) {
+        handle->listTail->data[3] = 0x09; //set the previous frame's disposal to background, so transparency is possible
+    }
    memcpy(&headerBytes[4], &centiSeconds, 2);
    memcpy(&headerBytes[13], &width, 2);
    memcpy(&headerBytes[15], &height, 2);
@ -397,12 +480,10 @@ static uint8_t * msf_compress_frame(void * allocContext, int width, int height,
    msf_lzw_reset(&lzw, tableSize, tableIdx);
    msf_put_code(&writeHead, &blockBits, msf_bit_log(lzw.len - 1), tableSize);

-    int lastCode = hasSamePal && frame.pixels[0] == previous.pixels[0]? 0 : tlb[frame.pixels[0]];
+    int lastCode = framesCompatible && frame.pixels[0] == previous.pixels[0]? 0 : tlb[frame.pixels[0]];
    MsfTimeLoop("compress") for (int i = 1; i < width * height; ++i) {
        //PERF: branching vs. branchless version of this line is observed to have no discernable impact on speed
-        int color = hasSamePal && frame.pixels[i] == previous.pixels[i]? 0 : tlb[frame.pixels[i]];
-        //PERF: branchless version must use && otherwise it will segfault on frame 1, but it's well-predicted so OK
-        // int color = (!(hasSamePal && frame.pixels[i] == previous.pixels[i])) * tlb[frame.pixels[i]];
+        int color = framesCompatible && frame.pixels[i] == previous.pixels[i]? 0 : tlb[frame.pixels[i]];
        int code = (&lzw.data[lastCode * lzw.stride])[color];
        if (code < 0) {
            //write to code stream
@ -424,9 +505,6 @@ static uint8_t * msf_compress_frame(void * allocContext, int width, int height,
        }
    }

-    MSF_GIF_FREE(allocContext, lzw.data, lzwAllocSize);
-    MSF_GIF_FREE(allocContext, previous.pixels, width * height * sizeof(uint32_t));
-
    //write code for leftover index buffer contents, then the end code
    msf_put_code(&writeHead, &blockBits, msf_imin(12, msf_bit_log(lzw.len - 1)), lastCode);
    msf_put_code(&writeHead, &blockBits, msf_imin(12, msf_bit_log(lzw.len)), tableSize + 1);
@ -439,38 +517,72 @@ static uint8_t * msf_compress_frame(void * allocContext, int width, int height,
    }
    *writeHead++ = 0; //terminating block

-    //filling in buffer header and shrink buffer to fit data
-    MsfBufferHeader * header = (MsfBufferHeader *) allocation;
-    header->next = NULL;
-    header->size = writeHead - writeBase;
-    uint8_t * moved = (uint8_t *) MSF_GIF_REALLOC(allocContext, allocation, maxBufSize, writeHead - allocation);
-    if (!moved) { MSF_GIF_FREE(allocContext, allocation, maxBufSize); return NULL; }
+    //fill in buffer header and shrink buffer to fit data
+    buffer->next = NULL;
+    buffer->size = writeHead - buffer->data;
+    MsfGifBuffer * moved =
+        (MsfGifBuffer *) MSF_GIF_REALLOC(allocContext, buffer, maxBufSize, offsetof(MsfGifBuffer, data) + buffer->size);
+    if (!moved) { MSF_GIF_FREE(allocContext, buffer, maxBufSize); return NULL; }
    return moved;
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/// Incremental API                                                                                                  ///
+/// To-memory API                                                                                                    ///
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

+static const int lzwAllocSize = 4096 * 256 * sizeof(int16_t);
+
+//NOTE: by C standard library conventions, freeing NULL should be a no-op,
+//      but just in case the user's custom free doesn't follow that rule, we do null checks on our end as well.
+static void msf_free_gif_state(MsfGifState * handle) {
+    if (handle->previousFrame.pixels) MSF_GIF_FREE(handle->customAllocatorContext, handle->previousFrame.pixels,
+                                                   handle->width * handle->height * sizeof(uint32_t));
+    if (handle->currentFrame.pixels)  MSF_GIF_FREE(handle->customAllocatorContext, handle->currentFrame.pixels,
+                                                   handle->width * handle->height * sizeof(uint32_t));
+    if (handle->lzwMem) MSF_GIF_FREE(handle->customAllocatorContext, handle->lzwMem, lzwAllocSize);
+    for (MsfGifBuffer * node = handle->listHead; node;) {
+        MsfGifBuffer * next = node->next; //NOTE: we have to copy the `next` pointer BEFORE freeing the node holding it
+        MSF_GIF_FREE(handle->customAllocatorContext, node, offsetof(MsfGifBuffer, data) + node->size);
+        node = next;
+    }
+    handle->listHead = NULL; //this implicitly marks the handle as invalid until the next msf_gif_begin() call
+}
+
 int msf_gif_begin(MsfGifState * handle, int width, int height) { MsfTimeFunc
+    //NOTE: we cannot stomp the entire struct to zero because we must preserve `customAllocatorContext`.
    MsfCookedFrame empty = {0}; //god I hate MSVC...
    handle->previousFrame = empty;
+    handle->currentFrame = empty;
    handle->width = width;
    handle->height = height;
+    handle->framesSubmitted = 0;
+
+    //allocate memory for LZW buffer
+    //NOTE: Unfortunately we can't just use stack memory for the LZW table because it's 2MB,
+    //      which is more stack space than most operating systems give by default,
+    //      and we can't realistically expect users to be willing to override that just to use our library,
+    //      so we have to allocate this on the heap.
+    handle->lzwMem = (int16_t *) MSF_GIF_MALLOC(handle->customAllocatorContext, lzwAllocSize);
+    handle->previousFrame.pixels =
+        (uint32_t *) MSF_GIF_MALLOC(handle->customAllocatorContext, handle->width * handle->height * sizeof(uint32_t));
+    handle->currentFrame.pixels =
+        (uint32_t *) MSF_GIF_MALLOC(handle->customAllocatorContext, handle->width * handle->height * sizeof(uint32_t));

    //setup header buffer header (lol)
-    handle->listHead = (uint8_t *) MSF_GIF_MALLOC(handle->customAllocatorContext, sizeof(MsfBufferHeader) + 32);
-    if (!handle->listHead) { return 0; }
+    handle->listHead = (MsfGifBuffer *) MSF_GIF_MALLOC(handle->customAllocatorContext, offsetof(MsfGifBuffer, data) + 32);
+    if (!handle->listHead || !handle->lzwMem || !handle->previousFrame.pixels || !handle->currentFrame.pixels) {
+        msf_free_gif_state(handle);
+        return 0;
+    }
    handle->listTail = handle->listHead;
-    MsfBufferHeader * header = (MsfBufferHeader *) handle->listHead;
-    header->next = NULL;
-    header->size = 32;
+    handle->listHead->next = NULL;
+    handle->listHead->size = 32;

    //NOTE: because __attribute__((__packed__)) is annoyingly compiler-specific, we do this unreadable weirdness
-    char headerBytes[33] = "GIF89a\0\0\0\0\x10\0\0" "\x21\xFF\x0BNETSCAPE2.0\x03\x01\0\0\0";
+    char headerBytes[33] = "GIF89a\0\0\0\0\x70\0\0" "\x21\xFF\x0BNETSCAPE2.0\x03\x01\0\0\0";
    memcpy(&headerBytes[6], &width, 2);
    memcpy(&headerBytes[8], &height, 2);
-    memcpy(handle->listHead + sizeof(MsfBufferHeader), headerBytes, 32);
+    memcpy(handle->listHead->data, headerBytes, 32);
    return 1;
 }

@ -482,86 +594,83 @@ int msf_gif_frame(MsfGifState * handle, uint8_t * pixelData, int centiSecondsPer
    if (pitchInBytes == 0) pitchInBytes = handle->width * 4;
    if (pitchInBytes < 0) pixelData -= pitchInBytes * (handle->height - 1);

-    uint8_t used[1 << 16]; //only 64k, so stack allocating is fine
-    MsfCookedFrame frame =
-        msf_cook_frame(handle->customAllocatorContext, pixelData, used, handle->width, handle->height, pitchInBytes,
-            msf_imin(maxBitDepth, handle->previousFrame.depth + 160 / msf_imax(1, handle->previousFrame.count)));
-    //TODO: de-duplicate cleanup code
-    if (!frame.pixels) {
-        MSF_GIF_FREE(handle->customAllocatorContext,
-                     handle->previousFrame.pixels, handle->width * handle->height * sizeof(uint32_t));
-        for (uint8_t * node = handle->listHead; node;) {
-            MsfBufferHeader * header = (MsfBufferHeader *) node;
-            node = header->next;
-            MSF_GIF_FREE(handle->customAllocatorContext, header, sizeof(MsfBufferHeader) + header->size);
-        }
-        handle->listHead = handle->listTail = NULL;
-        return 0;
-    }
+    uint8_t used[(1 << 16) + 1]; //only 64k, so stack allocating is fine
+    msf_cook_frame(&handle->currentFrame, pixelData, used, handle->width, handle->height, pitchInBytes,
+        msf_imin(maxBitDepth, handle->previousFrame.depth + 160 / msf_imax(1, handle->previousFrame.count)));

-    uint8_t * buffer = msf_compress_frame(handle->customAllocatorContext,
-        handle->width, handle->height, centiSecondsPerFame, frame, handle->previousFrame, used);
-    ((MsfBufferHeader *) handle->listTail)->next = buffer;
+    MsfGifBuffer * buffer = msf_compress_frame(handle->customAllocatorContext, handle->width, handle->height,
+        centiSecondsPerFame, handle->currentFrame, handle, used, handle->lzwMem);
+    if (!buffer) { msf_free_gif_state(handle); return 0; }
+    handle->listTail->next = buffer;
    handle->listTail = buffer;
-    if (!buffer) {
-        MSF_GIF_FREE(handle->customAllocatorContext, frame.pixels, handle->width * handle->height * sizeof(uint32_t));
-        MSF_GIF_FREE(handle->customAllocatorContext,
-                     handle->previousFrame.pixels, handle->width * handle->height * sizeof(uint32_t));
-        for (uint8_t * node = handle->listHead; node;) {
-            MsfBufferHeader * header = (MsfBufferHeader *) node;
-            node = header->next;
-            MSF_GIF_FREE(handle->customAllocatorContext, header, sizeof(MsfBufferHeader) + header->size);
-        }
-        handle->listHead = handle->listTail = NULL;
-        return 0;
-    }

-    handle->previousFrame = frame;
+    //swap current and previous frames
+    MsfCookedFrame tmp = handle->previousFrame;
+    handle->previousFrame = handle->currentFrame;
+    handle->currentFrame = tmp;
+
+    handle->framesSubmitted += 1;
    return 1;
 }

 MsfGifResult msf_gif_end(MsfGifState * handle) { MsfTimeFunc
    if (!handle->listHead) { MsfGifResult empty = {0}; return empty; }

-    MSF_GIF_FREE(handle->customAllocatorContext,
-                 handle->previousFrame.pixels, handle->width * handle->height * sizeof(uint32_t));
-
    //first pass: determine total size
    size_t total = 1; //1 byte for trailing marker
-    for (uint8_t * node = handle->listHead; node;) {
-        MsfBufferHeader * header = (MsfBufferHeader *) node;
-        node = header->next;
-        total += header->size;
-    }
+    for (MsfGifBuffer * node = handle->listHead; node; node = node->next) { total += node->size; }

    //second pass: write data
    uint8_t * buffer = (uint8_t *) MSF_GIF_MALLOC(handle->customAllocatorContext, total);
    if (buffer) {
        uint8_t * writeHead = buffer;
-        for (uint8_t * node = handle->listHead; node;) {
-            MsfBufferHeader * header = (MsfBufferHeader *) node;
-            memcpy(writeHead, node + sizeof(MsfBufferHeader), header->size);
-            writeHead += header->size;
-            node = header->next;
+        for (MsfGifBuffer * node = handle->listHead; node; node = node->next) {
+            memcpy(writeHead, node->data, node->size);
+            writeHead += node->size;
        }
        *writeHead++ = 0x3B;
    }

    //third pass: free buffers
-    for (uint8_t * node = handle->listHead; node;) {
-        MsfBufferHeader * header = (MsfBufferHeader *) node;
-        node = header->next;
-        MSF_GIF_FREE(handle->customAllocatorContext, header, sizeof(MsfBufferHeader) + header->size);
-    }
+    msf_free_gif_state(handle);

    MsfGifResult ret = { buffer, total, total, handle->customAllocatorContext };
    return ret;
 }

-void msf_gif_free(MsfGifResult result) {
+void msf_gif_free(MsfGifResult result) { MsfTimeFunc
    if (result.data) { MSF_GIF_FREE(result.contextPointer, result.data, result.allocSize); }
 }

+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// To-file API                                                                                                      ///
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int msf_gif_begin_to_file(MsfGifState * handle, int width, int height, MsfGifFileWriteFunc func, void * filePointer) {
+    handle->fileWriteFunc = func;
+    handle->fileWriteData = filePointer;
+    return msf_gif_begin(handle, width, height);
+}
+
+int msf_gif_frame_to_file(MsfGifState * handle, uint8_t * pixelData, int centiSecondsPerFame, int maxBitDepth, int pitchInBytes) {
+    if (!msf_gif_frame(handle, pixelData, centiSecondsPerFame, maxBitDepth, pitchInBytes)) { return 0; }
+
+    //NOTE: this is a somewhat hacky implementation which is not perfectly efficient, but it's good enough for now
+    MsfGifBuffer * head = handle->listHead;
+    if (!handle->fileWriteFunc(head->data, head->size, 1, handle->fileWriteData)) { msf_free_gif_state(handle); return 0; }
+    handle->listHead = head->next;
+    MSF_GIF_FREE(handle->customAllocatorContext, head, offsetof(MsfGifBuffer, data) + head->size);
+    return 1;
+}
+
+int msf_gif_end_to_file(MsfGifState * handle) {
+    //NOTE: this is a somewhat hacky implementation which is not perfectly efficient, but it's good enough for now
+    MsfGifResult result = msf_gif_end(handle);
+    int ret = (int) handle->fileWriteFunc(result.data, result.dataSize, 1, handle->fileWriteData);
+    msf_gif_free(result);
+    return ret;
+}
+
 #endif //MSF_GIF_ALREADY_IMPLEMENTED_IN_THIS_TRANSLATION_UNIT
 #endif //MSF_GIF_IMPL

@ -570,7 +679,7 @@ void msf_gif_free(MsfGifResult result) {
 This software is available under 2 licenses -- choose whichever you prefer.
 ------------------------------------------------------------------------------
 ALTERNATIVE A - MIT License
-Copyright (c) 2020 Miles Fogle
+Copyright (c) 2021 Miles Fogle
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
--- a/src/external/sdefl.h
+++ b/src/external/sdefl.h
@ -1,5 +1,4 @@
-/*
-# Small Deflate
+/*# Small Deflate
 `sdefl` is a small bare bone lossless compression library in ANSI C (ISO C90)
 which implements the Deflate (RFC 1951) compressed data format specification standard.
 It is mainly tuned to get as much speed and compression ratio from as little code
@ -33,16 +32,16 @@ this file implementation in *one* C or C++ file to prevent collisions.

 | Compressor name         | Compression| Decompress.| Compr. size | Ratio |
 | ------------------------| -----------| -----------| ----------- | ----- |
-| sdefl 1.0 -0            |   127 MB/s |   233 MB/s |    40004116 | 39.88 |
-| sdefl 1.0 -1            |   111 MB/s |   259 MB/s |    38940674 | 38.82 |
-| sdefl 1.0 -5            |    45 MB/s |   275 MB/s |    36577183 | 36.46 |
-| sdefl 1.0 -7            |    38 MB/s |   276 MB/s |    36523781 | 36.41 |
-| zlib 1.2.11 -1          |    72 MB/s |   307 MB/s |    42298774 | 42.30 |
-| zlib 1.2.11 -6          |    24 MB/s |   313 MB/s |    36548921 | 36.55 |
-| zlib 1.2.11 -9          |    20 MB/s |   314 MB/s |    36475792 | 36.48 |
 | miniz 1.0 -1            |   122 MB/s |   208 MB/s |    48510028 | 48.51 |
 | miniz 1.0 -6            |    27 MB/s |   260 MB/s |    36513697 | 36.51 |
 | miniz 1.0 -9            |    23 MB/s |   261 MB/s |    36460101 | 36.46 |
+| zlib 1.2.11 -1          |    72 MB/s |   307 MB/s |    42298774 | 42.30 |
+| zlib 1.2.11 -6          |    24 MB/s |   313 MB/s |    36548921 | 36.55 |
+| zlib 1.2.11 -9          |    20 MB/s |   314 MB/s |    36475792 | 36.48 |
+| sdefl 1.0 -0            |   127 MB/s |   371 MB/s |    40004116 | 39.88 |
+| sdefl 1.0 -1            |   111 MB/s |   398 MB/s |    38940674 | 38.82 |
+| sdefl 1.0 -5            |    45 MB/s |   420 MB/s |    36577183 | 36.46 |
+| sdefl 1.0 -7            |    38 MB/s |   423 MB/s |    36523781 | 36.41 |
 | libdeflate 1.3 -1       |   147 MB/s |   667 MB/s |    39597378 | 39.60 |
 | libdeflate 1.3 -6       |    69 MB/s |   689 MB/s |    36648318 | 36.65 |
 | libdeflate 1.3 -9       |    13 MB/s |   672 MB/s |    35197141 | 35.20 |
@ -398,8 +397,8 @@ sdefl_precode(struct sdefl_symcnt *cnt, unsigned *freqs, unsigned *items,
    if (offlen[cnt->off - 1]) break;

  total = (unsigned)(cnt->lit + cnt->off);
-  memcpy(lens, litlen, sizeof(unsigned char) * cnt->lit);
-  memcpy(lens + cnt->lit, offlen, sizeof(unsigned char) * cnt->off);
+  memcpy(lens, litlen, sizeof(unsigned char) * (size_t)cnt->lit);
+  memcpy(lens + cnt->lit, offlen, sizeof(unsigned char) * (size_t)cnt->off);
  do {
    unsigned len = lens[run_start];
    unsigned run_end = run_start;
--- a/src/external/sinfl.h
+++ b/src/external/sinfl.h
@ -33,16 +33,16 @@ this file implementation in *one* C or C++ file to prevent collisions.

 | Compressor name         | Compression| Decompress.| Compr. size | Ratio |
 | ------------------------| -----------| -----------| ----------- | ----- |
-| sdefl 1.0 -0            |   127 MB/s |   233 MB/s |    40004116 | 39.88 |
-| sdefl 1.0 -1            |   111 MB/s |   259 MB/s |    38940674 | 38.82 |
-| sdefl 1.0 -5            |    45 MB/s |   275 MB/s |    36577183 | 36.46 |
-| sdefl 1.0 -7            |    38 MB/s |   276 MB/s |    36523781 | 36.41 |
-| zlib 1.2.11 -1          |    72 MB/s |   307 MB/s |    42298774 | 42.30 |
-| zlib 1.2.11 -6          |    24 MB/s |   313 MB/s |    36548921 | 36.55 |
-| zlib 1.2.11 -9          |    20 MB/s |   314 MB/s |    36475792 | 36.48 |
 | miniz 1.0 -1            |   122 MB/s |   208 MB/s |    48510028 | 48.51 |
 | miniz 1.0 -6            |    27 MB/s |   260 MB/s |    36513697 | 36.51 |
 | miniz 1.0 -9            |    23 MB/s |   261 MB/s |    36460101 | 36.46 |
+| zlib 1.2.11 -1          |    72 MB/s |   307 MB/s |    42298774 | 42.30 |
+| zlib 1.2.11 -6          |    24 MB/s |   313 MB/s |    36548921 | 36.55 |
+| zlib 1.2.11 -9          |    20 MB/s |   314 MB/s |    36475792 | 36.48 |
+| sdefl 1.0 -0            |   127 MB/s |   371 MB/s |    40004116 | 39.88 |
+| sdefl 1.0 -1            |   111 MB/s |   398 MB/s |    38940674 | 38.82 |
+| sdefl 1.0 -5            |    45 MB/s |   420 MB/s |    36577183 | 36.46 |
+| sdefl 1.0 -7            |    38 MB/s |   423 MB/s |    36523781 | 36.41 |
 | libdeflate 1.3 -1       |   147 MB/s |   667 MB/s |    39597378 | 39.60 |
 | libdeflate 1.3 -6       |    69 MB/s |   689 MB/s |    36648318 | 36.65 |
 | libdeflate 1.3 -9       |    13 MB/s |   672 MB/s |    35197141 | 35.20 |
@ -51,7 +51,7 @@ this file implementation in *one* C or C++ file to prevent collisions.
 ### Compression
 Results on the [Silesia compression corpus](http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia):

-| File    |   Original | `sdefl 0`  	| `sdefl 5` 	| `sdefl 7` |
+| File    |   Original | `sdefl 0`    | `sdefl 5`   | `sdefl 7` |
 | :------ | ---------: | -----------------: | ---------: | ----------: |
 | dickens | 10.192.446 |  4,260,187|  3,845,261|   3,833,657 |
 | mozilla | 51.220.480 | 20,774,706 | 19,607,009 |  19,565,867 |
@ -121,12 +121,15 @@ extern "C" {
 #define SINFL_OFF_TBL_SIZE 402

 struct sinfl {
-  int bits, bitcnt;
+  const unsigned char *bitptr;
+  unsigned long long bitbuf;
+  int bitcnt;
+
  unsigned lits[SINFL_LIT_TBL_SIZE];
  unsigned dsts[SINFL_OFF_TBL_SIZE];
 };
-extern int sinflate(void *out, const void *in, int size);
-extern int zsinflate(void *out, const void *in, int size);
+extern int sinflate(void *out, int cap, const void *in, int size);
+extern int zsinflate(void *out, int cap, const void *in, int size);

 #ifdef __cplusplus
 }
@ -137,6 +140,33 @@ extern int zsinflate(void *out, const void *in, int size);
 #ifdef SINFL_IMPLEMENTATION

 #include <string.h> /* memcpy, memset */
+#include <assert.h> /* assert */
+
+#if defined(__GNUC__) || defined(__clang__)
+#define sinfl_likely(x)       __builtin_expect((x),1)
+#define sinfl_unlikely(x)     __builtin_expect((x),0)
+#else
+#define sinfl_likely(x)       (x)
+#define sinfl_unlikely(x)     (x)
+#endif
+
+#ifndef SINFL_NO_SIMD
+#if __x86_64__ || defined(_WIN32) || defined(_WIN64)
+  #include <emmintrin.h>
+  #define sinfl_char16 __m128i
+  #define sinfl_char16_ld(p) _mm_loadu_si128((const __m128i *)(void*)(p))
+  #define sinfl_char16_str(d,v)  _mm_storeu_si128((__m128i*)(void*)(d), v)
+  #define sinfl_char16_char(c) _mm_set1_epi8(c)
+#elif defined(__arm__) || defined(__aarch64__)
+  #include <arm_neon.h>
+  #define sinfl_char16 uint8x16_t
+  #define sinfl_char16_ld(p) vld1q_u8((const unsigned char*)(p))
+  #define sinfl_char16_str(d,v) vst1q_u8((unsigned char*)(d), v)
+  #define sinfl_char16_char(c) vdupq_n_u8(c)
+#else
+  #define SINFL_NO_SIMD
+#endif
+#endif

 static int
 sinfl_bsr(unsigned n) {
@ -147,20 +177,66 @@ sinfl_bsr(unsigned n) {
  return 31 - __builtin_clz(n);
 #endif
 }
+static unsigned long long
+sinfl_read64(const void *p) {
+  unsigned long long n;
+  memcpy(&n, p, 8);
+  return n;
+}
+#ifndef SINFL_NO_SIMD
+static unsigned char*
+sinfl_write128(unsigned char *dst, sinfl_char16 w) {
+  sinfl_char16_str(dst, w);
+  return dst + 8;
+}
+static void
+sinfl_copy128(unsigned char **dst, unsigned char **src) {
+  sinfl_char16 n = sinfl_char16_ld(*src);
+  sinfl_char16_str(*dst, n);
+  *dst += 16, *src += 16;
+}
+#else
+static unsigned char*
+sinfl_write64(unsigned char *dst, unsigned long long w) {
+  memcpy(dst, &w, 8);
+  return dst + 8;
+}
+static void
+sinfl_copy64(unsigned char **dst, unsigned char **src) {
+  unsigned long long n;
+  memcpy(&n, *src, 8);
+  memcpy(*dst, &n, 8);
+  *dst += 8, *src += 8;
+}
+#endif
+static void
+sinfl_refill(struct sinfl *s) {
+  s->bitbuf |= sinfl_read64(s->bitptr) << s->bitcnt;
+  s->bitptr += (63 - s->bitcnt) >> 3;
+  s->bitcnt |= 56; /* bitcount is in range [56,63] */
+}
 static int
-sinfl_get(const unsigned char **src, const unsigned char *end, struct sinfl *s,
-          int n) {
-  const unsigned char *in = *src;
-  int v = s->bits & ((1 << n)-1);
-  s->bits >>= n;
-  s->bitcnt = s->bitcnt - n;
-  s->bitcnt = s->bitcnt < 0 ? 0 : s->bitcnt;
-  while (s->bitcnt < 16 && in < end) {
-    s->bits |= (*in++) << s->bitcnt;
-    s->bitcnt += 8;
-  }
-  *src = in;
-  return v;
+sinfl_peek(struct sinfl *s, int cnt) {
+  assert(cnt >= 0 && cnt <= 56);
+  assert(cnt <= s->bitcnt);
+  return s->bitbuf & ((1ull << cnt) - 1);
+}
+static void
+sinfl_consume(struct sinfl *s, int cnt) {
+  assert(cnt <= s->bitcnt);
+  s->bitbuf >>= cnt;
+  s->bitcnt -= cnt;
+}
+static int
+sinfl__get(struct sinfl *s, int cnt) {
+  int res = sinfl_peek(s, cnt);
+  sinfl_consume(s, cnt);
+  return res;
+}
+static int
+sinfl_get(struct sinfl *s, int cnt) {
+  sinfl_refill(s);
+  return sinfl__get(s, cnt);
 }
 struct sinfl_gen {
  int len;
@ -276,22 +352,22 @@ sinfl_build(unsigned *tbl, unsigned char *lens, int tbl_bits, int maxlen,
  }
 }
 static int
-sinfl_decode(const unsigned char **in, const unsigned char *end,
-             struct sinfl *s, const unsigned *tbl, int bit_len) {
-  int idx = s->bits & ((1 << bit_len) - 1);
+sinfl_decode(struct sinfl *s, const unsigned *tbl, int bit_len) {
+  sinfl_refill(s);
+  {int idx = sinfl_peek(s, bit_len);
  unsigned key = tbl[idx];
  if (key & 0x10) {
    /* sub-table lookup */
    int len = key & 0x0f;
-    sinfl_get(in, end, s, bit_len);
-    idx = s->bits & ((1 << len)-1);
+    sinfl_consume(s, bit_len);
+    idx = sinfl_peek(s, len);
    key = tbl[((key >> 16) & 0xffff) + (unsigned)idx];
  }
-  sinfl_get(in, end, s, key & 0x0f);
-  return (key >> 16) & 0x0fff;
+  sinfl_consume(s, key & 0x0f);
+  return (key >> 16) & 0x0fff;}
 }
 static int
-sinfl_decompress(unsigned char *out, const unsigned char *in, int size) {
+sinfl_decompress(unsigned char *out, int cap, const unsigned char *in, int size) {
  static const unsigned char order[] = {16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15};
  static const short dbase[30+2] = {1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
      257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577};
@ -302,19 +378,22 @@ sinfl_decompress(unsigned char *out, const unsigned char *in, int size) {
  static const unsigned char lbits[29+2] = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,
      4,4,4,5,5,5,5,0,0,0};

+  const unsigned char *oe = out + cap;
  const unsigned char *e = in + size, *o = out;
  enum sinfl_states {hdr,stored,fixed,dyn,blk};
  enum sinfl_states state = hdr;
  struct sinfl s = {0};
  int last = 0;

-  sinfl_get(&in,e,&s,0); /* buffer input */
-  while (in < e || s.bitcnt) {
+  s.bitptr = in;
+  while (1) {
    switch (state) {
    case hdr: {
-      int type = 0; /* block header */
-      last = sinfl_get(&in,e,&s,1);
-      type = sinfl_get(&in,e,&s,2);
+      /* block header */
+      int type = 0;
+      sinfl_refill(&s);
+      last = sinfl__get(&s,1);
+      type = sinfl__get(&s,2);

      switch (type) {default: return (int)(out-o);
      case 0x00: state = stored; break;
@ -322,10 +401,12 @@ sinfl_decompress(unsigned char *out, const unsigned char *in, int size) {
      case 0x02: state = dyn; break;}
    } break;
    case stored: {
-      int len; /* uncompressed block */
-      sinfl_get(&in,e,&s,s.bitcnt & 7);
-      len = sinfl_get(&in,e,&s,16);
-      //int nlen = sinfl_get(&in,e,&s,16);
+      /* uncompressed block */
+      int len;
+      sinfl_refill(&s);
+      sinfl__get(&s,s.bitcnt & 7);
+      len = sinfl__get(&s,16);
+      //int nlen = sinfl__get(&s,16);   // @raysan5: Unused variable?
      in -= 2; s.bitcnt = 0;

      if (len > (e-in) || !len)
@ -353,72 +434,111 @@ sinfl_decompress(unsigned char *out, const unsigned char *in, int size) {
        int n, i;
        unsigned hlens[SINFL_PRE_TBL_SIZE];
        unsigned char nlens[19] = {0}, lens[288+32];
-        int nlit = 257 + sinfl_get(&in,e,&s,5);
-        int ndist = 1 + sinfl_get(&in,e,&s,5);
-        int nlen = 4 + sinfl_get(&in,e,&s,4);
+
+        sinfl_refill(&s);
+        {int nlit = 257 + sinfl__get(&s,5);
+        int ndist = 1 + sinfl__get(&s,5);
+        int nlen = 4 + sinfl__get(&s,4);
        for (n = 0; n < nlen; n++)
-          nlens[order[n]] = (unsigned char)sinfl_get(&in,e,&s,3);
+          nlens[order[n]] = (unsigned char)sinfl_get(&s,3);
        sinfl_build(hlens, nlens, 7, 7, 19);

        /* decode code lengths */
        for (n = 0; n < nlit + ndist;) {
-          int sym = sinfl_decode(&in, e, &s, hlens, 7);
+          int sym = sinfl_decode(&s, hlens, 7);
          switch (sym) {default: lens[n++] = (unsigned char)sym; break;
-          case 16: for (i=3+sinfl_get(&in,e,&s,2);i;i--,n++) lens[n]=lens[n-1]; break;
-          case 17: for (i=3+sinfl_get(&in,e,&s,3);i;i--,n++) lens[n]=0; break;
-          case 18: for (i=11+sinfl_get(&in,e,&s,7);i;i--,n++) lens[n]=0; break;}
+          case 16: for (i=3+sinfl_get(&s,2);i;i--,n++) lens[n]=lens[n-1]; break;
+          case 17: for (i=3+sinfl_get(&s,3);i;i--,n++) lens[n]=0; break;
+          case 18: for (i=11+sinfl_get(&s,7);i;i--,n++) lens[n]=0; break;}
        }
        /* build lit/dist tables */
        sinfl_build(s.lits, lens, 10, 15, nlit);
        sinfl_build(s.dsts, lens + nlit, 8, 15, ndist);
-        state = blk;
+        state = blk;}
    } break;
    case blk: {
      /* decompress block */
-      int i, sym = sinfl_decode(&in, e, &s, s.lits, 10);
-      if (sym > 256) {sym -= 257; /* match symbol */
-        {int len = sinfl_get(&in, e, &s, lbits[sym]) + lbase[sym];
-        int dsym = sinfl_decode(&in, e, &s, s.dsts, 8);
-        int offs = sinfl_get(&in, e, &s, dbits[dsym]) + dbase[dsym];
-        if (offs > (int)(out-o)) {
+      int sym = sinfl_decode(&s, s.lits, 10);
+      if (sym < 256) {
+        /* literal */
+        *out++ = (unsigned char)sym;
+      } else if (sym > 256) {sym -= 257; /* match symbol */
+        sinfl_refill(&s);
+        {int len = sinfl__get(&s, lbits[sym]) + lbase[sym];
+        int dsym = sinfl_decode(&s, s.dsts, 8);
+        int offs = sinfl__get(&s, dbits[dsym]) + dbase[dsym];
+        unsigned char *dst = out, *src = out - offs;
+        if (sinfl_unlikely(offs > (int)(out-o))) {
          return (int)(out-o);
-        } else if (offs == 1) {
-          /* rle match copying */
-          unsigned char c = *(out - offs);
-          unsigned long w = (c << 24) | (c << 16) | (c << 8) | c;
-          for (i = 0; i < len >> 2; ++i) {
-            memcpy(out, &w, 4);
-            out += 4;
-          }
-          len = len & 3;
-        } else if (offs >= 4) {
-          /* copy match */
-          int wcnt = len >> 2;
-          for (i = 0; i < wcnt; ++i) {
-            unsigned long w = 0;
-            memcpy(&w, out - offs, 4);
-            memcpy(out, &w, 4);
-            out += 4;
-          }
-          len = len & 3;
        }
-        for (i = 0; i < len; ++i)
-          {*out = *(out-offs), out++;}
+        out = out + len;
+
+#ifndef SINFL_NO_SIMD
+        if (sinfl_likely(oe - out >= 16 * 3)) {
+          if (offs >= 16) {
+            /* copy match */
+            sinfl_copy128(&dst, &src);
+            sinfl_copy128(&dst, &src);
+            do sinfl_copy128(&dst, &src);
+            while (dst < out);
+          } else if (offs == 1) {
+            /* rle match copying */
+            sinfl_char16 w = sinfl_char16_char(src[0]);
+            dst = sinfl_write128(dst, w);
+            dst = sinfl_write128(dst, w);
+            do dst = sinfl_write128(dst, w);
+            while (dst < out);
+          } else {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            do *dst++ = *src++;
+            while (dst < out);
+          }
        }
-      } else if (sym == 256) {
+#else
+        if (sinfl_likely(oe - out >= 3 * 8 - 3)) {
+          if (offs >= 8) {
+            /* copy match */
+            sinfl_copy64(&dst, &src);
+            sinfl_copy64(&dst, &src);
+            do sinfl_copy64(&dst, &src);
+            while (dst < out);
+          } else if (offs == 1) {
+            /* rle match copying */
+            unsigned int c = src[0];
+            unsigned int hw = (c << 24u) | (c << 16u) | (c << 8u) | (unsigned)c;
+            unsigned long long w = (unsigned long long)hw << 32llu | hw;
+            dst = sinfl_write64(dst, w);
+            dst = sinfl_write64(dst, w);
+            do dst = sinfl_write64(dst, w);
+            while (dst < out);
+          } else {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            do *dst++ = *src++;
+            while (dst < out);
+          }
+        }
+#endif
+        else {
+          *dst++ = *src++;
+          *dst++ = *src++;
+          do *dst++ = *src++;
+          while (dst < out);}
+        }
+      } else {
        /* end of block */
        if (last) return (int)(out-o);
        state = hdr;
        break;
-        /* literal */
-      } else *out++ = (unsigned char)sym;
+      }
    } break;}
  }
  return (int)(out-o);
 }
 extern int
-sinflate(void *out, const void *in, int size) {
-  return sinfl_decompress((unsigned char*)out, (const unsigned char*)in, size);
+sinflate(void *out, int cap, const void *in, int size) {
+  return sinfl_decompress((unsigned char*)out, cap, (const unsigned char*)in, size);
 }
 static unsigned
 sinfl_adler32(unsigned adler32, const unsigned char *in, int in_len) {
@ -448,11 +568,11 @@ sinfl_adler32(unsigned adler32, const unsigned char *in, int in_len) {
  } return (unsigned)(s2 << 16) + (unsigned)s1;
 }
 extern int
-zsinflate(void *out, const void *mem, int size) {
+zsinflate(void *out, int cap, const void *mem, int size) {
  const unsigned char *in = (const unsigned char*)mem;
  if (size >= 6) {
    const unsigned char *eob = in + size - 4;
-    int n = sinfl_decompress((unsigned char*)out, in + 2u, size);
+    int n = sinfl_decompress((unsigned char*)out, cap, in + 2u, size);
    unsigned a = sinfl_adler32(1u, (unsigned char*)out, n);
    unsigned h = eob[0] << 24 | eob[1] << 16 | eob[2] << 8 | eob[3] << 0;
    return a == h ? n : -1;
--- a/src/external/stb_image.h
+++ b/src/external/stb_image.h
@ -1,4 +1,4 @@
-/* stb_image - v2.26 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
                                  no warranty implied; use at your own risk

   Do this:
@ -48,6 +48,7 @@ LICENSE

 RECENT REVISION HISTORY:

+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
      2.26  (2020-07-13) many minor fixes
      2.25  (2020-02-02) fix warnings
      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
@ -89,7 +90,7 @@ RECENT REVISION HISTORY:
                                           Jeremy Sawicki (handle all ImageNet JPGs)
 Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
-    Arseny Kapoulkine
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
    John-Mark Allen
    Carmelo J Fdez-Aguera

@ -102,7 +103,7 @@ RECENT REVISION HISTORY:
    Thomas Ruf              Ronny Chevalier                         github:rlyeh
    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
-                            Laurent Gomila     Cort Stratton        github:snagar
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
    Cass Everitt            Ryamond Barbiero                        github:grim210
    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
@ -110,11 +111,13 @@ RECENT REVISION HISTORY:
    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
-                            Brad Weinberger    Matvey Cherevko      [reserved]
+                            Brad Weinberger    Matvey Cherevko      github:mosra
    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
    Ryan C. Gordon          [reserved]                              [reserved]
                     DO NOT ADD YOUR NAME HERE

+                     Jacko Dirks
+
  To add your name to the credits, pick a random blank space in the middle and fill it.
  80% of merge conflicts on stb PRs are due to people adding their name at the end
  of the credits.
@ -176,6 +179,32 @@ RECENT REVISION HISTORY:
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
 //
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
 // ===========================================================================
 //
 // UNICODE:
@ -281,11 +310,10 @@ RECENT REVISION HISTORY:
 //
 // iPhone PNG support:
 //
-// By default we convert iphone-formatted PNGs back to RGB, even though
-// they are internally encoded differently. You can disable this conversion
-// by calling stbi_convert_iphone_png_to_rgb(0), in which case
-// you will always just get the native iphone "format" through (which
-// is BGR stored in RGB).
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
 //
 // Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
 // pixel to remove any premultiplied alpha *only* if the image file explicitly
@ -489,6 +517,8 @@ STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
 // as above, but only applies to images loaded on the thread that calls the function
 // this function is only available if your compiler supports thread-local variables;
 // calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
 STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);

 // ZLIB client - used by PNG, available for other purposes
@ -634,7 +664,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #ifdef STBI_HAS_LROTL
   #define stbi_lrot(x,y)  _lrotl(x,y)
 #else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
 #endif

 #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
@ -748,9 +778,12 @@ static int stbi__sse2_available(void)

 #ifdef STBI_NEON
 #include <arm_neon.h>
-// assume GCC or Clang on ARM targets
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 #endif
+#endif

 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name
@ -924,6 +957,7 @@ static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 static int      stbi__pnm_test(stbi__context *s);
 static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
 #endif

 static
@ -998,7 +1032,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 }

 // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 {
   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
@ -1021,7 +1055,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add)
   return stbi__malloc(a*b*c + add);
 }

-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 {
   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
@ -1087,9 +1121,8 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re
   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
   ri->num_channels = 0;

-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
-   #endif
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
   #ifndef STBI_NO_PNG
   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
   #endif
@ -1107,6 +1140,13 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re
   #ifndef STBI_NO_PIC
   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
   #ifndef STBI_NO_PNM
   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
   #endif
@ -1262,12 +1302,12 @@ static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, in

 #ifndef STBI_NO_STDIO

-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
 STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
 #endif

-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 {
 	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
@ -1277,16 +1317,16 @@ STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wch
 static FILE *stbi__fopen(char const *filename, char const *mode)
 {
   FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
   wchar_t wMode[64];
   wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
      return 0;

-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
      return 0;

-#if _MSC_VER >= 1400
+#if defined(_MSC_VER) && _MSC_VER >= 1400
 	if (0 != _wfopen_s(&f, wFilename, wMode))
 		f = 0;
 #else
@ -1662,7 +1702,8 @@ static int stbi__get16le(stbi__context *s)
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
   stbi__uint32 z = stbi__get16le(s);
-   return z + (stbi__get16le(s) << 16);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
 }
 #endif

@ -2090,13 +2131,12 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
   int sgn;
   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);

-   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
   k = stbi_lrot(j->code_buffer, n);
-   if (n < 0 || n >= (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask))) return 0;
   j->code_buffer = k & ~stbi__bmask[n];
   k &= stbi__bmask[n];
   j->code_bits -= n;
-   return k + (stbi__jbias[n] & ~sgn);
+   return k + (stbi__jbias[n] & (sgn - 1));
 }

 // get some unsigned bits
@ -2146,7 +2186,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman

   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");

   // 0 all the ac values now so we can do it 32-bits at a time
   memset(data,0,64*sizeof(data[0]));
@ -2203,12 +2243,12 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
      // first scan for DC coefficient, must be first
      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
      t = stbi__jpeg_huff_decode(j, hdc);
-      if (t == -1) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
      diff = t ? stbi__extend_receive(j, t) : 0;

      dc = j->img_comp[b].dc_pred + diff;
      j->img_comp[b].dc_pred = dc;
-      data[0] = (short) (dc << j->succ_low);
+      data[0] = (short) (dc * (1 << j->succ_low));
   } else {
      // refinement scan for DC coefficient
      if (stbi__jpeg_get_bit(j))
@ -2245,7 +2285,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
            j->code_buffer <<= s;
            j->code_bits -= s;
            zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) << shift);
+            data[zig] = (short) ((r >> 8) * (1 << shift));
         } else {
            int rs = stbi__jpeg_huff_decode(j, hac);
            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
@ -2263,7 +2303,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
            } else {
               k += r;
               zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
            }
         }
      } while (k <= j->spec_end);
@ -3227,6 +3267,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
   }

+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
   // compute interleaved mcu info
   z->img_h_max = h_max;
   z->img_v_max = v_max;
@ -3782,6 +3829,10 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
   else
      decode_n = z->s->img_n;

+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
   // resample and color-convert
   {
      int k;
@ -3924,6 +3975,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
 {
   unsigned char* result;
   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
   STBI_NOTUSED(ri);
   j->s = s;
   stbi__setup_jpeg(j);
@ -3936,6 +3988,7 @@ static int stbi__jpeg_test(stbi__context *s)
 {
   int r;
   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
   j->s = s;
   stbi__setup_jpeg(j);
   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@ -3960,6 +4013,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
   int result;
   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
   j->s = s;
   result = stbi__jpeg_info_raw(j, x, y, comp);
   STBI_FREE(j);
@ -3979,6 +4033,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 // fast-way is faster to check than jpeg huffman, but slow way is slower
 #define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet

 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
@ -3988,8 +4043,8 @@ typedef struct
   stbi__uint16 firstcode[16];
   int maxcode[17];
   stbi__uint16 firstsymbol[16];
-   stbi_uc  size[288];
-   stbi__uint16 value[288];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
 } stbi__zhuffman;

 stbi_inline static int stbi__bitreverse16(int n)
@ -4120,7 +4175,7 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
   if (s >= 16) return -1; // invalid code!
   // code size is s, so:
   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   if (b >= sizeof (z->size)) return -1; // some data was corrupt somewhere!
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
   a->code_buffer >>= s;
   a->num_bits -= s;
@ -4317,7 +4372,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
   return 1;
 }

-static const stbi_uc stbi__zdefault_length[288] =
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
 {
   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
@ -4363,7 +4418,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
      } else {
         if (type == 1) {
            // use fixed code lengths
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
         } else {
            if (!stbi__compute_huffman_codes(a)) return 0;
@ -4759,6 +4814,7 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3

   // de-interlacing
   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
   for (p=0; p < 7; ++p) {
      int xorig[] = { 0,4,0,2,0,1,0 };
      int yorig[] = { 0,0,4,0,2,0,1 };
@ -4879,19 +4935,46 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
   return 1;
 }

-static int stbi__unpremultiply_on_load = 0;
-static int stbi__de_iphone_flag = 0;
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;

 STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
 {
-   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
 }

 STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 {
-   stbi__de_iphone_flag = flag_true_if_should_convert;
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
 }

+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
 static void stbi__de_iphone(stbi__png *z)
 {
   stbi__context *s = z->s;
@ -5272,6 +5355,32 @@ typedef struct
   int extra_read;
 } stbi__bmp_data;

+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
 static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
 {
   int hsz;
@ -5299,6 +5408,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
   if (hsz != 12) {
      int compress = stbi__get32le(s);
      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
      stbi__get32le(s); // discard sizeof
      stbi__get32le(s); // discard hres
      stbi__get32le(s); // discard vres
@ -5313,17 +5424,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
         }
         if (info->bpp == 16 || info->bpp == 32) {
            if (compress == 0) {
-               if (info->bpp == 32) {
-                  info->mr = 0xffu << 16;
-                  info->mg = 0xffu <<  8;
-                  info->mb = 0xffu <<  0;
-                  info->ma = 0xffu << 24;
-                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-               } else {
-                  info->mr = 31u << 10;
-                  info->mg = 31u <<  5;
-                  info->mb = 31u <<  0;
-               }
+               stbi__bmp_set_mask_defaults(info, compress);
            } else if (compress == 3) {
               info->mr = stbi__get32le(s);
               info->mg = stbi__get32le(s);
@ -5338,6 +5439,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
               return stbi__errpuc("bad BMP", "bad BMP");
         }
      } else {
+         // V4/V5 header
         int i;
         if (hsz != 108 && hsz != 124)
            return stbi__errpuc("bad BMP", "bad BMP");
@ -5345,6 +5447,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
         info->mg = stbi__get32le(s);
         info->mb = stbi__get32le(s);
         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
         stbi__get32le(s); // discard color space
         for (i=0; i < 12; ++i)
            stbi__get32le(s); // discard color space parameters
@ -5394,8 +5498,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
         psize = (info.offset - info.extra_read - info.hsz) >> 2;
   }
   if (psize == 0) {
-      STBI_ASSERT(info.offset == s->callback_already_read + (int) (s->img_buffer - s->img_buffer_original));
-      if (info.offset != s->callback_already_read + (s->img_buffer - s->buffer_start)) {
+      if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
        return stbi__errpuc("bad offset", "Corrupt BMP");
      }
   }
@ -6342,6 +6445,7 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c

   // intermediate buffer is RGBA
   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
   memset(result, 0xff, x*y*4);

   if (!stbi__pic_load_core(s,x,y,comp, result)) {
@ -6457,6 +6561,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
   if (!stbi__gif_header(s, g, comp, 1)) {
      STBI_FREE(g);
      stbi__rewind( s );
@ -6766,6 +6871,17 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
   }
 }

+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
 static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
 {
   if (stbi__gif_test(s)) {
@ -6777,6 +6893,10 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
      int stride;
      int out_size = 0;
      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
      memset(&g, 0, sizeof(g));
      if (delays) {
         *delays = 0;
@ -6794,26 +6914,29 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,

            if (out) {
               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
-               if (NULL == tmp) {
-                  STBI_FREE(g.out);
-                  STBI_FREE(g.history);
-                  STBI_FREE(g.background);
-                  return stbi__errpuc("outofmem", "Out of memory");
-               }
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
               else {
                   out = (stbi_uc*) tmp;
                   out_size = layers * stride;
               }

               if (delays) {
-                  *delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
                  delays_size = layers * sizeof(int);
               }
            } else {
               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
               out_size = layers * stride;
               if (delays) {
                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
                  delays_size = layers * sizeof(int);
               }
            }
@ -7138,9 +7261,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)

   info.all_a = 255;
   p = stbi__bmp_parse_header(s, &info);
-   stbi__rewind( s );
-   if (p == NULL)
+   if (p == NULL) {
+      stbi__rewind( s );
      return 0;
+   }
   if (x) *x = s->img_x;
   if (y) *y = s->img_y;
   if (comp) {
@ -7206,8 +7330,8 @@ static int stbi__psd_is16(stbi__context *s)
       stbi__rewind( s );
       return 0;
   }
-   (void) stbi__get32be(s);
-   (void) stbi__get32be(s);
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
   depth = stbi__get16be(s);
   if (depth != 16) {
       stbi__rewind( s );
@ -7286,7 +7410,6 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 // Known limitations:
 //    Does not support comments in the header section
 //    Does not support ASCII image data (formats P2 and P3)
-//    Does not support 16-bit-per-channel

 #ifndef STBI_NO_PNM

@ -7307,7 +7430,8 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
   stbi_uc *out;
   STBI_NOTUSED(ri);

-   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
      return 0;

   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
@ -7317,12 +7441,12 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
   *y = s->img_y;
   if (comp) *comp = s->img_n;

-   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
      return stbi__errpuc("too large", "PNM too large");

-   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
   if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));

   if (req_comp && req_comp != s->img_n) {
      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
@ -7398,11 +7522,19 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
   stbi__pnm_skip_whitespace(s, &c);

   maxv = stbi__pnm_getinteger(s, &c);  // read max value
-
-   if (maxv > 255)
-      return stbi__err("max value > 255", "PPM image not 8-bit");
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
   else
-      return 1;
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
 }
 #endif

@ -7458,6 +7590,9 @@ static int stbi__is_16_main(stbi__context *s)
   if (stbi__psd_is16(s))  return 1;
   #endif

+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
   return 0;
 }

--- a/src/external/stb_image_resize.h
+++ b/src/external/stb_image_resize.h
@ -1,4 +1,4 @@
-/* stb_image_resize - v0.96 - public domain image resizing
+/* stb_image_resize - v0.97 - public domain image resizing
   by Jorge L Rodriguez (@VinoBS) - 2014
   http://github.com/nothings/stb

@ -1064,7 +1064,11 @@ static void stbir__calculate_coefficients_upsample(stbir_filter filter, float sc
        total_filter += coefficient_group[i];
    }

-    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
+    // NOTE(fg): Not actually true in general, nor is there any reason to expect it should be.
+    // It would be true in exact math but is at best approximately true in floating-point math,
+    // and it would not make sense to try and put actual bounds on this here because it depends
+    // on the image aspect ratio which can get pretty extreme.
+    //STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);

    STBIR_ASSERT(total_filter > 0.9);
    STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
@ -1089,7 +1093,7 @@ static void stbir__calculate_coefficients_downsample(stbir_filter filter, float
 {
    int i;

-     STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+    STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.

    contributor->n0 = out_first_pixel;
    contributor->n1 = out_last_pixel;
@ -1103,7 +1107,11 @@ static void stbir__calculate_coefficients_downsample(stbir_filter filter, float
        coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
    }

-    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
+    // NOTE(fg): Not actually true in general, nor is there any reason to expect it should be.
+    // It would be true in exact math but is at best approximately true in floating-point math,
+    // and it would not make sense to try and put actual bounds on this here because it depends
+    // on the image aspect ratio which can get pretty extreme.
+    //STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);

    for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
    {
@ -1552,7 +1560,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                {
                    int out_pixel_index = k * 1;
                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                }
            }
@ -1573,7 +1580,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                {
                    int out_pixel_index = k * 2;
                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
                }
@ -1595,7 +1601,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                {
                    int out_pixel_index = k * 3;
                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
@ -1618,7 +1623,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                {
                    int out_pixel_index = k * 4;
                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
@ -1643,7 +1647,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                    int c;
                    int out_pixel_index = k * channels;
                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                    for (c = 0; c < channels; c++)
                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
                }
--- a/src/external/stb_image_write.h
+++ b/src/external/stb_image_write.h
@ -1,4 +1,4 @@
-/* stb_image_write - v1.15 - public domain - http://nothings.org/stb
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
                                     no warranty implied; use at your own risk

@ -140,6 +140,7 @@ CREDITS:
      Ivan Tikhonov
      github:ignotion
      Adam Schackart
+      Andrew Kensler

 LICENSE

@ -166,9 +167,9 @@ LICENSE
 #endif

 #ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
-extern int stbi_write_tga_with_rle;
-extern int stbi_write_png_compression_level;
-extern int stbi_write_force_png_filter;
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
 #endif

 #ifndef STBI_WRITE_NO_STDIO
@ -178,7 +179,7 @@ STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const
 STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
 STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);

-#ifdef STBI_WINDOWS_UTF8
+#ifdef STBIW_WINDOWS_UTF8
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
 #endif
 #endif
@ -285,7 +286,7 @@ static void stbi__stdio_write(void *context, void *data, int size)
   fwrite(data,1,size,(FILE*) context);
 }

-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
 #ifdef __cplusplus
 #define STBIW_EXTERN extern "C"
 #else
@ -296,25 +297,25 @@ STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned in

 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 {
-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
 }
 #endif

 static FILE *stbiw__fopen(char const *filename, char const *mode)
 {
   FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
   wchar_t wMode[64];
   wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
      return 0;

-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
      return 0;

-#if _MSC_VER >= 1400
-	if (0 != _wfopen_s(&f, wFilename, wMode))
-		f = 0;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
 #else
   f = _wfopen(wFilename, wMode);
 #endif
@ -397,7 +398,7 @@ static void stbiw__putc(stbi__write_context *s, unsigned char c)

 static void stbiw__write1(stbi__write_context *s, unsigned char a)
 {
-   if (s->buf_used + 1 > sizeof(s->buffer))
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
      stbiw__write_flush(s);
   s->buffer[s->buf_used++] = a;
 }
@ -405,7 +406,7 @@ static void stbiw__write1(stbi__write_context *s, unsigned char a)
 static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
   int n;
-   if (s->buf_used + 3 > sizeof(s->buffer))
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
      stbiw__write_flush(s);
   n = s->buf_used;
   s->buf_used = n+3;
@ -490,11 +491,22 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,

 static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
-   int pad = (-x*3) & 3;
-   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-           "11 4 22 4" "4 44 22 444444",
-           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
 }

 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
@ -622,6 +634,8 @@ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const

 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))

+#ifndef STBI_WRITE_NO_STDIO
+
 static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 {
   int exponent;
@ -756,7 +770,7 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
      s->func(s->context, header, sizeof(header)-1);

-#ifdef __STDC_WANT_SECURE_LIB__
+#ifdef __STDC_LIB_EXT1__
      len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 #else
      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
@ -777,7 +791,6 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x,
   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
 }

-#ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
 {
   stbi__write_context s = { 0 };
@ -968,6 +981,23 @@ STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, i
      (void) stbiw__sbfree(hash_table[i]);
   STBIW_FREE(hash_table);

+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
   {
      // compute adler32 on input
      unsigned int s1=1, s2=0;
@ -1598,6 +1628,10 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION

 /* Revision history
+      1.16  (2021-07-11)
+             make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
      1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
      1.13
      1.12
@ -1635,7 +1669,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
             add HDR output
             fix monochrome BMP
      0.95 (2014-08-17)
-		       add monochrome TGA output
+             add monochrome TGA output
      0.94 (2014-05-31)
             rename private functions to avoid conflicts with stb_image.h
      0.93 (2014-05-27)
--- a/src/external/stb_rect_pack.h
+++ b/src/external/stb_rect_pack.h
@ -1,9 +1,15 @@
-// stb_rect_pack.h - v1.00 - public domain - rectangle packing
+// stb_rect_pack.h - v1.01 - public domain - rectangle packing
 // Sean Barrett 2014
 //
 // Useful for e.g. packing rectangular textures into an atlas.
 // Does not do rotation.
 //
+// Before #including,
+//
+//    #define STB_RECT_PACK_IMPLEMENTATION
+//
+// in the file that you want to have the implementation.
+//
 // Not necessarily the awesomest packing method, but better than
 // the totally naive one in stb_truetype (which is primarily what
 // this is meant to replace).
@ -35,6 +41,7 @@
 //
 // Version history:
 //
+//     1.01  (2021-07-11)  always use large rect mode, expose STBRP__MAXVAL in public section
 //     1.00  (2019-02-25)  avoid small space waste; gracefully fail too-wide rectangles
 //     0.99  (2019-02-07)  warning fixes
 //     0.11  (2017-03-03)  return packing success/fail result
@ -75,11 +82,10 @@ typedef struct stbrp_context stbrp_context;
 typedef struct stbrp_node    stbrp_node;
 typedef struct stbrp_rect    stbrp_rect;

-#ifdef STBRP_LARGE_RECTS
 typedef int            stbrp_coord;
-#else
-typedef unsigned short stbrp_coord;
-#endif
+
+#define STBRP__MAXVAL  0x7fffffff
+// Mostly for internal use, but this is the maximum supported coordinate value.

 STBRP_DEF int stbrp_pack_rects (stbrp_context *context, stbrp_rect *rects, int num_rects);
 // Assign packed locations to rectangles. The rectangles are of type
@ -209,8 +215,10 @@ struct stbrp_context

 #ifdef _MSC_VER
 #define STBRP__NOTUSED(v)  (void)(v)
+#define STBRP__CDECL       __cdecl
 #else
 #define STBRP__NOTUSED(v)  (void)sizeof(v)
+#define STBRP__CDECL
 #endif

 enum
@ -253,9 +261,6 @@ STBRP_DEF void stbrp_setup_allow_out_of_mem(stbrp_context *context, int allow_ou
 STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes)
 {
   int i;
-#ifndef STBRP_LARGE_RECTS
-   STBRP_ASSERT(width <= 0xffff && height <= 0xffff);
-#endif

   for (i=0; i < num_nodes-1; ++i)
      nodes[i].next = &nodes[i+1];
@ -274,11 +279,7 @@ STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height,
   context->extra[0].y = 0;
   context->extra[0].next = &context->extra[1];
   context->extra[1].x = (stbrp_coord) width;
-#ifdef STBRP_LARGE_RECTS
   context->extra[1].y = (1<<30);
-#else
-   context->extra[1].y = 65535;
-#endif
   context->extra[1].next = NULL;
 }

@ -520,7 +521,7 @@ static stbrp__findresult stbrp__skyline_pack_rectangle(stbrp_context *context, i
   return res;
 }

-static int rect_height_compare(const void *a, const void *b)
+static int STBRP__CDECL rect_height_compare(const void *a, const void *b)
 {
   const stbrp_rect *p = (const stbrp_rect *) a;
   const stbrp_rect *q = (const stbrp_rect *) b;
@ -531,19 +532,13 @@ static int rect_height_compare(const void *a, const void *b)
   return (p->w > q->w) ? -1 : (p->w < q->w);
 }

-static int rect_original_order(const void *a, const void *b)
+static int STBRP__CDECL rect_original_order(const void *a, const void *b)
 {
   const stbrp_rect *p = (const stbrp_rect *) a;
   const stbrp_rect *q = (const stbrp_rect *) b;
   return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed);
 }

-#ifdef STBRP_LARGE_RECTS
-#define STBRP__MAXVAL  0xffffffff
-#else
-#define STBRP__MAXVAL  0xffff
-#endif
-
 STBRP_DEF int stbrp_pack_rects(stbrp_context *context, stbrp_rect *rects, int num_rects)
 {
   int i, all_rects_packed = 1;
--- a/src/external/stb_truetype.h
+++ b/src/external/stb_truetype.h
@ -1,5 +1,5 @@
-// stb_truetype.h - v1.24 - public domain
-// authored from 2009-2020 by Sean Barrett / RAD Game Tools
+// stb_truetype.h - v1.26 - public domain
+// authored from 2009-2021 by Sean Barrett / RAD Game Tools
 //
 // =======================================================================
 //
@ -53,11 +53,13 @@
 //       Johan Duparc               Thomas Fields
 //       Hou Qiming                 Derek Vinyard
 //       Rob Loach                  Cort Stratton
-//       Kenney Phillis Jr.         Brian Costabile            
-//       Ken Voskuil (kaesve)       
+//       Kenney Phillis Jr.         Brian Costabile
+//       Ken Voskuil (kaesve)
 //
 // VERSION HISTORY
 //
+//   1.26 (2021-08-28) fix broken rasterizer
+//   1.25 (2021-07-11) many fixes
 //   1.24 (2020-02-05) fix warning
 //   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
 //   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
@ -270,8 +272,8 @@
 ////  SAMPLE PROGRAMS
 ////
 //
-//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
-//
+//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless.
+//  See "tests/truetype_demo_win32.c" for a complete version.
 #if 0
 #define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
 #include "stb_truetype.h"
@ -297,6 +299,8 @@ void my_stbtt_initfont(void)
 void my_stbtt_print(float x, float y, char *text)
 {
   // assume orthographic projection with units = screen pixels, origin at top left
+   glEnable(GL_BLEND);
+   glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
   glEnable(GL_TEXTURE_2D);
   glBindTexture(GL_TEXTURE_2D, ftex);
   glBegin(GL_QUADS);
@ -304,10 +308,10 @@ void my_stbtt_print(float x, float y, char *text)
      if (*text >= 32 && *text < 128) {
         stbtt_aligned_quad q;
         stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
-         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
-         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
-         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
-         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1);
      }
      ++text;
   }
@ -853,6 +857,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
 STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
 // frees the data allocated above

+STBTT_DEF unsigned char *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl);
 STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg);
 STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg);
 // fills svg with the character's SVG data.
@ -1539,12 +1544,12 @@ STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codep
      search += 2;

      {
-         stbtt_uint16 offset, start;
+         stbtt_uint16 offset, start, last;
         stbtt_uint16 item = (stbtt_uint16) ((search - endCount) >> 1);

-         STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item));
         start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
-         if (unicode_codepoint < start)
+         last = ttUSHORT(data + endCount + 2*item);
+         if (unicode_codepoint < start || unicode_codepoint > last)
            return 0;

         offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);
@ -1871,7 +1876,7 @@ static int stbtt__GetGlyphShapeTT(const stbtt_fontinfo *info, int glyph_index, s
               if (comp_verts) STBTT_free(comp_verts, info->userdata);
               return 0;
            }
-            if (num_vertices > 0) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
+            if (num_vertices > 0 && vertices) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
            STBTT_memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
            if (vertices) STBTT_free(vertices, info->userdata);
            vertices = tmp;
@ -2134,7 +2139,7 @@ static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, st
               subrs = stbtt__cid_get_glyph_subrs(info, glyph_index);
            has_subrs = 1;
         }
-         // fallthrough
+         // FALLTHROUGH
      case 0x1D: // callgsubr
         if (sp < 1) return STBTT__CSERR("call(g|)subr stack");
         v = (int) s[--sp];
@ -2239,7 +2244,7 @@ static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, st
      } break;

      default:
-         if (b0 != 255 && b0 != 28 && (b0 < 32 || b0 > 254))
+         if (b0 != 255 && b0 != 28 && b0 < 32)
            return STBTT__CSERR("reserved operator");

         // push immediate
@ -2351,7 +2356,7 @@ STBTT_DEF int stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningent
   return length;
 }

-static int  stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
 {
   stbtt_uint8 *data = info->data + info->kern;
   stbtt_uint32 needle, straw;
@ -2381,243 +2386,225 @@ static int  stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph
   return 0;
 }

-static stbtt_int32  stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph)
+static stbtt_int32 stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph)
 {
-    stbtt_uint16 coverageFormat = ttUSHORT(coverageTable);
-    switch(coverageFormat) {
-        case 1: {
-            stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2);
+   stbtt_uint16 coverageFormat = ttUSHORT(coverageTable);
+   switch (coverageFormat) {
+      case 1: {
+         stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2);

-            // Binary search.
-            stbtt_int32 l=0, r=glyphCount-1, m;
-            int straw, needle=glyph;
-            while (l <= r) {
-                stbtt_uint8 *glyphArray = coverageTable + 4;
-                stbtt_uint16 glyphID;
-                m = (l + r) >> 1;
-                glyphID = ttUSHORT(glyphArray + 2 * m);
-                straw = glyphID;
-                if (needle < straw)
-                    r = m - 1;
-                else if (needle > straw)
-                    l = m + 1;
-                else {
-                     return m;
-                }
+         // Binary search.
+         stbtt_int32 l=0, r=glyphCount-1, m;
+         int straw, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *glyphArray = coverageTable + 4;
+            stbtt_uint16 glyphID;
+            m = (l + r) >> 1;
+            glyphID = ttUSHORT(glyphArray + 2 * m);
+            straw = glyphID;
+            if (needle < straw)
+               r = m - 1;
+            else if (needle > straw)
+               l = m + 1;
+            else {
+               return m;
            }
-        } break;
+         }
+         break;
+      }

-        case 2: {
-            stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2);
-            stbtt_uint8 *rangeArray = coverageTable + 4;
+      case 2: {
+         stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2);
+         stbtt_uint8 *rangeArray = coverageTable + 4;

-            // Binary search.
-            stbtt_int32 l=0, r=rangeCount-1, m;
-            int strawStart, strawEnd, needle=glyph;
-            while (l <= r) {
-                stbtt_uint8 *rangeRecord;
-                m = (l + r) >> 1;
-                rangeRecord = rangeArray + 6 * m;
-                strawStart = ttUSHORT(rangeRecord);
-                strawEnd = ttUSHORT(rangeRecord + 2);
-                if (needle < strawStart)
-                    r = m - 1;
-                else if (needle > strawEnd)
-                    l = m + 1;
-                else {
-                    stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4);
-                    return startCoverageIndex + glyph - strawStart;
-                }
+         // Binary search.
+         stbtt_int32 l=0, r=rangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *rangeRecord;
+            m = (l + r) >> 1;
+            rangeRecord = rangeArray + 6 * m;
+            strawStart = ttUSHORT(rangeRecord);
+            strawEnd = ttUSHORT(rangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else {
+               stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4);
+               return startCoverageIndex + glyph - strawStart;
            }
-        } break;
+         }
+         break;
+      }

-        default: {
-            // There are no other cases.
-            STBTT_assert(0);
-        } break;
-    }
+      default: return -1; // unsupported
+   }

-    return -1;
+   return -1;
 }

 static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
 {
-    stbtt_uint16 classDefFormat = ttUSHORT(classDefTable);
-    switch(classDefFormat)
-    {
-        case 1: {
-            stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2);
-            stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4);
-            stbtt_uint8 *classDef1ValueArray = classDefTable + 6;
+   stbtt_uint16 classDefFormat = ttUSHORT(classDefTable);
+   switch (classDefFormat)
+   {
+      case 1: {
+         stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2);
+         stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4);
+         stbtt_uint8 *classDef1ValueArray = classDefTable + 6;

-            if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
-                return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
+         if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
+            return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
+         break;
+      }

-            classDefTable = classDef1ValueArray + 2 * glyphCount;
-        } break;
+      case 2: {
+         stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2);
+         stbtt_uint8 *classRangeRecords = classDefTable + 4;

-        case 2: {
-            stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2);
-            stbtt_uint8 *classRangeRecords = classDefTable + 4;
+         // Binary search.
+         stbtt_int32 l=0, r=classRangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *classRangeRecord;
+            m = (l + r) >> 1;
+            classRangeRecord = classRangeRecords + 6 * m;
+            strawStart = ttUSHORT(classRangeRecord);
+            strawEnd = ttUSHORT(classRangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else
+               return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
+         }
+         break;
+      }

-            // Binary search.
-            stbtt_int32 l=0, r=classRangeCount-1, m;
-            int strawStart, strawEnd, needle=glyph;
-            while (l <= r) {
-                stbtt_uint8 *classRangeRecord;
-                m = (l + r) >> 1;
-                classRangeRecord = classRangeRecords + 6 * m;
-                strawStart = ttUSHORT(classRangeRecord);
-                strawEnd = ttUSHORT(classRangeRecord + 2);
-                if (needle < strawStart)
-                    r = m - 1;
-                else if (needle > strawEnd)
-                    l = m + 1;
-                else
-                    return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
-            }
+      default:
+         return -1; // Unsupported definition type, return an error.
+   }

-            classDefTable = classRangeRecords + 6 * classRangeCount;
-        } break;
-
-        default: {
-            // There are no other cases.
-            STBTT_assert(0);
-        } break;
-    }
-
-    return -1;
+   // "All glyphs not assigned to a class fall into class 0". (OpenType spec)
+   return 0;
 }

 // Define to STBTT_assert(x) if you want to break on unimplemented formats.
 #define STBTT_GPOS_TODO_assert(x)

-static stbtt_int32  stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
 {
-    stbtt_uint16 lookupListOffset;
-    stbtt_uint8 *lookupList;
-    stbtt_uint16 lookupCount;
-    stbtt_uint8 *data;
-    stbtt_int32 i;
+   stbtt_uint16 lookupListOffset;
+   stbtt_uint8 *lookupList;
+   stbtt_uint16 lookupCount;
+   stbtt_uint8 *data;
+   stbtt_int32 i, sti;

-    if (!info->gpos) return 0;
+   if (!info->gpos) return 0;

-    data = info->data + info->gpos;
+   data = info->data + info->gpos;

-    if (ttUSHORT(data+0) != 1) return 0; // Major version 1
-    if (ttUSHORT(data+2) != 0) return 0; // Minor version 0
+   if (ttUSHORT(data+0) != 1) return 0; // Major version 1
+   if (ttUSHORT(data+2) != 0) return 0; // Minor version 0

-    lookupListOffset = ttUSHORT(data+8);
-    lookupList = data + lookupListOffset;
-    lookupCount = ttUSHORT(lookupList);
+   lookupListOffset = ttUSHORT(data+8);
+   lookupList = data + lookupListOffset;
+   lookupCount = ttUSHORT(lookupList);

-    for (i=0; i<lookupCount; ++i) {
-        stbtt_uint16 lookupOffset = ttUSHORT(lookupList + 2 + 2 * i);
-        stbtt_uint8 *lookupTable = lookupList + lookupOffset;
+   for (i=0; i<lookupCount; ++i) {
+      stbtt_uint16 lookupOffset = ttUSHORT(lookupList + 2 + 2 * i);
+      stbtt_uint8 *lookupTable = lookupList + lookupOffset;

-        stbtt_uint16 lookupType = ttUSHORT(lookupTable);
-        stbtt_uint16 subTableCount = ttUSHORT(lookupTable + 4);
-        stbtt_uint8 *subTableOffsets = lookupTable + 6;
-        switch(lookupType) {
-            case 2: { // Pair Adjustment Positioning Subtable
-                stbtt_int32 sti;
-                for (sti=0; sti<subTableCount; sti++) {
-                    stbtt_uint16 subtableOffset = ttUSHORT(subTableOffsets + 2 * sti);
-                    stbtt_uint8 *table = lookupTable + subtableOffset;
-                    stbtt_uint16 posFormat = ttUSHORT(table);
-                    stbtt_uint16 coverageOffset = ttUSHORT(table + 2);
-                    stbtt_int32 coverageIndex = stbtt__GetCoverageIndex(table + coverageOffset, glyph1);
-                    if (coverageIndex == -1) continue;
+      stbtt_uint16 lookupType = ttUSHORT(lookupTable);
+      stbtt_uint16 subTableCount = ttUSHORT(lookupTable + 4);
+      stbtt_uint8 *subTableOffsets = lookupTable + 6;
+      if (lookupType != 2) // Pair Adjustment Positioning Subtable
+         continue;

-                    switch (posFormat) {
-                        case 1: {
-                            stbtt_int32 l, r, m;
-                            int straw, needle;
-                            stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
-                            stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
-                            stbtt_int32 valueRecordPairSizeInBytes = 2;
-                            stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
-                            stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
-                            stbtt_uint8 *pairValueTable = table + pairPosOffset;
-                            stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
-                            stbtt_uint8 *pairValueArray = pairValueTable + 2;
-                            // TODO: Support more formats.
-                            STBTT_GPOS_TODO_assert(valueFormat1 == 4);
-                            if (valueFormat1 != 4) return 0;
-                            STBTT_GPOS_TODO_assert(valueFormat2 == 0);
-                            if (valueFormat2 != 0) return 0;
+      for (sti=0; sti<subTableCount; sti++) {
+         stbtt_uint16 subtableOffset = ttUSHORT(subTableOffsets + 2 * sti);
+         stbtt_uint8 *table = lookupTable + subtableOffset;
+         stbtt_uint16 posFormat = ttUSHORT(table);
+         stbtt_uint16 coverageOffset = ttUSHORT(table + 2);
+         stbtt_int32 coverageIndex = stbtt__GetCoverageIndex(table + coverageOffset, glyph1);
+         if (coverageIndex == -1) continue;

-                            STBTT_assert(coverageIndex < pairSetCount);
-                            STBTT__NOTUSED(pairSetCount);
+         switch (posFormat) {
+            case 1: {
+               stbtt_int32 l, r, m;
+               int straw, needle;
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_int32 valueRecordPairSizeInBytes = 2;
+                  stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
+                  stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
+                  stbtt_uint8 *pairValueTable = table + pairPosOffset;
+                  stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
+                  stbtt_uint8 *pairValueArray = pairValueTable + 2;

-                            needle=glyph2;
-                            r=pairValueCount-1;
-                            l=0;
+                  if (coverageIndex >= pairSetCount) return 0;

-                            // Binary search.
-                            while (l <= r) {
-                                stbtt_uint16 secondGlyph;
-                                stbtt_uint8 *pairValue;
-                                m = (l + r) >> 1;
-                                pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
-                                secondGlyph = ttUSHORT(pairValue);
-                                straw = secondGlyph;
-                                if (needle < straw)
-                                    r = m - 1;
-                                else if (needle > straw)
-                                    l = m + 1;
-                                else {
-                                    stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
-                                    return xAdvance;
-                                }
-                            }
-                        } break;
+                  needle=glyph2;
+                  r=pairValueCount-1;
+                  l=0;

-                        case 2: {
-                            stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
-                            stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+                  // Binary search.
+                  while (l <= r) {
+                     stbtt_uint16 secondGlyph;
+                     stbtt_uint8 *pairValue;
+                     m = (l + r) >> 1;
+                     pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
+                     secondGlyph = ttUSHORT(pairValue);
+                     straw = secondGlyph;
+                     if (needle < straw)
+                        r = m - 1;
+                     else if (needle > straw)
+                        l = m + 1;
+                     else {
+                        stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
+                        return xAdvance;
+                     }
+                  }
+               } else
+                  return 0;
+               break;
+            }

-                            stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
-                            stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
-                            int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
-                            int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
+            case 2: {
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
+                  stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
+                  int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
+                  int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);

-                            stbtt_uint16 class1Count = ttUSHORT(table + 12);
-                            stbtt_uint16 class2Count = ttUSHORT(table + 14);
-                            STBTT_assert(glyph1class < class1Count);
-                            STBTT_assert(glyph2class < class2Count);
+                  stbtt_uint16 class1Count = ttUSHORT(table + 12);
+                  stbtt_uint16 class2Count = ttUSHORT(table + 14);
+                  stbtt_uint8 *class1Records, *class2Records;
+                  stbtt_int16 xAdvance;

-                            // TODO: Support more formats.
-                            STBTT_GPOS_TODO_assert(valueFormat1 == 4);
-                            if (valueFormat1 != 4) return 0;
-                            STBTT_GPOS_TODO_assert(valueFormat2 == 0);
-                            if (valueFormat2 != 0) return 0;
+                  if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
+                  if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed

-                            if (glyph1class >= 0 && glyph1class < class1Count && glyph2class >= 0 && glyph2class < class2Count) {
-                                stbtt_uint8 *class1Records = table + 16;
-                                stbtt_uint8 *class2Records = class1Records + 2 * (glyph1class * class2Count);
-                                stbtt_int16 xAdvance = ttSHORT(class2Records + 2 * glyph2class);
-                                return xAdvance;
-                            }
-                        } break;
-
-                        default: {
-                            // There are no other cases.
-                            STBTT_assert(0);
-                            break;
-                        };
-                    }
-                }
-                break;
-            };
+                  class1Records = table + 16;
+                  class2Records = class1Records + 2 * (glyph1class * class2Count);
+                  xAdvance = ttSHORT(class2Records + 2 * glyph2class);
+                  return xAdvance;
+               } else
+                  return 0;
+               break;
+            }

            default:
-                // TODO: Implement other stuff.
-                break;
-        }
-    }
+               return 0; // Unsupported position format
+         }
+      }
+   }

-    return 0;
+   return 0;
 }

 STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int g1, int g2)
@ -3075,6 +3062,23 @@ static void stbtt__handle_clipped_edge(float *scanline, int x, stbtt__active_edg
   }
 }

+static float stbtt__sized_trapezoid_area(float height, float top_width, float bottom_width)
+{
+   STBTT_assert(top_width >= 0);
+   STBTT_assert(bottom_width >= 0);
+   return (top_width + bottom_width) / 2.0f * height;
+}
+
+static float stbtt__position_trapezoid_area(float height, float tx0, float tx1, float bx0, float bx1)
+{
+   return stbtt__sized_trapezoid_area(height, tx1 - tx0, bx1 - bx0);
+}
+
+static float stbtt__sized_triangle_area(float height, float width)
+{
+   return height * width / 2;
+}
+
 static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, int len, stbtt__active_edge *e, float y_top)
 {
   float y_bottom = y_top+1;
@ -3129,13 +3133,13 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
               float height;
               // simple case, only spans one pixel
               int x = (int) x_top;
-               height = sy1 - sy0;
+               height = (sy1 - sy0) * e->direction;
               STBTT_assert(x >= 0 && x < len);
-               scanline[x] += e->direction * (1-((x_top - x) + (x_bottom-x))/2)  * height;
-               scanline_fill[x] += e->direction * height; // everything right of this pixel is filled
+               scanline[x]      += stbtt__position_trapezoid_area(height, x_top, x+1.0f, x_bottom, x+1.0f);
+               scanline_fill[x] += height; // everything right of this pixel is filled
            } else {
               int x,x1,x2;
-               float y_crossing, step, sign, area;
+               float y_crossing, y_final, step, sign, area;
               // covers 2+ pixels
               if (x_top > x_bottom) {
                  // flip scanline vertically; signed area is the same
@ -3148,29 +3152,79 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                  dy = -dy;
                  t = x0, x0 = xb, xb = t;
               }
+               STBTT_assert(dy >= 0);
+               STBTT_assert(dx >= 0);

               x1 = (int) x_top;
               x2 = (int) x_bottom;
               // compute intersection with y axis at x1+1
-               y_crossing = (x1+1 - x0) * dy + y_top;
+               y_crossing = y_top + dy * (x1+1 - x0);
+
+               // compute intersection with y axis at x2
+               y_final = y_top + dy * (x2 - x0);
+
+               //           x1    x_top                            x2    x_bottom
+               //     y_top  +------|-----+------------+------------+--------|---+------------+
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //       sy0  |      Txxxxx|............|............|............|............|
+               // y_crossing |            *xxxxx.......|............|............|............|
+               //            |            |     xxxxx..|............|............|............|
+               //            |            |     /-   xx*xxxx........|............|............|
+               //            |            | dy <       |    xxxxxx..|............|............|
+               //   y_final  |            |     \-     |          xx*xxx.........|............|
+               //       sy1  |            |            |            |   xxxxxB...|............|
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //  y_bottom  +------------+------------+------------+------------+------------+
+               //
+               // goal is to measure the area covered by '.' in each pixel
+
+               // if x2 is right at the right edge of x1, y_crossing can blow up, github #1057
+               // @TODO: maybe test against sy1 rather than y_bottom?
+               if (y_crossing > y_bottom)
+                  y_crossing = y_bottom;

               sign = e->direction;
-               // area of the rectangle covered from y0..y_crossing
-               area = sign * (y_crossing-sy0);
-               // area of the triangle (x_top,y0), (x+1,y0), (x+1,y_crossing)
-               scanline[x1] += area * (1-((x_top - x1)+(x1+1-x1))/2);

-               step = sign * dy;
+               // area of the rectangle covered from sy0..y_crossing
+               area = sign * (y_crossing-sy0);
+
+               // area of the triangle (x_top,sy0), (x1+1,sy0), (x1+1,y_crossing)
+               scanline[x1] += stbtt__sized_triangle_area(area, x1+1 - x_top);
+
+               // check if final y_crossing is blown up; no test case for this
+               if (y_final > y_bottom) {
+                  y_final = y_bottom;
+                  dy = (y_final - y_crossing ) / (x2 - (x1+1)); // if denom=0, y_final = y_crossing, so y_final <= y_bottom
+               }
+
+               // in second pixel, area covered by line segment found in first pixel
+               // is always a rectangle 1 wide * the height of that line segment; this
+               // is exactly what the variable 'area' stores. it also gets a contribution
+               // from the line segment within it. the THIRD pixel will get the first
+               // pixel's rectangle contribution, the second pixel's rectangle contribution,
+               // and its own contribution. the 'own contribution' is the same in every pixel except
+               // the leftmost and rightmost, a trapezoid that slides down in each pixel.
+               // the second pixel's contribution to the third pixel will be the
+               // rectangle 1 wide times the height change in the second pixel, which is dy.
+
+               step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x,
+               // which multiplied by 1-pixel-width is how much pixel area changes for each step in x
+               // so the area advances by 'step' every time
+
               for (x = x1+1; x < x2; ++x) {
-                  scanline[x] += area + step/2;
+                  scanline[x] += area + step/2; // area of trapezoid is 1*step/2
                  area += step;
               }
-               y_crossing += dy * (x2 - (x1+1));
+               STBTT_assert(STBTT_fabs(area) <= 1.01f); // accumulated error from area += step unless we round step down
+               STBTT_assert(sy1 > y_final-0.01f);

-               STBTT_assert(STBTT_fabs(area) <= 1.01f);
-
-               scanline[x2] += area + sign * (1-((x2-x2)+(x_bottom-x2))/2) * (sy1-y_crossing);
+               // area covered in the last pixel is the rectangle from all the pixels to the left,
+               // plus the trapezoid filled by the line segment in this pixel all the way to the right edge
+               scanline[x2] += area + sign * stbtt__position_trapezoid_area(sy1-y_final, (float) x2, x2+1.0f, x_bottom, x2+1.0f);

+               // the rest of the line is filled based on the total height of the line segment in this pixel
               scanline_fill[x2] += sign * (sy1-sy0);
            }
         } else {
@ -3178,6 +3232,9 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
            // clipping logic. since this does not match the intended use
            // of this library, we use a different, very slow brute
            // force implementation
+            // note though that this does happen some of the time because
+            // x_top and x_bottom can be extrapolated at the top & bottom of
+            // the shape and actually lie outside the bounding box
            int x;
            for (x=0; x < len; ++x) {
               // cases:
@ -4414,15 +4471,14 @@ static int stbtt__compute_crossings_x(float x, float y, int nverts, stbtt_vertex
   float y_frac;
   int winding = 0;

-   orig[0] = x;
-   orig[1] = y;
-
   // make sure y never passes through a vertex of the shape
   y_frac = (float) STBTT_fmod(y, 1.0f);
   if (y_frac < 0.01f)
      y += 0.01f;
   else if (y_frac > 0.99f)
      y -= 0.01f;
+
+   orig[0] = x;
   orig[1] = y;

   // test a ray from (-infinity,y) to (x,y)
@ -4484,35 +4540,35 @@ static float stbtt__cuberoot( float x )
      return  (float) STBTT_pow( x,1.0f/3.0f);
 }

-// x^3 + c*x^2 + b*x + a = 0
+// x^3 + a*x^2 + b*x + c = 0
 static int stbtt__solve_cubic(float a, float b, float c, float* r)
 {
-	float s = -a / 3;
-	float p = b - a*a / 3;
-	float q = a * (2*a*a - 9*b) / 27 + c;
+   float s = -a / 3;
+   float p = b - a*a / 3;
+   float q = a * (2*a*a - 9*b) / 27 + c;
   float p3 = p*p*p;
-	float d = q*q + 4*p3 / 27;
-	if (d >= 0) {
-		float z = (float) STBTT_sqrt(d);
-		float u = (-q + z) / 2;
-		float v = (-q - z) / 2;
-		u = stbtt__cuberoot(u);
-		v = stbtt__cuberoot(v);
-		r[0] = s + u + v;
-		return 1;
-	} else {
-	   float u = (float) STBTT_sqrt(-p/3);
-	   float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative
-	   float m = (float) STBTT_cos(v);
+   float d = q*q + 4*p3 / 27;
+   if (d >= 0) {
+      float z = (float) STBTT_sqrt(d);
+      float u = (-q + z) / 2;
+      float v = (-q - z) / 2;
+      u = stbtt__cuberoot(u);
+      v = stbtt__cuberoot(v);
+      r[0] = s + u + v;
+      return 1;
+   } else {
+      float u = (float) STBTT_sqrt(-p/3);
+      float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative
+      float m = (float) STBTT_cos(v);
      float n = (float) STBTT_cos(v-3.141592/2)*1.732050808f;
-	   r[0] = s + u * 2 * m;
-	   r[1] = s - u * (m + n);
-	   r[2] = s - u * (m - n);
+      r[0] = s + u * 2 * m;
+      r[1] = s - u * (m + n);
+      r[2] = s - u * (m - n);

      //STBTT_assert( STBTT_fabs(((r[0]+a)*r[0]+b)*r[0]+c) < 0.05f);  // these asserts may not be safe at all scales, though they're in bezier t parameter units so maybe?
      //STBTT_assert( STBTT_fabs(((r[1]+a)*r[1]+b)*r[1]+c) < 0.05f);
      //STBTT_assert( STBTT_fabs(((r[2]+a)*r[2]+b)*r[2]+c) < 0.05f);
-   	return 3;
+      return 3;
   }
 }

@ -4589,18 +4645,17 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
            for (i=0; i < num_verts; ++i) {
               float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;

-               // check against every point here rather than inside line/curve primitives -- @TODO: wrong if multiple 'moves' in a row produce a garbage point, and given culling, probably more efficient to do within line/curve
-               float dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
-               if (dist2 < min_dist*min_dist)
-                  min_dist = (float) STBTT_sqrt(dist2);
-
-               if (verts[i].type == STBTT_vline) {
+               if (verts[i].type == STBTT_vline && precompute[i] != 0.0f) {
                  float x1 = verts[i-1].x*scale_x, y1 = verts[i-1].y*scale_y;

+                  float dist,dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                  if (dist2 < min_dist*min_dist)
+                     min_dist = (float) STBTT_sqrt(dist2);
+
                  // coarse culling against bbox
                  //if (sx > STBTT_min(x0,x1)-min_dist && sx < STBTT_max(x0,x1)+min_dist &&
                  //    sy > STBTT_min(y0,y1)-min_dist && sy < STBTT_max(y0,y1)+min_dist)
-                  float dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i];
+                  dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i];
                  STBTT_assert(i != 0);
                  if (dist < min_dist) {
                     // check position along line
@ -4627,7 +4682,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                     float ax = x1-x0, ay = y1-y0;
                     float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
                     float mx = x0 - sx, my = y0 - sy;
-                     float res[3],px,py,t,it;
+                     float res[3] = {0.f,0.f,0.f};
+                     float px,py,t,it,dist2;
                     float a_inv = precompute[i];
                     if (a_inv == 0.0) { // if a_inv is 0, it's 2nd degree so use quadratic formula
                        float a = 3*(ax*bx + ay*by);
@ -4654,6 +4710,10 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                        float d = (mx*ax+my*ay) * a_inv;
                        num = stbtt__solve_cubic(b, c, d, res);
                     }
+                     dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                     if (dist2 < min_dist*min_dist)
+                        min_dist = (float) STBTT_sqrt(dist2);
+
                     if (num >= 1 && res[0] >= 0.0f && res[0] <= 1.0f) {
                        t = res[0], it = 1.0f - t;
                        px = it*it*x0 + 2*t*it*x1 + t*t*x2;
@ -4913,6 +4973,12 @@ STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const

 // FULL VERSION HISTORY
 //
+//   1.25 (2021-07-11) many fixes
+//   1.24 (2020-02-05) fix warning
+//   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
+//   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
+//   1.21 (2019-02-25) fix warning
+//   1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics()
 //   1.19 (2018-02-11) OpenType GPOS kerning (horizontal only), STBTT_fmod
 //   1.18 (2018-01-29) add missing function
 //   1.17 (2017-07-23) make more arguments const; doc fix
--- a/src/rcore.c
+++ b/src/rcore.c
@ -148,6 +148,7 @@

 #if defined(SUPPORT_COMPRESSION_API)
    #define SINFL_IMPLEMENTATION
+    #define SINFL_NO_SIMD
    #include "external/sinfl.h"     // Deflate (RFC 1951) decompressor

    #define SDEFL_IMPLEMENTATION
@ -3004,7 +3005,7 @@ unsigned char *DecompressData(unsigned char *compData, int compDataLength, int *
 #if defined(SUPPORT_COMPRESSION_API)
    // Decompress data from a valid DEFLATE stream
    data = RL_CALLOC(MAX_DECOMPRESSION_SIZE*1024*1024, 1);
-    int length = sinflate(data, compData, compDataLength);
+    int length = sinflate(data, MAX_DECOMPRESSION_SIZE, compData, compDataLength);
    unsigned char *temp = RL_REALLOC(data, length);

    if (temp != NULL) data = temp;
--- a/src/rmodels.c
+++ b/src/rmodels.c
@ -1830,7 +1830,7 @@ void UpdateModelAnimation(Model model, ModelAnimation anim, int frame)
                        continue;
                    }
                    boneId = mesh.boneIds[boneCounter];
-                    int boneIdParent = model.bones[boneId].parent;
+                    //int boneIdParent = model.bones[boneId].parent;
                    inTranslation = model.bindPose[boneId].translation;
                    inRotation = model.bindPose[boneId].rotation;
                    // inScale = model.bindPose[boneId].scale;