From 2d82cd1a2c0b8c8db1eac55d84f736a3aed9bb0e Mon Sep 17 00:00:00 2001
From: t-mw <t-mw@users.noreply.github.com>
Date: Tue, 2 Mar 2021 13:53:31 +0100
Subject: [PATCH 001/170] Rename arraddnoff -> arraddnindex

Fixes #1011.
As intended according to https://github.com/nothings/stb/pull/932#issuecomment-657524197.
---
 stb_ds.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/stb_ds.h b/stb_ds.h
index d1e0c75..e5baad7 100644
--- a/stb_ds.h
+++ b/stb_ds.h
@@ -531,24 +531,24 @@ extern void * stbds_shmode_func(size_t elemsize, int mode);
 #define stbds_temp(t)    stbds_header(t)->temp
 #define stbds_temp_key(t) (*(char **) stbds_header(t)->hash_table)
 
-#define stbds_arrsetcap(a,n)  (stbds_arrgrow(a,0,n))
-#define stbds_arrsetlen(a,n)  ((stbds_arrcap(a) < (size_t) (n) ? stbds_arrsetcap((a),(size_t)(n)),0 : 0), (a) ? stbds_header(a)->length = (size_t) (n) : 0)
-#define stbds_arrcap(a)       ((a) ? stbds_header(a)->capacity : 0)
-#define stbds_arrlen(a)       ((a) ? (ptrdiff_t) stbds_header(a)->length : 0)
-#define stbds_arrlenu(a)      ((a) ?             stbds_header(a)->length : 0)
-#define stbds_arrput(a,v)     (stbds_arrmaybegrow(a,1), (a)[stbds_header(a)->length++] = (v))
-#define stbds_arrpush         stbds_arrput  // synonym
-#define stbds_arrpop(a)       (stbds_header(a)->length--, (a)[stbds_header(a)->length])
-#define stbds_arraddn(a,n)    ((void)(stbds_arraddnoff(a, n)))    // deprecated, use one of the following instead:
-#define stbds_arraddnptr(a,n) (stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)])
-#define stbds_arraddnoff(a,n) (stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), stbds_header(a)->length-(n))
-#define stbds_arrlast(a)      ((a)[stbds_header(a)->length-1])
-#define stbds_arrfree(a)      ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL)
-#define stbds_arrdel(a,i)     stbds_arrdeln(a,i,1)
-#define stbds_arrdeln(a,i,n)  (memmove(&(a)[i], &(a)[(i)+(n)], sizeof *(a) * (stbds_header(a)->length-(n)-(i))), stbds_header(a)->length -= (n))
-#define stbds_arrdelswap(a,i) ((a)[i] = stbds_arrlast(a), stbds_header(a)->length -= 1)
-#define stbds_arrinsn(a,i,n)  (stbds_arraddn((a),(n)), memmove(&(a)[(i)+(n)], &(a)[i], sizeof *(a) * (stbds_header(a)->length-(n)-(i))))
-#define stbds_arrins(a,i,v)   (stbds_arrinsn((a),(i),1), (a)[i]=(v))
+#define stbds_arrsetcap(a,n)   (stbds_arrgrow(a,0,n))
+#define stbds_arrsetlen(a,n)   ((stbds_arrcap(a) < (size_t) (n) ? stbds_arrsetcap((a),(size_t)(n)),0 : 0), (a) ? stbds_header(a)->length = (size_t) (n) : 0)
+#define stbds_arrcap(a)        ((a) ? stbds_header(a)->capacity : 0)
+#define stbds_arrlen(a)        ((a) ? (ptrdiff_t) stbds_header(a)->length : 0)
+#define stbds_arrlenu(a)       ((a) ?             stbds_header(a)->length : 0)
+#define stbds_arrput(a,v)      (stbds_arrmaybegrow(a,1), (a)[stbds_header(a)->length++] = (v))
+#define stbds_arrpush          stbds_arrput  // synonym
+#define stbds_arrpop(a)        (stbds_header(a)->length--, (a)[stbds_header(a)->length])
+#define stbds_arraddn(a,n)     ((void)(stbds_arraddnindex(a, n)))    // deprecated, use one of the following instead:
+#define stbds_arraddnptr(a,n)  (stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)])
+#define stbds_arraddnindex(a,n)(stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), stbds_header(a)->length-(n))
+#define stbds_arrlast(a)       ((a)[stbds_header(a)->length-1])
+#define stbds_arrfree(a)       ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL)
+#define stbds_arrdel(a,i)      stbds_arrdeln(a,i,1)
+#define stbds_arrdeln(a,i,n)   (memmove(&(a)[i], &(a)[(i)+(n)], sizeof *(a) * (stbds_header(a)->length-(n)-(i))), stbds_header(a)->length -= (n))
+#define stbds_arrdelswap(a,i)  ((a)[i] = stbds_arrlast(a), stbds_header(a)->length -= 1)
+#define stbds_arrinsn(a,i,n)   (stbds_arraddn((a),(n)), memmove(&(a)[(i)+(n)], &(a)[i], sizeof *(a) * (stbds_header(a)->length-(n)-(i))))
+#define stbds_arrins(a,i,v)    (stbds_arrinsn((a),(i),1), (a)[i]=(v))
 
 #define stbds_arrmaybegrow(a,n)  ((!(a) || stbds_header(a)->length + (n) > stbds_header(a)->capacity) \
                                   ? (stbds_arrgrow(a,n,0),0) : 0)

From 6294c6cfa2144d52e78b1f545070bbb5a40cf504 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 15:29:17 -0700
Subject: [PATCH 002/170] Move stb.h to deprecated.

It was never designed to be used by anyone but Sean and has
numerous problems; new code should definitely not be using
this.
---
 stb.h => deprecated/stb.h | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename stb.h => deprecated/stb.h (100%)

diff --git a/stb.h b/deprecated/stb.h
similarity index 100%
rename from stb.h
rename to deprecated/stb.h

From 696cb038a35f0196a6f28d7cdd06749a9536d38b Mon Sep 17 00:00:00 2001
From: Michael Aganier <michaelaganier@gmail.com>
Date: Mon, 14 Jun 2021 14:38:26 -0400
Subject: [PATCH 003/170] stb_sprintf: add attribute format to variadic
 functions

This allows for compiler verification of the format string
just like printf.
---
 stb_sprintf.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index 0635360..a009c5d 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -7,6 +7,7 @@
 //
 // Contributors:
 //    Fabian "ryg" Giesen (reformatting)
+//    github:aganm (attribute format)
 //
 // Contributors (bugfixes):
 //    github:d26435
@@ -176,6 +177,16 @@ PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC):
 #endif
 #endif
 
+#if defined(__has_attribute)
+ #if __has_attribute(format)
+   #define STBSP__ATTRIBUTE_FORMAT(fmt,va) __attribute__((format(printf,fmt,va)))
+ #endif
+#endif
+
+#ifndef STBSP__ATTRIBUTE_FORMAT
+#define STBSP__ATTRIBUTE_FORMAT(fmt,va)
+#endif
+
 #include <stdarg.h> // for va_list()
 #include <stddef.h> // size_t, ptrdiff_t
 
@@ -190,8 +201,8 @@ typedef char *STBSP_SPRINTFCB(const char *buf, void *user, int len);
 
 STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va);
 STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va);
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...);
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...);
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(2,3);
+STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(3,4);
 
 STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va);
 STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char period);

From 98623b957763afb62fbe7b533762de7b3b23bbb2 Mon Sep 17 00:00:00 2001
From: Alan Hickman <f.alan.hickman@gmail.com>
Date: Sat, 24 Apr 2021 13:08:03 -0700
Subject: [PATCH 004/170] stb_dxt: Initialize tables at compile time

Also fix a "potentially uninitialized variable" warning.

This is a modified version of Alan's original PR that keeps the
table generator in the file (in case there's interest) and also
replaces the expand[] tables with math, since it's trivial.

Fixes issue #1117.
---
 stb_dxt.h | 267 ++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 191 insertions(+), 76 deletions(-)

diff --git a/stb_dxt.h b/stb_dxt.h
index 04666de..b37e543 100644
--- a/stb_dxt.h
+++ b/stb_dxt.h
@@ -29,6 +29,7 @@
 //   Kevin Schmidt (#defines for "freestanding" compilation)
 //   github:ppiastucki (BC4 support)
 //   Ignacio Castano - improve DXT endpoint quantization
+//   Alan Hickman - static table initialization
 //
 // LICENSE
 //
@@ -81,14 +82,10 @@ STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *sr
 
 #include <stdlib.h>
 
-#if !defined(STBD_ABS) || !defined(STBI_FABS)
+#if !defined(STBI_FABS)
 #include <math.h>
 #endif
 
-#ifndef STBD_ABS
-#define STBD_ABS(i)           abs(i)
-#endif
-
 #ifndef STBD_FABS
 #define STBD_FABS(x)          fabs(x)
 #endif
@@ -98,12 +95,112 @@ STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *sr
 #define STBD_MEMSET           memset
 #endif
 
-static unsigned char stb__Expand5[32];
-static unsigned char stb__Expand6[64];
-static unsigned char stb__OMatch5[256][2];
-static unsigned char stb__OMatch6[256][2];
-static unsigned char stb__QuantRBTab[256+16];
-static unsigned char stb__QuantGTab[256+16];
+static const unsigned char stb__QuantRBTab[256 + 16] = {
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,   8,   8,
+     8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,  16,  16,  24,  24,  24,
+    24,  24,  24,  24,  24,  33,  33,  33,  33,  33,  33,  33,  33,  33,  41,  41,
+    41,  41,  41,  41,  41,  41,  49,  49,  49,  49,  49,  49,  49,  49,  57,  57,
+    57,  57,  57,  57,  57,  57,  66,  66,  66,  66,  66,  66,  66,  66,  74,  74,
+    74,  74,  74,  74,  74,  74,  74,  82,  82,  82,  82,  82,  82,  82,  82,  90,
+    90,  90,  90,  90,  90,  90,  90,  99,  99,  99,  99,  99,  99,  99,  99, 107,
+   107, 107, 107, 107, 107, 107, 107, 107, 115, 115, 115, 115, 115, 115, 115, 115,
+   123, 123, 123, 123, 123, 123, 123, 123, 132, 132, 132, 132, 132, 132, 132, 132,
+   140, 140, 140, 140, 140, 140, 140, 140, 148, 148, 148, 148, 148, 148, 148, 148,
+   148, 156, 156, 156, 156, 156, 156, 156, 156, 165, 165, 165, 165, 165, 165, 165,
+   165, 173, 173, 173, 173, 173, 173, 173, 173, 181, 181, 181, 181, 181, 181, 181,
+   181, 181, 189, 189, 189, 189, 189, 189, 189, 189, 198, 198, 198, 198, 198, 198,
+   198, 198, 206, 206, 206, 206, 206, 206, 206, 206, 214, 214, 214, 214, 214, 214,
+   214, 214, 222, 222, 222, 222, 222, 222, 222, 222, 222, 231, 231, 231, 231, 231,
+   231, 231, 231, 239, 239, 239, 239, 239, 239, 239, 239, 247, 247, 247, 247, 247,
+   247, 247, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+static const unsigned char stb__QuantGTab[256 + 16] = {
+     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   8,
+     8,   8,   8,  12,  12,  12,  12,  16,  16,  16,  16,  20,  20,  20,  20,  24,
+    24,  24,  24,  28,  28,  28,  28,  32,  32,  32,  32,  36,  36,  36,  36,  40,
+    40,  40,  40,  44,  44,  44,  44,  48,  48,  48,  48,  52,  52,  52,  52,  56,
+    56,  56,  56,  60,  60,  60,  60,  65,  65,  65,  65,  69,  69,  69,  69,  73,
+    73,  73,  73,  77,  77,  77,  77,  81,  81,  81,  81,  85,  85,  85,  85,  85,
+    89,  89,  89,  89,  93,  93,  93,  93,  97,  97,  97,  97, 101, 101, 101, 101,
+   105, 105, 105, 105, 109, 109, 109, 109, 113, 113, 113, 113, 117, 117, 117, 117,
+   121, 121, 121, 121, 125, 125, 125, 125, 130, 130, 130, 130, 134, 134, 134, 134,
+   138, 138, 138, 138, 142, 142, 142, 142, 146, 146, 146, 146, 150, 150, 150, 150,
+   154, 154, 154, 154, 158, 158, 158, 158, 162, 162, 162, 162, 166, 166, 166, 166,
+   170, 170, 170, 170, 170, 174, 174, 174, 174, 178, 178, 178, 178, 182, 182, 182,
+   182, 186, 186, 186, 186, 190, 190, 190, 190, 195, 195, 195, 195, 199, 199, 199,
+   199, 203, 203, 203, 203, 207, 207, 207, 207, 211, 211, 211, 211, 215, 215, 215,
+   215, 219, 219, 219, 219, 223, 223, 223, 223, 227, 227, 227, 227, 231, 231, 231,
+   231, 235, 235, 235, 235, 239, 239, 239, 239, 243, 243, 243, 243, 247, 247, 247,
+   247, 251, 251, 251, 251, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+static const unsigned char stb__OMatch5[256][2] = {
+   {  0,  0 }, {  0,  0 }, {  0,  1 }, {  0,  1 }, {  1,  0 }, {  1,  0 }, {  1,  0 }, {  1,  1 },
+   {  1,  1 }, {  2,  0 }, {  2,  0 }, {  0,  4 }, {  2,  1 }, {  2,  1 }, {  2,  1 }, {  3,  0 },
+   {  3,  0 }, {  3,  0 }, {  3,  1 }, {  1,  5 }, {  3,  2 }, {  3,  2 }, {  4,  0 }, {  4,  0 },
+   {  4,  1 }, {  4,  1 }, {  4,  2 }, {  4,  2 }, {  4,  2 }, {  3,  5 }, {  5,  1 }, {  5,  1 },
+   {  5,  2 }, {  4,  4 }, {  5,  3 }, {  5,  3 }, {  5,  3 }, {  6,  2 }, {  6,  2 }, {  6,  2 },
+   {  6,  3 }, {  5,  5 }, {  6,  4 }, {  6,  4 }, {  4,  8 }, {  7,  3 }, {  7,  3 }, {  7,  3 },
+   {  7,  4 }, {  7,  4 }, {  7,  4 }, {  7,  5 }, {  5,  9 }, {  7,  6 }, {  7,  6 }, {  8,  4 },
+   {  8,  4 }, {  8,  5 }, {  8,  5 }, {  8,  6 }, {  8,  6 }, {  8,  6 }, {  7,  9 }, {  9,  5 },
+   {  9,  5 }, {  9,  6 }, {  8,  8 }, {  9,  7 }, {  9,  7 }, {  9,  7 }, { 10,  6 }, { 10,  6 },
+   { 10,  6 }, { 10,  7 }, {  9,  9 }, { 10,  8 }, { 10,  8 }, {  8, 12 }, { 11,  7 }, { 11,  7 },
+   { 11,  7 }, { 11,  8 }, { 11,  8 }, { 11,  8 }, { 11,  9 }, {  9, 13 }, { 11, 10 }, { 11, 10 },
+   { 12,  8 }, { 12,  8 }, { 12,  9 }, { 12,  9 }, { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 },
+   { 13,  9 }, { 13,  9 }, { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 },
+   { 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 },
+   { 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 },
+   { 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 },
+   { 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 },
+   { 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 },
+   { 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 },
+   { 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 },
+   { 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 },
+   { 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 },
+   { 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 },
+   { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 },
+   { 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 },
+   { 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 },
+   { 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 },
+   { 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 },
+   { 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 },
+   { 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 },
+   { 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 },
+   { 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 },
+};
+static const unsigned char stb__OMatch6[256][2] = {
+   {  0,  0 }, {  0,  1 }, {  1,  0 }, {  1,  0 }, {  1,  1 }, {  2,  0 }, {  2,  1 }, {  3,  0 },
+   {  3,  0 }, {  3,  1 }, {  4,  0 }, {  4,  0 }, {  4,  1 }, {  5,  0 }, {  5,  1 }, {  6,  0 },
+   {  6,  0 }, {  6,  1 }, {  7,  0 }, {  7,  0 }, {  7,  1 }, {  8,  0 }, {  8,  1 }, {  8,  1 },
+   {  8,  2 }, {  9,  1 }, {  9,  2 }, {  9,  2 }, {  9,  3 }, { 10,  2 }, { 10,  3 }, { 10,  3 },
+   { 10,  4 }, { 11,  3 }, { 11,  4 }, { 11,  4 }, { 11,  5 }, { 12,  4 }, { 12,  5 }, { 12,  5 },
+   { 12,  6 }, { 13,  5 }, { 13,  6 }, {  8, 16 }, { 13,  7 }, { 14,  6 }, { 14,  7 }, {  9, 17 },
+   { 14,  8 }, { 15,  7 }, { 15,  8 }, { 11, 16 }, { 15,  9 }, { 15, 10 }, { 16,  8 }, { 16,  9 },
+   { 16, 10 }, { 15, 13 }, { 17,  9 }, { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 },
+   { 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 },
+   { 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 },
+   { 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 },
+   { 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 },
+   { 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 },
+   { 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 },
+   { 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 },
+   { 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 },
+   { 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 },
+   { 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 },
+   { 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 },
+   { 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 },
+   { 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 },
+   { 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 },
+   { 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 },
+   { 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 },
+   { 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 },
+   { 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 },
+   { 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 },
+   { 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 },
+   { 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 },
+   { 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 },
+   { 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 },
+   { 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 },
+};
 
 static int stb__Mul8Bit(int a, int b)
 {
@@ -117,9 +214,10 @@ static void stb__From16Bit(unsigned char *out, unsigned short v)
    int gv = (v & 0x07e0) >>  5;
    int bv = (v & 0x001f) >>  0;
 
-   out[0] = stb__Expand5[rv];
-   out[1] = stb__Expand6[gv];
-   out[2] = stb__Expand5[bv];
+   // expand to 8 bits via bit replication
+   out[0] = (rv * 33) >> 2;
+   out[1] = (gv * 65) >> 4;
+   out[2] = (bv * 33) >> 2;
    out[3] = 0;
 }
 
@@ -151,35 +249,6 @@ static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char
 
 /****************************************************************************/
 
-// compute table to reproduce constant colors as accurately as possible
-static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size)
-{
-   int i,mn,mx;
-   for (i=0;i<256;i++) {
-      int bestErr = 256;
-      for (mn=0;mn<size;mn++) {
-         for (mx=0;mx<size;mx++) {
-            int mine = expand[mn];
-            int maxe = expand[mx];
-            int err = STBD_ABS(stb__Lerp13(maxe, mine) - i);
-
-            // DX10 spec says that interpolation must be within 3% of "correct" result,
-            // add this as error term. (normally we'd expect a random distribution of
-            // +-1.5% error, but nowhere in the spec does it say that the error has to be
-            // unbiased - better safe than sorry).
-            err += STBD_ABS(maxe - mine) * 3 / 100;
-
-            if(err < bestErr)
-            {
-               Table[i*2+0] = (unsigned char)mx;
-               Table[i*2+1] = (unsigned char)mn;
-               bestErr = err;
-            }
-         }
-      }
-   }
-}
-
 static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
 {
    stb__From16Bit(color+ 0, c0);
@@ -198,7 +267,7 @@ static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
   // process channels separately
   for (ch=0; ch<3; ++ch) {
       unsigned char *bp = block+ch, *dp = dest+ch;
-      unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
+      const unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
       STBD_MEMSET(err, 0, sizeof(err));
       for(y=0; y<4; ++y) {
          dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
@@ -316,7 +385,7 @@ static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *c
 // The color optimization function. (Clever code, part 1)
 static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
 {
-  int mind = 0x7fffffff,maxd = -0x7fffffff;
+  int mind,maxd;
   unsigned char *minp, *maxp;
   double magn;
   int v_r,v_g,v_b;
@@ -398,8 +467,10 @@ static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax1
       v_b = (int) (vfb * magn);
    }
 
+   minp = maxp = block;
+   mind = maxd = block[0]*v_r + block[1]*v_g + block[2]*v_b;
    // Pick colors at extreme points
-   for(i=0;i<16;i++)
+   for(i=1;i<16;i++)
    {
       int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
 
@@ -418,12 +489,12 @@ static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax1
    *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
 }
 
-static const float midpoints5[32] = {
+static const float stb__midpoints5[32] = {
    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
 };
 
-static const float midpoints6[64] = {
+static const float stb__midpoints6[64] = {
    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
@@ -435,7 +506,7 @@ static unsigned short stb__Quantize5(float x)
    unsigned short q;
    x = x < 0 ? 0 : x > 1 ? 1 : x;  // saturate
    q = (unsigned short)(x * 31);
-   q += (x > midpoints5[q]);
+   q += (x > stb__midpoints5[q]);
    return q;
 }
 
@@ -444,7 +515,7 @@ static unsigned short stb__Quantize6(float x)
    unsigned short q;
    x = x < 0 ? 0 : x > 1 ? 1 : x;  // saturate
    q = (unsigned short)(x * 63);
-   q += (x > midpoints6[q]);
+   q += (x > stb__midpoints6[q]);
    return q;
 }
 
@@ -654,35 +725,9 @@ static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src, int
    }
 }
 
-static void stb__InitDXT()
-{
-   int i;
-   for(i=0;i<32;i++)
-      stb__Expand5[i] = (unsigned char)((i<<3)|(i>>2));
-
-   for(i=0;i<64;i++)
-      stb__Expand6[i] = (unsigned char)((i<<2)|(i>>4));
-
-   for(i=0;i<256+16;i++)
-   {
-      int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8;
-      stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)];
-      stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)];
-   }
-
-   stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32);
-   stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64);
-}
-
 void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
 {
    unsigned char data[16][4];
-   static int init=1;
-   if (init) {
-      stb__InitDXT();
-      init=0;
-   }
-
    if (alpha) {
       int i;
       stb__CompressAlphaBlock(dest,(unsigned char*) src+3, 4);
@@ -710,6 +755,76 @@ void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src)
 }
 #endif // STB_DXT_IMPLEMENTATION
 
+// Compile with STB_DXT_IMPLEMENTATION and STB_DXT_GENERATE_TABLES
+// defined to generate the tables above.
+#ifdef STB_DXT_GENERATE_TABLES
+#include <stdio.h>
+
+int main()
+{
+   int i, j;
+   const char *quant_names[] = { "stb__QuantRBTab", "stb__QuantGTab" };
+   const char *omatch_names[] = { "stb__OMatch5", "stb__OMatch6" };
+   int dequant_mults[2] = { 33*4, 65 }; // .4 fixed-point dequant multipliers
+
+   // quant tables (for dither)
+   for (i=0; i < 2; ++i) {
+      int quant_mult = i ? 63 : 31;
+      printf("static const unsigned char %s[256 + 16] = {\n", quant_names[i]);
+      for (int j = 0; j < 256 + 16; ++j) {
+         int v = j - 8;
+         int q, dq;
+
+         v = (v < 0) ? 0 : (v > 255) ? 255 : v; // clamp
+         q = stb__Mul8Bit(v, quant_mult); // quantize
+         dq = (q * dequant_mults[i]) >> 4; // dequantize
+
+         if ((j % 16) == 0) printf("  "); // 2 spaces, third is done below
+         printf(" %3d,", dq);
+         if ((j % 16) == 15) printf("\n");
+      }
+      printf("};\n");
+   }
+
+   // optimal endpoint tables
+   for (i = 0; i < 2; ++i) {
+      int dequant = dequant_mults[i];
+      int size = i ? 64 : 32;
+      printf("static const unsigned char %s[256][2] = {\n", omatch_names[i]);
+      for (int j = 0; j < 256; ++j) {
+         int mn, mx;
+         int best_mn = 0, best_mx = 0;
+         int best_err = 256;
+         for (mn=0;mn<size;mn++) {
+            for (mx=0;mx<size;mx++) {
+               int mine = (mn * dequant) >> 4;
+               int maxe = (mx * dequant) >> 4;
+               int err = abs(stb__Lerp13(maxe, mine) - j);
+
+               // DX10 spec says that interpolation must be within 3% of "correct" result,
+               // add this as error term. Normally we'd expect a random distribution of
+               // +-1.5% error, but nowhere in the spec does it say that the error has to be
+               // unbiased - better safe than sorry.
+               err += abs(maxe - mine) * 3 / 100;
+
+               if(err < best_err) {
+                  best_mn = mn;
+                  best_mx = mx;
+                  best_err = err;
+               }
+            }
+         }
+         if ((j % 8) == 0) printf("  "); // 2 spaces, third is done below
+         printf(" { %2d, %2d },", best_mx, best_mn);
+         if ((j % 8) == 7) printf("\n");
+      }
+      printf("};\n");
+   }
+
+   return 0;
+}
+#endif
+
 /*
 ------------------------------------------------------------------------------
 This software is available under 2 licenses -- choose whichever you prefer.

From ae3ed90b0c9548a69e2da4e3fa0f5e77fd00c4d0 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 16:24:09 -0700
Subject: [PATCH 005/170] stb_dxt: Better error calc for single-color table

Don't truncate error as aggressively; easily done, but wanted
to keep it separate from the previous change.
---
 stb_dxt.h | 132 +++++++++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/stb_dxt.h b/stb_dxt.h
index b37e543..25d3fba 100644
--- a/stb_dxt.h
+++ b/stb_dxt.h
@@ -134,72 +134,72 @@ static const unsigned char stb__QuantGTab[256 + 16] = {
    247, 251, 251, 251, 251, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
 };
 static const unsigned char stb__OMatch5[256][2] = {
-   {  0,  0 }, {  0,  0 }, {  0,  1 }, {  0,  1 }, {  1,  0 }, {  1,  0 }, {  1,  0 }, {  1,  1 },
-   {  1,  1 }, {  2,  0 }, {  2,  0 }, {  0,  4 }, {  2,  1 }, {  2,  1 }, {  2,  1 }, {  3,  0 },
-   {  3,  0 }, {  3,  0 }, {  3,  1 }, {  1,  5 }, {  3,  2 }, {  3,  2 }, {  4,  0 }, {  4,  0 },
-   {  4,  1 }, {  4,  1 }, {  4,  2 }, {  4,  2 }, {  4,  2 }, {  3,  5 }, {  5,  1 }, {  5,  1 },
-   {  5,  2 }, {  4,  4 }, {  5,  3 }, {  5,  3 }, {  5,  3 }, {  6,  2 }, {  6,  2 }, {  6,  2 },
-   {  6,  3 }, {  5,  5 }, {  6,  4 }, {  6,  4 }, {  4,  8 }, {  7,  3 }, {  7,  3 }, {  7,  3 },
-   {  7,  4 }, {  7,  4 }, {  7,  4 }, {  7,  5 }, {  5,  9 }, {  7,  6 }, {  7,  6 }, {  8,  4 },
-   {  8,  4 }, {  8,  5 }, {  8,  5 }, {  8,  6 }, {  8,  6 }, {  8,  6 }, {  7,  9 }, {  9,  5 },
-   {  9,  5 }, {  9,  6 }, {  8,  8 }, {  9,  7 }, {  9,  7 }, {  9,  7 }, { 10,  6 }, { 10,  6 },
-   { 10,  6 }, { 10,  7 }, {  9,  9 }, { 10,  8 }, { 10,  8 }, {  8, 12 }, { 11,  7 }, { 11,  7 },
-   { 11,  7 }, { 11,  8 }, { 11,  8 }, { 11,  8 }, { 11,  9 }, {  9, 13 }, { 11, 10 }, { 11, 10 },
-   { 12,  8 }, { 12,  8 }, { 12,  9 }, { 12,  9 }, { 12, 10 }, { 12, 10 }, { 12, 10 }, { 11, 13 },
-   { 13,  9 }, { 13,  9 }, { 13, 10 }, { 12, 12 }, { 13, 11 }, { 13, 11 }, { 13, 11 }, { 14, 10 },
-   { 14, 10 }, { 14, 10 }, { 14, 11 }, { 13, 13 }, { 14, 12 }, { 14, 12 }, { 12, 16 }, { 15, 11 },
-   { 15, 11 }, { 15, 11 }, { 15, 12 }, { 15, 12 }, { 15, 12 }, { 15, 13 }, { 13, 17 }, { 15, 14 },
-   { 15, 14 }, { 16, 12 }, { 16, 12 }, { 16, 13 }, { 16, 13 }, { 16, 14 }, { 16, 14 }, { 16, 14 },
-   { 15, 17 }, { 17, 13 }, { 17, 13 }, { 17, 14 }, { 16, 16 }, { 17, 15 }, { 17, 15 }, { 17, 15 },
-   { 18, 14 }, { 18, 14 }, { 18, 14 }, { 18, 15 }, { 17, 17 }, { 18, 16 }, { 18, 16 }, { 16, 20 },
-   { 19, 15 }, { 19, 15 }, { 19, 15 }, { 19, 16 }, { 19, 16 }, { 19, 16 }, { 19, 17 }, { 17, 21 },
-   { 19, 18 }, { 19, 18 }, { 20, 16 }, { 20, 16 }, { 20, 17 }, { 20, 17 }, { 20, 18 }, { 20, 18 },
-   { 20, 18 }, { 19, 21 }, { 21, 17 }, { 21, 17 }, { 21, 18 }, { 20, 20 }, { 21, 19 }, { 21, 19 },
-   { 21, 19 }, { 22, 18 }, { 22, 18 }, { 22, 18 }, { 22, 19 }, { 21, 21 }, { 22, 20 }, { 22, 20 },
-   { 20, 24 }, { 23, 19 }, { 23, 19 }, { 23, 19 }, { 23, 20 }, { 23, 20 }, { 23, 20 }, { 23, 21 },
-   { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 24, 20 }, { 24, 21 }, { 24, 21 }, { 24, 22 },
-   { 24, 22 }, { 24, 22 }, { 23, 25 }, { 25, 21 }, { 25, 21 }, { 25, 22 }, { 24, 24 }, { 25, 23 },
-   { 25, 23 }, { 25, 23 }, { 26, 22 }, { 26, 22 }, { 26, 22 }, { 26, 23 }, { 25, 25 }, { 26, 24 },
-   { 26, 24 }, { 24, 28 }, { 27, 23 }, { 27, 23 }, { 27, 23 }, { 27, 24 }, { 27, 24 }, { 27, 24 },
-   { 27, 25 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 28, 24 }, { 28, 25 }, { 28, 25 },
-   { 28, 26 }, { 28, 26 }, { 28, 26 }, { 27, 29 }, { 29, 25 }, { 29, 25 }, { 29, 26 }, { 28, 28 },
-   { 29, 27 }, { 29, 27 }, { 29, 27 }, { 30, 26 }, { 30, 26 }, { 30, 26 }, { 30, 27 }, { 29, 29 },
-   { 30, 28 }, { 30, 28 }, { 30, 28 }, { 31, 27 }, { 31, 27 }, { 31, 27 }, { 31, 28 }, { 31, 28 },
-   { 31, 28 }, { 31, 29 }, { 31, 29 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 },
+   {  0,  0 }, {  0,  0 }, {  0,  0 }, {  0,  0 }, {  0,  0 }, {  1,  1 }, {  1,  1 }, {  1,  1 },
+   {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  2,  2 }, {  2,  2 }, {  2,  2 },
+   {  2,  2 }, {  2,  2 }, {  2,  2 }, {  2,  2 }, {  2,  2 }, {  3,  3 }, {  3,  3 }, {  3,  3 },
+   {  3,  3 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  4,  4 }, {  4,  4 }, {  4,  4 },
+   {  4,  4 }, {  4,  4 }, {  4,  4 }, {  4,  4 }, {  4,  4 }, {  4,  4 }, {  5,  5 }, {  5,  5 },
+   {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  6,  6 }, {  6,  6 },
+   {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  6 }, {  7,  7 }, {  7,  7 },
+   {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  8,  8 }, {  8,  8 },
+   {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  9,  9 },
+   {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, { 10, 10 },
+   { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 11, 11 },
+   { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 12, 12 },
+   { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 },
+   { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 },
+   { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 },
+   { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 },
+   { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 },
+   { 16, 16 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 },
+   { 17, 17 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 },
+   { 18, 18 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 },
+   { 19, 19 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 },
+   { 20, 20 }, { 20, 20 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 },
+   { 21, 21 }, { 21, 21 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 },
+   { 22, 22 }, { 22, 22 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 },
+   { 23, 23 }, { 23, 23 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 },
+   { 24, 24 }, { 24, 24 }, { 24, 24 }, { 25, 25 }, { 25, 25 }, { 25, 25 }, { 25, 25 }, { 25, 25 },
+   { 25, 25 }, { 25, 25 }, { 25, 25 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, { 26, 26 },
+   { 26, 26 }, { 26, 26 }, { 26, 26 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, { 27, 27 },
+   { 27, 27 }, { 27, 27 }, { 27, 27 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 },
+   { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 29, 29 }, { 29, 29 }, { 29, 29 }, { 29, 29 },
+   { 29, 29 }, { 29, 29 }, { 29, 29 }, { 29, 29 }, { 30, 30 }, { 30, 30 }, { 30, 30 }, { 30, 30 },
+   { 30, 30 }, { 30, 30 }, { 30, 30 }, { 30, 30 }, { 31, 31 }, { 31, 31 }, { 31, 31 }, { 31, 31 },
 };
 static const unsigned char stb__OMatch6[256][2] = {
-   {  0,  0 }, {  0,  1 }, {  1,  0 }, {  1,  0 }, {  1,  1 }, {  2,  0 }, {  2,  1 }, {  3,  0 },
-   {  3,  0 }, {  3,  1 }, {  4,  0 }, {  4,  0 }, {  4,  1 }, {  5,  0 }, {  5,  1 }, {  6,  0 },
-   {  6,  0 }, {  6,  1 }, {  7,  0 }, {  7,  0 }, {  7,  1 }, {  8,  0 }, {  8,  1 }, {  8,  1 },
-   {  8,  2 }, {  9,  1 }, {  9,  2 }, {  9,  2 }, {  9,  3 }, { 10,  2 }, { 10,  3 }, { 10,  3 },
-   { 10,  4 }, { 11,  3 }, { 11,  4 }, { 11,  4 }, { 11,  5 }, { 12,  4 }, { 12,  5 }, { 12,  5 },
-   { 12,  6 }, { 13,  5 }, { 13,  6 }, {  8, 16 }, { 13,  7 }, { 14,  6 }, { 14,  7 }, {  9, 17 },
-   { 14,  8 }, { 15,  7 }, { 15,  8 }, { 11, 16 }, { 15,  9 }, { 15, 10 }, { 16,  8 }, { 16,  9 },
-   { 16, 10 }, { 15, 13 }, { 17,  9 }, { 17, 10 }, { 17, 11 }, { 15, 16 }, { 18, 10 }, { 18, 11 },
-   { 18, 12 }, { 16, 16 }, { 19, 11 }, { 19, 12 }, { 19, 13 }, { 17, 17 }, { 20, 12 }, { 20, 13 },
-   { 20, 14 }, { 19, 16 }, { 21, 13 }, { 21, 14 }, { 21, 15 }, { 20, 17 }, { 22, 14 }, { 22, 15 },
-   { 25, 10 }, { 22, 16 }, { 23, 15 }, { 23, 16 }, { 26, 11 }, { 23, 17 }, { 24, 16 }, { 24, 17 },
-   { 27, 12 }, { 24, 18 }, { 25, 17 }, { 25, 18 }, { 28, 13 }, { 25, 19 }, { 26, 18 }, { 26, 19 },
-   { 29, 14 }, { 26, 20 }, { 27, 19 }, { 27, 20 }, { 30, 15 }, { 27, 21 }, { 28, 20 }, { 28, 21 },
-   { 28, 21 }, { 28, 22 }, { 29, 21 }, { 29, 22 }, { 24, 32 }, { 29, 23 }, { 30, 22 }, { 30, 23 },
-   { 25, 33 }, { 30, 24 }, { 31, 23 }, { 31, 24 }, { 27, 32 }, { 31, 25 }, { 31, 26 }, { 32, 24 },
-   { 32, 25 }, { 32, 26 }, { 31, 29 }, { 33, 25 }, { 33, 26 }, { 33, 27 }, { 31, 32 }, { 34, 26 },
-   { 34, 27 }, { 34, 28 }, { 32, 32 }, { 35, 27 }, { 35, 28 }, { 35, 29 }, { 33, 33 }, { 36, 28 },
-   { 36, 29 }, { 36, 30 }, { 35, 32 }, { 37, 29 }, { 37, 30 }, { 37, 31 }, { 36, 33 }, { 38, 30 },
-   { 38, 31 }, { 41, 26 }, { 38, 32 }, { 39, 31 }, { 39, 32 }, { 42, 27 }, { 39, 33 }, { 40, 32 },
-   { 40, 33 }, { 43, 28 }, { 40, 34 }, { 41, 33 }, { 41, 34 }, { 44, 29 }, { 41, 35 }, { 42, 34 },
-   { 42, 35 }, { 45, 30 }, { 42, 36 }, { 43, 35 }, { 43, 36 }, { 46, 31 }, { 43, 37 }, { 44, 36 },
-   { 44, 37 }, { 44, 37 }, { 44, 38 }, { 45, 37 }, { 45, 38 }, { 40, 48 }, { 45, 39 }, { 46, 38 },
-   { 46, 39 }, { 41, 49 }, { 46, 40 }, { 47, 39 }, { 47, 40 }, { 43, 48 }, { 47, 41 }, { 47, 42 },
-   { 48, 40 }, { 48, 41 }, { 48, 42 }, { 47, 45 }, { 49, 41 }, { 49, 42 }, { 49, 43 }, { 47, 48 },
-   { 50, 42 }, { 50, 43 }, { 50, 44 }, { 48, 48 }, { 51, 43 }, { 51, 44 }, { 51, 45 }, { 49, 49 },
-   { 52, 44 }, { 52, 45 }, { 52, 46 }, { 51, 48 }, { 53, 45 }, { 53, 46 }, { 53, 47 }, { 52, 49 },
-   { 54, 46 }, { 54, 47 }, { 57, 42 }, { 54, 48 }, { 55, 47 }, { 55, 48 }, { 58, 43 }, { 55, 49 },
-   { 56, 48 }, { 56, 49 }, { 59, 44 }, { 56, 50 }, { 57, 49 }, { 57, 50 }, { 60, 45 }, { 57, 51 },
-   { 58, 50 }, { 58, 51 }, { 61, 46 }, { 58, 52 }, { 59, 51 }, { 59, 52 }, { 62, 47 }, { 59, 53 },
-   { 60, 52 }, { 60, 53 }, { 60, 53 }, { 60, 54 }, { 61, 53 }, { 61, 54 }, { 61, 54 }, { 61, 55 },
-   { 62, 54 }, { 62, 55 }, { 62, 55 }, { 62, 56 }, { 63, 55 }, { 63, 56 }, { 63, 56 }, { 63, 57 },
-   { 63, 58 }, { 63, 59 }, { 63, 59 }, { 63, 60 }, { 63, 61 }, { 63, 62 }, { 63, 62 }, { 63, 63 },
+   {  0,  0 }, {  0,  0 }, {  0,  0 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  2,  2 },
+   {  2,  2 }, {  2,  2 }, {  2,  2 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  4,  4 },
+   {  4,  4 }, {  4,  4 }, {  4,  4 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  6,  6 },
+   {  6,  6 }, {  6,  6 }, {  6,  6 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  8,  8 },
+   {  8,  8 }, {  8,  8 }, {  8,  8 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, { 10, 10 },
+   { 10, 10 }, { 10, 10 }, { 10, 10 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 12, 12 },
+   { 12, 12 }, { 12, 12 }, { 12, 12 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 14, 14 },
+   { 14, 14 }, { 14, 14 }, { 14, 14 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 16, 16 },
+   { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 },
+   { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 },
+   { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 },
+   { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 },
+   { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 25, 25 }, { 25, 25 }, { 25, 25 }, { 25, 25 },
+   { 26, 26 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, { 27, 27 },
+   { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 29, 29 }, { 29, 29 }, { 29, 29 }, { 29, 29 },
+   { 30, 30 }, { 30, 30 }, { 30, 30 }, { 30, 30 }, { 31, 31 }, { 31, 31 }, { 31, 31 }, { 31, 31 },
+   { 32, 32 }, { 32, 32 }, { 32, 32 }, { 32, 32 }, { 32, 32 }, { 33, 33 }, { 33, 33 }, { 33, 33 },
+   { 33, 33 }, { 34, 34 }, { 34, 34 }, { 34, 34 }, { 34, 34 }, { 35, 35 }, { 35, 35 }, { 35, 35 },
+   { 35, 35 }, { 36, 36 }, { 36, 36 }, { 36, 36 }, { 36, 36 }, { 37, 37 }, { 37, 37 }, { 37, 37 },
+   { 37, 37 }, { 38, 38 }, { 38, 38 }, { 38, 38 }, { 38, 38 }, { 39, 39 }, { 39, 39 }, { 39, 39 },
+   { 39, 39 }, { 40, 40 }, { 40, 40 }, { 40, 40 }, { 40, 40 }, { 41, 41 }, { 41, 41 }, { 41, 41 },
+   { 41, 41 }, { 42, 42 }, { 42, 42 }, { 42, 42 }, { 42, 42 }, { 43, 43 }, { 43, 43 }, { 43, 43 },
+   { 43, 43 }, { 44, 44 }, { 44, 44 }, { 44, 44 }, { 44, 44 }, { 45, 45 }, { 45, 45 }, { 45, 45 },
+   { 45, 45 }, { 46, 46 }, { 46, 46 }, { 46, 46 }, { 46, 46 }, { 47, 47 }, { 47, 47 }, { 47, 47 },
+   { 47, 47 }, { 48, 48 }, { 48, 48 }, { 48, 48 }, { 48, 48 }, { 48, 48 }, { 49, 49 }, { 49, 49 },
+   { 49, 49 }, { 49, 49 }, { 50, 50 }, { 50, 50 }, { 50, 50 }, { 50, 50 }, { 51, 51 }, { 51, 51 },
+   { 51, 51 }, { 51, 51 }, { 52, 52 }, { 52, 52 }, { 52, 52 }, { 52, 52 }, { 53, 53 }, { 53, 53 },
+   { 53, 53 }, { 53, 53 }, { 54, 54 }, { 54, 54 }, { 54, 54 }, { 54, 54 }, { 55, 55 }, { 55, 55 },
+   { 55, 55 }, { 55, 55 }, { 56, 56 }, { 56, 56 }, { 56, 56 }, { 56, 56 }, { 57, 57 }, { 57, 57 },
+   { 57, 57 }, { 57, 57 }, { 58, 58 }, { 58, 58 }, { 58, 58 }, { 58, 58 }, { 59, 59 }, { 59, 59 },
+   { 59, 59 }, { 59, 59 }, { 60, 60 }, { 60, 60 }, { 60, 60 }, { 60, 60 }, { 61, 61 }, { 61, 61 },
+   { 61, 61 }, { 61, 61 }, { 62, 62 }, { 62, 62 }, { 62, 62 }, { 62, 62 }, { 63, 63 }, { 63, 63 },
 };
 
 static int stb__Mul8Bit(int a, int b)
@@ -794,7 +794,7 @@ int main()
       for (int j = 0; j < 256; ++j) {
          int mn, mx;
          int best_mn = 0, best_mx = 0;
-         int best_err = 256;
+         int best_err = 256 * 100;
          for (mn=0;mn<size;mn++) {
             for (mx=0;mx<size;mx++) {
                int mine = (mn * dequant) >> 4;
@@ -805,7 +805,7 @@ int main()
                // add this as error term. Normally we'd expect a random distribution of
                // +-1.5% error, but nowhere in the spec does it say that the error has to be
                // unbiased - better safe than sorry.
-               err += abs(maxe - mine) * 3 / 100;
+               err += abs(maxe - mine) * 3;
 
                if(err < best_err) {
                   best_mn = mn;

From dc2aa370c07219cd7f29d901d1168f2ff448c789 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 16:37:35 -0700
Subject: [PATCH 006/170] stb_dxt: Remove dithering support.

Keep STB_DXT_DITHER so as not to break existing code that tries
to enable it, but just leave it permanently off. I originally
introduced it somewhat superstitiously because of the RGB565
endpoint resolution but it never improved either perceptual quality
or objective quality metrics, and the code is appreciably simpler
without it.
---
 stb_dxt.h | 186 ++++++------------------------------------------------
 1 file changed, 18 insertions(+), 168 deletions(-)

diff --git a/stb_dxt.h b/stb_dxt.h
index 25d3fba..02a06ae 100644
--- a/stb_dxt.h
+++ b/stb_dxt.h
@@ -1,4 +1,4 @@
-// stb_dxt.h - v1.10 - DXT1/DXT5 compressor - public domain
+// stb_dxt.h - v1.11 - DXT1/DXT5 compressor - public domain
 // original by fabian "ryg" giesen - ported to C by stb
 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
 //
@@ -10,6 +10,7 @@
 //     You can turn on dithering and "high quality" using mode.
 //
 // version history:
+//   v1.11  - (ryg) avoid racy global init, better single-color tables, remove dither
 //   v1.10  - (i.c) various small quality improvements
 //   v1.09  - (stb) update documentation re: surprising alpha channel requirement
 //   v1.08  - (stb) fix bug in dxt-with-alpha block
@@ -50,7 +51,7 @@ extern "C" {
 
 // compression mode (bitflags)
 #define STB_DXT_NORMAL    0
-#define STB_DXT_DITHER    1   // use dithering. dubious win. never use for normal maps and the like!
+#define STB_DXT_DITHER    1   // use dithering. was always dubious, now deprecated. does nothing!
 #define STB_DXT_HIGHQUAL  2   // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
 
 STBDDEF void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src_rgba_four_bytes_per_pixel, int alpha, int mode);
@@ -82,7 +83,7 @@ STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *sr
 
 #include <stdlib.h>
 
-#if !defined(STBI_FABS)
+#if !defined(STBD_FABS)
 #include <math.h>
 #endif
 
@@ -90,49 +91,6 @@ STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *sr
 #define STBD_FABS(x)          fabs(x)
 #endif
 
-#ifndef STBD_MEMSET
-#include <string.h>
-#define STBD_MEMSET           memset
-#endif
-
-static const unsigned char stb__QuantRBTab[256 + 16] = {
-     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   8,   8,   8,
-     8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,  16,  16,  24,  24,  24,
-    24,  24,  24,  24,  24,  33,  33,  33,  33,  33,  33,  33,  33,  33,  41,  41,
-    41,  41,  41,  41,  41,  41,  49,  49,  49,  49,  49,  49,  49,  49,  57,  57,
-    57,  57,  57,  57,  57,  57,  66,  66,  66,  66,  66,  66,  66,  66,  74,  74,
-    74,  74,  74,  74,  74,  74,  74,  82,  82,  82,  82,  82,  82,  82,  82,  90,
-    90,  90,  90,  90,  90,  90,  90,  99,  99,  99,  99,  99,  99,  99,  99, 107,
-   107, 107, 107, 107, 107, 107, 107, 107, 115, 115, 115, 115, 115, 115, 115, 115,
-   123, 123, 123, 123, 123, 123, 123, 123, 132, 132, 132, 132, 132, 132, 132, 132,
-   140, 140, 140, 140, 140, 140, 140, 140, 148, 148, 148, 148, 148, 148, 148, 148,
-   148, 156, 156, 156, 156, 156, 156, 156, 156, 165, 165, 165, 165, 165, 165, 165,
-   165, 173, 173, 173, 173, 173, 173, 173, 173, 181, 181, 181, 181, 181, 181, 181,
-   181, 181, 189, 189, 189, 189, 189, 189, 189, 189, 198, 198, 198, 198, 198, 198,
-   198, 198, 206, 206, 206, 206, 206, 206, 206, 206, 214, 214, 214, 214, 214, 214,
-   214, 214, 222, 222, 222, 222, 222, 222, 222, 222, 222, 231, 231, 231, 231, 231,
-   231, 231, 231, 239, 239, 239, 239, 239, 239, 239, 239, 247, 247, 247, 247, 247,
-   247, 247, 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-};
-static const unsigned char stb__QuantGTab[256 + 16] = {
-     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   8,
-     8,   8,   8,  12,  12,  12,  12,  16,  16,  16,  16,  20,  20,  20,  20,  24,
-    24,  24,  24,  28,  28,  28,  28,  32,  32,  32,  32,  36,  36,  36,  36,  40,
-    40,  40,  40,  44,  44,  44,  44,  48,  48,  48,  48,  52,  52,  52,  52,  56,
-    56,  56,  56,  60,  60,  60,  60,  65,  65,  65,  65,  69,  69,  69,  69,  73,
-    73,  73,  73,  77,  77,  77,  77,  81,  81,  81,  81,  85,  85,  85,  85,  85,
-    89,  89,  89,  89,  93,  93,  93,  93,  97,  97,  97,  97, 101, 101, 101, 101,
-   105, 105, 105, 105, 109, 109, 109, 109, 113, 113, 113, 113, 117, 117, 117, 117,
-   121, 121, 121, 121, 125, 125, 125, 125, 130, 130, 130, 130, 134, 134, 134, 134,
-   138, 138, 138, 138, 142, 142, 142, 142, 146, 146, 146, 146, 150, 150, 150, 150,
-   154, 154, 154, 154, 158, 158, 158, 158, 162, 162, 162, 162, 166, 166, 166, 166,
-   170, 170, 170, 170, 170, 174, 174, 174, 174, 178, 178, 178, 178, 182, 182, 182,
-   182, 186, 186, 186, 186, 190, 190, 190, 190, 195, 195, 195, 195, 199, 199, 199,
-   199, 203, 203, 203, 203, 207, 207, 207, 207, 211, 211, 211, 211, 215, 215, 215,
-   215, 219, 219, 219, 219, 223, 223, 223, 223, 227, 227, 227, 227, 231, 231, 231,
-   231, 235, 235, 235, 235, 239, 239, 239, 239, 243, 243, 243, 243, 247, 247, 247,
-   247, 251, 251, 251, 251, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
-};
 static const unsigned char stb__OMatch5[256][2] = {
    {  0,  0 }, {  0,  0 }, {  0,  0 }, {  0,  0 }, {  0,  0 }, {  1,  1 }, {  1,  1 }, {  1,  1 },
    {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  2,  2 }, {  2,  2 }, {  2,  2 },
@@ -257,36 +215,8 @@ static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned shor
    stb__Lerp13RGB(color+12, color+4, color+0);
 }
 
-// Block dithering function. Simply dithers a block to 565 RGB.
-// (Floyd-Steinberg)
-static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
-{
-  int err[8],*ep1 = err,*ep2 = err+4, *et;
-  int ch,y;
-
-  // process channels separately
-  for (ch=0; ch<3; ++ch) {
-      unsigned char *bp = block+ch, *dp = dest+ch;
-      const unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
-      STBD_MEMSET(err, 0, sizeof(err));
-      for(y=0; y<4; ++y) {
-         dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
-         ep1[0] = bp[ 0] - dp[ 0];
-         dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
-         ep1[1] = bp[ 4] - dp[ 4];
-         dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
-         ep1[2] = bp[ 8] - dp[ 8];
-         dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
-         ep1[3] = bp[12] - dp[12];
-         bp += 16;
-         dp += 16;
-         et = ep1, ep1 = ep2, ep2 = et; // swap
-      }
-   }
-}
-
 // The color matching function
-static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither)
+static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color)
 {
    unsigned int mask = 0;
    int dirr = color[0*4+0] - color[1*4+0];
@@ -315,68 +245,14 @@ static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *c
    halfPoint = (stops[3] + stops[2]);
    c3Point   = (stops[2] + stops[0]);
 
-   if(!dither) {
-      // the version without dithering is straightforward
-      for (i=15;i>=0;i--) {
-         int dot = dots[i]*2;
-         mask <<= 2;
+   for (i=15;i>=0;i--) {
+      int dot = dots[i]*2;
+      mask <<= 2;
 
-         if(dot < halfPoint)
-           mask |= (dot < c0Point) ? 1 : 3;
-         else
-           mask |= (dot < c3Point) ? 2 : 0;
-      }
-  } else {
-      // with floyd-steinberg dithering
-      int err[8],*ep1 = err,*ep2 = err+4;
-      int *dp = dots, y;
-
-      c0Point   <<= 3;
-      halfPoint <<= 3;
-      c3Point   <<= 3;
-      for(i=0;i<8;i++)
-         err[i] = 0;
-
-      for(y=0;y<4;y++)
-      {
-         int dot,lmask,step;
-
-         dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]);
-         if(dot < halfPoint)
-           step = (dot < c0Point) ? 1 : 3;
-         else
-           step = (dot < c3Point) ? 2 : 0;
-         ep1[0] = dp[0] - stops[step];
-         lmask = step;
-
-         dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]);
-         if(dot < halfPoint)
-           step = (dot < c0Point) ? 1 : 3;
-         else
-           step = (dot < c3Point) ? 2 : 0;
-         ep1[1] = dp[1] - stops[step];
-         lmask |= step<<2;
-
-         dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]);
-         if(dot < halfPoint)
-           step = (dot < c0Point) ? 1 : 3;
-         else
-           step = (dot < c3Point) ? 2 : 0;
-         ep1[2] = dp[2] - stops[step];
-         lmask |= step<<4;
-
-         dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]);
-         if(dot < halfPoint)
-           step = (dot < c0Point) ? 1 : 3;
-         else
-           step = (dot < c3Point) ? 2 : 0;
-         ep1[3] = dp[3] - stops[step];
-         lmask |= step<<6;
-
-         dp += 4;
-         mask |= lmask << (y*8);
-         { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
-      }
+      if(dot < halfPoint)
+         mask |= (dot < c0Point) ? 1 : 3;
+      else
+         mask |= (dot < c3Point) ? 2 : 0;
    }
 
    return mask;
@@ -603,12 +479,10 @@ static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, i
 {
    unsigned int mask;
    int i;
-   int dither;
    int refinecount;
    unsigned short max16, min16;
-   unsigned char dblock[16*4],color[4*4];
+   unsigned char color[4*4];
 
-   dither = mode & STB_DXT_DITHER;
    refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
 
    // check if block is constant
@@ -622,15 +496,11 @@ static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, i
       max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
       min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
    } else {
-      // first step: compute dithered version for PCA if desired
-      if(dither)
-         stb__DitherBlock(dblock,block);
-
-      // second step: pca+map along principal axis
-      stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16);
+      // first step: PCA+map along principal axis
+      stb__OptimizeColorsBlock(block,&max16,&min16);
       if (max16 != min16) {
          stb__EvalColors(color,max16,min16);
-         mask = stb__MatchColorsBlock(block,color,dither);
+         mask = stb__MatchColorsBlock(block,color);
       } else
          mask = 0;
 
@@ -638,10 +508,10 @@ static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, i
       for (i=0;i<refinecount;i++) {
          unsigned int lastmask = mask;
 
-         if (stb__RefineBlock(dither ? dblock : block,&max16,&min16,mask)) {
+         if (stb__RefineBlock(block,&max16,&min16,mask)) {
             if (max16 != min16) {
                stb__EvalColors(color,max16,min16);
-               mask = stb__MatchColorsBlock(block,color,dither);
+               mask = stb__MatchColorsBlock(block,color);
             } else {
                mask = 0;
                break;
@@ -763,29 +633,9 @@ void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src)
 int main()
 {
    int i, j;
-   const char *quant_names[] = { "stb__QuantRBTab", "stb__QuantGTab" };
    const char *omatch_names[] = { "stb__OMatch5", "stb__OMatch6" };
    int dequant_mults[2] = { 33*4, 65 }; // .4 fixed-point dequant multipliers
 
-   // quant tables (for dither)
-   for (i=0; i < 2; ++i) {
-      int quant_mult = i ? 63 : 31;
-      printf("static const unsigned char %s[256 + 16] = {\n", quant_names[i]);
-      for (int j = 0; j < 256 + 16; ++j) {
-         int v = j - 8;
-         int q, dq;
-
-         v = (v < 0) ? 0 : (v > 255) ? 255 : v; // clamp
-         q = stb__Mul8Bit(v, quant_mult); // quantize
-         dq = (q * dequant_mults[i]) >> 4; // dequantize
-
-         if ((j % 16) == 0) printf("  "); // 2 spaces, third is done below
-         printf(" %3d,", dq);
-         if ((j % 16) == 15) printf("\n");
-      }
-      printf("};\n");
-   }
-
    // optimal endpoint tables
    for (i = 0; i < 2; ++i) {
       int dequant = dequant_mults[i];

From 2de22bde0ae1de14ad2d7f098a4643e340b96dd8 Mon Sep 17 00:00:00 2001
From: Valentin Lenhart <vlaube-de@users.noreply.github.com>
Date: Wed, 28 Oct 2020 14:13:17 +0100
Subject: [PATCH 007/170] stb_sprintf.h: stdlib.h is not needed

va_arg() is in stdarg.h, which is already being included
---
 stb_sprintf.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index a009c5d..cbe7868 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -187,7 +187,7 @@ PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC):
 #define STBSP__ATTRIBUTE_FORMAT(fmt,va)
 #endif
 
-#include <stdarg.h> // for va_list()
+#include <stdarg.h> // for va_arg(), va_list()
 #include <stddef.h> // size_t, ptrdiff_t
 
 #ifndef STB_SPRINTF_MIN
@@ -211,8 +211,6 @@ STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char peri
 
 #ifdef STB_SPRINTF_IMPLEMENTATION
 
-#include <stdlib.h> // for va_arg()
-
 #define stbsp__uint32 unsigned int
 #define stbsp__int32 signed int
 

From 02578923d324babc429ad4115e2a22d997f84d74 Mon Sep 17 00:00:00 2001
From: Rafael Sachetto <rsachetto@gmail.com>
Date: Wed, 27 Jan 2021 15:05:48 -0300
Subject: [PATCH 008/170] Fix compilation warnings in the s390x architeture.
 Fixes #1082.

---
 stb_sprintf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index cbe7868..7396760 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -224,7 +224,7 @@ STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char peri
 #define stbsp__uint16 unsigned short
 
 #ifndef stbsp__uintptr
-#if defined(__ppc64__) || defined(__powerpc64__) || defined(__aarch64__) || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64)
+#if defined(__ppc64__) || defined(__powerpc64__) || defined(__aarch64__) || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64) || defined(__s390x__)
 #define stbsp__uintptr stbsp__uint64
 #else
 #define stbsp__uintptr stbsp__uint32

From 04007a071cf0ecdeab853186be98617920f324ad Mon Sep 17 00:00:00 2001
From: Jan CW Kroeze <jcwkroeze@pm.me>
Date: Thu, 11 Mar 2021 21:13:25 +0100
Subject: [PATCH 009/170] Note GL blend state for stb_truetype

---
 stb_truetype.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/stb_truetype.h b/stb_truetype.h
index 62595a1..542a890 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -297,6 +297,8 @@ void my_stbtt_initfont(void)
 void my_stbtt_print(float x, float y, char *text)
 {
    // assume orthographic projection with units = screen pixels, origin at top left
+   glEnable(GL_BLEND);
+   glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
    glEnable(GL_TEXTURE_2D);
    glBindTexture(GL_TEXTURE_2D, ftex);
    glBegin(GL_QUADS);

From 067655993a3d0a631e03cd7692607308bd9e1865 Mon Sep 17 00:00:00 2001
From: Doj <vitek03@gmail.com>
Date: Wed, 27 Jan 2021 20:39:02 +0300
Subject: [PATCH 010/170] stb_sprintf: fix stbsp_ddtoS64 macro

Should use xh argument not ph (which is the name of the
variable that it actually gets instantiated with the
one time it is used).
---
 stb_sprintf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index 7396760..3152ff3 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -1597,7 +1597,7 @@ static stbsp__uint64 const stbsp__powten[20] = {
 #define stbsp__ddtoS64(ob, xh, xl)          \
    {                                        \
       double ahi = 0, alo, vh, t;           \
-      ob = (stbsp__int64)ph;                \
+      ob = (stbsp__int64)xh;                \
       vh = (double)ob;                      \
       ahi = (xh - vh);                      \
       t = (ahi - xh);                       \

From 0afc39f59a96113af1f911f67bdb057785965875 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 17:03:51 -0700
Subject: [PATCH 011/170] stb_truetype: Turn codepoint assert into error check

Fixes the bug covered by PR #1066, but with a slightly different
fix that's hopefully a bit clearer.
---
 stb_truetype.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 542a890..e06b867 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -1541,12 +1541,12 @@ STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codep
       search += 2;
 
       {
-         stbtt_uint16 offset, start;
+         stbtt_uint16 offset, start, last;
          stbtt_uint16 item = (stbtt_uint16) ((search - endCount) >> 1);
 
-         STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item));
          start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
-         if (unicode_codepoint < start)
+         last = ttUSHORT(data + endCount + 2*item);
+         if (unicode_codepoint < start || unicode_codepoint > last)
             return 0;
 
          offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);

From a49749a57d993e8d3bcc933f83266d7461435cfa Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 17:08:20 -0700
Subject: [PATCH 012/170] stb_textedit: Docs fix.

Fixes issue #1041.
---
 stb_textedit.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_textedit.h b/stb_textedit.h
index cd38a25..41afb9e 100644
--- a/stb_textedit.h
+++ b/stb_textedit.h
@@ -87,8 +87,8 @@
 //   moderate sizes. The undo system does no memory allocations, so
 //   it grows STB_TexteditState by the worst-case storage which is (in bytes):
 //
-//        [4 + 3 * sizeof(STB_TEXTEDIT_POSITIONTYPE)] * STB_TEXTEDIT_UNDOSTATE_COUNT
-//      +          sizeof(STB_TEXTEDIT_CHARTYPE)      * STB_TEXTEDIT_UNDOCHAR_COUNT
+//        [4 + 3 * sizeof(STB_TEXTEDIT_POSITIONTYPE)] * STB_TEXTEDIT_UNDOSTATECOUNT
+//      +          sizeof(STB_TEXTEDIT_CHARTYPE)      * STB_TEXTEDIT_UNDOCHARCOUNT
 //
 //
 // Implementation mode:

From 1203eb554b7229cdcdc9fde3826f95a675a0287f Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 17:11:25 -0700
Subject: [PATCH 013/170] stb_image: Fix issue #994.

Accidentally introduced during a merge.
---
 stb_image.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index accef48..0a8622b 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -5395,7 +5395,7 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
    }
    if (psize == 0) {
       STBI_ASSERT(info.offset == s->callback_already_read + (int) (s->img_buffer - s->img_buffer_original));
-      if (info.offset != s->callback_already_read + (s->img_buffer - s->buffer_start)) {
+      if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
         return stbi__errpuc("bad offset", "Corrupt BMP");
       }
    }

From 448bb137e3717bf015b410ca25ed83cc48353050 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 17:18:51 -0700
Subject: [PATCH 014/170] stb_image: Better docs for stbi_info.

Fixes #1026.
---
 stb_image.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 0a8622b..f59d773 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.26 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.27  (          ) document stbi_info better
       2.26  (2020-07-13) many minor fixes
       2.25  (2020-02-02) fix warnings
       2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
@@ -176,6 +177,14 @@ RECENT REVISION HISTORY:
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
 //
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
 // ===========================================================================
 //
 // UNICODE:

From c62af8565795222df95f548f31027ec0afb59bb1 Mon Sep 17 00:00:00 2001
From: Jacko Dirks <jacko.dirks@gmail.com>
Date: Sat, 15 Aug 2020 17:19:02 +0200
Subject: [PATCH 015/170] stb_image.h: Suppress warnings about out_size,
 delay_size

These two variables are unused on some targets, resulting in
warnings. Add STBI_NOTUSED around them to suppress those
warnings.
---
 stb_image.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index f59d773..a5ed6aa 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -116,6 +116,8 @@ RECENT REVISION HISTORY:
     Ryan C. Gordon          [reserved]                              [reserved]
                      DO NOT ADD YOUR NAME HERE
 
+                     Jacko Dirks
+
   To add your name to the credits, pick a random blank space in the middle and fill it.
   80% of merge conflicts on stb PRs are due to people adding their name at the end
   of the credits.
@@ -6786,6 +6788,10 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
       int stride;
       int out_size = 0;
       int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
       memset(&g, 0, sizeof(g));
       if (delays) {
          *delays = 0;
@@ -7215,8 +7221,8 @@ static int stbi__psd_is16(stbi__context *s)
        stbi__rewind( s );
        return 0;
    }
-   (void) stbi__get32be(s);
-   (void) stbi__get32be(s);
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
    depth = stbi__get16be(s);
    if (depth != 16) {
        stbi__rewind( s );

From 8c15cc9c79bf6f180d74808657046caf2ec0b445 Mon Sep 17 00:00:00 2001
From: Simon Breuss <simon.breuss@gmail.com>
Date: Tue, 18 May 2021 23:51:25 +0200
Subject: [PATCH 016/170] Adds 16-bit support for pnm files.

---
 stb_image.h | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index a5ed6aa..3c943a3 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -108,7 +108,7 @@ RECENT REVISION HISTORY:
     Cass Everitt            Ryamond Barbiero                        github:grim210
     Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
     Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
-    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
+    Josh Tobin              Simon Breuss       Matthew Gregan       github:poppolopoppo
     Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
     Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
                             Brad Weinberger    Matvey Cherevko      [reserved]
@@ -935,6 +935,7 @@ static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 static int      stbi__pnm_test(stbi__context *s);
 static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
 #endif
 
 static
@@ -7322,7 +7323,8 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
    stbi_uc *out;
    STBI_NOTUSED(ri);
 
-   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
       return 0;
 
    if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
@@ -7332,12 +7334,12 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
    *y = s->img_y;
    if (comp) *comp = s->img_n;
 
-   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
       return stbi__errpuc("too large", "PNM too large");
 
-   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
 
    if (req_comp && req_comp != s->img_n) {
       out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
@@ -7413,12 +7415,21 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
    stbi__pnm_skip_whitespace(s, &c);
 
    maxv = stbi__pnm_getinteger(s, &c);  // read max value
-
-   if (maxv > 255)
-      return stbi__err("max value > 255", "PPM image not 8-bit");
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
    else
-      return 1;
+      return 8;
 }
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+
 #endif
 
 static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
@@ -7473,6 +7484,9 @@ static int stbi__is_16_main(stbi__context *s)
    if (stbi__psd_is16(s))  return 1;
    #endif
 
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
    return 0;
 }
 

From 8e8f7d9b690bad933333cc57d05871ab5ca35917 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 17:35:40 -0700
Subject: [PATCH 017/170] stb_image: Update credits, change log

---
 stb_image.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 3c943a3..20050ec 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -48,7 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
-      2.27  (          ) document stbi_info better
+      2.27  (          ) document stbi_info better, 16-bit PNM support
       2.26  (2020-07-13) many minor fixes
       2.25  (2020-02-02) fix warnings
       2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
@@ -90,7 +90,7 @@ RECENT REVISION HISTORY:
                                            Jeremy Sawicki (handle all ImageNet JPGs)
  Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
     Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
-    Arseny Kapoulkine
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
     John-Mark Allen
     Carmelo J Fdez-Aguera
 
@@ -108,7 +108,7 @@ RECENT REVISION HISTORY:
     Cass Everitt            Ryamond Barbiero                        github:grim210
     Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
     Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
-    Josh Tobin              Simon Breuss       Matthew Gregan       github:poppolopoppo
+    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
     Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
     Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
                             Brad Weinberger    Matvey Cherevko      [reserved]

From 6199b717bcae770e9bae75696cef6a55f1e22ef1 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 1 Jul 2021 17:49:11 -0700
Subject: [PATCH 018/170] README: updated supported stb_image and
 stb_image_write formats

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 961556e..118cc9c 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,9 @@ by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts.
 library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.21 | audio | 5569 | decode ogg vorbis files from file/memory to float/16-bit signed output
-**[stb_image.h](stb_image.h)** | 2.26 | graphics | 7762 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_image.h](stb_image.h)** | 2.26 | graphics | 7762 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC, PNM
 **[stb_truetype.h](stb_truetype.h)** | 1.24 | graphics | 5011 | parse, decode, and rasterize characters from truetype fonts
-**[stb_image_write.h](stb_image_write.h)** | 1.15 | graphics | 1690 | image writing to disk: PNG, TGA, BMP
+**[stb_image_write.h](stb_image_write.h)** | 1.15 | graphics | 1690 | image writing to disk: PNG, TGA, BMP, JPG, HDR
 **[stb_image_resize.h](stb_image_resize.h)** | 0.96 | graphics | 2631 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.00 | graphics | 628 | simple 2D rectangle packer with decent quality
 **[stb_ds.h](stb_ds.h)** | 0.66 | utility | 1893 | typesafe dynamic array and hash tables for C, will compile in C++

From 6ab6303f98f1ee33abe93b730dbbb5b875abbddd Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 17:50:34 -0700
Subject: [PATCH 019/170] stb_image: Check results of all mallocs.

A few were missing. Make sure to always report ouf-of-memory
errors.

Fixes issue #1056.
---
 stb_image.h | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 20050ec..71bd506 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -48,7 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
-      2.27  (          ) document stbi_info better, 16-bit PNM support
+      2.27  (          ) document stbi_info better, 16-bit PNM support, bug fixes
       2.26  (2020-07-13) many minor fixes
       2.25  (2020-02-02) fix warnings
       2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
@@ -3936,6 +3936,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
 {
    unsigned char* result;
    stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
    STBI_NOTUSED(ri);
    j->s = s;
    stbi__setup_jpeg(j);
@@ -3948,6 +3949,7 @@ static int stbi__jpeg_test(stbi__context *s)
 {
    int r;
    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
    j->s = s;
    stbi__setup_jpeg(j);
    r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -3972,6 +3974,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
    int result;
    stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
    j->s = s;
    result = stbi__jpeg_info_raw(j, x, y, comp);
    STBI_FREE(j);
@@ -4771,6 +4774,7 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3
 
    // de-interlacing
    final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -6354,6 +6358,7 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c
 
    // intermediate buffer is RGBA
    result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
@@ -6469,6 +6474,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
    stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
    if (!stbi__gif_header(s, g, comp, 1)) {
       STBI_FREE(g);
       stbi__rewind( s );
@@ -6778,6 +6784,17 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
    }
 }
 
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
 static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
 {
    if (stbi__gif_test(s)) {
@@ -6810,12 +6827,8 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
 
             if (out) {
                void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
-               if (NULL == tmp) {
-                  STBI_FREE(g.out);
-                  STBI_FREE(g.history);
-                  STBI_FREE(g.background);
-                  return stbi__errpuc("outofmem", "Out of memory");
-               }
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
                else {
                    out = (stbi_uc*) tmp;
                    out_size = layers * stride;
@@ -6823,13 +6836,19 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
 
                if (delays) {
                   *delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
                   delays_size = layers * sizeof(int);
                }
             } else {
                out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
                out_size = layers * stride;
                if (delays) {
                   *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
                   delays_size = layers * sizeof(int);
                }
             }

From 86b7570cfba845e8209c6aec2d15e487bb1d8bb4 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 18:10:49 -0700
Subject: [PATCH 020/170] stb_image: Fix bug on JPEGs with malformed DC deltas

extend_receive implicitly requires n <= 15 (code length);
the maximum that actually makes sense for 8-bit baseline JPEG is
11, but 15 is the natural limit for us because the AC coding path
stores the number of magnitude bits in a nibble.

Check that DC delta bits are in range before attempting to call
extend_receive.

Fixes issue #1108.
---
 stb_image.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 71bd506..2fced29 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -2158,7 +2158,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
 
    if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
    t = stbi__jpeg_huff_decode(j, hdc);
-   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
 
    // 0 all the ac values now so we can do it 32-bits at a time
    memset(data,0,64*sizeof(data[0]));
@@ -2215,7 +2215,7 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
       // first scan for DC coefficient, must be first
       memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
       t = stbi__jpeg_huff_decode(j, hdc);
-      if (t == -1) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
       diff = t ? stbi__extend_receive(j, t) : 0;
 
       dc = j->img_comp[b].dc_pred + diff;

From 265b73bb0baf5aa24f12d733c652086063d54a5d Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 18:28:41 -0700
Subject: [PATCH 021/170] stb_image: Fix lrot definition, small extend_receive
 tweak

Define lrot in a way that doesn't involve UB when n==0.
Also, the previous patch ensures that n <= 15 for all callers
of stbi__extend_receive, so can remove the (less restrictive)
bounds check for 0 <= n < 17 (the bounds of stbi__bmask)
entirely.

Fixes issue #1065.
---
 stb_image.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 2fced29..13f9f12 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -645,7 +645,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #ifdef STBI_HAS_LROTL
    #define stbi_lrot(x,y)  _lrotl(x,y)
 #else
-   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
 #endif
 
 #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
@@ -2104,7 +2104,6 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
 
    sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
    k = stbi_lrot(j->code_buffer, n);
-   if (n < 0 || n >= (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask))) return 0;
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
    j->code_bits -= n;

From 6d857933d5a88633bc170fc982d1c0ebaed9faf9 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 18:42:01 -0700
Subject: [PATCH 022/170] stb_image, stb_image_write: Fix compare sign warnings

For the stb_image fix, also replace the magic 288 with a more
descriptive name while I'm at it.

Fixes #1100
---
 stb_image.h       | 11 ++++++-----
 stb_image_write.h |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 13f9f12..8ac86b6 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -3993,6 +3993,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 // fast-way is faster to check than jpeg huffman, but slow way is slower
 #define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
 #define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
 
 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
@@ -4002,8 +4003,8 @@ typedef struct
    stbi__uint16 firstcode[16];
    int maxcode[17];
    stbi__uint16 firstsymbol[16];
-   stbi_uc  size[288];
-   stbi__uint16 value[288];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
 } stbi__zhuffman;
 
 stbi_inline static int stbi__bitreverse16(int n)
@@ -4134,7 +4135,7 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
    if (s >= 16) return -1; // invalid code!
    // code size is s, so:
    b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
-   if (b >= sizeof (z->size)) return -1; // some data was corrupt somewhere!
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
    if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
    a->code_buffer >>= s;
    a->num_bits -= s;
@@ -4331,7 +4332,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    return 1;
 }
 
-static const stbi_uc stbi__zdefault_length[288] =
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
 {
    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
@@ -4377,7 +4378,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       } else {
          if (type == 1) {
             // use fixed code lengths
-            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
          } else {
             if (!stbi__compute_huffman_codes(a)) return 0;
diff --git a/stb_image_write.h b/stb_image_write.h
index 95943eb..7b5d91f 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -397,7 +397,7 @@ static void stbiw__putc(stbi__write_context *s, unsigned char c)
 
 static void stbiw__write1(stbi__write_context *s, unsigned char a)
 {
-   if (s->buf_used + 1 > sizeof(s->buffer))
+   if ((size_t)s->buf_used + 1 > sizeof(s->buffer))
       stbiw__write_flush(s);
    s->buffer[s->buf_used++] = a;
 }
@@ -405,7 +405,7 @@ static void stbiw__write1(stbi__write_context *s, unsigned char a)
 static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
    int n;
-   if (s->buf_used + 3 > sizeof(s->buffer))
+   if ((size_t)s->buf_used + 3 > sizeof(s->buffer))
       stbiw__write_flush(s);
    n = s->buf_used;
    s->buf_used = n+3;

From 43b32c7babcec25bf7c544220e55bc03b4fd37dc Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 20:59:22 -0700
Subject: [PATCH 023/170] stb_image: Avoid shift of signed values in
 extend_receive

Use an equivalent formulation that has sgn=0 or 1, not 0 or -1.
This avoids right-shifting signed values, at least in this place.

Fixes issue #1061.
---
 stb_image.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 8ac86b6..5dd9f32 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -2102,12 +2102,12 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
    int sgn;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
 
-   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
    k = stbi_lrot(j->code_buffer, n);
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
    j->code_bits -= n;
-   return k + (stbi__jbias[n] & ~sgn);
+   return k + (stbi__jbias[n] & (sgn - 1));
 }
 
 // get some unsigned bits

From bb0931744529c218bfa50e9b367c811a31251b75 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 21:09:11 -0700
Subject: [PATCH 024/170] stb_image: Avoid left-shifts of signed values

It's implementation-specified behavior. Writing this code and then
relying on compiler strength reduction to turn it back into shifts
feels extremely silly but it is what it is.

Fixes issue #1097.
---
 stb_image.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 5dd9f32..a50e7bc 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -2219,7 +2219,7 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
 
       dc = j->img_comp[b].dc_pred + diff;
       j->img_comp[b].dc_pred = dc;
-      data[0] = (short) (dc << j->succ_low);
+      data[0] = (short) (dc * (1 << j->succ_low));
    } else {
       // refinement scan for DC coefficient
       if (stbi__jpeg_get_bit(j))
@@ -2256,7 +2256,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
             j->code_buffer <<= s;
             j->code_bits -= s;
             zig = stbi__jpeg_dezigzag[k++];
-            data[zig] = (short) ((r >> 8) << shift);
+            data[zig] = (short) ((r >> 8) * (1 << shift));
          } else {
             int rs = stbi__jpeg_huff_decode(j, hac);
             if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
@@ -2274,7 +2274,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
             } else {
                k += r;
                zig = stbi__jpeg_dezigzag[k++];
-               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
             }
          }
       } while (k <= j->spec_end);

From d6ab7faec08d6964856472f4ada6adbc0e33ae92 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 21:38:44 -0700
Subject: [PATCH 025/170] stb_image: Update comment

As per recent patches, we do support 16-bit PNMs.
---
 stb_image.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index a50e7bc..82b4951 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -7321,7 +7321,6 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 // Known limitations:
 //    Does not support comments in the header section
 //    Does not support ASCII image data (formats P2 and P3)
-//    Does not support 16-bit-per-channel
 
 #ifndef STBI_NO_PNM
 

From 31ba943e3fdff7aff00b55629f747121f802e8c1 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 21:57:25 -0700
Subject: [PATCH 026/170] stb_image: UB fix in stbi__get32le

Need to do the second-part shift on uint32 not int.
---
 stb_image.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 82b4951..01807cc 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1674,7 +1674,8 @@ static int stbi__get16le(stbi__context *s)
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16le(s);
-   return z + (stbi__get16le(s) << 16);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
 }
 #endif
 

From 4d3b93f5891794a6986c9808fbaaba7ee5a27a10 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 22:01:23 -0700
Subject: [PATCH 027/170] stb_image: Erorr in BMP should error, not assert.

There was both the assert and the error check; should just be
the error check.

Fixes issue #881 (or rather, part of it).
---
 stb_image.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 01807cc..87ae770 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -5411,7 +5411,6 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
          psize = (info.offset - info.extra_read - info.hsz) >> 2;
    }
    if (psize == 0) {
-      STBI_ASSERT(info.offset == s->callback_already_read + (int) (s->img_buffer - s->img_buffer_original));
       if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
         return stbi__errpuc("bad offset", "Corrupt BMP");
       }

From 55a180671c8be196dabde96baaa7219fc20ecd1f Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 22:31:41 -0700
Subject: [PATCH 028/170] readme: Add "how to use these libs" section

Try to be a bit more explicit still.

Fixes issue #903, or so I hope.
---
 README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/README.md b/README.md
index 118cc9c..fd3f669 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,24 @@ Total lines of C code: 56530
 FAQ
 ---
 
+#### How do I use these libraries?
+
+The idea behind single-header file libraries is that they're easy to distribute and deploy
+because all the code is contained in a single file. By default, the .h files in here act as
+their own header files, i.e. they declare the functions contained in the file but don't
+actually result in any code getting compiled.
+
+So in addition, you should select _exactly one_ C/C++ source file that actually instantiates
+the code, preferably a file you're not editing frequently. This file should define a
+specific macro (this is documented per-library) to actually enable the function definitions.
+For example, to use stb_image, you should have exactly one C/C++ file that doesn't
+include stb_image.h regularly, but instead does
+
+    #define STB_IMAGE_IMPLEMENTATION
+    #include "stb_image.h"
+
+The right macro to define is pointed out right at the top of each of these libraries.
+
 #### What's the license?
 
 These libraries are in the public domain. You can do anything you

From 618dbd01c81bad6aeb8eb3f37fc3281548678d3c Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 22:52:15 -0700
Subject: [PATCH 029/170] stb_image: Document image size limits

Both the buffer size limits and the image dimension limits.

Fixes issue #672.
---
 stb_image.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/stb_image.h b/stb_image.h
index 87ae770..3057481 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -187,6 +187,24 @@ RECENT REVISION HISTORY:
 //   // returns ok=1 and sets x, y, n if image is a supported format,
 //   // 0 otherwise.
 //
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
 // ===========================================================================
 //
 // UNICODE:

From 56a7113cd0d0cb3c5be401436de10d7d5e96df6e Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 2 Jul 2021 23:06:44 -0700
Subject: [PATCH 030/170] stb_image: Reorder format test sequence

Put the formats that start with a clear magic number first,
the dodgy ones that don't have much of a distinctive header
should be tested for later after we've ruled out the clearer
ones.

Fixes issue #787, hopefully. (Never got a clean repro.)
---
 stb_image.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 3057481..6bbcea4 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1117,9 +1117,8 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re
    ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
    ri->num_channels = 0;
 
-   #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
-   #endif
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
    #ifndef STBI_NO_PNG
    if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    #endif
@@ -1137,6 +1136,13 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re
    #ifndef STBI_NO_PIC
    if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
    #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
    #ifndef STBI_NO_PNM
    if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    #endif

From 991f1f6419aeba2b69c26aa26befe3ae93259a15 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sat, 3 Jul 2021 00:38:30 -0700
Subject: [PATCH 031/170] stb_image: Fix wrong buffer sizes passed to
 MultiByteToWideChar

Fixes issue #772.
---
 stb_image.h       | 4 ++--
 stb_image_write.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 6bbcea4..54ab97d 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1316,10 +1316,10 @@ static FILE *stbi__fopen(char const *filename, char const *mode)
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
       return 0;
 
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
 #if _MSC_VER >= 1400
diff --git a/stb_image_write.h b/stb_image_write.h
index 7b5d91f..be9e72d 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -306,10 +306,10 @@ static FILE *stbiw__fopen(char const *filename, char const *mode)
 #if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
       return 0;
 
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)))
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
 #if _MSC_VER >= 1400

From 2506215e8ab4095d791e95658b26dd708bb13857 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sat, 3 Jul 2021 00:46:04 -0700
Subject: [PATCH 032/170] stb_image: Key Win32 UTF-8 support off _WIN32 not
 _MSC_VER

So that it also works on MinGW.

Fixes issue #729.
---
 stb_image.h       | 8 ++++----
 stb_image_write.h | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 54ab97d..59ee9fb 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1298,12 +1298,12 @@ static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, in
 
 #ifndef STBI_NO_STDIO
 
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
 STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
 #endif
 
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 {
 	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
@@ -1313,7 +1313,7 @@ STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wch
 static FILE *stbi__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
 	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
@@ -1322,7 +1322,7 @@ static FILE *stbi__fopen(char const *filename, char const *mode)
 	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
-#if _MSC_VER >= 1400
+#if defined(_MSC_VER) && _MSC_VER >= 1400
 	if (0 != _wfopen_s(&f, wFilename, wMode))
 		f = 0;
 #else
diff --git a/stb_image_write.h b/stb_image_write.h
index be9e72d..6889b8d 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -285,7 +285,7 @@ static void stbi__stdio_write(void *context, void *data, int size)
    fwrite(data,1,size,(FILE*) context);
 }
 
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
 #ifdef __cplusplus
 #define STBIW_EXTERN extern "C"
 #else
@@ -303,7 +303,7 @@ STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const w
 static FILE *stbiw__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
 	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
@@ -312,7 +312,7 @@ static FILE *stbiw__fopen(char const *filename, char const *mode)
 	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
-#if _MSC_VER >= 1400
+#if defined(_MSC_VER) && _MSC_VER >= 1400
 	if (0 != _wfopen_s(&f, wFilename, wMode))
 		f = 0;
 #else

From db864a1e309e3e785d1b58879cec83c66444fb2b Mon Sep 17 00:00:00 2001
From: Eugene Golushkov <e.golushkov@nospam-gmail.com>
Date: Thu, 1 Oct 2020 03:20:30 +0300
Subject: [PATCH 033/170] stb_image: fix building by MSVC for Windows 10 on ARM

---
 stb_image.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 59ee9fb..5b2c456 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -777,9 +777,12 @@ static int stbi__sse2_available(void)
 
 #ifdef STBI_NEON
 #include <arm_neon.h>
-// assume GCC or Clang on ARM targets
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 #endif
+#endif
 
 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name

From 82f9950cea9194f731c42ebbc0678c19c85972af Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sat, 3 Jul 2021 01:00:46 -0700
Subject: [PATCH 034/170] stb_image: Update credits

---
 stb_image.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 5b2c456..f2e3430 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -103,7 +103,7 @@ RECENT REVISION HISTORY:
     Thomas Ruf              Ronny Chevalier                         github:rlyeh
     Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
     Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
-                            Laurent Gomila     Cort Stratton        github:snagar
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
     Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
     Cass Everitt            Ryamond Barbiero                        github:grim210
     Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw

From 81bcfa9043546dcb0ff5de710002470ac6bb342b Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sat, 3 Jul 2021 23:55:14 -0700
Subject: [PATCH 035/170] stb_image_write: Remove tabs

Whitespace-only change.
---
 stb_image_write.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/stb_image_write.h b/stb_image_write.h
index 6889b8d..3ba0db1 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -296,7 +296,7 @@ STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned in
 
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
 {
-	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+   return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
 }
 #endif
 
@@ -306,15 +306,15 @@ static FILE *stbiw__fopen(char const *filename, char const *mode)
 #if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
       return 0;
 
-	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+   if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
       return 0;
 
 #if defined(_MSC_VER) && _MSC_VER >= 1400
-	if (0 != _wfopen_s(&f, wFilename, wMode))
-		f = 0;
+   if (0 != _wfopen_s(&f, wFilename, wMode))
+      f = 0;
 #else
    f = _wfopen(wFilename, wMode);
 #endif
@@ -1635,7 +1635,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
              add HDR output
              fix monochrome BMP
       0.95 (2014-08-17)
-		       add monochrome TGA output
+             add monochrome TGA output
       0.94 (2014-05-31)
              rename private functions to avoid conflicts with stb_image.h
       0.93 (2014-05-27)

From 76f55ac210f76592aa73b226c63dff022bd9060b Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 00:16:14 -0700
Subject: [PATCH 036/170] stb_image_write: STBI_WINDOWS_UTF8 ->
 STBIW_WINDOWS_UTF8

Fixes issue #925.
---
 stb_image_write.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_image_write.h b/stb_image_write.h
index 3ba0db1..a1634e5 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -178,7 +178,7 @@ STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const
 STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
 STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
 
-#ifdef STBI_WINDOWS_UTF8
+#ifdef STBIW_WINDOWS_UTF8
 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
 #endif
 #endif
@@ -285,7 +285,7 @@ static void stbi__stdio_write(void *context, void *data, int size)
    fwrite(data,1,size,(FILE*) context);
 }
 
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
 #ifdef __cplusplus
 #define STBIW_EXTERN extern "C"
 #else
@@ -303,7 +303,7 @@ STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const w
 static FILE *stbiw__fopen(char const *filename, char const *mode)
 {
    FILE *f;
-#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8)
    wchar_t wMode[64];
    wchar_t wFilename[1024];
    if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))

From ba3ba9b78c85abd4ed8ff5d432196f84160bd2b4 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 00:23:38 -0700
Subject: [PATCH 037/170] stb_image_write: Disable HDR writer completely in
 NO_STDIO build

Fixes issue #793, hopefully.
---
 stb_image_write.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_image_write.h b/stb_image_write.h
index a1634e5..0dba14c 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -622,6 +622,8 @@ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const
 
 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 
+#ifndef STBI_WRITE_NO_STDIO
+
 static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
 {
    int exponent;
@@ -777,7 +779,6 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x,
    return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
 }
 
-#ifndef STBI_WRITE_NO_STDIO
 STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
 {
    stbi__write_context s = { 0 };

From a5e40739ac096711e6640babdf3038c8203f9978 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 01:33:53 -0700
Subject: [PATCH 038/170] stb_image_write: Fix define tested for sprintf_s

Fixes issue 744.
---
 stb_image_write.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image_write.h b/stb_image_write.h
index 0dba14c..271d4ac 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -758,7 +758,7 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f
       char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
       s->func(s->context, header, sizeof(header)-1);
 
-#ifdef __STDC_WANT_SECURE_LIB__
+#ifdef __STDC_LIB_EXT1__
       len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
 #else
       len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);

From 3f671b870ff2f25b439ba36d8490c201c8b2c2bd Mon Sep 17 00:00:00 2001
From: Johannes Schultz <git42@sagagames.de>
Date: Sat, 16 Jan 2021 15:04:26 +0100
Subject: [PATCH 039/170] stb_vorbis: rename BUFFER_SIZE macro to
 STB_BUFFER_SIZE

Fixes issue #1076.
---
 stb_vorbis.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 6b9be1b..e3ce1e2 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -29,7 +29,7 @@
 //    Bernhard Wodo      Evan Balster        github:alxprd
 //    Tom Beaumont       Ingo Leitgeb        Nicolas Guillemot
 //    Phillip Bennefall  Rohit               Thiago Goulart
-//    github:manxorist   saga musix          github:infatum
+//    github:manxorist   Saga Musix          github:infatum
 //    Timur Gagiev       Maxwell Koo         Peter Waller
 //    github:audinowho   Dougall Johnson     David Reid
 //    github:Clownacy    Pedro J. Estebanez  Remi Verschelde
@@ -5160,11 +5160,11 @@ static void copy_samples(short *dest, float *src, int len)
 
 static void compute_samples(int mask, short *output, int num_c, float **data, int d_offset, int len)
 {
-   #define BUFFER_SIZE  32
-   float buffer[BUFFER_SIZE];
-   int i,j,o,n = BUFFER_SIZE;
+   #define STB_BUFFER_SIZE  32
+   float buffer[STB_BUFFER_SIZE];
+   int i,j,o,n = STB_BUFFER_SIZE;
    check_endianness();
-   for (o = 0; o < len; o += BUFFER_SIZE) {
+   for (o = 0; o < len; o += STB_BUFFER_SIZE) {
       memset(buffer, 0, sizeof(buffer));
       if (o + n > len) n = len - o;
       for (j=0; j < num_c; ++j) {
@@ -5181,16 +5181,17 @@ static void compute_samples(int mask, short *output, int num_c, float **data, in
          output[o+i] = v;
       }
    }
+   #undef STB_BUFFER_SIZE
 }
 
 static void compute_stereo_samples(short *output, int num_c, float **data, int d_offset, int len)
 {
-   #define BUFFER_SIZE  32
-   float buffer[BUFFER_SIZE];
-   int i,j,o,n = BUFFER_SIZE >> 1;
+   #define STB_BUFFER_SIZE  32
+   float buffer[STB_BUFFER_SIZE];
+   int i,j,o,n = STB_BUFFER_SIZE >> 1;
    // o is the offset in the source data
    check_endianness();
-   for (o = 0; o < len; o += BUFFER_SIZE >> 1) {
+   for (o = 0; o < len; o += STB_BUFFER_SIZE >> 1) {
       // o2 is the offset in the output data
       int o2 = o << 1;
       memset(buffer, 0, sizeof(buffer));
@@ -5220,6 +5221,7 @@ static void compute_stereo_samples(short *output, int num_c, float **data, int d
          output[o2+i] = v;
       }
    }
+   #undef STB_BUFFER_SIZE
 }
 
 static void convert_samples_short(int buf_c, short **buffer, int b_offset, int data_c, float **data, int d_offset, int samples)

From 2e3a6e82228164171b1a4d5b703c40c8d2d42b69 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 02:43:00 -0700
Subject: [PATCH 040/170] stb_vorbis: Move asserts around a bit

Not an actual bug, it just looked wonky, but this code runs
with code lengths that are verified to be in range (<32) by
the length-reading code. Anyway.

Fixes issue #901.
---
 stb_vorbis.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index e3ce1e2..8bf886b 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -1079,6 +1079,7 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values)
    // find the first entry
    for (k=0; k < n; ++k) if (len[k] < NO_CODE) break;
    if (k == n) { assert(c->sorted_entries == 0); return TRUE; }
+   assert(len[k] < 32); // no error return required, code reading lens checks this
    // add to the list
    add_entry(c, 0, k, m++, len[k], values);
    // add all available leaves
@@ -1092,6 +1093,7 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values)
       uint32 res;
       int z = len[i], y;
       if (z == NO_CODE) continue;
+      assert(z < 32); // no error return required, code reading lens checks this
       // find lowest available leaf (should always be earliest,
       // which is what the specification calls for)
       // note that this property, and the fact we can never have
@@ -1101,12 +1103,10 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values)
       while (z > 0 && !available[z]) --z;
       if (z == 0) { return FALSE; }
       res = available[z];
-      assert(z >= 0 && z < 32);
       available[z] = 0;
       add_entry(c, bit_reverse(res), i, m++, len[i], values);
       // propagate availability up the tree
       if (z != len[i]) {
-         assert(len[i] >= 0 && len[i] < 32);
          for (y=len[i]; y > z; --y) {
             assert(available[y] == 0);
             available[y] = res + (1 << (32-y));

From 6fe053614c6973ae7edf92ef5f9c5bd901adfeeb Mon Sep 17 00:00:00 2001
From: "morlad (iLeitgeb)" <morlad@morlad.at>
Date: Fri, 12 Mar 2021 14:30:06 +0000
Subject: [PATCH 041/170] stb_vorbis: Fix memory leak in stb_vorbis

When start_decoder() fails it may already have allocated memory
for .vendor and/or .comment_list. Call vorbis_deinit() to free
any allocated memory.

Fixes issue #1051.
---
 stb_vorbis.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 8bf886b..0c7c18f 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -33,7 +33,7 @@
 //    Timur Gagiev       Maxwell Koo         Peter Waller
 //    github:audinowho   Dougall Johnson     David Reid
 //    github:Clownacy    Pedro J. Estebanez  Remi Verschelde
-//    AnthoFoxo
+//    AnthoFoxo          github:morlat
 //
 // Partial history:
 //    1.21    - 2021-07-02 - fix bug for files with no comments
@@ -4513,6 +4513,7 @@ stb_vorbis *stb_vorbis_open_pushdata(
          *error = VORBIS_need_more_data;
       else
          *error = p.error;
+      vorbis_deinit(&p);
       return NULL;
    }
    f = vorbis_alloc(&p);

From 5300d55277cbe1304f7c0d2eb180700e531e698a Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 03:00:17 -0700
Subject: [PATCH 042/170] stb_vorbis: Include alloca.h on Sun targets

See PR #1006.
---
 stb_vorbis.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 0c7c18f..8288d09 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -1,4 +1,4 @@
-// Ogg Vorbis audio decoder - v1.21 - public domain
+// Ogg Vorbis audio decoder - v1.22 - public domain
 // http://nothings.org/stb_vorbis/
 //
 // Original version written by Sean Barrett in 2007.
@@ -33,9 +33,10 @@
 //    Timur Gagiev       Maxwell Koo         Peter Waller
 //    github:audinowho   Dougall Johnson     David Reid
 //    github:Clownacy    Pedro J. Estebanez  Remi Verschelde
-//    AnthoFoxo          github:morlat
+//    AnthoFoxo          github:morlat       Gabriel Ravier
 //
 // Partial history:
+//    1.22    -            - various small fixes
 //    1.21    - 2021-07-02 - fix bug for files with no comments
 //    1.20    - 2020-07-11 - several small fixes
 //    1.19    - 2020-02-05 - warnings
@@ -581,7 +582,7 @@ enum STBVorbisError
    #if defined(_MSC_VER) || defined(__MINGW32__)
       #include <malloc.h>
    #endif
-   #if defined(__linux__) || defined(__linux) || defined(__EMSCRIPTEN__) || defined(__NEWLIB__)
+   #if defined(__linux__) || defined(__linux) || defined(__sun__) || defined(__EMSCRIPTEN__) || defined(__NEWLIB__)
       #include <alloca.h>
    #endif
 #else // STB_VORBIS_NO_CRT

From 904ecbfc37cfdd0aa86c4cf280ac3f05f52465e3 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 03:15:43 -0700
Subject: [PATCH 043/170] stb_vorbis: Remove spurious assignment to val

This is definitely unnecessary, or at least I can't find anything
in the Vorbis spec that would indicate anything special happening
here.

Fixes issue #816.
---
 stb_vorbis.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 8288d09..8bda695 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -3872,8 +3872,7 @@ static int start_decoder(vorb *f)
                unsigned int div=1;
                for (k=0; k < c->dimensions; ++k) {
                   int off = (z / div) % c->lookup_values;
-                  float val = mults[off];
-                  val = mults[off]*c->delta_value + c->minimum_value + last;
+                  float val = mults[off]*c->delta_value + c->minimum_value + last;
                   c->multiplicands[j*c->dimensions + k] = val;
                   if (c->sequence_p)
                      last = val;

From e31da438e84db1d9285ba4933d43db1ec73a9a1b Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 16:08:30 -0700
Subject: [PATCH 044/170] stb_vorbis: Fix unused parameter warnings.

Some parameters do not get used, or only when certain config
defines are set. Explicitly mark them as unused to make compilers
happy.

Fixes issue #396.
---
 stb_vorbis.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 8bda695..38de648 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -649,6 +649,12 @@ typedef   signed int    int32;
 
 typedef float codetype;
 
+#ifdef _MSC_VER
+#define STBV_NOTUSED(v)  (void)(v)
+#else
+#define STBV_NOTUSED(v)  (void)sizeof(v)
+#endif
+
 // @NOTE
 //
 // Some arrays below are tagged "//varies", which means it's actually
@@ -3072,6 +3078,7 @@ static int do_floor(vorb *f, Mapping *map, int i, int n, float *target, YTYPE *f
       for (q=1; q < g->values; ++q) {
          j = g->sorted_order[q];
          #ifndef STB_VORBIS_NO_DEFER_FLOOR
+         STBV_NOTUSED(step2_flag);
          if (finalY[j] >= 0)
          #else
          if (step2_flag[j])
@@ -3174,6 +3181,7 @@ static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start,
 
 // WINDOWING
 
+   STBV_NOTUSED(left_end);
    n = f->blocksize[m->blockflag];
    map = &f->mapping[m->mapping];
 

From c817c9621ee306b0360cda5191e1054e1cb70daa Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 16:12:32 -0700
Subject: [PATCH 045/170] stb_vorbis: Add missing cast to uint to avoid UB

Fixes issue #574.
---
 stb_vorbis.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 38de648..a998923 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -4579,7 +4579,7 @@ static uint32 vorbis_find_page(stb_vorbis *f, uint32 *end, uint32 *last)
                header[i] = get8(f);
             if (f->eof) return 0;
             if (header[4] != 0) goto invalid;
-            goal = header[22] + (header[23] << 8) + (header[24]<<16) + (header[25]<<24);
+            goal = header[22] + (header[23] << 8) + (header[24]<<16) + ((uint32)header[25]<<24);
             for (i=22; i < 26; ++i)
                header[i] = 0;
             crc = 0;

From d5613c9511d23b7f0573cfe53b70fa9984d2dd29 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 16:19:46 -0700
Subject: [PATCH 046/170] stb_vorbis: A few UB fixes.

Fixes issue #1018.
---
 stb_vorbis.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index a998923..da1767d 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -1055,7 +1055,7 @@ static float float32_unpack(uint32 x)
    uint32 sign = x & 0x80000000;
    uint32 exp = (x & 0x7fe00000) >> 21;
    double res = sign ? -(double)mantissa : (double)mantissa;
-   return (float) ldexp((float)res, exp-788);
+   return (float) ldexp((float)res, (int)exp-788);
 }
 
 
@@ -3379,7 +3379,7 @@ static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start,
       // this isn't to spec, but spec would require us to read ahead
       // and decode the size of all current frames--could be done,
       // but presumably it's not a commonly used feature
-      f->current_loc = -n2; // start of first frame is positioned for discard
+      f->current_loc = 0u - n2; // start of first frame is positioned for discard (NB this is an intentional unsigned overflow/wrap-around)
       // we might have to discard samples "from" the next frame too,
       // if we're lapping a large block then a small at the start?
       f->discard_samples_deferred = n - right_end;
@@ -3963,7 +3963,7 @@ static int start_decoder(vorb *f)
                if (g->class_masterbooks[j] >= f->codebook_count) return error(f, VORBIS_invalid_setup);
             }
             for (k=0; k < 1 << g->class_subclasses[j]; ++k) {
-               g->subclass_books[j][k] = get_bits(f,8)-1;
+               g->subclass_books[j][k] = (int16)get_bits(f,8)-1;
                if (g->subclass_books[j][k] >= f->codebook_count) return error(f, VORBIS_invalid_setup);
             }
          }

From 39a06413859a8a3f9613145e69ae9494e34b6ff4 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 16:25:33 -0700
Subject: [PATCH 047/170] stb_vorbis: Clarify lifetime of pushdata *output
 buffers

Fixes issue #929.
---
 stb_vorbis.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index da1767d..4cd1699 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -223,6 +223,12 @@ extern int stb_vorbis_decode_frame_pushdata(
 // channel. In other words, (*output)[0][0] contains the first sample from
 // the first channel, and (*output)[1][0] contains the first sample from
 // the second channel.
+//
+// *output points into stb_vorbis's internal output buffer storage; these
+// buffers are owned by stb_vorbis and application code should not free
+// them or modify their contents. They are transient and will be overwritten
+// once you ask for more data to get decoded, so be sure to grab any data
+// you need before then.
 
 extern void stb_vorbis_flush_pushdata(stb_vorbis *f);
 // inform stb_vorbis that your next datablock will not be contiguous with

From 0def11ae179ea2ef7216ac1c4e0d42ee9e019892 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 16:35:45 -0700
Subject: [PATCH 048/170] stb_vorbis: Fix some unused variables.

Fixes issue #817.
---
 stb_vorbis.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 4cd1699..7c0e9cd 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -4989,7 +4989,7 @@ unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f)
             // set. whoops!
             break;
          }
-         previous_safe = last_page_loc+1;
+         //previous_safe = last_page_loc+1; // NOTE: not used after this point, but note for debugging
          last_page_loc = stb_vorbis_get_file_offset(f);
       }
 
@@ -5309,8 +5309,6 @@ int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels, short
    float **outputs;
    int len = num_shorts / channels;
    int n=0;
-   int z = f->channels;
-   if (z > channels) z = channels;
    while (n < len) {
       int k = f->channel_buffer_end - f->channel_buffer_start;
       if (n+k >= len) k = len - n;
@@ -5329,8 +5327,6 @@ int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer, in
 {
    float **outputs;
    int n=0;
-   int z = f->channels;
-   if (z > channels) z = channels;
    while (n < len) {
       int k = f->channel_buffer_end - f->channel_buffer_start;
       if (n+k >= len) k = len - n;

From 0d47d17002a532fa6dc118b4c95c95e557ac7ef3 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 19:26:31 -0700
Subject: [PATCH 049/170] stb_vorbis: Set error on open_memory with NULL data

"Unexpected EOF" seems like the closest match from the error
codes we have.

Fixes issue #1123.
---
 stb_vorbis.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 7c0e9cd..51e8eec 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -5100,7 +5100,10 @@ stb_vorbis * stb_vorbis_open_filename(const char *filename, int *error, const st
 stb_vorbis * stb_vorbis_open_memory(const unsigned char *data, int len, int *error, const stb_vorbis_alloc *alloc)
 {
    stb_vorbis *f, p;
-   if (data == NULL) return NULL;
+   if (!data) {
+      if (error) *error = VORBIS_unexpected_eof;
+      return NULL;
+   }
    vorbis_init(&p, alloc);
    p.stream = (uint8 *) data;
    p.stream_end = (uint8 *) data + len;

From 70136cd5f1f0f40fc5df529689797f67c3a79c07 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 20:54:18 -0700
Subject: [PATCH 050/170] stb_vorbis: Change imdct_step3_inner_s_loop_ld654

Released Clang 12 generates bad code for the original loop in here.
While this is a compiler bug plain and simple, we still have to deal
with it.

This is related to the SLP vectorizer, and in particular the two
reverse subtracts in the butterflies for the second half to avoid
unary negates.

Use the more regular dataflow that has the unary negates in it
(we can at least fold one of them into a constant, namely for A2)
and introduce a few temporaries that also make alias analysis (and
possible block-level vectorization) a whole let easier while I'm at
it.

This fixes the codegen issues on Clang 12, which now produces a
working decoder, and I expect the single unary negate that we
actually gain per iteration of this loop is not a significant
perf concern. (There are bigger fish to fry here regardless.)

Fixes issue #1152.
---
 stb_vorbis.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/stb_vorbis.c b/stb_vorbis.c
index 51e8eec..534e689 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -2592,34 +2592,33 @@ static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A,
 
    while (z > base) {
       float k00,k11;
+      float l00,l11;
 
-      k00   = z[-0] - z[-8];
-      k11   = z[-1] - z[-9];
-      z[-0] = z[-0] + z[-8];
-      z[-1] = z[-1] + z[-9];
-      z[-8] =  k00;
-      z[-9] =  k11 ;
+      k00    = z[-0] - z[ -8];
+      k11    = z[-1] - z[ -9];
+      l00    = z[-2] - z[-10];
+      l11    = z[-3] - z[-11];
+      z[ -0] = z[-0] + z[ -8];
+      z[ -1] = z[-1] + z[ -9];
+      z[ -2] = z[-2] + z[-10];
+      z[ -3] = z[-3] + z[-11];
+      z[ -8] = k00;
+      z[ -9] = k11;
+      z[-10] = (l00+l11) * A2;
+      z[-11] = (l11-l00) * A2;
 
-      k00    = z[ -2] - z[-10];
-      k11    = z[ -3] - z[-11];
-      z[ -2] = z[ -2] + z[-10];
-      z[ -3] = z[ -3] + z[-11];
-      z[-10] = (k00+k11) * A2;
-      z[-11] = (k11-k00) * A2;
-
-      k00    = z[-12] - z[ -4];  // reverse to avoid a unary negation
+      k00    = z[ -4] - z[-12];
       k11    = z[ -5] - z[-13];
+      l00    = z[ -6] - z[-14];
+      l11    = z[ -7] - z[-15];
       z[ -4] = z[ -4] + z[-12];
       z[ -5] = z[ -5] + z[-13];
-      z[-12] = k11;
-      z[-13] = k00;
-
-      k00    = z[-14] - z[ -6];  // reverse to avoid a unary negation
-      k11    = z[ -7] - z[-15];
       z[ -6] = z[ -6] + z[-14];
       z[ -7] = z[ -7] + z[-15];
-      z[-14] = (k00+k11) * A2;
-      z[-15] = (k00-k11) * A2;
+      z[-12] = k11;
+      z[-13] = -k00;
+      z[-14] = (l11-l00) * A2;
+      z[-15] = (l00+l11) * -A2;
 
       iter_54(z);
       iter_54(z-8);

From e5fd7f6ce0a187bfdded1d6c1123152b7995dd39 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 21:09:20 -0700
Subject: [PATCH 051/170] stb: Remove stua entirely

As per issue 634, Stua "will be deleted from stb.h" soonish. That
was 3 years ago, which should be plenty of warning, and the
language has been de-facto orphaned and undocumented for a
good while longer than that.

Fixes issue #634.
---
 deprecated/stb.h | 1350 ----------------------------------------------
 1 file changed, 1350 deletions(-)

diff --git a/deprecated/stb.h b/deprecated/stb.h
index 03aec96..cd715e7 100644
--- a/deprecated/stb.h
+++ b/deprecated/stb.h
@@ -13058,1356 +13058,6 @@ char * stb__string_constant(char *file, int line, char *x)
 #endif // STB_DEFINE
 #endif // !STB_DEBUG && !STB_ALWAYS_H
 
-
-#ifdef STB_STUA
-#error "STUA is no longer supported"
-//////////////////////////////////////////////////////////////////////////
-//
-//  stua: little scripting language
-//
-//     define STB_STUA to compile it
-//
-//     see http://nothings.org/stb/stb_stua.html for documentation
-//
-//  basic parsing model:
-//
-//   lexical analysis
-//      use stb_lex() to parse tokens; keywords get their own tokens
-//
-//   parsing:
-//      recursive descent parser. too much of a hassle to make an unambiguous
-//      LR(1) grammar, and one-pass generation is clumsier (recursive descent
-//      makes it easier to e.g. compile nested functions). on the other hand,
-//      dictionary syntax required hackery to get extra lookahead.
-//
-//   codegen:
-//      output into an evaluation tree, using array indices as 'pointers'
-//
-//   run:
-//      traverse the tree; support for 'break/continue/return' is tricky
-//
-//   garbage collection:
-//      stu__mark and sweep; explicit stack with non-stu__compile_global_scope roots
-
-typedef stb_int32 stua_obj;
-
-typedef stb_idict stua_dict;
-
-STB_EXTERN void stua_run_script(char *s);
-STB_EXTERN void stua_uninit(void);
-
-extern stua_obj stua_globals;
-
-STB_EXTERN double   stua_number(stua_obj z);
-
-STB_EXTERN stua_obj stua_getnil(void);
-STB_EXTERN stua_obj stua_getfalse(void);
-STB_EXTERN stua_obj stua_gettrue(void);
-STB_EXTERN stua_obj stua_string(char *z);
-STB_EXTERN stua_obj stua_make_number(double d);
-STB_EXTERN stua_obj stua_box(int type, void *data, int size);
-
-enum
-{
-   STUA_op_negate=129,
-   STUA_op_shl,   STUA_op_ge,
-   STUA_op_shr,   STUA_op_le,
-   STUA_op_shru,
-   STUA_op_last
-};
-
-#define STUA_NO_VALUE   2     // equivalent to a tagged NULL
-STB_EXTERN stua_obj (*stua_overload)(int op, stua_obj a, stua_obj b, stua_obj c);
-
-STB_EXTERN stua_obj stua_error(char *err, ...);
-
-STB_EXTERN stua_obj stua_pushroot(stua_obj o);
-STB_EXTERN void     stua_poproot (   void   );
-
-
-#ifdef STB_DEFINE
-// INTERPRETER
-
-// 31-bit floating point implementation
-//   force the (1 << 30) bit (2nd highest bit) to be zero by re-biasing the exponent;
-//   then shift and set the bottom bit
-
-static stua_obj stu__floatp(float *f)
-{
-   unsigned int n = *(unsigned int *) f;
-   unsigned int e = n & (0xff << 23);
-
-   assert(sizeof(int) == 4 && sizeof(float) == 4);
-
-   if (!e)                    // zero?
-      n = n;                  //   no change
-   else if (e < (64 << 23))   // underflow of the packed encoding?
-      n = (n & 0x80000000);   //   signed 0
-   else if (e > (190 << 23))  // overflow of the encoding? (or INF or NAN)
-      n = (n & 0x80000000) + (127 << 23); // new INF encoding
-   else
-      n -= 0x20000000;
-
-   // now we need to shuffle the bits so that the spare bit is at the bottom
-   assert((n & 0x40000000) == 0);
-   return (n & 0x80000000) + (n << 1) + 1;
-}
-
-static unsigned char stu__getfloat_addend[256];
-static float stu__getfloat(stua_obj v)
-{
-   unsigned int n;
-   unsigned int e = ((unsigned int) v) >> 24;
-
-   n = (int) v >> 1;  // preserve high bit
-   n += stu__getfloat_addend[e] << 24;
-   return *(float *) &n;
-}
-
-stua_obj stua_float(float f)
-{
-   return stu__floatp(&f);
-}
-
-static void stu__float_init(void)
-{
-   int i;
-   stu__getfloat_addend[0]    = 0;   // do nothing to biased exponent of 0
-   for (i=1; i < 127; ++i)
-      stu__getfloat_addend[i] = 32;  // undo the -0x20000000
-   stu__getfloat_addend[127]  = 64;  // convert packed INF to INF (0x3f -> 0x7f)
-
-   for (i=0; i < 128; ++i) // for signed floats, remove the bit we just shifted down
-      stu__getfloat_addend[128+i] = stu__getfloat_addend[i] - 64;
-}
-
-// Tagged data type implementation
-
-                                                 // TAGS:
-#define stu__int_tag          0  // of 2 bits    //   00   int
-#define stu__float_tag        1  // of 1 bit     //   01   float
-#define stu__ptr_tag          2  // of 2 bits    //   10   boxed
-                                                 //   11   float
-
-#define stu__tag(x)           ((x) & 3)
-#define stu__number(x)        (stu__tag(x) != stu__ptr_tag)
-#define stu__isint(x)         (stu__tag(x) == stu__int_tag)
-
-#define stu__int(x)           ((x) >> 2)
-#define stu__float(x)         (stu__getfloat(x))
-
-#define stu__makeint(v)       ((v)*4+stu__int_tag)
-
-// boxed data, and tag support for boxed data
-
-enum
-{
-   STU___float    = 1,   STU___int      = 2,
-   STU___number   = 3,   STU___string   = 4,
-   STU___function = 5,   STU___dict     = 6,
-   STU___boolean  = 7,   STU___error    = 8,
-};
-
-// boxed data
-#define STU__BOX  short type, stua_gc
-typedef struct stu__box { STU__BOX; } stu__box;
-
-stu__box stu__nil   = { 0, 1 };
-stu__box stu__true  = { STU___boolean, 1, };
-stu__box stu__false = { STU___boolean, 1, };
-
-#define stu__makeptr(v)  ((stua_obj)     (v) + stu__ptr_tag)
-
-#define stua_nil    stu__makeptr(&stu__nil)
-#define stua_true   stu__makeptr(&stu__true)
-#define stua_false  stu__makeptr(&stu__false)
-
-stua_obj stua_getnil(void)   { return stua_nil; }
-stua_obj stua_getfalse(void) { return stua_false; }
-stua_obj stua_gettrue(void)  { return stua_true; }
-
-#define stu__ptr(x)      ((stu__box *) ((x) - stu__ptr_tag))
-
-#define stu__checkt(t,x) ((t) == STU___float  ? ((x) & 1) == stu__float_tag : \
-                          (t) == STU___int    ? stu__isint(x)               : \
-                          (t) == STU___number ? stu__number(x)              : \
-                          stu__tag(x) == stu__ptr_tag && stu__ptr(x)->type == (t))
-
-typedef struct
-{
-   STU__BOX;
-   void *ptr;
-} stu__wrapper;
-
-// implementation of a 'function' or function + closure
-
-typedef struct stu__func
-{
-   STU__BOX;
-   stua_obj closure_source;  // 0 - regular function; 4 - C function
-                             // if closure, pointer to source function
-   union {
-      stua_obj closure_data; // partial-application data
-      void *store;           // pointer to free that holds 'code'
-      stua_obj (*func)(stua_dict *context);
-   } f;
-   // closure ends here
-   short *code;
-   int num_param;
-   stua_obj *param;  // list of parameter strings
-} stu__func;
-
-// apply this to 'short *code' to get at data
-#define stu__const(f)  ((stua_obj *) (f))
-
-static void stu__free_func(stu__func *f)
-{
-   if (f->closure_source == 0)          free(f->f.store);
-   if ((stb_uint) f->closure_source <= 4)   free(f->param);
-   free(f);
-}
-
-#define stu__pd(x)       ((stua_dict *)    stu__ptr(x))
-#define stu__pw(x)       ((stu__wrapper *) stu__ptr(x))
-#define stu__pf(x)       ((stu__func *)    stu__ptr(x))
-
-
-// garbage-collection
-
-
-static stu__box ** stu__gc_ptrlist;
-static stua_obj * stu__gc_root_stack;
-
-stua_obj stua_pushroot(stua_obj o) { stb_arr_push(stu__gc_root_stack, o); return o; }
-void     stua_poproot (   void   ) { stb_arr_pop(stu__gc_root_stack); }
-
-static stb_sdict *stu__strings;
-static void stu__mark(stua_obj z)
-{
-   int i;
-   stu__box *p = stu__ptr(z);
-   if (p->stua_gc == 1) return; // already marked
-   assert(p->stua_gc == 0);
-   p->stua_gc = 1;
-   switch(p->type) {
-      case STU___function: {
-         stu__func *f = (stu__func *) p;
-         if ((stb_uint) f->closure_source <= 4) {
-            if (f->closure_source == 0) {
-               for (i=1; i <= f->code[0]; ++i)
-                  if (!stu__number(((stua_obj *) f->code)[-i]))
-                     stu__mark(((stua_obj *) f->code)[-i]);
-            }
-            for (i=0; i < f->num_param; ++i)
-               stu__mark(f->param[i]);
-         } else {
-            stu__mark(f->closure_source);
-            stu__mark(f->f.closure_data);
-         }
-         break;
-      }
-      case STU___dict: {
-         stua_dict *e = (stua_dict *) p;
-         for (i=0; i < e->limit; ++i)
-            if (e->table[i].k != STB_IEMPTY && e->table[i].k != STB_IDEL) {
-               if (!stu__number(e->table[i].k)) stu__mark((int) e->table[i].k);
-               if (!stu__number(e->table[i].v)) stu__mark((int) e->table[i].v);
-            }
-         break;
-      }
-   }
-}
-
-static int stu__num_allocs, stu__size_allocs;
-static stua_obj stu__flow_val = stua_nil; // used for break & return
-
-static void stua_gc(int force)
-{
-   int i;
-   if (!force && stu__num_allocs == 0 && stu__size_allocs == 0) return;
-   stu__num_allocs = stu__size_allocs = 0;
-   //printf("[gc]\n");
-
-   // clear marks
-   for (i=0; i < stb_arr_len(stu__gc_ptrlist); ++i)
-       stu__gc_ptrlist[i]->stua_gc = 0;
-
-   // stu__mark everything reachable
-   stu__nil.stua_gc = stu__true.stua_gc = stu__false.stua_gc = 1;
-   stu__mark(stua_globals);
-   if (!stu__number(stu__flow_val))
-      stu__mark(stu__flow_val);
-   for (i=0; i < stb_arr_len(stu__gc_root_stack); ++i)
-      if (!stu__number(stu__gc_root_stack[i]))
-         stu__mark(stu__gc_root_stack[i]);
-
-   // sweep unreachables
-   for (i=0; i < stb_arr_len(stu__gc_ptrlist);) {
-      stu__box *z = stu__gc_ptrlist[i];
-      if (!z->stua_gc) {
-         switch (z->type) {
-            case STU___dict:        stb_idict_destroy((stua_dict *) z); break;
-            case STU___error:       free(((stu__wrapper *) z)->ptr); break;
-            case STU___string:      stb_sdict_remove(stu__strings, (char*) ((stu__wrapper *) z)->ptr, NULL); free(z); break;
-            case STU___function:    stu__free_func((stu__func *) z); break;
-         }
-         // swap in the last item over this, and repeat
-         z = stb_arr_pop(stu__gc_ptrlist);
-         stu__gc_ptrlist[i] = z;
-      } else
-         ++i;
-   }
-}
-
-static void stu__consider_gc(stua_obj x)
-{
-   if (stu__size_allocs < 100000) return;
-   if (stu__num_allocs < 10 && stu__size_allocs < 1000000) return;
-   stb_arr_push(stu__gc_root_stack, x);
-   stua_gc(0);
-   stb_arr_pop(stu__gc_root_stack);
-}
-
-static stua_obj stu__makeobj(int type, void *data, int size, int safe_to_gc)
-{
-   stua_obj x = stu__makeptr(data);
-   ((stu__box *) data)->type = type;
-   stb_arr_push(stu__gc_ptrlist, (stu__box *) data);
-   stu__num_allocs  += 1;
-   stu__size_allocs += size;
-   if (safe_to_gc) stu__consider_gc(x);
-   return x;
-}
-
-stua_obj stua_box(int type, void *data, int size)
-{
-   stu__wrapper *p = (stu__wrapper *) malloc(sizeof(*p));
-   p->ptr = data;
-   return stu__makeobj(type, p, size, 0);
-}
-
-// a stu string can be directly compared for equality, because
-// they go into a hash table
-stua_obj stua_string(char *z)
-{
-   stu__wrapper *b = (stu__wrapper *) stb_sdict_get(stu__strings, z);
-   if (b == NULL) {
-      int o = stua_box(STU___string, NULL, strlen(z) + sizeof(*b));
-      b = stu__pw(o);
-      stb_sdict_add(stu__strings, z, b);
-      stb_sdict_getkey(stu__strings, z, (char **) &b->ptr);
-   }
-   return stu__makeptr(b);
-}
-
-// stb_obj dictionary is just an stb_idict
-static void     stu__set(stua_dict *d, stua_obj k, stua_obj v)
-{ if (stb_idict_set(d, k, v)) stu__size_allocs += 8; }
-
-static stua_obj stu__get(stua_dict *d, stua_obj k, stua_obj res)
-{
-   stb_idict_get_flag(d, k, &res);
-   return res;
-}
-
-static stua_obj make_string(char *z, int len)
-{
-   stua_obj s;
-   char temp[256], *q = (char *) stb_temp(temp, len+1), *p = q;
-   while (len > 0) {
-      if (*z == '\\') {
-              if (z[1] == 'n') *p = '\n';
-         else if (z[1] == 'r') *p = '\r';
-         else if (z[1] == 't') *p = '\t';
-         else                  *p = z[1];
-         p += 1; z += 2; len -= 2;
-      } else {
-         *p++ = *z++; len -= 1;
-      }
-   }
-   *p = 0;
-   s = stua_string(q);
-   stb_tempfree(temp, q);
-   return s;
-}
-
-enum token_names
-{
-   T__none=128,
-   ST_shl = STUA_op_shl,    ST_ge  = STUA_op_ge,
-   ST_shr = STUA_op_shr,    ST_le = STUA_op_le,
-   ST_shru = STUA_op_shru,  STU__negate = STUA_op_negate,
-   ST__reset_numbering = STUA_op_last,
-   ST_white,
-   ST_id, ST_float, ST_decimal, ST_hex, ST_char,ST_string, ST_number,
-   // make sure the keywords come _AFTER_ ST_id, so stb_lex prefer them
-   ST_if,      ST_while,    ST_for,     ST_eq,  ST_nil,
-   ST_then,    ST_do,       ST_in,      ST_ne,  ST_true,
-   ST_else,    ST_break,    ST_let,     ST_and, ST_false,
-   ST_elseif,  ST_continue, ST_into,    ST_or,  ST_repeat,
-   ST_end,     ST_as,       ST_return,  ST_var, ST_func,
-   ST_catch,   ST__frame,
-   ST__max_terminals,
-
-   STU__defaultparm, STU__seq,
-};
-
-static stua_dict  * stu__globaldict;
-       stua_obj     stua_globals;
-
-static enum
-{
-   FLOW_normal,  FLOW_continue,   FLOW_break,  FLOW_return,  FLOW_error,
-} stu__flow;
-
-stua_obj stua_error(char *z, ...)
-{
-   stua_obj a;
-   char temp[4096], *x;
-   va_list v; va_start(v,z); vsprintf(temp, z, v); va_end(v);
-   x = stb_p_strdup(temp);
-   a = stua_box(STU___error, x, strlen(x));
-   stu__flow = FLOW_error;
-   stu__flow_val = a;
-   return stua_nil;
-}
-
-double stua_number(stua_obj z)
-{
-   return stu__tag(z) == stu__int_tag ? stu__int(z) : stu__float(z);
-}
-
-stua_obj stua_make_number(double d)
-{
-   double e = floor(d);
-   if (e == d && e < (1 << 29) && e >= -(1 << 29))
-      return stu__makeint((int) e);
-   else
-      return stua_float((float) d);
-}
-
-stua_obj (*stua_overload)(int op, stua_obj a, stua_obj b, stua_obj c) = NULL;
-
-static stua_obj stu__op(int op, stua_obj a, stua_obj b, stua_obj c)
-{
-   stua_obj r = STUA_NO_VALUE;
-   if (op == '+') {
-      if (stu__checkt(STU___string, a) && stu__checkt(STU___string, b)) {
-         ;// @TODO: string concatenation
-      } else if (stu__checkt(STU___function, a) && stu__checkt(STU___dict, b)) {
-         stu__func *f = (stu__func *) malloc(12);
-         assert(offsetof(stu__func, code)==12);
-         f->closure_source = a;
-         f->f.closure_data = b;
-         return stu__makeobj(STU___function, f, 16, 1);
-      }
-   }
-   if (stua_overload) r = stua_overload(op,a,b,c);
-   if (stu__flow != FLOW_error && r == STUA_NO_VALUE)
-      stua_error("Typecheck for operator %d", op), r=stua_nil;
-   return r;
-}
-
-#define STU__EVAL2(a,b)             \
-          a = stu__eval(stu__f[n+1]);  if (stu__flow) break; stua_pushroot(a); \
-          b = stu__eval(stu__f[n+2]);  stua_poproot(); if (stu__flow) break;
-
-#define STU__FB(op)              \
-          STU__EVAL2(a,b)           \
-          if (stu__tag(a) == stu__int_tag && stu__tag(b) == stu__int_tag) \
-             return ((a) op (b));                 \
-          if (stu__number(a) && stu__number(b)) \
-             return stua_make_number(stua_number(a) op stua_number(b)); \
-          return stu__op(stu__f[n], a,b, stua_nil)
-
-#define STU__F(op)              \
-          STU__EVAL2(a,b)           \
-          if (stu__number(a) && stu__number(b)) \
-             return stua_make_number(stua_number(a) op stua_number(b)); \
-          return stu__op(stu__f[n], a,b, stua_nil)
-
-#define STU__I(op)               \
-          STU__EVAL2(a,b)           \
-          if (stu__tag(a) == stu__int_tag && stu__tag(b) == stu__int_tag) \
-             return stu__makeint(stu__int(a) op stu__int(b));                 \
-          return stu__op(stu__f[n], a,b, stua_nil)
-
-#define STU__C(op)               \
-          STU__EVAL2(a,b)           \
-          if (stu__number(a) && stu__number(b)) \
-             return (stua_number(a) op stua_number(b)) ? stua_true : stua_false; \
-          return stu__op(stu__f[n], a,b, stua_nil)
-
-#define STU__CE(op)              \
-          STU__EVAL2(a,b)           \
-          return (a op b) ? stua_true : stua_false
-
-static short *stu__f;
-static stua_obj  stu__f_obj;
-static stua_dict       *stu__c;
-static stua_obj stu__funceval(stua_obj fo, stua_obj co);
-
-static int stu__cond(stua_obj x)
-{
-   if (stu__flow) return 0;
-   if (!stu__checkt(STU___boolean, x))
-      x = stu__op('!', x, stua_nil, stua_nil);
-   if (x == stua_true ) return 1;
-   if (x == stua_false) return 0;
-   stu__flow = FLOW_error;
-   return 0;
-}
-
-// had to manually eliminate tailcall recursion for debugging complex stuff
-#define TAILCALL(x)   n = (x); goto top;
-static stua_obj stu__eval(int n)
-{
-top:
-   if (stu__flow >= FLOW_return) return stua_nil; // is this needed?
-   if (n < 0) return stu__const(stu__f)[n];
-   assert(n != 0 && n != 1);
-   switch (stu__f[n]) {
-      stua_obj a,b,c;
-      case ST_catch:   a = stu__eval(stu__f[n+1]);
-                       if (stu__flow == FLOW_error) { a=stu__flow_val; stu__flow = FLOW_normal; }
-                       return a;
-      case ST_var:     b = stu__eval(stu__f[n+2]); if (stu__flow) break;
-                       stu__set(stu__c, stu__const(stu__f)[stu__f[n+1]], b);
-                       return b;
-      case STU__seq:   stu__eval(stu__f[n+1]); if (stu__flow) break;
-                       TAILCALL(stu__f[n+2]);
-      case ST_if:      if (!stu__cond(stu__eval(stu__f[n+1]))) return stua_nil;
-                       TAILCALL(stu__f[n+2]);
-      case ST_else:    a = stu__cond(stu__eval(stu__f[n+1]));
-                       TAILCALL(stu__f[n + 2 + !a]);
-                       #define STU__HANDLE_BREAK            \
-                          if (stu__flow >= FLOW_break) {    \
-                             if (stu__flow == FLOW_break) { \
-                                a = stu__flow_val;          \
-                                stu__flow = FLOW_normal;    \
-                                stu__flow_val = stua_nil;   \
-                                return a;                   \
-                             }                              \
-                             return stua_nil;               \
-                          }
-      case ST_as:      stu__eval(stu__f[n+3]);
-                       STU__HANDLE_BREAK
-                       // fallthrough!
-      case ST_while:   a = stua_nil; stua_pushroot(a);
-                       while (stu__cond(stu__eval(stu__f[n+1]))) {
-                          stua_poproot();
-                          a = stu__eval(stu__f[n+2]);
-                          STU__HANDLE_BREAK
-                          stu__flow = FLOW_normal;  // clear 'continue' flag
-                          stua_pushroot(a);
-                          if (stu__f[n+3]) stu__eval(stu__f[n+3]);
-                          STU__HANDLE_BREAK
-                          stu__flow = FLOW_normal;  // clear 'continue' flag
-                       }
-                       stua_poproot();
-                       return a;
-      case ST_break:   stu__flow = FLOW_break;  stu__flow_val = stu__eval(stu__f[n+1]); break;
-      case ST_continue:stu__flow = FLOW_continue; break;
-      case ST_return:  stu__flow = FLOW_return; stu__flow_val = stu__eval(stu__f[n+1]); break;
-      case ST__frame:  return stu__f_obj;
-      case '[':        STU__EVAL2(a,b);
-                       if (stu__checkt(STU___dict, a))
-                          return stu__get(stu__pd(a), b, stua_nil);
-                       return stu__op(stu__f[n], a, b, stua_nil);
-      case '=':        a = stu__eval(stu__f[n+2]); if (stu__flow) break;
-                       n = stu__f[n+1];
-                       if (stu__f[n] == ST_id) {
-                          if (!stb_idict_update(stu__c, stu__const(stu__f)[stu__f[n+1]], a))
-                             if (!stb_idict_update(stu__globaldict, stu__const(stu__f)[stu__f[n+1]], a))
-                                return stua_error("Assignment to undefined variable");
-                       } else if (stu__f[n] == '[') {
-                          stua_pushroot(a);
-                          b = stu__eval(stu__f[n+1]); if (stu__flow) { stua_poproot(); break; }
-                          stua_pushroot(b);
-                          c = stu__eval(stu__f[n+2]); stua_poproot(); stua_poproot();
-                          if (stu__flow) break;
-                          if (!stu__checkt(STU___dict, b)) return stua_nil;
-                          stu__set(stu__pd(b), c, a);
-                       } else {
-                          return stu__op(stu__f[n], stu__eval(n), a, stua_nil);
-                       }
-                       return a;
-      case STU__defaultparm:
-                       a = stu__eval(stu__f[n+2]);
-                       stu__flow = FLOW_normal;
-                       if (stb_idict_add(stu__c, stu__const(stu__f)[stu__f[n+1]], a))
-                          stu__size_allocs += 8;
-                       return stua_nil;
-      case ST_id:      a = stu__get(stu__c, stu__const(stu__f)[stu__f[n+1]], STUA_NO_VALUE); // try local variable
-                       return a != STUA_NO_VALUE       // else try stu__compile_global_scope variable
-                            ? a : stu__get(stu__globaldict, stu__const(stu__f)[stu__f[n+1]], stua_nil);
-      case STU__negate:a = stu__eval(stu__f[n+1]); if (stu__flow) break;
-                       return stu__isint(a) ? -a : stu__op(stu__f[n], a, stua_nil, stua_nil);
-      case '~':        a = stu__eval(stu__f[n+1]); if (stu__flow) break;
-                       return stu__isint(a) ? (~a)&~3 : stu__op(stu__f[n], a, stua_nil, stua_nil);
-      case '!':        a = stu__eval(stu__f[n+1]); if (stu__flow) break;
-                       a = stu__cond(a); if (stu__flow) break;
-                       return a ? stua_true : stua_false;
-      case ST_eq: STU__CE(==); case ST_le: STU__C(<=); case '<': STU__C(<);
-      case ST_ne: STU__CE(!=); case ST_ge: STU__C(>=); case '>': STU__C(>);
-      case '+' : STU__FB(+);  case '*': STU__F(*);  case '&': STU__I(&); case ST_shl: STU__I(<<);
-      case '-' : STU__FB(-);  case '/': STU__F(/);  case '|': STU__I(|); case ST_shr: STU__I(>>);
-                             case '%': STU__I(%);  case '^': STU__I(^);
-      case ST_shru:    STU__EVAL2(a,b);
-                       if (stu__tag(a) == stu__int_tag && stu__tag(b) == stu__int_tag)
-                          return stu__makeint((unsigned) stu__int(a) >> stu__int(b));
-                       return stu__op(stu__f[n], a,b, stua_nil);
-      case ST_and:      a = stu__eval(stu__f[n+1]); b = stu__cond(a); if (stu__flow) break;
-                       return a ? stu__eval(stu__f[n+2]) : a;
-      case ST_or :      a = stu__eval(stu__f[n+1]); b = stu__cond(a); if (stu__flow) break;
-                       return a ? b : stu__eval(stu__f[n+2]);
-      case'(':case':': STU__EVAL2(a,b);
-                       if (!stu__checkt(STU___function, a))
-                           return stu__op(stu__f[n], a,b, stua_nil);
-                       if (!stu__checkt(STU___dict, b))
-                           return stua_nil;
-                       if (stu__f[n] == ':')
-                          b = stu__makeobj(STU___dict, stb_idict_copy(stu__pd(b)), stb_idict_memory_usage(stu__pd(b)), 0);
-                       a = stu__funceval(a,b);
-                       return a;
-      case '{' :    {
-                       stua_dict *d;
-                       d = stb_idict_new_size(stu__f[n+1] > 40 ? 64 : 16);
-                       if (d == NULL)
-                          return stua_nil; // breakpoint fodder
-                       c = stu__makeobj(STU___dict, d, 32, 1);
-                       stua_pushroot(c);
-                       a = stu__f[n+1];
-                       for (b=0; b < a; ++b) {
-                          stua_obj x = stua_pushroot(stu__eval(stu__f[n+2 + b*2 + 0]));
-                          stua_obj y = stu__eval(stu__f[n+2 + b*2 + 1]);
-                          stua_poproot();
-                          if (stu__flow) { stua_poproot(); return stua_nil; }
-                          stu__set(d, x, y);
-                       }
-                       stua_poproot();
-                       return c;
-                    }
-      default:         if (stu__f[n] < 0) return stu__const(stu__f)[stu__f[n]];
-                       assert(0); /* NOTREACHED */ // internal error!
-   }
-   return stua_nil;
-}
-
-int stb__stua_nesting;
-static stua_obj stu__funceval(stua_obj fo, stua_obj co)
-{
-   stu__func *f = stu__pf(fo);
-   stua_dict *context = stu__pd(co);
-   int i,j;
-   stua_obj p;
-   short *tf = stu__f;     // save previous function
-   stua_dict *tc = stu__c;
-
-   if (stu__flow == FLOW_error) return stua_nil;
-   assert(stu__flow == FLOW_normal);
-
-   stua_pushroot(fo);
-   stua_pushroot(co);
-   stu__consider_gc(stua_nil);
-
-   while ((stb_uint) f->closure_source > 4) {
-      // add data from closure to context
-      stua_dict *e = (stua_dict *) stu__pd(f->f.closure_data);
-      for (i=0; i < e->limit; ++i)
-         if (e->table[i].k != STB_IEMPTY && e->table[i].k != STB_IDEL)
-            if (stb_idict_add(context, e->table[i].k, e->table[i].v))
-               stu__size_allocs += 8;
-            // use add so if it's already defined, we don't override it; that way
-            // explicit parameters win over applied ones, and most recent applications
-            // win over previous ones
-      f = stu__pf(f->closure_source);
-   }
-
-   for (j=0, i=0; i < f->num_param; ++i)
-      // if it doesn't already exist, add it from the numbered parameters
-      if (stb_idict_add(context, f->param[i], stu__get(context, stu__int(j), stua_nil)))
-         ++j;
-
-   // @TODO: if (stu__get(context, stu__int(f->num_param+1)) != STUA_NO_VALUE) // error: too many parameters
-   // @TODO: ditto too few parameters
-
-   if (f->closure_source == 4)
-      p = f->f.func(context);
-   else {
-      stu__f = f->code, stu__c = context;
-      stu__f_obj = co;
-      ++stb__stua_nesting;
-      if (stu__f[1])
-         p = stu__eval(stu__f[1]);
-      else
-         p = stua_nil;
-      --stb__stua_nesting;
-      stu__f = tf, stu__c = tc;  // restore previous function
-      if (stu__flow == FLOW_return) {
-         stu__flow = FLOW_normal;
-         p = stu__flow_val;
-         stu__flow_val = stua_nil;
-      }
-   }
-
-   stua_poproot();
-   stua_poproot();
-
-   return p;
-}
-
-// Parser
-
-static int stu__tok;
-static stua_obj stu__tokval;
-
-static char *stu__curbuf, *stu__bufstart;
-
-static stb_matcher *stu__lex_matcher;
-
-static unsigned char stu__prec[ST__max_terminals], stu__end[ST__max_terminals];
-
-static void stu__nexttoken(void)
-{
-   int len;
-
-retry:
-   stu__tok = stb_lex(stu__lex_matcher, stu__curbuf, &len);
-   if (stu__tok == 0)
-      return;
-   switch(stu__tok) {
-      case ST_white  : stu__curbuf += len; goto retry;
-      case T__none  : stu__tok = *stu__curbuf; break;
-      case ST_string:  stu__tokval = make_string(stu__curbuf+1, len-2); break;
-      case ST_id    :  stu__tokval = make_string(stu__curbuf, len); break;
-      case ST_hex    : stu__tokval = stu__makeint(strtol(stu__curbuf+2,NULL,16)); stu__tok = ST_number; break;
-      case ST_decimal: stu__tokval = stu__makeint(strtol(stu__curbuf  ,NULL,10)); stu__tok = ST_number; break;
-      case ST_float  : stu__tokval = stua_float((float) atof(stu__curbuf))       ; stu__tok = ST_number; break;
-      case ST_char   : stu__tokval = stu__curbuf[2] == '\\' ? stu__curbuf[3] : stu__curbuf[2];
-                      if (stu__curbuf[3] == 't') stu__tokval = '\t';
-                      if (stu__curbuf[3] == 'n') stu__tokval = '\n';
-                      if (stu__curbuf[3] == 'r') stu__tokval = '\r';
-                      stu__tokval = stu__makeint(stu__tokval);
-                      stu__tok  = ST_number;
-                      break;
-   }
-   stu__curbuf += len;
-}
-
-static struct { int stu__tok; char *regex; } stu__lexemes[] =
-{
-   ST_white  , "([ \t\n\r]|/\\*(.|\n)*\\*/|//[^\r\n]*([\r\n]|$))+",
-   ST_id     , "[_a-zA-Z][_a-zA-Z0-9]*",
-   ST_hex    , "0x[0-9a-fA-F]+",
-   ST_decimal, "[0-9]+[0-9]*",
-   ST_float  , "[0-9]+\\.?[0-9]*([eE][-+]?[0-9]+)?",
-   ST_float  , "\\.[0-9]+([eE][-+]?[0-9]+)?",
-   ST_char   , "c'(\\\\.|[^\\'])'",
-   ST_string , "\"(\\\\.|[^\\\"\n\r])*\"",
-   ST_string , "\'(\\\\.|[^\\\'\n\r])*\'",
-
-   #define stua_key4(a,b,c,d)  ST_##a, #a, ST_##b, #b, ST_##c, #c, ST_##d, #d,
-   stua_key4(if,then,else,elseif)    stua_key4(while,do,for,in)
-   stua_key4(func,var,let,break)     stua_key4(nil,true,false,end)
-   stua_key4(return,continue,as,repeat) stua_key4(_frame,catch,catch,catch)
-
-   ST_shl, "<<",   ST_and, "&&",  ST_eq,  "==",  ST_ge, ">=",
-   ST_shr, ">>",   ST_or , "||",  ST_ne,  "!=",  ST_le, "<=",
-   ST_shru,">>>",  ST_into, "=>",
-   T__none, ".",
-};
-
-typedef struct
-{
-   stua_obj  *data;    // constants being compiled
-   short     *code;    // code being compiled
-   stua_dict *locals;
-   short     *non_local_refs;
-} stu__comp_func;
-
-static stu__comp_func stu__pfunc;
-static stu__comp_func *func_stack = NULL;
-static void stu__push_func_comp(void)
-{
-   stb_arr_push(func_stack, stu__pfunc);
-   stu__pfunc.data = NULL;
-   stu__pfunc.code = NULL;
-   stu__pfunc.locals = stb_idict_new_size(16);
-   stu__pfunc.non_local_refs = NULL;
-   stb_arr_push(stu__pfunc.code, 0); // number of data items
-   stb_arr_push(stu__pfunc.code, 1); // starting execution address
-}
-
-static void stu__pop_func_comp(void)
-{
-   stb_arr_free(stu__pfunc.code);
-   stb_arr_free(stu__pfunc.data);
-   stb_idict_destroy(stu__pfunc.locals);
-   stb_arr_free(stu__pfunc.non_local_refs);
-   stu__pfunc = stb_arr_pop(func_stack);
-}
-
-// if an id is a reference to an outer lexical scope, this
-// function returns the "name" of it, and updates the stack
-// structures to make sure the names are propagated in.
-static int stu__nonlocal_id(stua_obj var_obj)
-{
-   stua_obj dummy, var = var_obj;
-   int i, n = stb_arr_len(func_stack), j,k;
-   if (stb_idict_get_flag(stu__pfunc.locals, var, &dummy)) return 0;
-   for (i=n-1; i > 1; --i) {
-      if (stb_idict_get_flag(func_stack[i].locals, var, &dummy))
-         break;
-   }
-   if (i <= 1) return 0; // stu__compile_global_scope
-   j = i; // need to access variable from j'th frame
-   for (i=0; i < stb_arr_len(stu__pfunc.non_local_refs); ++i)
-      if (stu__pfunc.non_local_refs[i] == j) return j-n;
-   stb_arr_push(stu__pfunc.non_local_refs, j-n);
-   // now make sure all the parents propagate it down
-   for (k=n-1; k > 1; --k) {
-      if (j-k >= 0) return j-n; // comes direct from this parent
-      for(i=0; i < stb_arr_len(func_stack[k].non_local_refs); ++i)
-         if (func_stack[k].non_local_refs[i] == j-k)
-            return j-n;
-      stb_arr_push(func_stack[k].non_local_refs, j-k);
-   }
-   assert (k != 1);
-
-   return j-n;
-}
-
-static int stu__off(void)                { return stb_arr_len(stu__pfunc.code); }
-static void stu__cc(int a)
-{
-   assert(a >= -2000 && a < 5000);
-   stb_arr_push(stu__pfunc.code, a);
-}
-static int stu__cc1(int a)                      { stu__cc(a); return stu__off()-1; }
-static int stu__cc2(int a, int b)               { stu__cc(a); stu__cc(b); return stu__off()-2; }
-static int stu__cc3(int a, int b, int c)        {
- if (a == '=') assert(c != 0);
- stu__cc(a); stu__cc(b); stu__cc(c); return stu__off()-3; }
-static int stu__cc4(int a, int b, int c, int d) { stu__cc(a); stu__cc(b); stu__cc(c); stu__cc(d); return stu__off()-4; }
-
-static int stu__cdv(stua_obj p)
-{
-   int i;
-   assert(p != STUA_NO_VALUE);
-   for (i=0; i < stb_arr_len(stu__pfunc.data); ++i)
-      if (stu__pfunc.data[i] == p)
-         break;
-   if (i == stb_arr_len(stu__pfunc.data))
-      stb_arr_push(stu__pfunc.data, p);
-   return ~i;
-}
-
-static int stu__cdt(void)
-{
-   int z = stu__cdv(stu__tokval);
-   stu__nexttoken();
-   return z;
-}
-
-static int stu__seq(int a, int b)
-{
-   return !a ? b : !b ? a : stu__cc3(STU__seq, a,b);
-}
-
-static char stu__comp_err_str[1024];
-static int stu__comp_err_line;
-static int stu__err(char *str, ...)
-{
-   va_list v;
-   char *s = stu__bufstart;
-   stu__comp_err_line = 1;
-   while (s < stu__curbuf) {
-      if (s[0] == '\n' || s[0] == '\r') {
-         if (s[0]+s[1] == '\n' + '\r') ++s;
-         ++stu__comp_err_line;
-      }
-      ++s;
-   }
-   va_start(v, str);
-   vsprintf(stu__comp_err_str, str, v);
-   va_end(v);
-   return 0;
-}
-
-static int stu__accept(int p)
-{
-   if (stu__tok != p) return 0;
-   stu__nexttoken();
-   return 1;
-}
-
-static int stu__demand(int p)
-{
-   if (stu__accept(p)) return 1;
-   return stu__err("Didn't find expected stu__tok");
-}
-
-static int stu__demandv(int p, stua_obj *val)
-{
-   if (stu__tok == p || p==0) {
-      *val = stu__tokval;
-      stu__nexttoken();
-      return 1;
-   } else
-      return 0;
-}
-
-static int stu__expr(int p);
-int stu__nexpr(int p) { stu__nexttoken(); return stu__expr(p); }
-static int stu__statements(int once, int as);
-
-static int stu__parse_if(void)      // parse both ST_if and ST_elseif
-{
-   int b,c,a;
-   a = stu__nexpr(1);               if (!a) return 0;
-   if (!stu__demand(ST_then))       return stu__err("expecting THEN");
-   b = stu__statements(0,0);        if (!b) return 0;
-   if (b == 1) b = -1;
-
-   if (stu__tok == ST_elseif) {
-      return stu__parse_if();
-   } else if (stu__accept(ST_else)) {
-      c = stu__statements(0,0); if (!c) return 0;
-      if (!stu__demand(ST_end)) return stu__err("expecting END after else clause");
-      return stu__cc4(ST_else, a, b, c);
-   } else {
-      if (!stu__demand(ST_end)) return stu__err("expecting END in if statement");
-      return stu__cc3(ST_if, a, b);
-   }
-}
-
-int stu__varinit(int z, int in_globals)
-{
-   int a,b;
-   stu__nexttoken();
-   while (stu__demandv(ST_id, &b)) {
-      if (!stb_idict_add(stu__pfunc.locals, b, 1))
-         if (!in_globals) return stu__err("Redefined variable %s.", stu__pw(b)->ptr);
-      if (stu__accept('=')) {
-         a = stu__expr(1);       if (!a) return 0;
-      } else
-         a = stu__cdv(stua_nil);
-      z = stu__seq(z, stu__cc3(ST_var, stu__cdv(b), a));
-      if (!stu__accept(',')) break;
-   }
-   return z;
-}
-
-static int stu__compile_unary(int z, int outparm, int require_inparm)
-{
-   int op = stu__tok, a, b;
-   stu__nexttoken();
-   if (outparm) {
-      if (require_inparm || (stu__tok && stu__tok != ST_end && stu__tok != ST_else && stu__tok != ST_elseif && stu__tok !=';')) {
-         a = stu__expr(1); if (!a) return 0;
-      } else
-         a = stu__cdv(stua_nil);
-      b = stu__cc2(op, a);
-   } else
-      b = stu__cc1(op);
-   return stu__seq(z,b);
-}
-
-static int stu__assign(void)
-{
-   int z;
-   stu__accept(ST_let);
-   z = stu__expr(1); if (!z) return 0;
-   if (stu__accept('=')) {
-      int y,p = (z >= 0 ? stu__pfunc.code[z] : 0);
-      if (z < 0 || (p != ST_id && p != '[')) return stu__err("Invalid lvalue in assignment");
-      y = stu__assign();         if (!y) return 0;
-      z = stu__cc3('=', z, y);
-   }
-   return z;
-}
-
-static int stu__statements(int once, int stop_while)
-{
-   int a,b, c, z=0;
-   for(;;) {
-      switch (stu__tok) {
-         case ST_if     : a = stu__parse_if(); if (!a) return 0;
-                          z = stu__seq(z, a);
-                          break;
-         case ST_while  : if (stop_while) return (z ? z:1);
-                          a = stu__nexpr(1); if (!a) return 0;
-                          if (stu__accept(ST_as)) c = stu__statements(0,0); else c = 0;
-                          if (!stu__demand(ST_do)) return stu__err("expecting DO");
-                          b = stu__statements(0,0); if (!b) return 0;
-                          if (!stu__demand(ST_end)) return stu__err("expecting END");
-                          if (b == 1) b = -1;
-                          z = stu__seq(z, stu__cc4(ST_while, a, b, c));
-                          break;
-         case ST_repeat : stu__nexttoken();
-                          c = stu__statements(0,1); if (!c) return 0;
-                          if (!stu__demand(ST_while)) return stu__err("expecting WHILE");
-                          a = stu__expr(1); if (!a) return 0;
-                          if (!stu__demand(ST_do)) return stu__err("expecting DO");
-                          b = stu__statements(0,0); if (!b) return 0;
-                          if (!stu__demand(ST_end)) return stu__err("expecting END");
-                          if (b == 1) b = -1;
-                          z = stu__seq(z, stu__cc4(ST_as, a, b, c));
-                          break;
-         case ST_catch  : a = stu__nexpr(1); if (!a) return 0;
-                          z = stu__seq(z, stu__cc2(ST_catch, a));
-                          break;
-         case ST_var    : z = stu__varinit(z,0); break;
-         case ST_return : z = stu__compile_unary(z,1,1); break;
-         case ST_continue:z = stu__compile_unary(z,0,0); break;
-         case ST_break  : z = stu__compile_unary(z,1,0); break;
-         case ST_into   : if (z == 0 && !once) return stu__err("=> cannot be first statement in block");
-                          a = stu__nexpr(99);
-                          b = (a >= 0? stu__pfunc.code[a] : 0);
-                          if (a < 0 || (b != ST_id && b != '[')) return stu__err("Invalid lvalue on right side of =>");
-                          z = stu__cc3('=', a, z);
-                          break;
-         default        : if (stu__end[stu__tok]) return once ? 0 : (z ? z:1);
-                          a = stu__assign(); if (!a) return 0;
-                          stu__accept(';');
-                          if (stu__tok && !stu__end[stu__tok]) {
-                             if (a < 0)
-                                return stu__err("Constant has no effect");
-                             if (stu__pfunc.code[a] != '(' && stu__pfunc.code[a] != '=')
-                                return stu__err("Expression has no effect");
-                          }
-                          z = stu__seq(z, a);
-                          break;
-      }
-      if (!z) return 0;
-      stu__accept(';');
-      if (once && stu__tok != ST_into) return z;
-   }
-}
-
-static int stu__postexpr(int z, int p);
-static int stu__dictdef(int end, int *count)
-{
-   int z,n=0,i,flags=0;
-   short *dict=NULL;
-   stu__nexttoken();
-   while (stu__tok != end) {
-      if (stu__tok == ST_id) {
-         stua_obj id = stu__tokval;
-         stu__nexttoken();
-         if (stu__tok == '=') {
-            flags |= 1;
-            stb_arr_push(dict, stu__cdv(id));
-            z = stu__nexpr(1); if (!z) return 0;
-         } else {
-            z = stu__cc2(ST_id, stu__cdv(id));
-            z = stu__postexpr(z,1); if (!z) return 0;
-            flags |= 2;
-            stb_arr_push(dict, stu__cdv(stu__makeint(n++)));
-         }
-      } else {
-         z = stu__expr(1); if (!z) return 0;
-         flags |= 2;
-         stb_arr_push(dict, stu__cdv(stu__makeint(n++)));
-      }
-      if (end != ')' && flags == 3) { z=stu__err("can't mix initialized and uninitialized defs"); goto done;}
-      stb_arr_push(dict, z);
-      if (!stu__accept(',')) break;
-   }
-   if (!stu__demand(end))
-      return stu__err(end == ')' ? "Expecting ) at end of function call"
-                                 : "Expecting } at end of dictionary definition");
-   z = stu__cc2('{', stb_arr_len(dict)/2);
-   for (i=0; i < stb_arr_len(dict); ++i)
-      stu__cc(dict[i]);
-   if (count) *count = n;
-done:
-   stb_arr_free(dict);
-   return z;
-}
-
-static int stu__comp_id(void)
-{
-   int z,d;
-   d = stu__nonlocal_id(stu__tokval);
-   if (d == 0)
-      return z = stu__cc2(ST_id, stu__cdt());
-   // access a non-local frame by naming it with the appropriate int
-   assert(d < 0);
-   z = stu__cdv(d);            // relative frame # is the 'variable' in our local frame
-   z = stu__cc2(ST_id, z);     // now access that dictionary
-   return stu__cc3('[', z, stu__cdt()); // now access the variable from that dir
-}
-
-static stua_obj stu__funcdef(stua_obj *id, stua_obj *func);
-static int stu__expr(int p)
-{
-   int z;
-   // unary
-   switch (stu__tok) {
-      case ST_number: z = stu__cdt(); break;
-      case ST_string: z = stu__cdt(); break;  // @TODO - string concatenation like C
-      case ST_id    : z = stu__comp_id(); break;
-      case ST__frame: z = stu__cc1(ST__frame); stu__nexttoken(); break;
-      case ST_func  : z = stu__funcdef(NULL,NULL); break;
-      case ST_if    : z = stu__parse_if(); break;
-      case ST_nil   : z = stu__cdv(stua_nil); stu__nexttoken(); break;
-      case ST_true  : z = stu__cdv(stua_true); stu__nexttoken(); break;
-      case ST_false : z = stu__cdv(stua_false); stu__nexttoken(); break;
-      case '-'      : z = stu__nexpr(99); if (z) z=stu__cc2(STU__negate,z); else return z; break;
-      case '!'      : z = stu__nexpr(99); if (z) z=stu__cc2('!',z); else return z; break;
-      case '~'      : z = stu__nexpr(99); if (z) z=stu__cc2('~',z); else return z; break;
-      case '{'      : z = stu__dictdef('}', NULL); break;
-      default       : return stu__err("Unexpected token");
-      case '('      : stu__nexttoken(); z = stu__statements(0,0); if (!stu__demand(')')) return stu__err("Expecting )");
-   }
-   return stu__postexpr(z,p);
-}
-
-static int stu__postexpr(int z, int p)
-{
-   int q;
-   // postfix
-   while (stu__tok == '(' || stu__tok == '[' || stu__tok == '.') {
-      if (stu__accept('.')) {
-         // MUST be followed by a plain identifier! use [] for other stuff
-         if (stu__tok != ST_id) return stu__err("Must follow . with plain name; try [] instead");
-         z = stu__cc3('[', z, stu__cdv(stu__tokval));
-         stu__nexttoken();
-      } else if (stu__accept('[')) {
-         while (stu__tok != ']') {
-            int r = stu__expr(1); if (!r) return 0;
-            z = stu__cc3('[', z, r);
-            if (!stu__accept(',')) break;
-         }
-         if (!stu__demand(']')) return stu__err("Expecting ]");
-      } else {
-         int n, p = stu__dictdef(')', &n); if (!p) return 0;
-         #if 0 // this is incorrect!
-         if (z > 0 && stu__pfunc.code[z] == ST_id) {
-            stua_obj q = stu__get(stu__globaldict, stu__pfunc.data[-stu__pfunc.code[z+1]-1], stua_nil);
-            if (stu__checkt(STU___function, q))
-               if ((stu__pf(q))->num_param != n)
-                  return stu__err("Incorrect number of parameters");
-         }
-         #endif
-         z = stu__cc3('(', z, p);
-      }
-   }
-   // binop - this implementation taken from lcc
-   for (q=stu__prec[stu__tok]; q >= p; --q) {
-      while (stu__prec[stu__tok] == q) {
-         int o = stu__tok, y = stu__nexpr(p+1); if (!y) return 0;
-         z = stu__cc3(o,z,y);
-      }
-   }
-   return z;
-}
-
-static stua_obj stu__finish_func(stua_obj *param, int start)
-{
-   int n, size;
-   stu__func *f = (stu__func *) malloc(sizeof(*f));
-   f->closure_source = 0;
-   f->num_param = stb_arr_len(param);
-   f->param = (int *) stb_copy(param, f->num_param * sizeof(*f->param));
-   size = stb_arr_storage(stu__pfunc.code) + stb_arr_storage(stu__pfunc.data) + sizeof(*f) + 8;
-   f->f.store = malloc(stb_arr_storage(stu__pfunc.code) + stb_arr_storage(stu__pfunc.data));
-   f->code = (short *) ((char *) f->f.store + stb_arr_storage(stu__pfunc.data));
-   memcpy(f->code, stu__pfunc.code, stb_arr_storage(stu__pfunc.code));
-   f->code[1] = start;
-   f->code[0] = stb_arr_len(stu__pfunc.data);
-   for (n=0; n < f->code[0]; ++n)
-      ((stua_obj *) f->code)[-1-n] = stu__pfunc.data[n];
-   return stu__makeobj(STU___function, f, size, 0);
-}
-
-static int stu__funcdef(stua_obj *id, stua_obj *result)
-{
-   int n,z=0,i,q;
-   stua_obj *param = NULL;
-   short *nonlocal;
-   stua_obj v,f=stua_nil;
-   assert(stu__tok == ST_func);
-   stu__nexttoken();
-   if (id) {
-      if (!stu__demandv(ST_id, id)) return stu__err("Expecting function name");
-   } else
-      stu__accept(ST_id);
-   if (!stu__demand('(')) return stu__err("Expecting ( for function parameter");
-   stu__push_func_comp();
-   while (stu__tok != ')') {
-      if (!stu__demandv(ST_id, &v)) { z=stu__err("Expecting parameter name"); goto done; }
-      stb_idict_add(stu__pfunc.locals, v, 1);
-      if (stu__tok == '=') {
-         n = stu__nexpr(1); if (!n) { z=0; goto done; }
-         z = stu__seq(z, stu__cc3(STU__defaultparm, stu__cdv(v), n));
-      } else
-         stb_arr_push(param, v);
-      if (!stu__accept(',')) break;
-   }
-   if (!stu__demand(')'))   { z=stu__err("Expecting ) at end of parameter list"); goto done; }
-   n = stu__statements(0,0);   if (!n) { z=0; goto done; }
-   if (!stu__demand(ST_end)) { z=stu__err("Expecting END at end of function"); goto done; }
-   if (n == 1) n = 0;
-   n = stu__seq(z,n);
-   f = stu__finish_func(param, n);
-   if (result) { *result = f; z=1; stu__pop_func_comp(); }
-   else {
-      nonlocal = stu__pfunc.non_local_refs;
-      stu__pfunc.non_local_refs = NULL;
-      stu__pop_func_comp();
-      z = stu__cdv(f);
-      if (nonlocal) {  // build a closure with references to the needed frames
-         short *initcode = NULL;
-         for (i=0; i < stb_arr_len(nonlocal); ++i) {
-            int k = nonlocal[i], p;
-            stb_arr_push(initcode, stu__cdv(k));
-            if (k == -1) p = stu__cc1(ST__frame);
-            else { p = stu__cdv(stu__makeint(k+1)); p = stu__cc2(ST_id, p); }
-            stb_arr_push(initcode, p);
-         }
-         q = stu__cc2('{', stb_arr_len(nonlocal));
-         for (i=0; i < stb_arr_len(initcode); ++i)
-            stu__cc(initcode[i]);
-         z = stu__cc3('+', z, q);
-         stb_arr_free(initcode);
-      }
-      stb_arr_free(nonlocal);
-   }
-done:
-   stb_arr_free(param);
-   if (!z) stu__pop_func_comp();
-   return z;
-}
-
-static int stu__compile_global_scope(void)
-{
-   stua_obj o;
-   int z=0;
-
-   stu__push_func_comp();
-   while (stu__tok != 0) {
-      if (stu__tok == ST_func) {
-         stua_obj id, f;
-         if (!stu__funcdef(&id,&f))
-            goto error;
-         stu__set(stu__globaldict, id, f);
-      } else if (stu__tok == ST_var) {
-         z = stu__varinit(z,1); if (!z) goto error;
-      } else {
-         int y = stu__statements(1,0); if (!y) goto error;
-         z = stu__seq(z,y);
-      }
-      stu__accept(';');
-   }
-   o = stu__finish_func(NULL, z);
-   stu__pop_func_comp();
-
-   o = stu__funceval(o, stua_globals); // initialize stu__globaldict
-   if (stu__flow == FLOW_error)
-      printf("Error: %s\n", ((stu__wrapper *) stu__ptr(stu__flow_val))->ptr);
-   return 1;
-error:
-   stu__pop_func_comp();
-   return 0;
-}
-
-stua_obj stu__myprint(stua_dict *context)
-{
-   stua_obj x = stu__get(context, stua_string("x"), stua_nil);
-   if ((x & 1) == stu__float_tag) printf("%f", stu__getfloat(x));
-   else if (stu__tag(x) == stu__int_tag) printf("%d", stu__int(x));
-   else {
-       stu__wrapper *s = stu__pw(x);
-       if (s->type == STU___string || s->type == STU___error)
-          printf("%s", s->ptr);
-       else if (s->type == STU___dict) printf("{{dictionary}}");
-       else if (s->type == STU___function) printf("[[function]]");
-       else
-          printf("[[ERROR:%s]]", s->ptr);
-   }
-   return x;
-}
-
-void stua_init(void)
-{
-   if (!stu__globaldict) {
-      int i;
-      stua_obj s;
-      stu__func *f;
-
-      stu__prec[ST_and] = stu__prec[ST_or] =                     1;
-      stu__prec[ST_eq ] = stu__prec[ST_ne] = stu__prec[ST_le] =
-       stu__prec[ST_ge] = stu__prec['>' ]  = stu__prec['<'] =    2;
-      stu__prec[':']    =                                        3;
-      stu__prec['&']    = stu__prec['|']   = stu__prec['^'] =    4;
-      stu__prec['+']    = stu__prec['-']   =                     5;
-      stu__prec['*']    = stu__prec['/']   = stu__prec['%'] =
-       stu__prec[ST_shl]= stu__prec[ST_shr]= stu__prec[ST_shru]= 6;
-
-      stu__end[')']   = stu__end[ST_end] = stu__end[ST_else] = 1;
-      stu__end[ST_do] = stu__end[ST_elseif] = 1;
-
-      stu__float_init();
-      stu__lex_matcher = stb_lex_matcher();
-      for (i=0; i < sizeof(stu__lexemes)/sizeof(stu__lexemes[0]); ++i)
-         stb_lex_item(stu__lex_matcher, stu__lexemes[i].regex, stu__lexemes[i].stu__tok);
-
-      stu__globaldict = stb_idict_new_size(64);
-      stua_globals    = stu__makeobj(STU___dict, stu__globaldict, 0,0);
-      stu__strings    = stb_sdict_new(0);
-
-      stu__curbuf = stu__bufstart = "func _print(x) end\n"
-      "func print()\n  var x=0 while _frame[x] != nil as x=x+1 do _print(_frame[x]) end end\n";
-      stu__nexttoken();
-      if (!stu__compile_global_scope())
-         printf("Compile error in line %d: %s\n", stu__comp_err_line, stu__comp_err_str);
-
-      s = stu__get(stu__globaldict, stua_string("_print"), stua_nil);
-      if (stu__tag(s) == stu__ptr_tag && stu__ptr(s)->type == STU___function) {
-         f = stu__pf(s);
-         free(f->f.store);
-         f->closure_source = 4;
-         f->f.func = stu__myprint;
-         f->code = NULL;
-      }
-   }
-}
-
-void stua_uninit(void)
-{
-   if (stu__globaldict) {
-      stb_idict_remove_all(stu__globaldict);
-      stb_arr_setlen(stu__gc_root_stack, 0);
-      stua_gc(1);
-      stb_idict_destroy(stu__globaldict);
-      stb_sdict_delete(stu__strings);
-      stb_matcher_free(stu__lex_matcher);
-      stb_arr_free(stu__gc_ptrlist);
-      stb_arr_free(func_stack);
-      stb_arr_free(stu__gc_root_stack);
-      stu__globaldict = NULL;
-   }
-}
-
-void stua_run_script(char *s)
-{
-   stua_init();
-
-   stu__curbuf = stu__bufstart = s;
-   stu__nexttoken();
-
-   stu__flow = FLOW_normal;
-
-   if (!stu__compile_global_scope())
-      printf("Compile error in line %d: %s\n", stu__comp_err_line, stu__comp_err_str);
-   stua_gc(1);
-}
-#endif // STB_DEFINE
-#endif // STB_STUA
-
 #undef STB_EXTERN
 #endif // STB_INCLUDE_STB_H
 

From ab18d9b250494249a433068cc3583129d809b322 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 21:42:44 -0700
Subject: [PATCH 052/170] stb_image: Fix two bugs found via VC++ /analyze

Also fixes issue #366.
---
 stb_image.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index f2e3430..e68580f 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -3821,6 +3821,10 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    else
       decode_n = z->s->img_n;
 
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
    // resample and color-convert
    {
       int k;
@@ -6862,9 +6866,10 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
                }
 
                if (delays) {
-                  *delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
-                  if (!*delays)
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
                      return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
                   delays_size = layers * sizeof(int);
                }
             } else {

From 17bc84e15dfb7c5c7ebcc5246e929678478b15a4 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 21:47:13 -0700
Subject: [PATCH 053/170] stb_image: stbi__bmp_info only rewind stream on error

To be consistent with the other info functions.

Fixes issue #892.
---
 stb_image.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index e68580f..32d5e88 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -7205,9 +7205,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 
    info.all_a = 255;
    p = stbi__bmp_parse_header(s, &info);
-   stbi__rewind( s );
-   if (p == NULL)
+   if (p == NULL) {
+      stbi__rewind( s );
       return 0;
+   }
    if (x) *x = s->img_x;
    if (y) *y = s->img_y;
    if (comp) {

From 1e82fd4a4ebe9ccc70dd86776a47a24ff02769fd Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 4 Jul 2021 22:17:57 -0700
Subject: [PATCH 054/170] stb_image: BMP v4/v5 header parsing fixes

As per MS's own docs, should ignore the r/g/b bitmasks in the
header unless BI_BITFIELDS compression is selected. Factor out
setting the default masks since that now exists in two branches.

Add some more checking for unsupported compression formats and
illegal bpp/compression combinations while I'm at it.

Fixes issue #783.
---
 stb_image.h | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 32d5e88..4f684e3 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -5320,6 +5320,32 @@ typedef struct
    int extra_read;
 } stbi__bmp_data;
 
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
 static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
 {
    int hsz;
@@ -5347,6 +5373,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
    if (hsz != 12) {
       int compress = stbi__get32le(s);
       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
       stbi__get32le(s); // discard sizeof
       stbi__get32le(s); // discard hres
       stbi__get32le(s); // discard vres
@@ -5361,17 +5389,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          }
          if (info->bpp == 16 || info->bpp == 32) {
             if (compress == 0) {
-               if (info->bpp == 32) {
-                  info->mr = 0xffu << 16;
-                  info->mg = 0xffu <<  8;
-                  info->mb = 0xffu <<  0;
-                  info->ma = 0xffu << 24;
-                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
-               } else {
-                  info->mr = 31u << 10;
-                  info->mg = 31u <<  5;
-                  info->mb = 31u <<  0;
-               }
+               stbi__bmp_set_mask_defaults(info, compress);
             } else if (compress == 3) {
                info->mr = stbi__get32le(s);
                info->mg = stbi__get32le(s);
@@ -5386,6 +5404,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
                return stbi__errpuc("bad BMP", "bad BMP");
          }
       } else {
+         // V4/V5 header
          int i;
          if (hsz != 108 && hsz != 124)
             return stbi__errpuc("bad BMP", "bad BMP");
@@ -5393,6 +5412,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
          info->mg = stbi__get32le(s);
          info->mb = stbi__get32le(s);
          info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
          stbi__get32le(s); // discard color space
          for (i=0; i < 12; ++i)
             stbi__get32le(s); // discard color space parameters

From a5d989c358a47e1f0245e11040904a9d7fbf9786 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:07:12 -0700
Subject: [PATCH 055/170] stb_truetype: Tabs->spaces

Whitespace only.
---
 stb_truetype.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index e06b867..8a704b6 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -4489,32 +4489,32 @@ static float stbtt__cuberoot( float x )
 // x^3 + c*x^2 + b*x + a = 0
 static int stbtt__solve_cubic(float a, float b, float c, float* r)
 {
-	float s = -a / 3;
-	float p = b - a*a / 3;
-	float q = a * (2*a*a - 9*b) / 27 + c;
+   float s = -a / 3;
+   float p = b - a*a / 3;
+   float q = a * (2*a*a - 9*b) / 27 + c;
    float p3 = p*p*p;
-	float d = q*q + 4*p3 / 27;
-	if (d >= 0) {
-		float z = (float) STBTT_sqrt(d);
-		float u = (-q + z) / 2;
-		float v = (-q - z) / 2;
-		u = stbtt__cuberoot(u);
-		v = stbtt__cuberoot(v);
-		r[0] = s + u + v;
-		return 1;
-	} else {
-	   float u = (float) STBTT_sqrt(-p/3);
-	   float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative
-	   float m = (float) STBTT_cos(v);
+   float d = q*q + 4*p3 / 27;
+   if (d >= 0) {
+      float z = (float) STBTT_sqrt(d);
+      float u = (-q + z) / 2;
+      float v = (-q - z) / 2;
+      u = stbtt__cuberoot(u);
+      v = stbtt__cuberoot(v);
+      r[0] = s + u + v;
+      return 1;
+   } else {
+      float u = (float) STBTT_sqrt(-p/3);
+      float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative
+      float m = (float) STBTT_cos(v);
       float n = (float) STBTT_cos(v-3.141592/2)*1.732050808f;
-	   r[0] = s + u * 2 * m;
-	   r[1] = s - u * (m + n);
-	   r[2] = s - u * (m - n);
+      r[0] = s + u * 2 * m;
+      r[1] = s - u * (m + n);
+      r[2] = s - u * (m - n);
 
       //STBTT_assert( STBTT_fabs(((r[0]+a)*r[0]+b)*r[0]+c) < 0.05f);  // these asserts may not be safe at all scales, though they're in bezier t parameter units so maybe?
       //STBTT_assert( STBTT_fabs(((r[1]+a)*r[1]+b)*r[1]+c) < 0.05f);
       //STBTT_assert( STBTT_fabs(((r[2]+a)*r[2]+b)*r[2]+c) < 0.05f);
-   	return 3;
+      return 3;
    }
 }
 

From ba5cc43d33fd06875c254d46a301ec188aa441a7 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:10:22 -0700
Subject: [PATCH 056/170] stb_truetype: Fix stbtt__solve_cubic comment

---
 stb_truetype.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 8a704b6..a58fc62 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -4486,7 +4486,7 @@ static float stbtt__cuberoot( float x )
       return  (float) STBTT_pow( x,1.0f/3.0f);
 }
 
-// x^3 + c*x^2 + b*x + a = 0
+// x^3 + a*x^2 + b*x + c = 0
 static int stbtt__solve_cubic(float a, float b, float c, float* r)
 {
    float s = -a / 3;

From e817b4a998abdd7dfdacab5d76384b310f54f4f0 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:17:12 -0700
Subject: [PATCH 057/170] stb_ds: Fix typos in docs.

---
 stb_ds.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_ds.h b/stb_ds.h
index cf6be15..c74164e 100644
--- a/stb_ds.h
+++ b/stb_ds.h
@@ -104,7 +104,7 @@ DOCUMENTATION
           moving the rest of the array over. Returns b.
 
       arrinsn:
-        void arrins(T* a, int p, int n);
+        void arrinsn(T* a, int p, int n);
           Inserts n uninitialized items into array a starting at a[p],
           moving the rest of the array over.
 
@@ -123,7 +123,7 @@ DOCUMENTATION
           Deletes the element at a[p], moving the rest of the array over.
 
       arrdeln:
-        void arrdel(T* a, int p, int n);
+        void arrdeln(T* a, int p, int n);
           Deletes n elements starting at a[p], moving the rest of the array over.
 
       arrdelswap:

From 76a0a00874a5075bf590d1fd2f40de1836aea567 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:19:51 -0700
Subject: [PATCH 058/170] stb.h: _MSC_VER in readdir_raw -> _WIN32

For MinGW.
---
 deprecated/stb.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/deprecated/stb.h b/deprecated/stb.h
index cd715e7..0a5fd55 100644
--- a/deprecated/stb.h
+++ b/deprecated/stb.h
@@ -6132,7 +6132,7 @@ static char **readdir_raw(char *dir, int return_subdirs, char *mask)
    char buffer[4096], with_slash[4096];
    size_t n;
 
-   #ifdef _MSC_VER
+   #ifdef WIN32
       stb__wchar *ws;
       struct _wfinddata_t data;
    #ifdef _WIN64
@@ -6142,7 +6142,7 @@ static char **readdir_raw(char *dir, int return_subdirs, char *mask)
       const long none = -1;
       long z;
    #endif
-   #else // !_MSC_VER
+   #else // !WIN32
       const DIR *none = NULL;
       DIR *z;
    #endif
@@ -6159,7 +6159,7 @@ static char **readdir_raw(char *dir, int return_subdirs, char *mask)
    if (!stb_strscpy(with_slash,buffer,sizeof(with_slash)))
       return NULL;
 
-   #ifdef _MSC_VER
+   #ifdef WIN32
       if (!stb_strscpy(buffer+n,"*.*",sizeof(buffer)-n))
          return NULL;
       ws = stb__from_utf8(buffer);
@@ -6170,7 +6170,7 @@ static char **readdir_raw(char *dir, int return_subdirs, char *mask)
 
    if (z != none) {
       int nonempty = STB_TRUE;
-      #ifndef _MSC_VER
+      #ifndef WIN32
       struct dirent *data = readdir(z);
       nonempty = (data != NULL);
       #endif
@@ -6179,7 +6179,7 @@ static char **readdir_raw(char *dir, int return_subdirs, char *mask)
 
          do {
             int is_subdir;
-            #ifdef _MSC_VER
+            #ifdef WIN32
             char *name = stb__to_utf8((stb__wchar *)data.name);
             if (name == NULL) {
                fprintf(stderr, "%s to convert '%S' to %s!\n", "Unable", data.name, "utf8");
@@ -6207,13 +6207,13 @@ static char **readdir_raw(char *dir, int return_subdirs, char *mask)
                }
             }
          }
-         #ifdef _MSC_VER
+         #ifdef WIN32
          while (0 == _wfindnext(z, &data));
          #else
          while ((data = readdir(z)) != NULL);
          #endif
       }
-      #ifdef _MSC_VER
+      #ifdef WIN32
          _findclose(z);
       #else
          closedir(z);

From 85bc8060be3f517477f2e785c31851c6214376f2 Mon Sep 17 00:00:00 2001
From: Louis Schnellbach <xipiryon@gmail.com>
Date: Thu, 19 Nov 2020 15:59:51 +0100
Subject: [PATCH 059/170] Support for Page Up/Down (changes from
 ocornut/imgui/commit/ec945f44)

---
 stb_textedit.h | 71 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/stb_textedit.h b/stb_textedit.h
index 41afb9e..54f4b16 100644
--- a/stb_textedit.h
+++ b/stb_textedit.h
@@ -142,6 +142,8 @@
 //    STB_TEXTEDIT_K_RIGHT       keyboard input to move cursor right
 //    STB_TEXTEDIT_K_UP          keyboard input to move cursor up
 //    STB_TEXTEDIT_K_DOWN        keyboard input to move cursor down
+//    STB_TEXTEDIT_K_PGUP        keyboard input to move cursor up a page
+//    STB_TEXTEDIT_K_PGDOWN      keyboard input to move cursor down a page
 //    STB_TEXTEDIT_K_LINESTART   keyboard input to move cursor to start of line  // e.g. HOME
 //    STB_TEXTEDIT_K_LINEEND     keyboard input to move cursor to end of line    // e.g. END
 //    STB_TEXTEDIT_K_TEXTSTART   keyboard input to move cursor to start of text  // e.g. ctrl-HOME
@@ -164,10 +166,6 @@
 //    STB_TEXTEDIT_K_TEXTSTART2          secondary keyboard input to move cursor to start of text
 //    STB_TEXTEDIT_K_TEXTEND2            secondary keyboard input to move cursor to end of text
 //
-// Todo:
-//    STB_TEXTEDIT_K_PGUP        keyboard input to move cursor up a page
-//    STB_TEXTEDIT_K_PGDOWN      keyboard input to move cursor down a page
-//
 // Keyboard input must be encoded as a single integer value; e.g. a character code
 // and some bitflags that represent shift states. to simplify the interface, SHIFT must
 // be a bitflag, so we can test the shifted state of cursor movements to allow selection,
@@ -331,6 +329,10 @@ typedef struct
    // each textfield keeps its own insert mode state. to keep an app-wide
    // insert mode, copy this value in/out of the app state
 
+   int row_count_per_page;
+   // page size in number of row.
+   // this value MUST be set to >0 for pageup or pagedown in multilines documents.
+
    /////////////////////
    //
    // private data
@@ -849,12 +851,16 @@ retry:
          break;
 
       case STB_TEXTEDIT_K_DOWN:
-      case STB_TEXTEDIT_K_DOWN | STB_TEXTEDIT_K_SHIFT: {
+      case STB_TEXTEDIT_K_DOWN | STB_TEXTEDIT_K_SHIFT:
+      case STB_TEXTEDIT_K_PGDOWN:
+      case STB_TEXTEDIT_K_PGDOWN | STB_TEXTEDIT_K_SHIFT: {
          StbFindState find;
          StbTexteditRow row;
-         int i, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0;
+         int i, j, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0;
+         int is_page = (key & ~STB_TEXTEDIT_K_SHIFT) == STB_TEXTEDIT_K_PGDOWN;
+         int row_count = is_page ? state->row_count_per_page : 1;
 
-         if (state->single_line) {
+         if (!is_page && state->single_line) {
             // on windows, up&down in single-line behave like left&right
             key = STB_TEXTEDIT_K_RIGHT | (key & STB_TEXTEDIT_K_SHIFT);
             goto retry;
@@ -863,17 +869,20 @@ retry:
          if (sel)
             stb_textedit_prep_selection_at_cursor(state);
          else if (STB_TEXT_HAS_SELECTION(state))
-            stb_textedit_move_to_last(str,state);
+            stb_textedit_move_to_last(str, state);
 
          // compute current position of cursor point
          stb_textedit_clamp(str, state);
          stb_textedit_find_charpos(&find, str, state->cursor, state->single_line);
 
-         // now find character position down a row
-         if (find.length) {
-            float goal_x = state->has_preferred_x ? state->preferred_x : find.x;
-            float x;
+         for (j = 0; j < row_count; ++j) {
+            float x, goal_x = state->has_preferred_x ? state->preferred_x : find.x;
             int start = find.first_char + find.length;
+
+            if (find.length == 0)
+               break;
+
+            // now find character position down a row
             state->cursor = start;
             STB_TEXTEDIT_LAYOUTROW(&row, str, state->cursor);
             x = row.x0;
@@ -895,17 +904,25 @@ retry:
 
             if (sel)
                state->select_end = state->cursor;
+
+            // go to next line
+            find.first_char = find.first_char + find.length;
+            find.length = row.num_chars;
          }
          break;
       }
 
       case STB_TEXTEDIT_K_UP:
-      case STB_TEXTEDIT_K_UP | STB_TEXTEDIT_K_SHIFT: {
+      case STB_TEXTEDIT_K_UP | STB_TEXTEDIT_K_SHIFT:
+      case STB_TEXTEDIT_K_PGUP:
+      case STB_TEXTEDIT_K_PGUP | STB_TEXTEDIT_K_SHIFT: {
          StbFindState find;
          StbTexteditRow row;
-         int i, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0;
+         int i, j, prev_scan, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0;
+         int is_page = (key & ~STB_TEXTEDIT_K_SHIFT) == STB_TEXTEDIT_K_PGUP;
+         int row_count = is_page ? state->row_count_per_page : 1;
 
-         if (state->single_line) {
+         if (!is_page && state->single_line) {
             // on windows, up&down become left&right
             key = STB_TEXTEDIT_K_LEFT | (key & STB_TEXTEDIT_K_SHIFT);
             goto retry;
@@ -920,11 +937,14 @@ retry:
          stb_textedit_clamp(str, state);
          stb_textedit_find_charpos(&find, str, state->cursor, state->single_line);
 
-         // can only go up if there's a previous row
-         if (find.prev_first != find.first_char) {
+         for (j = 0; j < row_count; ++j) {
+            float  x, goal_x = state->has_preferred_x ? state->preferred_x : find.x;
+
+            // can only go up if there's a previous row
+            if (find.prev_first == find.first_char)
+               break;
+
             // now find character position up a row
-            float goal_x = state->has_preferred_x ? state->preferred_x : find.x;
-            float x;
             state->cursor = find.prev_first;
             STB_TEXTEDIT_LAYOUTROW(&row, str, state->cursor);
             x = row.x0;
@@ -946,6 +966,14 @@ retry:
 
             if (sel)
                state->select_end = state->cursor;
+
+            // go to previous line
+            // (we need to scan previous line the hard way. maybe we could expose this as a new API function?)
+            prev_scan = find.prev_first > 0 ? find.prev_first - 1 : 0;
+            while (prev_scan > 0 && STB_TEXTEDIT_GETCHAR(str, prev_scan - 1) != STB_TEXTEDIT_NEWLINE)
+               --prev_scan;
+            find.first_char = find.prev_first;
+            find.prev_first = prev_scan;
          }
          break;
       }
@@ -1069,10 +1097,6 @@ retry:
          state->has_preferred_x = 0;
          break;
       }
-
-// @TODO:
-//    STB_TEXTEDIT_K_PGUP      - move cursor up a page
-//    STB_TEXTEDIT_K_PGDOWN    - move cursor down a page
    }
 }
 
@@ -1337,6 +1361,7 @@ static void stb_textedit_clear_state(STB_TexteditState *state, int is_single_lin
    state->initialized = 1;
    state->single_line = (unsigned char) is_single_line;
    state->insert_mode = 0;
+   state->row_count_per_page = 0;
 }
 
 // API initialize

From eb677dda6e6a495c903cd5428ec7ebc840674f49 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:25:16 -0700
Subject: [PATCH 060/170] stb_textedit: Update credit, version history

---
 stb_textedit.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/stb_textedit.h b/stb_textedit.h
index 54f4b16..90543b0 100644
--- a/stb_textedit.h
+++ b/stb_textedit.h
@@ -1,4 +1,4 @@
-// stb_textedit.h - v1.13  - public domain - Sean Barrett
+// stb_textedit.h - v1.14  - public domain - Sean Barrett
 // Development of this library was sponsored by RAD Game Tools
 //
 // This C header file implements the guts of a multi-line text-editing
@@ -29,6 +29,7 @@
 //
 // VERSION HISTORY
 //
+//   1.14 (          ) page up/down, various fixes
 //   1.13 (2019-02-07) fix bug in undo size management
 //   1.12 (2018-01-29) user can change STB_TEXTEDIT_KEYTYPE, fix redo to avoid crash
 //   1.11 (2017-03-03) fix HOME on last line, dragging off single-line textfield
@@ -52,6 +53,7 @@
 //   Ulf Winklemann: move-by-word in 1.1
 //   Fabian Giesen: secondary key inputs in 1.5
 //   Martins Mozeiko: STB_TEXTEDIT_memmove in 1.6
+//   Louis Schnellbach: page up/down in 1.14
 //
 //   Bugfixes:
 //      Scott Graham

From 44f046af0c6f1d79ef6f189d69ac670bbfba4ac4 Mon Sep 17 00:00:00 2001
From: ocornut <omarcornut@gmail.com>
Date: Mon, 5 Jul 2021 17:22:24 +0200
Subject: [PATCH 061/170] stb_textedit: Fix paste failure handling breaking
 undo stack

Could lead to freezes.

Fixes issue #734.
---
 stb_textedit.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/stb_textedit.h b/stb_textedit.h
index 90543b0..5bb3a57 100644
--- a/stb_textedit.h
+++ b/stb_textedit.h
@@ -712,9 +712,7 @@ static int stb_textedit_paste_internal(STB_TEXTEDIT_STRING *str, STB_TexteditSta
       state->has_preferred_x = 0;
       return 1;
    }
-   // remove the undo since we didn't actually insert the characters
-   if (state->undostate.undo_point)
-      --state->undostate.undo_point;
+   // note: paste failure will leave deleted selection, may be restored with an undo (see https://github.com/nothings/stb/issues/734 for details)
    return 0;
 }
 

From b691fc430594cbe3a99b32e53bf8a20decc89d97 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:39:35 -0700
Subject: [PATCH 062/170] stb_truetype: Remove dead assignments

Confirmed from the OpenType spec that there's nothing missing
here. (These were just annotations listing the total sizes of
the tables, but this is not used for anything here.)

Fixes issue #704.
---
 stb_truetype.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index a58fc62..c3c4f8e 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -2454,8 +2454,6 @@ static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
 
             if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
                 return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
-
-            classDefTable = classDef1ValueArray + 2 * glyphCount;
         } break;
 
         case 2: {
@@ -2478,8 +2476,6 @@ static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
                 else
                     return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
             }
-
-            classDefTable = classRangeRecords + 6 * classRangeCount;
         } break;
 
         default: {

From 2c8cd33e2eb331f94c27c0cbe79694216f434339 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20Vondru=C5=A1?= <mosra@centrum.cz>
Date: Mon, 6 Apr 2020 18:06:34 +0200
Subject: [PATCH 063/170] stb_image: make unpremultiply and de-iPhone flags
 thread_local.

Follows the change done for vertical flipping in
eb48fbdcede98f93dab3a5e42ce5b0072b739cf1 for these two options as well.
---
 stb_image.h | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 4f684e3..50c263c 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -518,6 +518,8 @@ STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
 // as above, but only applies to images loaded on the thread that calls the function
 // this function is only available if your compiler supports thread-local variables;
 // calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
 STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
 
 // ZLIB client - used by PNG, available for other purposes
@@ -4927,19 +4929,46 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int
    return 1;
 }
 
-static int stbi__unpremultiply_on_load = 0;
-static int stbi__de_iphone_flag = 0;
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
 
 STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
 {
-   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
 }
 
 STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 {
-   stbi__de_iphone_flag = flag_true_if_should_convert;
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
 }
 
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
 static void stbi__de_iphone(stbi__png *z)
 {
    stbi__context *s = z->s;

From 7e10880f53572ad2c1a949a9311f813a41bedbdd Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 20:47:30 -0700
Subject: [PATCH 064/170] stb_image: Update credits

---
 stb_image.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 50c263c..760dc34 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -111,7 +111,7 @@ RECENT REVISION HISTORY:
     Josh Tobin                                 Matthew Gregan       github:poppolopoppo
     Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
     Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
-                            Brad Weinberger    Matvey Cherevko      [reserved]
+                            Brad Weinberger    Matvey Cherevko      github:mosra
     Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
     Ryan C. Gordon          [reserved]                              [reserved]
                      DO NOT ADD YOUR NAME HERE

From b18c989dc2c7c2d3852b819271a848d8159dc749 Mon Sep 17 00:00:00 2001
From: Rob Loach <robloach@gmail.com>
Date: Fri, 16 Apr 2021 04:21:42 -0400
Subject: [PATCH 065/170] stb_tilemap_editor: Fix variable usage

---
 stb_tilemap_editor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_tilemap_editor.h b/stb_tilemap_editor.h
index 9b77364..a89a516 100644
--- a/stb_tilemap_editor.h
+++ b/stb_tilemap_editor.h
@@ -2996,7 +2996,7 @@ static void stbte__tile_paint(stbte_tilemap *tm, int sx, int sy, int mapx, int m
       i = layer;
       if (i == tm->solo_layer || (!tm->layerinfo[i].hidden && tm->solo_layer < 0))
          if (data[i] >= 0)
-            STBTE_DRAW_TILE(x0,y0, (unsigned short) data[i], 0, tm->props[mapy][mapx]);
+            STBTE_DRAW_TILE(sx,sy, (unsigned short) data[i], 0, tm->props[mapy][mapx]);
    }
 }
 
@@ -3496,7 +3496,7 @@ static void stbte__tile_in_palette(stbte_tilemap *tm, int x, int y, int slot)
    switch (stbte__ui.event) {
       case STBTE__paint:
          stbte__draw_rect(x,y,x+tm->palette_spacing_x-1,y+tm->palette_spacing_x-1, STBTE_COLOR_TILEPALETTE_BACKGROUND);
-         STBTE_DRAW_TILE(x,y,t->id, slot == tm->cur_tile,0);
+         STBTE_DRAW_TILE(x,y,id, slot == tm->cur_tile,0);
          if (slot == tm->cur_tile)
             stbte__draw_frame_delayed(x-1,y-1,x+tm->palette_spacing_x,y+tm->palette_spacing_y, STBTE_COLOR_TILEPALETTE_OUTLINE);
          break;

From 21bfcbbc3dfb3ed8a9c8f86d68de9dbb93eb5768 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 21:07:34 -0700
Subject: [PATCH 066/170] stb_tilemap_editor: Update version, credits

---
 stb_tilemap_editor.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/stb_tilemap_editor.h b/stb_tilemap_editor.h
index a89a516..180b5cd 100644
--- a/stb_tilemap_editor.h
+++ b/stb_tilemap_editor.h
@@ -1,4 +1,4 @@
-// stb_tilemap_editor.h - v0.41 - Sean Barrett - http://nothings.org/stb
+// stb_tilemap_editor.h - v0.42 - Sean Barrett - http://nothings.org/stb
 // placed in the public domain - not copyrighted - first released 2014-09
 //
 // Embeddable tilemap editor for C/C++
@@ -275,6 +275,7 @@
 //   either approach allows cut&pasting between levels.)
 //
 // REVISION HISTORY
+//   0.42  fix compilation errors
 //   0.41  fix warnings
 //   0.40  fix warning
 //   0.39  fix warning
@@ -317,6 +318,7 @@
 //   Bugfixes:
 //      Ryan Whitworth
 //      Eugene Opalev
+//      Rob Loach
 //
 // LICENSE
 //

From 77eedd4fcfb6d25eb8fd2f306384fdaf733103ad Mon Sep 17 00:00:00 2001
From: Werner Stoop <wstoop@gmail.com>
Date: Fri, 9 Apr 2021 20:03:01 +0200
Subject: [PATCH 067/170] stb_tilemap_editor: Several fixes.

Re-added calls to `stbte__hittest()`, fixed some compiler errors.
Also fixed some GCC warnings about unused variables when
STBTE__COLORPICKER and STBTE_ALLOW_LINK is not defined.
---
 stb_tilemap_editor.h | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/stb_tilemap_editor.h b/stb_tilemap_editor.h
index 180b5cd..d5bdd81 100644
--- a/stb_tilemap_editor.h
+++ b/stb_tilemap_editor.h
@@ -1823,6 +1823,8 @@ static int stbte__button(int colormode, const char *label, int x, int y, int tex
    int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT;
    int s = STBTE__BUTTON_INTERNAL_SPACING;
 
+   if(!disabled) stbte__hittest(x0,y0,x1,y1,id);
+
    if (stbte__ui.event == STBTE__paint)
       stbte__draw_textbox(x0,y0,x1,y1, (char*) label,s+textoff,s, colormode, STBTE__INDEX_FOR_ID(id,disabled,toggled));
    if (disabled)
@@ -1835,6 +1837,8 @@ static int stbte__button_icon(int colormode, char ch, int x, int y, int width, i
    int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT;
    int s = STBTE__BUTTON_INTERNAL_SPACING;
 
+   stbte__hittest(x0,y0,x1,y1,id);
+
    if (stbte__ui.event == STBTE__paint) {
       char label[2] = { ch, 0 };
       int pad = (9 - stbte__get_char_width(ch))/2;
@@ -1848,6 +1852,7 @@ static int stbte__button_icon(int colormode, char ch, int x, int y, int width, i
 static int stbte__minibutton(int colormode, int x, int y, int ch, int id)
 {
    int x0 = x, y0 = y, x1 = x+8, y1 = y+7;
+   stbte__hittest(x0,y0,x1,y1,id);
    if (stbte__ui.event == STBTE__paint) {
       char str[2] = { (char)ch, 0 };
       stbte__draw_textbox(x0,y0,x1,y1, str,1,0,colormode, STBTE__INDEX_FOR_ID(id,0,0));
@@ -1858,6 +1863,7 @@ static int stbte__minibutton(int colormode, int x, int y, int ch, int id)
 static int stbte__layerbutton(int x, int y, int ch, int id, int toggled, int disabled, int colormode)
 {
    int x0 = x, y0 = y, x1 = x+10, y1 = y+11;
+   if(!disabled) stbte__hittest(x0,y0,x1,y1,id);
    if (stbte__ui.event == STBTE__paint) {
       char str[2] = { (char)ch, 0 };
       int off = (9-stbte__get_char_width(ch))/2;
@@ -1871,6 +1877,7 @@ static int stbte__layerbutton(int x, int y, int ch, int id, int toggled, int dis
 static int stbte__microbutton(int x, int y, int size, int id, int colormode)
 {
    int x0 = x, y0 = y, x1 = x+size, y1 = y+size;
+   stbte__hittest(x0,y0,x1,y1,id);
    if (stbte__ui.event == STBTE__paint) {
       stbte__draw_box(x0,y0,x1,y1, colormode, STBTE__INDEX_FOR_ID(id,0,0));
    }
@@ -1880,6 +1887,7 @@ static int stbte__microbutton(int x, int y, int size, int id, int colormode)
 static int stbte__microbutton_dragger(int x, int y, int size, int id, int *pos)
 {
    int x0 = x, y0 = y, x1 = x+size, y1 = y+size;
+   stbte__hittest(x0,y0,x1,y1,id);
    switch (stbte__ui.event) {
       case STBTE__paint:
          stbte__draw_box(x0,y0,x1,y1, STBTE__cexpander, STBTE__INDEX_FOR_ID(id,0,0));
@@ -1910,6 +1918,8 @@ static int stbte__category_button(const char *label, int x, int y, int width, in
    int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT;
    int s = STBTE__BUTTON_INTERNAL_SPACING;
 
+   stbte__hittest(x0,y0,x1,y1,id);
+
    if (stbte__ui.event == STBTE__paint)
       stbte__draw_textbox(x0,y0,x1,y1, (char*) label, s,s, STBTE__ccategory_button, STBTE__INDEX_FOR_ID(id,0,toggled));
 
@@ -1929,6 +1939,7 @@ static int stbte__slider(int x0, int w, int y, int range, int *value, int id)
 {
    int x1 = x0+w;
    int pos = *value * w / (range+1);
+   stbte__hittest(x0,y-2,x1,y+3,id);
    int event_mouse_move = STBTE__change;
    switch (stbte__ui.event) {
       case STBTE__paint:
@@ -1971,6 +1982,7 @@ static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, f
 {
    int x1 = x0+w;
    int y1 = y0+11;
+   stbte__hittest(x0,y0,x1,y1,id);
    switch (stbte__ui.event) {
       case STBTE__paint: {
          char text[32];
@@ -1982,7 +1994,7 @@ static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, f
       case STBTE__rightdown:
          if (STBTE__IS_HOT(id) && STBTE__INACTIVE())
             stbte__activate(id);
-            return STBTE__begin;
+         return STBTE__begin;
          break;
       case STBTE__leftup:
       case STBTE__rightup:
@@ -2022,7 +2034,6 @@ static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, f
 
 static void stbte__scrollbar(int x, int y0, int y1, int *val, int v0, int v1, int num_vis, int id)
 {
-   int over;
    int thumbpos;
    if (v1 - v0 <= num_vis)
       return;
@@ -2031,7 +2042,7 @@ static void stbte__scrollbar(int x, int y0, int y1, int *val, int v0, int v1, in
    thumbpos = y0+2 + (y1-y0-4) * *val / (v1 - v0 - num_vis);
    if (thumbpos < y0) thumbpos = y0;
    if (thumbpos >= y1) thumbpos = y1;
-   over = stbte__hittest(x-1,y0,x+2,y1,id);
+   stbte__hittest(x-1,y0,x+2,y1,id);
    switch (stbte__ui.event) {
       case STBTE__paint:
          stbte__draw_rect(x,y0,x+1,y1, stbte__color_table[STBTE__cscrollbar][STBTE__text][STBTE__idle]);
@@ -2809,6 +2820,10 @@ static void stbte__drag_update(stbte_tilemap *tm, int mapx, int mapy, int copy_p
    int ox,oy,i,deleted=0,written=0;
    short temp[STBTE_MAX_LAYERS];
    short *data = NULL;
+
+   STBTE__NOTUSED(deleted);
+   STBTE__NOTUSED(written);
+
    if (!stbte__ui.shift) {
       ox = mapx - stbte__ui.drag_x;
       oy = mapy - stbte__ui.drag_y;
@@ -2930,6 +2945,9 @@ static void stbte__tile_paint(stbte_tilemap *tm, int sx, int sy, int mapx, int m
 {
    int i;
    int id = STBTE__IDMAP(mapx,mapy);
+   int x0=sx, y0=sy;
+   int x1=sx+tm->spacing_x, y1=sy+tm->spacing_y;
+   stbte__hittest(x0,y0,x1,y1, id);
    short *data = tm->data[mapy][mapx];
    short temp[STBTE_MAX_LAYERS];
 
@@ -3494,7 +3512,10 @@ static void stbte__categories(stbte_tilemap *tm, int x0, int y0, int w, int h)
 
 static void stbte__tile_in_palette(stbte_tilemap *tm, int x, int y, int slot)
 {
+   stbte__tileinfo *t = &tm->tiles[slot];
+   int x0=x, y0=y, x1 = x+tm->palette_spacing_x - 1, y1 = y+tm->palette_spacing_y;
    int id = STBTE__ID(STBTE__palette, slot);
+   stbte__hittest(x0,y0,x1,y1, id);
    switch (stbte__ui.event) {
       case STBTE__paint:
          stbte__draw_rect(x,y,x+tm->palette_spacing_x-1,y+tm->palette_spacing_x-1, STBTE_COLOR_TILEPALETTE_BACKGROUND);
@@ -3567,6 +3588,7 @@ static void stbte__props_panel(stbte_tilemap *tm, int x0, int y0, int w, int h)
    my = stbte__ui.select_y0;
    p = tm->props[my][mx];
    data = tm->data[my][mx];
+   STBTE__NOTUSED(data);
    for (i=0; i < STBTE_MAX_PROPERTIES; ++i) {
       unsigned int n = STBTE_PROP_TYPE(i, data, p);
       if (n) {
@@ -3646,8 +3668,9 @@ static void stbte__props_panel(stbte_tilemap *tm, int x0, int y0, int w, int h)
    }
 }
 
-static int stbte__cp_mode, stbte__cp_aspect, stbte__cp_state, stbte__cp_index, stbte__save, stbte__cp_altered, stbte__color_copy;
+static int stbte__cp_mode, stbte__cp_aspect, stbte__save, stbte__cp_altered;
 #ifdef STBTE__COLORPICKER
+static int stbte__cp_state, stbte__cp_index, stbte__color_copy;
 static void stbte__dump_colorstate(void)
 {
    int i,j,k;

From 15d2dc5c5159e0e48b06287d123961256ab67c27 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 21:25:41 -0700
Subject: [PATCH 068/170] stb_tilemap_editor: Update contributors

---
 stb_tilemap_editor.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stb_tilemap_editor.h b/stb_tilemap_editor.h
index d5bdd81..fbd3388 100644
--- a/stb_tilemap_editor.h
+++ b/stb_tilemap_editor.h
@@ -319,6 +319,7 @@
 //      Ryan Whitworth
 //      Eugene Opalev
 //      Rob Loach
+//      github:wernsey
 //
 // LICENSE
 //

From 6ca560c9afba1bb850e07683988583a045c8dc78 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 6 Jul 2021 21:40:31 -0700
Subject: [PATCH 069/170] stb_image: Update documentation for de-iPhone flag

It's default off, not default on.

Fixes issue #651.
---
 stb_image.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 760dc34..7f6ccf3 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -310,11 +310,10 @@ RECENT REVISION HISTORY:
 //
 // iPhone PNG support:
 //
-// By default we convert iphone-formatted PNGs back to RGB, even though
-// they are internally encoded differently. You can disable this conversion
-// by calling stbi_convert_iphone_png_to_rgb(0), in which case
-// you will always just get the native iphone "format" through (which
-// is BGR stored in RGB).
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
 //
 // Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
 // pixel to remove any premultiplied alpha *only* if the image file explicitly

From 9f985460226d7939d7fd71f81b5e38dda4b06651 Mon Sep 17 00:00:00 2001
From: Devendranath Thadi <devendranath.thadi3@gmail.com>
Date: Wed, 25 Nov 2020 06:48:14 +0000
Subject: [PATCH 070/170] Travis-ci: added support for ppc64le

---
 .travis.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index 3b71802..c2ad947 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,4 +1,7 @@
 language: C
+arch:
+  - AMD64
+  - ppc64le
 install: true
 script:
   - cd tests

From b77192742d5ae5746131f379da8157443128be6d Mon Sep 17 00:00:00 2001
From: Randy <randy408@protonmail.com>
Date: Wed, 25 Nov 2020 14:04:52 +0100
Subject: [PATCH 071/170] oss-fuzz: integrate with CIFuzz

---
 .github/workflows/ci-fuzz.yml | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 .github/workflows/ci-fuzz.yml

diff --git a/.github/workflows/ci-fuzz.yml b/.github/workflows/ci-fuzz.yml
new file mode 100644
index 0000000..332fca9
--- /dev/null
+++ b/.github/workflows/ci-fuzz.yml
@@ -0,0 +1,23 @@
+name: CIFuzz
+on: [pull_request]
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'stb'
+        dry-run: false
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'stb'
+        fuzz-seconds: 900
+        dry-run: false
+    - name: Upload Crash
+      uses: actions/upload-artifact@v1
+      if: failure()
+      with:
+        name: artifacts
+        path: ./out/artifacts

From dcb1116f1351bc8edad059d9ca432e8f322ba922 Mon Sep 17 00:00:00 2001
From: Randy <randy408@protonmail.com>
Date: Thu, 26 Nov 2020 02:18:13 +0100
Subject: [PATCH 072/170] ossfuzz: improve code coverage

fix .gif pattern matching
add netpbm test images derived from the public domain pngsuite
added more image types (downloaded in Dockerfile)
---
 tests/ossfuzz.sh       |   8 ++++++--
 tests/pbm/basi0g16.pgm | Bin 0 -> 2064 bytes
 tests/pbm/basi2c16.ppm | Bin 0 -> 6160 bytes
 tests/pbm/cdfn2c08.ppm | Bin 0 -> 826 bytes
 tests/pbm/cdun2c08.ppm | Bin 0 -> 3086 bytes
 tests/pbm/comment.pgm  | Bin 0 -> 1083 bytes
 tests/pbm/ctfn0g04.pgm | Bin 0 -> 1038 bytes
 7 files changed, 6 insertions(+), 2 deletions(-)
 create mode 100644 tests/pbm/basi0g16.pgm
 create mode 100644 tests/pbm/basi2c16.ppm
 create mode 100644 tests/pbm/cdfn2c08.ppm
 create mode 100644 tests/pbm/cdun2c08.ppm
 create mode 100644 tests/pbm/comment.pgm
 create mode 100644 tests/pbm/ctfn0g04.pgm

diff --git a/tests/ossfuzz.sh b/tests/ossfuzz.sh
index 2af98f5..4b8fd27 100755
--- a/tests/ossfuzz.sh
+++ b/tests/ossfuzz.sh
@@ -17,9 +17,13 @@ cp $SRC/stb/tests/stb_png.dict $OUT/stb_png_read_fuzzer.dict
 
 tar xvzf $SRC/stb/jpg.tar.gz --directory $SRC/stb/tests
 tar xvzf $SRC/stb/gif.tar.gz --directory $SRC/stb/tests
+unzip    $SRC/stb/bmp.zip    -d $SRC/stb/tests
+unzip    $SRC/stb/tga.zip    -d $SRC/stb/tests
 
-find $SRC/stb/tests -name "*.png" -o -name "*.jpg" -o -name ".gif" | \
-     xargs zip $OUT/stbi_read_fuzzer_seed_corpus.zip
+find $SRC/stb/tests -name "*.png" -o -name "*.jpg" -o -name "*.gif" \
+                 -o -name "*.bmp" -o -name "*.tga" -o -name "*.TGA" \
+                 -o -name "*.ppm" -o -name "*.pgm" \
+    | xargs zip $OUT/stbi_read_fuzzer_seed_corpus.zip
 
 echo "" >> $SRC/stb/tests/gif.dict
 cat $SRC/stb/tests/gif.dict $SRC/stb/tests/stb_png.dict > $OUT/stbi_read_fuzzer.dict
diff --git a/tests/pbm/basi0g16.pgm b/tests/pbm/basi0g16.pgm
new file mode 100644
index 0000000000000000000000000000000000000000..72412431a9513fbced9f9bac62f912d2edf89aea
GIT binary patch
literal 2064
zcmbu=_g9Sp0EhACI8KKvD_cZX_UO9VB3t&}TefTw5fNDt*;`ij%HDf#*;_>R7GLN6
z;r;{f`#*fn^E7eTY<BOwc5j=H!(n&WAT@5dQ-Biq;7eVa(vI%*XE-5*6UjnW5z8(P
zahfZ{^OQuskjzKj(tf2N4VlSJAxcr6>eQz>9q7qGMlznM%waKW*vuY|aF%P_<2mp6
z#y=8uMZ1)ev}7d@g(*WtYS4g|bfOo77{vspF_)#RWh;9*#(8e=fS0`I2ht0j(-ygq
zp6qy16g!ovMI&0%g+2@+m@sA#&2rYWo&B8PBDZ)%0w4LsFFn!;tyd~Cl9T)t$3a!<
z(1f;hqaVW<%VcJ;fR$`yCkHvjW$y5VH+<#~UlgxHS|uBq$VEX)QjTiWqZ#e#!2m`u
zjw#G$5v$q6ZVq#XtK8)oZ~4k!l5|~hS|V4n;DHyVsem8;w4fsa1QJ9j5k#?s7`70{
zQO<Fl`@A5D?}!9l)OJNH9ofiB5xl8HO&ZdQ&h%z5qnXHb=CO=*Y-1nCxxh^x@`?}q
zBw0^&TCs{$26B*(Vw9x{wP{Qny3&`SjA0Tpna>I~u!94f<Px`e%xgaJoA0`>ql!_4
ztk_wuFj=v)P$9BnCrUxGVrPbi$cmkC^_LYpp$d={JHhHAD|UvfovhdiR12Byj$x;d
z8p&*T3_IObS7y6o*y$)gneC2Yr?o1}Y<CPhP30rA-7)O=tF+8^$FNgNMP;@-h8<rO
wklF4Sb}Gn2W}fA=V^?;Wd6v^o3AxG4vz&Ik<SH}Ia@z5fi_ARBX~$jYKODUj(EtDd

literal 0
HcmV?d00001

diff --git a/tests/pbm/basi2c16.ppm b/tests/pbm/basi2c16.ppm
new file mode 100644
index 0000000000000000000000000000000000000000..f2913bb9e2ddc2a4a41dcf7db79cf6f6e1e4ff60
GIT binary patch
literal 6160
zcmXBYA<(V1*2eM5OjTs$heUEay)#&3)Sbv!WF!)cj6`CQNF){+iNqq2$n)+0JnhVH
z+S8u1^KQsm&szWc@85s_^Pm6w=Rbe{_uqg2`S<TXzrX(a>pSor@oDk?@Y=Bqcs6*v
zxNo>6Tx(n&oL9^}P8E(f4hyD;y~B86n9&F9EVc!kjJ3zgpgYk{Xgd7sFZ}c8J2*eS
zBR(zOA6`3_0nY}H7xxXfglmn<gY$~H$Em{c#$mw}v3D3R3^V$GoyE3bld<+#8FVMw
z2~CHbnxEty`6qlwd|JFeyml-Do(&!^?i+3i*BX}x=M{60Q-$M=!-6Sd?=W5%X7m9&
zi*3OsW9_jr=uWf~n(iG<$9Z`kPs{o79r0=L{_xtd40twpytr?;C0uJ<9-LRqJx&#l
zHx3J?h`qyjVVKbe>@2ngn~b%`%Ah;ZPETt((><-7I4N({{c(PLe-E2q#rwl+$1>pA
z;PK+V;g)c%ad~iFG50uCINmrcm?HKL<Aq^HAF#977Hl%s9xH?Hyg$>~>Hh3wy8Ck{
zXXI{NJLkuD#HYpk!)wPf;Mw5u;=bXQaIJB9a9%O@I8`{_I4qbV_73BPVMZUYv)C4F
zGS(g|bM2;c=e66(PS@^E(_MRII2|{<44fa|5uX<C53e1|fM<ipi~ELK!nMZb!Fk2p
z<5b~z<FH_g*gK3Dh8cap&SG1z$yoa`n65Hqu#-E>;7&W0p)#f`L+v>ib?R(5KfWVA
zE#4nqJC*^@29Fo_4Y!1Ajmv}cin+(B!tuso!4$D~7%vPn`hcCqwqTRb#&oqm8#`6z
zY~1P2v#E@o&ZhRJJDVnRBC752a(;YAd|LdKy`R^PWx%t+<Hdc$E#X?@^5DE;?s2Mc
zym44CMeH5M3&V^)U}v!{$7{MKAFrKif4uIj%<)#noyS{yI~{M6O?SL);VkrwedGN2
zj`*~Ae|YU!20R-)Ufeg_60S8a56&y*9;XV&8;1o`#NOfWD*OyH`hcC?H`BFc-|W=n
zeRF5+_pLIO*|*xebKjb5r+sUS>GrL&oPy4`CC-oUh);|6hrbW`Sq3~CJYL*4+!C%e
zE)UKt<{qaC#~X(QQ^ek3yfDn@!<J0f*)7?rEnE7%V0lZGsr{B}Uzsg6*_~Tzi=DRA
zS<`LF2R@(vz1Ex`-w~e{?+>pX%YbKt$BX-hTf()*<-vKy+~ZW?c;m2Oir71h7lwJQ
zP3OZ}+o`i_?asEWb!D2o*0rzwS~s~eYuy%iu61YawAMbDZmq-2CzC>!hx6k*;?v^&
z;k9G=wI9xp$BX-hTf()*<-vKy+~ZW?c;m2Oir71h*YcPy%*$gZAC||Ron4;Fv}Ji}
z-{j?Ka_xU@uT*AvI_u8m@xe~Z6K2ybPrUd{vedkCetbuKTD(8Jb}R#)4IVG<8*T~L
z8kYy>6?2bMh2xFGf+=F}=GAoZnpZo8d0yS=e~+R1HK*Uh@NJpbCO3Ir+fw^^?QCV{
z)dzRZYnbgcukkY7y!MVyBN67F^W!_>)8hT%wPP9Z_fq~mUfeg_60S8a56&y*9;XV&
z8;1o`%)RM)mwP+KEB^{<m~&qlA97#&&gQ<!ZOMIGnw<O2);{+>R3`Ufc4zM6WvATt
z&UCp?k<TI5P8H|Jcf_Z~`@?I;GT_<Z@#4PWmT;|cd2n7a_c&EJ-Z(73GTx;)Rd(v#
zRJk)=Q&pKTPgU)Gn5riK!a>!RwoFxLn><xM)PAbMT$!ngmpiAbcXpbp6iqi(!@?(!
z&c~bc<2&Nh;{D;ZV;S&l@E25n?i+3i*BX}x=M{60{OO#3yiGSO<87xD$J?E~8}G`*
zYrKC~aUSm``!L>Z>FjuSwk_lBLzBll%(WlycvWV+d*{yaPSH-|JuIf<Pt|wx=MO*0
zci=nX)8hT%wPP9ZZ18w--*8K~*0?-4ugG)d{Q9I}vD2^&i#t;smdf;QSZW`yVQF%h
zhovn(3`=J_J1jo@!pjooCJ#%zYCkN!tIV*Z=uZ9&{v>@Lda6uHyh-1I?}$%}_lMVx
zWx%t+<HcWF__-xqYg``WMsa>UV|`!x>=Y}LVv4ozU5ZVPSBh;3bBdkyAw?fLn_`&T
zl487?oMP{4&+p*(>8IB-)SIaL#JtB%{tkRcd|JFeyml-Do(&!^?i+3i*BYs1oL{#~
zZ>pZUzCoR)cfTUwde@eC^{%sF?wt=l^e)Vu?OnXu!hh%g($A_VruR;_M>Rl|jH#7M
z`W^U=__TO`c<opQJR3Y-+&BC^gr9T~&ab+r8ln59cUSL$zEhnqUY(6synG0Ayu$1Q
z|HiNBC)4xMTc*3B8?U;h>Z=~Y>`f>84tz&^TD(8Jb}R#)4IVG<8`8fyzdDMlvnrjc
zhHjwVX1y8umUZGV>u>!WdK!ACbTf3nRi#vG)iKof)V<jtkR-kX-w~e{?+>pX%YbKt
z$BQH<&aYmszNj9fdac^0I-)zN_gn9ZzIVQLeiFSwx(B+=s!6J>>JRF2>eWgd%1!J+
zNOIqS?}$%}_lMVxWx%sRGArj-ic%s`cUPxW*HHymHB@EME!7*Zw}<zg=b^i+3Zy!!
zZlIo~&a8Z)l%%Yu#L6~@B>f%uj`*~Ae|YU!24q^`{K~J&g349ODC+m>o$5oX>8g*a
zHM+OlQEr0jjjE<<zdD%uu+oGwk&>PAsZzFP1$I(QGT(vkh);|6hu4nGX`Ek^gHpH>
zt<s^AmQsrnfV!zVld8Nbrm7Fsk^8JluYRSjtL&fzq#UO-s!XlPK=X~JG&XBYlHY;v
zh);|6hfK(vUo)HLAI%cV<;u9qi^^WgG0Ft$uj*B*`&4u)sk)PTtdfHAj#8VlrxLX0
z08KNR$uwDNK4w42B>x@wj`+05Zi4e`TGr&LDNYlSrVAx?rCudSr7<NNr37_cbuW4;
z{Xn@u=|&k%NmF@Qsb8~;CNRyBnuaygYcs)?lTFHZ;5#B)A<nNog=T)u&zeOw*J(!5
zyrJx_9IQ;Ke5S0U+(7@Q6D!LokttUy9V_E&QqjDnsZq19CV1^3v`x`Yh+Qq4tnWbf
zahzWpB5hl=xzH9s6SbyOO?sM|G=V71D;X<gDzPd3ki5tfN|DOKO7xmbG+k*%)TFC<
zU0Vn3PP75i9!uLeHqLAk*@SX_?dG)S(#}Zx7ws~%7toBYc~!HY<|xf1%J<63%AKS|
zlAMyS^1G%G%~F~OHP>o7*N#D(6774mwbJfQ8$|6vwKwPd+KOt!sBN7#U)myRBcttx
zHU*lxHNk2c)MTY8MTuYOSxHo>P6<hAL!Db)P#s0pS(Q#zLpM-wv)&AS%Q`XJ^Zy6h
Cv5{y1

literal 0
HcmV?d00001

diff --git a/tests/pbm/cdfn2c08.ppm b/tests/pbm/cdfn2c08.ppm
new file mode 100644
index 0000000000000000000000000000000000000000..1a9e0f0a9766add367295fd6a0279a630440489e
GIT binary patch
literal 826
zcmZ{iO-{ow6ohA;U2b5RH5DW(VoxN*qNtl307Qc-p@Pa!#0`9fT)|iH6>^1K!B=>U
z?T|)=N>5Spc`|;U=k--~rf&9|hmXyZdi++mcdNDf+U(!AuRB#NE-w~Eq1N{+_4e}l
ze7@af*QzYCV!6x&2$e%>NEP1`NzVq-#wuq^6eq|h|C;pted2tfEG-#`#bDuWcscCi
zF$h{gClD58)N6=`jhssqNEbO9@Ijo)k<*I$3WHDEu$8hRcjIwf;Iv~aO_?~qYS_RH
zxIB0*+EY*7pd<GkYJm|g)j-Zo6YnkEbxB_LiMKG+JJ+LcHGLwF2?TEG-WlC%-vs?H
z&m-?5cYk=&&pG`WhS9^34|xY)v-@e^)aPKemb?!Ak9{W25RJgi&Nb%Hfamn?kVB0=
z<nt%Dm~$@3rOHJ!q%r3>7kVGhg$Do5k=`CYWE`e^m^(QK#Zz(IsirG;>Rz3?hqU6f
P(p6*|{G1ovCTHLW)Hrd7

literal 0
HcmV?d00001

diff --git a/tests/pbm/cdun2c08.ppm b/tests/pbm/cdun2c08.ppm
new file mode 100644
index 0000000000000000000000000000000000000000..2d7202b89dafb02c9a57f18997a48916bec26dcd
GIT binary patch
literal 3086
zcmb7_+p(iC3_$&zKRVc<gfRD3Kmq$h3u%EC(n4BD3vPiHXn_{IM-m8Q9?8yV7(b7m
z7Jq+lx90w*x!>+TKerUZ2WdiTL+V0$EW((@>eIz*a;>yJq-#O-HD|wzVOV&L&3Ew&
zWA>cG6;9@NzaM|&2xs2MGQI;c=yy6fBZeK?Y>QYPLrNQ&Qh2BEOrhV(D}`ZuQZ7zf
z35P0(Ibq!LAs5(Uirz8T0v{JBt-T6M=)%WrEG%ee)725<qoOEXrk2Nr4{B=!HrWH<
z9Db6g6;Znv&M$K9)u{ZKLbviSo&ndyj7k%P%MyRHvv9!|W8fIZFMQzKnZh)MIE9zW
zdkTXt$`+L*e6EnJ4d1hn7krds3*S<%&OcOTmnsySkXosjfmj)zTUfE$KRc(f%<Q*P
zRQm~HFaE3(xxGGCDnR&sl`7zfxaBAYmO~GIAI2!h5Jf`j1IcLRlE#S85TiI+Kj7SC
z{Kjsvz2RF4D8|-@mA{nKGgYlbIe%oR5buFT<G02~_l8g8aUt_4<1bn1QzLwykul_p
z7vep`xA8mMdlr04@iqUMYT;X1%_`vI^k=@=e;pr^*ru1nhfKJcqdXvH!x!<%@;j~Z
zCvd8m$KT3x)s{))q=||;Nge!e@~dSyXCcn5QtOBC5j*A7sj5?3r>;&7?Z5<wq&^>B
z|Ht*YWKv*pj)L;d1${>zn>j+>o#8je=iwZ`ue^lH{y-Rr2VIGc7$~o(Ut5u{)RFPU
zo4kI!Kfw18X@4W+i~a1OBCCKoj#hJ$WaH1>BIiqn7GZ2Rly7edt1_u8=`&p=(dJ7`
zD&KAP{W&+@w3rJE9~B>eUY?5e7v}co#=94UA>&)cTe~yV{>$T#w(nwmdsdG10ZO&e
ztmpMQi7Wn<D8;+g?3}-kjPn_=`_%Wt6WYo23gd2ZbIf~w&Md}waaJ!@183iB%L^@D
pv1hb=9kUM;S}=<#SGaiX3%mbicA>(i)B_MaLY)cYvm)6M{sER_hvWbN

literal 0
HcmV?d00001

diff --git a/tests/pbm/comment.pgm b/tests/pbm/comment.pgm
new file mode 100644
index 0000000000000000000000000000000000000000..aa9dc71887f79ac4bd39c67cf3af312b332d3cea
GIT binary patch
literal 1083
zcmWGA<x*B~E=o--Nlj5ms#I|I^bJrbOD!tS%+FIW(la#BGcr;L@bgtD$SF<N&CKI6
zHc~J);xaNd<@%2VjDf%f2tt7%4G7AC07?2kt>l$}0GrtzKmf9z*76`b(ESavdnOQo
z<UoLy^0>@q00Mk=Q%fE>zG41e1q3kjX(^A-Y>@r5mWQQtWPk4lf-^t>vY*!Spge|d
LH^^QXptU>zF1GZB

literal 0
HcmV?d00001

diff --git a/tests/pbm/ctfn0g04.pgm b/tests/pbm/ctfn0g04.pgm
new file mode 100644
index 0000000000000000000000000000000000000000..284f8708977d076a30ee1ac63577121f46eac71a
GIT binary patch
literal 1038
zcmWGA<>E3nQZP2+GBP#g`i}&Rfxra_LV+L+2+Dx~N%}vn<duK`o7o*e0J5Lf@*q3V
z{SC5vCJ=z+K!BF=xXfk%0(^E;OCCACVg6nP1TgbyDUZ)=ko~llhoy65fA0o@Ge7{c
SpVsoAJce#J$X*zrwLAbZ{pP~}

literal 0
HcmV?d00001


From 95372dc4f881900e51284f85c63312d75aee8f17 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 7 Jul 2021 00:29:19 -0700
Subject: [PATCH 073/170] stb.h: fix bug in stb_readdir caused by change to
 stb_strncpy

---
 deprecated/stb.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deprecated/stb.h b/deprecated/stb.h
index 03aec96..06f3168 100644
--- a/deprecated/stb.h
+++ b/deprecated/stb.h
@@ -9230,6 +9230,7 @@ int stb__wildmatch_raw(char *expr, char *candidate, int search, int insensitive)
       // need to allow for non-writeable strings... assume they're small
       if (s - last < 256) {
          stb_strncpy(buffer, last, (int) (s-last+1));
+         buffer[s-last] = 0;
          z = stb__wildmatch_raw2(buffer, candidate, search, insensitive);
       } else {
          *s = 0;

From 84e7a4ae24fdea454c29ae629abf17d41bc4d697 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 7 Jul 2021 00:43:40 -0700
Subject: [PATCH 074/170] stb_c_lexer: fix NUL eof test so it compiles

---
 stb_c_lexer.h                  |  2 +-
 tests/test_cpp_compilation.cpp | 49 +++++++++++++++++++++++++++++++++-
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/stb_c_lexer.h b/stb_c_lexer.h
index be34f90..4676102 100644
--- a/stb_c_lexer.h
+++ b/stb_c_lexer.h
@@ -608,7 +608,7 @@ int stb_c_lexer_get_token(stb_lexer *lexer)
          // check for EOF
          STB_C_LEX_0_IS_EOF(
             if (*p == 0)
-               return stb__clex_eof(tok);
+               return stb__clex_eof(lexer);
          )
 
       single_char:
diff --git a/tests/test_cpp_compilation.cpp b/tests/test_cpp_compilation.cpp
index c396882..10f27b3 100644
--- a/tests/test_cpp_compilation.cpp
+++ b/tests/test_cpp_compilation.cpp
@@ -7,7 +7,6 @@
 #include "stb_image_write.h"
 #include "stb_perlin.h"
 #include "stb_dxt.h"
-#include "stb_c_lexer.h"
 #include "stb_divide.h"
 #include "stb_herringbone_wang_tile.h"
 #include "stb_ds.h"
@@ -183,3 +182,51 @@ void dummy3(void)
   stb_textedit_initialize_state(0,0);
   stb_textedit_paste(0,0,0,0);
 }
+
+#define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
+#define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
+#define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
+#define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)     CLEX_floatlit
+#define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+     CLEX_floatlit
+#define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
+#define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
+#define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
+#define STB_C_LEX_C_CHARS           Y   //  single-quote-delimited character with escape CLEX_charlits
+#define STB_C_LEX_C_COMMENTS        Y   //  "/* comment */"
+#define STB_C_LEX_CPP_COMMENTS      Y   //  "// comment to end of line\n"
+#define STB_C_LEX_C_COMPARISONS     Y   //  "==" CLEX_eq  "!=" CLEX_noteq   "<=" CLEX_lesseq  ">=" CLEX_greatereq
+#define STB_C_LEX_C_LOGICAL         Y   //  "&&"  CLEX_andand   "||"  CLEX_oror
+#define STB_C_LEX_C_SHIFTS          Y   //  "<<"  CLEX_shl      ">>"  CLEX_shr
+#define STB_C_LEX_C_INCREMENTS      Y   //  "++"  CLEX_plusplus "--"  CLEX_minusminus
+#define STB_C_LEX_C_ARROW           Y   //  "->"  CLEX_arrow
+#define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
+#define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
+#define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
+                                        //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
+                                        //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
+                                        //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
+
+#define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
+#define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
+#define STB_C_LEX_HEX_SUFFIXES      ""  // e.g. "uUlL"
+#define STB_C_LEX_OCTAL_SUFFIXES    ""  // e.g. "uUlL"
+#define STB_C_LEX_FLOAT_SUFFIXES    ""  //
+
+#define STB_C_LEX_0_IS_EOF             Y  // if Y, ends parsing at '\0'; if N, returns '\0' as token
+#define STB_C_LEX_INTEGERS_AS_DOUBLES  N  // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
+#define STB_C_LEX_MULTILINE_DSTRINGS   N  // allow newlines in double-quoted strings
+#define STB_C_LEX_MULTILINE_SSTRINGS   N  // allow newlines in single-quoted strings
+#define STB_C_LEX_USE_STDLIB           Y  // use strtod,strtol for parsing #s; otherwise inaccurate hack
+#define STB_C_LEX_DOLLAR_IDENTIFIER    Y  // allow $ as an identifier character
+#define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
+
+#define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
+                                              // leaving it as N should help you catch config bugs
+
+#define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
+                                              // still have #line, #pragma, etc)
+
+//#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
+
+#define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
+#include "stb_c_lexer.h"

From ca0e6d06d8ae055823a5b11fe64a0b05cb6ae3e3 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 7 Jul 2021 00:57:43 -0700
Subject: [PATCH 075/170] stb_c_lexer: don't define default macro definitions
 on non-implementation includes

---
 stb_c_lexer.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/stb_c_lexer.h b/stb_c_lexer.h
index 4676102..0cb9f39 100644
--- a/stb_c_lexer.h
+++ b/stb_c_lexer.h
@@ -42,6 +42,7 @@
 //
 //   See end of file for license information.
 
+#ifdef STB_C_LEXER_IMPLEMENTATION
 #ifndef STB_C_LEXER_DEFINITIONS
 // to change the default parsing rules, copy the following lines
 // into your C/C++ file *before* including this, and then replace
@@ -96,6 +97,7 @@
 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
 // --END--
 
+#endif
 #endif
 
 #ifndef INCLUDE_STB_C_LEXER_H

From fc5b791228c85cae0a4652254b78e35846504fe1 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 7 Jul 2021 01:21:37 -0700
Subject: [PATCH 076/170] stb_truetype: fix SDF handling of multiple move_to
 commands in a row

---
 stb_truetype.h        | 19 +++++++++++--------
 tests/test_truetype.c | 30 +++++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index c3c4f8e..ce32bb3 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -4587,18 +4587,17 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
             for (i=0; i < num_verts; ++i) {
                float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y;
 
-               // check against every point here rather than inside line/curve primitives -- @TODO: wrong if multiple 'moves' in a row produce a garbage point, and given culling, probably more efficient to do within line/curve
-               float dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
-               if (dist2 < min_dist*min_dist)
-                  min_dist = (float) STBTT_sqrt(dist2);
-
-               if (verts[i].type == STBTT_vline) {
+               if (verts[i].type == STBTT_vline && precompute[i] != 0.0f) {
                   float x1 = verts[i-1].x*scale_x, y1 = verts[i-1].y*scale_y;
 
+                  float dist,dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                  if (dist2 < min_dist*min_dist)
+                     min_dist = (float) STBTT_sqrt(dist2);
+
                   // coarse culling against bbox
                   //if (sx > STBTT_min(x0,x1)-min_dist && sx < STBTT_max(x0,x1)+min_dist &&
                   //    sy > STBTT_min(y0,y1)-min_dist && sy < STBTT_max(y0,y1)+min_dist)
-                  float dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i];
+                  dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i];
                   STBTT_assert(i != 0);
                   if (dist < min_dist) {
                      // check position along line
@@ -4625,7 +4624,7 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                      float ax = x1-x0, ay = y1-y0;
                      float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
                      float mx = x0 - sx, my = y0 - sy;
-                     float res[3],px,py,t,it;
+                     float res[3],px,py,t,it,dist2;
                      float a_inv = precompute[i];
                      if (a_inv == 0.0) { // if a_inv is 0, it's 2nd degree so use quadratic formula
                         float a = 3*(ax*bx + ay*by);
@@ -4652,6 +4651,10 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                         float d = (mx*ax+my*ay) * a_inv;
                         num = stbtt__solve_cubic(b, c, d, res);
                      }
+                     dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy);
+                     if (dist2 < min_dist*min_dist)
+                        min_dist = (float) STBTT_sqrt(dist2);
+
                      if (num >= 1 && res[0] >= 0.0f && res[0] <= 1.0f) {
                         t = res[0], it = 1.0f - t;
                         px = it*it*x0 + 2*t*it*x1 + t*t*x2;
diff --git a/tests/test_truetype.c b/tests/test_truetype.c
index 1ac147b..5f03d3b 100644
--- a/tests/test_truetype.c
+++ b/tests/test_truetype.c
@@ -42,9 +42,13 @@ int main(int argc, char **argv)
    //debug();
 
    // @TODO: why is minglui.ttc failing? 
-   fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/mingliu.ttc", "rb"));
+   //fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/mingliu.ttc", "rb"));
 
-   //fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/x/DroidSansMono.ttf", "rb"));
+   fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/DejaVuSans.ttf", "rb"));
+
+   stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
+
+#if 0
    {
       static stbtt_pack_context pc;
       static stbtt_packedchar cd[256];
@@ -54,6 +58,27 @@ int main(int argc, char **argv)
       stbtt_PackFontRange(&pc, ttf_buffer, 0, 32.0, 0, 256, cd);
       stbtt_PackEnd(&pc);
    }
+#endif
+
+   {
+      static stbtt_pack_context pc;
+      static stbtt_packedchar cd[256];
+      static unsigned char atlas[1024*1024];
+      unsigned char *data;
+
+      stbtt_PackBegin(&pc, atlas, 1024,1024,1024,1,NULL);
+      stbtt_PackFontRange(&pc, ttf_buffer, 0, 32.0, 'u', 1, cd);
+      stbtt_PackEnd(&pc);
+
+      data = stbtt_GetCodepointSDF(&font, stbtt_ScaleForPixelHeight(&font,32.0), 'u', 4, 128, 128/4, &w,&h,&i,&j);
+      for (j=0; j < h; ++j) {
+         for (i=0; i < w; ++i) {
+            putchar(" .:ioVM@"[data[j*w+i]>>5]);
+         }
+         putchar('\n');
+      }
+      return 0;
+   }
 
 #if 0
    stbtt_BakeFontBitmap(ttf_buffer,stbtt_GetFontOffsetForIndex(ttf_buffer,0), 40.0, temp_bitmap[0],BITMAP_W,BITMAP_H, 32,96, cdata); // no guarantee this fits!
@@ -92,7 +117,6 @@ int main(int argc, char **argv)
    return 0;
 #endif
 
-   stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
    bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, (float)s), c, &w, &h, 0,0);
 
    for (j=0; j < h; ++j) {

From c38ec91d229cbe56167f8af865c05b3d92db705f Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 01:27:15 -0700
Subject: [PATCH 077/170] stb_divide: Fix integer overflow issues

We've established the signs of values before so we can carefully
jiggle the expressions to be guaranteed overflow-free; the tests
for <0 here were meant to check if the value was "still negative",
i.e. if the sum did not underflow below INT_MIN, and we can
rewrite that using algebra to be overflow-free.

We need an extra case in the Euclidean dvision for
INT_MIN / INT_MIN which is a bit annoying but trivial; finally,
for two's complement platforms, note that abs(x) = (x > 0) ? x : -x
does not work (since for x=INT_MIN, -x still gives INT_MIN),
but the equivalent formulation for _negative_ absolute value
(x < 0) ? x : -x is always in range and overflow-free, so
rewrite the relevant expressions using that negative absolute
value instead.

Fixes issue #741.
---
 stb_divide.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/stb_divide.h b/stb_divide.h
index f8b1f3e..c8a1d92 100644
--- a/stb_divide.h
+++ b/stb_divide.h
@@ -166,15 +166,15 @@ int stb_div_floor(int v1, int v2)
    return v1/v2;
    #else
    if (v1 >= 0 && v2 < 0) {
-      if ((-v1)+v2+1 < 0) // check if increasing v1's magnitude overflows
-         return -stb__div(-v1+v2+1,v2); // nope, so just compute it
+      if (v2 + 1 >= INT_MIN + v1) // check if increasing v1's magnitude overflows
+         return -stb__div((v2+1)-v1,v2); // nope, so just compute it
       else
          return -stb__div(-v1,v2) + ((-v1)%v2 ? -1 : 0);
    }
    if (v1 < 0 && v2 >= 0) {
       if (v1 != INT_MIN) {
-         if (v1-v2+1 < 0) // check if increasing v1's magnitude overflows
-            return -stb__div(v1-v2+1,-v2); // nope, so just compute it
+         if (v1 + 1 >= INT_MIN + v2) // check if increasing v1's magnitude overflows
+            return -stb__div((v1+1)-v2,-v2); // nope, so just compute it
          else
             return -stb__div(-v1,v2) + (stb__mod(v1,-v2) ? -1 : 0);
       } else // it must be possible to compute -(v1+v2) without overflowing
@@ -209,8 +209,10 @@ int stb_div_eucl(int v1, int v2)
    else // if v1 is INT_MIN, we have to move away from overflow place
       if (v2 >= 0)
          q = -stb__div(-(v1+v2),v2)-1, r = -stb__mod(-(v1+v2),v2);
-      else
+      else if (v2 != INT_MIN)
          q = stb__div(-(v1-v2),-v2)+1, r = -stb__mod(-(v1-v2),-v2);
+      else // for INT_MIN / INT_MIN, we need to be extra-careful to avoid overflow
+         q = 1, r = 0;
    #endif
    if (r >= 0)
       return q;
@@ -228,13 +230,13 @@ int stb_mod_trunc(int v1, int v2)
       if (r >= 0)
          return r;
       else
-         return r + (v2 > 0 ? v2 : -v2);
+         return r - (v2 < 0 ? v2 : -v2);
    } else {    // modulus result should always be negative
       int r = stb__mod(v1,v2);
       if (r <= 0)
          return r;
       else
-         return r - (v2 > 0 ? v2 : -v2);
+         return r + (v2 < 0 ? v2 : -v2);
    }
    #endif
 }
@@ -267,7 +269,7 @@ int stb_mod_eucl(int v1, int v2)
    if (r >= 0)
       return r;
    else
-      return r + (v2 > 0 ? v2 : -v2); // abs()
+      return r - (v2 < 0 ? v2 : -v2); // negative abs() [to avoid overflow]
 }
 
 #ifdef STB_DIVIDE_TEST

From e7f9f92c9d466fac3b42a2b4a3e6849df7bea51c Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 01:33:35 -0700
Subject: [PATCH 078/170] stb_divide: Update changelog

---
 stb_divide.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_divide.h b/stb_divide.h
index c8a1d92..455aacf 100644
--- a/stb_divide.h
+++ b/stb_divide.h
@@ -1,8 +1,9 @@
-// stb_divide.h - v0.93 - public domain - Sean Barrett, Feb 2010
+// stb_divide.h - v0.94 - public domain - Sean Barrett, Feb 2010
 // Three kinds of divide/modulus of signed integers.
 //
 // HISTORY
 //
+//   v0.94              Fix integer overflow issues 
 //   v0.93  2020-02-02  Write useful exit() value from main()
 //   v0.92  2019-02-25  Fix warning
 //   v0.91  2010-02-27  Fix euclidean division by INT_MIN for non-truncating C

From 14c224c84e51af5b4932f2a410e66a06f20c8cf7 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 01:45:29 -0700
Subject: [PATCH 079/170] stb_image_resize: Remove ill-advised asserts.

These mostly add very little and have caused problems for people,
nor does it make sense to require this when the underlying
computations are performed in floating-point arithmetic depending
on ratios of user-passed in image dimensions.

Arbitrary absolute epsilons here would just be garbage; we could
try and compute desired relative error bounds based on the
determined scale values, but this still leaves the questions of
what purpose this would even serve, which is unclear.

Leave the filter kernel asserts as comment for documentation
of what the behavior would be with exact math, but don't actually
bother asserting here.

Fixes issue #736.
---
 stb_image_resize.h | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/stb_image_resize.h b/stb_image_resize.h
index 42a8efb..17e46b7 100644
--- a/stb_image_resize.h
+++ b/stb_image_resize.h
@@ -1064,7 +1064,11 @@ static void stbir__calculate_coefficients_upsample(stbir_filter filter, float sc
         total_filter += coefficient_group[i];
     }
 
-    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
+    // NOTE(fg): Not actually true in general, nor is there any reason to expect it should be.
+    // It would be true in exact math but is at best approximately true in floating-point math,
+    // and it would not make sense to try and put actual bounds on this here because it depends
+    // on the image aspect ratio which can get pretty extreme.
+    //STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
 
     STBIR_ASSERT(total_filter > 0.9);
     STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
@@ -1089,7 +1093,7 @@ static void stbir__calculate_coefficients_downsample(stbir_filter filter, float
 {
     int i;
 
-     STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+    STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
 
     contributor->n0 = out_first_pixel;
     contributor->n1 = out_last_pixel;
@@ -1103,7 +1107,11 @@ static void stbir__calculate_coefficients_downsample(stbir_filter filter, float
         coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
     }
 
-    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
+    // NOTE(fg): Not actually true in general, nor is there any reason to expect it should be.
+    // It would be true in exact math but is at best approximately true in floating-point math,
+    // and it would not make sense to try and put actual bounds on this here because it depends
+    // on the image aspect ratio which can get pretty extreme.
+    //STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
 
     for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
     {
@@ -1552,7 +1560,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                 {
                     int out_pixel_index = k * 1;
                     float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                     output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                 }
             }
@@ -1573,7 +1580,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                 {
                     int out_pixel_index = k * 2;
                     float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                     output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                     output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
                 }
@@ -1595,7 +1601,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                 {
                     int out_pixel_index = k * 3;
                     float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                     output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                     output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
                     output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
@@ -1618,7 +1623,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                 {
                     int out_pixel_index = k * 4;
                     float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                     output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
                     output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
                     output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
@@ -1643,7 +1647,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float
                     int c;
                     int out_pixel_index = k * channels;
                     float coefficient = horizontal_coefficients[coefficient_group + k - n0];
-                    STBIR_ASSERT(coefficient != 0);
                     for (c = 0; c < channels; c++)
                         output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
                 }

From db6e91b7d867f33eb5e9c5012324e2702a1eeedd Mon Sep 17 00:00:00 2001
From: Andrew Kensler <andrew@eastfarthing.com>
Date: Sat, 3 Jul 2021 23:39:43 -0700
Subject: [PATCH 080/170] Store uncompressed if compression was worse

If the data was uncompressible and this deflate implementation expanded
by more than the overhead of simply storing it uncompressed, fall back
to deflate's uncompressed storage mode.  This bounds the maximum deflated
size at the original size plus an overhead of 6 fixed bytes with another
5 bytes per 32767 byte block.

Fixes issue #948.
---
 stb_image_write.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/stb_image_write.h b/stb_image_write.h
index 271d4ac..a909e82 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -140,6 +140,7 @@ CREDITS:
       Ivan Tikhonov
       github:ignotion
       Adam Schackart
+      Andrew Kensler
 
 LICENSE
 
@@ -969,6 +970,23 @@ STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, i
       (void) stbiw__sbfree(hash_table[i]);
    STBIW_FREE(hash_table);
 
+   // store uncompressed instead if compression was worse
+   if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) {
+      stbiw__sbn(out) = 2;  // truncate to DEFLATE 32K window and FLEVEL = 1
+      for (j = 0; j < data_len;) {
+         int blocklen = data_len - j;
+         if (blocklen > 32767) blocklen = 32767;
+         stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN
+         stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8));
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN
+         stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8));
+         memcpy(out+stbiw__sbn(out), data+j, blocklen);
+         stbiw__sbn(out) += blocklen;
+         j += blocklen;
+      }
+   }
+
    {
       // compute adler32 on input
       unsigned int s1=1, s2=0;
@@ -1599,6 +1617,8 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
+      1.15  (          )
+             make Deflate code emit uncompressed blocks when it would otherwise expand
       1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
       1.13
       1.12

From e59050549285deb717d8a2245586c0b8c2a3d38a Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 02:45:41 -0700
Subject: [PATCH 081/170] stb_truetype: GCC warning fix

GCC warns about a potentially uninitialized variable here. It's
not (or at least I don't see how), but fix it anyway.
---
 stb_truetype.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index ce32bb3..8b3bd87 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -4624,7 +4624,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc
                      float ax = x1-x0, ay = y1-y0;
                      float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2;
                      float mx = x0 - sx, my = y0 - sy;
-                     float res[3],px,py,t,it,dist2;
+                     float res[3] = {0.f,0.f,0.f};
+                     float px,py,t,it,dist2;
                      float a_inv = precompute[i];
                      if (a_inv == 0.0) { // if a_inv is 0, it's 2nd degree so use quadratic formula
                         float a = 3*(ax*bx + ay*by);

From 1252a3e6415353bb8f82cc52fc392c4073634eca Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 02:58:28 -0700
Subject: [PATCH 082/170] stb_truetype: GPOS parsing fixes

Glyphs not assigned a glyph class should default to class 0 as
per the OpenType spec. Also, change the code to treat malformed
data as an error to be handled, not an assert, and change the
way the version checking works.

Fixes the issue mentioned in PR #1035. Also, this part of the
code is indented incorrectly; will fix that in a subsequent
commit.
---
 stb_truetype.h | 99 +++++++++++++++++++++++---------------------------
 1 file changed, 46 insertions(+), 53 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 8b3bd87..f959774 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -2479,12 +2479,13 @@ static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
         } break;
 
         default: {
-            // There are no other cases.
-            STBTT_assert(0);
+            // Unsupported defition type; return an error.
+            return -1;
         } break;
     }
 
-    return -1;
+    // "All glyphs not assigned to a class fall into class 0". (OpenType spec)
+    return 0;
 }
 
 // Define to STBTT_assert(x) if you want to break on unimplemented formats.
@@ -2533,75 +2534,67 @@ static stbtt_int32  stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, i
                             int straw, needle;
                             stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
                             stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
-                            stbtt_int32 valueRecordPairSizeInBytes = 2;
-                            stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
-                            stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
-                            stbtt_uint8 *pairValueTable = table + pairPosOffset;
-                            stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
-                            stbtt_uint8 *pairValueArray = pairValueTable + 2;
-                            // TODO: Support more formats.
-                            STBTT_GPOS_TODO_assert(valueFormat1 == 4);
-                            if (valueFormat1 != 4) return 0;
-                            STBTT_GPOS_TODO_assert(valueFormat2 == 0);
-                            if (valueFormat2 != 0) return 0;
+                            if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                                stbtt_int32 valueRecordPairSizeInBytes = 2;
+                                stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
+                                stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
+                                stbtt_uint8 *pairValueTable = table + pairPosOffset;
+                                stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
+                                stbtt_uint8 *pairValueArray = pairValueTable + 2;
 
-                            STBTT_assert(coverageIndex < pairSetCount);
-                            STBTT__NOTUSED(pairSetCount);
+                                if (coverageIndex >= pairSetCount) return 0;
 
-                            needle=glyph2;
-                            r=pairValueCount-1;
-                            l=0;
+                                needle=glyph2;
+                                r=pairValueCount-1;
+                                l=0;
 
-                            // Binary search.
-                            while (l <= r) {
-                                stbtt_uint16 secondGlyph;
-                                stbtt_uint8 *pairValue;
-                                m = (l + r) >> 1;
-                                pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
-                                secondGlyph = ttUSHORT(pairValue);
-                                straw = secondGlyph;
-                                if (needle < straw)
-                                    r = m - 1;
-                                else if (needle > straw)
-                                    l = m + 1;
-                                else {
-                                    stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
-                                    return xAdvance;
+                                // Binary search.
+                                while (l <= r) {
+                                    stbtt_uint16 secondGlyph;
+                                    stbtt_uint8 *pairValue;
+                                    m = (l + r) >> 1;
+                                    pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
+                                    secondGlyph = ttUSHORT(pairValue);
+                                    straw = secondGlyph;
+                                    if (needle < straw)
+                                        r = m - 1;
+                                    else if (needle > straw)
+                                        l = m + 1;
+                                    else {
+                                        stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
+                                        return xAdvance;
+                                    }
                                 }
-                            }
+                            } else
+                                return 0;
                         } break;
 
                         case 2: {
                             stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
                             stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+                            if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                                stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
+                                stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
+                                int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
+                                int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
 
-                            stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
-                            stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
-                            int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
-                            int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
+                                stbtt_uint16 class1Count = ttUSHORT(table + 12);
+                                stbtt_uint16 class2Count = ttUSHORT(table + 14);
 
-                            stbtt_uint16 class1Count = ttUSHORT(table + 12);
-                            stbtt_uint16 class2Count = ttUSHORT(table + 14);
-                            STBTT_assert(glyph1class < class1Count);
-                            STBTT_assert(glyph2class < class2Count);
+                                if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
+                                if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed
 
-                            // TODO: Support more formats.
-                            STBTT_GPOS_TODO_assert(valueFormat1 == 4);
-                            if (valueFormat1 != 4) return 0;
-                            STBTT_GPOS_TODO_assert(valueFormat2 == 0);
-                            if (valueFormat2 != 0) return 0;
-
-                            if (glyph1class >= 0 && glyph1class < class1Count && glyph2class >= 0 && glyph2class < class2Count) {
                                 stbtt_uint8 *class1Records = table + 16;
                                 stbtt_uint8 *class2Records = class1Records + 2 * (glyph1class * class2Count);
                                 stbtt_int16 xAdvance = ttSHORT(class2Records + 2 * glyph2class);
                                 return xAdvance;
-                            }
+                            } else
+                                return 0;
                         } break;
 
                         default: {
-                            // There are no other cases.
-                            STBTT_assert(0);
+                            // Unsupported definition type
+                            return 0;
                             break;
                         };
                     }

From 9fe3b4bb52ff671809d72afa7e5dead88d292cb1 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 03:12:33 -0700
Subject: [PATCH 083/170] stb_truetype: GPOS handling formatting changes

Was using 4-character spaces and otherwise formatted unlike the
rest of the code, fix this. Also get rid of the outer switch in
GetGlyphGPOSInfoAdvance with just one case; just use an if.
No behavioral changes.
---
 stb_truetype.h | 363 ++++++++++++++++++++++++-------------------------
 1 file changed, 177 insertions(+), 186 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index f959774..783ecfa 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -2353,7 +2353,7 @@ STBTT_DEF int stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningent
    return length;
 }
 
-static int  stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
 {
    stbtt_uint8 *data = info->data + info->kern;
    stbtt_uint32 needle, straw;
@@ -2383,232 +2383,223 @@ static int  stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph
    return 0;
 }
 
-static stbtt_int32  stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph)
+static stbtt_int32 stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph)
 {
-    stbtt_uint16 coverageFormat = ttUSHORT(coverageTable);
-    switch(coverageFormat) {
-        case 1: {
-            stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2);
+   stbtt_uint16 coverageFormat = ttUSHORT(coverageTable);
+   switch (coverageFormat) {
+      case 1: {
+         stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2);
 
-            // Binary search.
-            stbtt_int32 l=0, r=glyphCount-1, m;
-            int straw, needle=glyph;
-            while (l <= r) {
-                stbtt_uint8 *glyphArray = coverageTable + 4;
-                stbtt_uint16 glyphID;
-                m = (l + r) >> 1;
-                glyphID = ttUSHORT(glyphArray + 2 * m);
-                straw = glyphID;
-                if (needle < straw)
-                    r = m - 1;
-                else if (needle > straw)
-                    l = m + 1;
-                else {
-                     return m;
-                }
+         // Binary search.
+         stbtt_int32 l=0, r=glyphCount-1, m;
+         int straw, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *glyphArray = coverageTable + 4;
+            stbtt_uint16 glyphID;
+            m = (l + r) >> 1;
+            glyphID = ttUSHORT(glyphArray + 2 * m);
+            straw = glyphID;
+            if (needle < straw)
+               r = m - 1;
+            else if (needle > straw)
+               l = m + 1;
+            else {
+               return m;
             }
-        } break;
+         }
+         break;
+      }
 
-        case 2: {
-            stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2);
-            stbtt_uint8 *rangeArray = coverageTable + 4;
+      case 2: {
+         stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2);
+         stbtt_uint8 *rangeArray = coverageTable + 4;
 
-            // Binary search.
-            stbtt_int32 l=0, r=rangeCount-1, m;
-            int strawStart, strawEnd, needle=glyph;
-            while (l <= r) {
-                stbtt_uint8 *rangeRecord;
-                m = (l + r) >> 1;
-                rangeRecord = rangeArray + 6 * m;
-                strawStart = ttUSHORT(rangeRecord);
-                strawEnd = ttUSHORT(rangeRecord + 2);
-                if (needle < strawStart)
-                    r = m - 1;
-                else if (needle > strawEnd)
-                    l = m + 1;
-                else {
-                    stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4);
-                    return startCoverageIndex + glyph - strawStart;
-                }
+         // Binary search.
+         stbtt_int32 l=0, r=rangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *rangeRecord;
+            m = (l + r) >> 1;
+            rangeRecord = rangeArray + 6 * m;
+            strawStart = ttUSHORT(rangeRecord);
+            strawEnd = ttUSHORT(rangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else {
+               stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4);
+               return startCoverageIndex + glyph - strawStart;
             }
-        } break;
+         }
+         break;
+      }
 
-        default: {
-            // There are no other cases.
-            STBTT_assert(0);
-        } break;
-    }
+      default: return -1; // unsupported
+   }
 
-    return -1;
+   return -1;
 }
 
 static stbtt_int32  stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph)
 {
-    stbtt_uint16 classDefFormat = ttUSHORT(classDefTable);
-    switch(classDefFormat)
-    {
-        case 1: {
-            stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2);
-            stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4);
-            stbtt_uint8 *classDef1ValueArray = classDefTable + 6;
+   stbtt_uint16 classDefFormat = ttUSHORT(classDefTable);
+   switch (classDefFormat)
+   {
+      case 1: {
+         stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2);
+         stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4);
+         stbtt_uint8 *classDef1ValueArray = classDefTable + 6;
 
-            if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
-                return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
-        } break;
+         if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount)
+            return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID));
+         break;
+      }
 
-        case 2: {
-            stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2);
-            stbtt_uint8 *classRangeRecords = classDefTable + 4;
+      case 2: {
+         stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2);
+         stbtt_uint8 *classRangeRecords = classDefTable + 4;
 
-            // Binary search.
-            stbtt_int32 l=0, r=classRangeCount-1, m;
-            int strawStart, strawEnd, needle=glyph;
-            while (l <= r) {
-                stbtt_uint8 *classRangeRecord;
-                m = (l + r) >> 1;
-                classRangeRecord = classRangeRecords + 6 * m;
-                strawStart = ttUSHORT(classRangeRecord);
-                strawEnd = ttUSHORT(classRangeRecord + 2);
-                if (needle < strawStart)
-                    r = m - 1;
-                else if (needle > strawEnd)
-                    l = m + 1;
-                else
-                    return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
-            }
-        } break;
+         // Binary search.
+         stbtt_int32 l=0, r=classRangeCount-1, m;
+         int strawStart, strawEnd, needle=glyph;
+         while (l <= r) {
+            stbtt_uint8 *classRangeRecord;
+            m = (l + r) >> 1;
+            classRangeRecord = classRangeRecords + 6 * m;
+            strawStart = ttUSHORT(classRangeRecord);
+            strawEnd = ttUSHORT(classRangeRecord + 2);
+            if (needle < strawStart)
+               r = m - 1;
+            else if (needle > strawEnd)
+               l = m + 1;
+            else
+               return (stbtt_int32)ttUSHORT(classRangeRecord + 4);
+         }
+         break;
+      }
 
-        default: {
-            // Unsupported defition type; return an error.
-            return -1;
-        } break;
-    }
+      default:
+         return -1; // Unsupported definition type, return an error.
+   }
 
-    // "All glyphs not assigned to a class fall into class 0". (OpenType spec)
-    return 0;
+   // "All glyphs not assigned to a class fall into class 0". (OpenType spec)
+   return 0;
 }
 
 // Define to STBTT_assert(x) if you want to break on unimplemented formats.
 #define STBTT_GPOS_TODO_assert(x)
 
-static stbtt_int32  stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
+static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2)
 {
-    stbtt_uint16 lookupListOffset;
-    stbtt_uint8 *lookupList;
-    stbtt_uint16 lookupCount;
-    stbtt_uint8 *data;
-    stbtt_int32 i;
+   stbtt_uint16 lookupListOffset;
+   stbtt_uint8 *lookupList;
+   stbtt_uint16 lookupCount;
+   stbtt_uint8 *data;
+   stbtt_int32 i, sti;
 
-    if (!info->gpos) return 0;
+   if (!info->gpos) return 0;
 
-    data = info->data + info->gpos;
+   data = info->data + info->gpos;
 
-    if (ttUSHORT(data+0) != 1) return 0; // Major version 1
-    if (ttUSHORT(data+2) != 0) return 0; // Minor version 0
+   if (ttUSHORT(data+0) != 1) return 0; // Major version 1
+   if (ttUSHORT(data+2) != 0) return 0; // Minor version 0
 
-    lookupListOffset = ttUSHORT(data+8);
-    lookupList = data + lookupListOffset;
-    lookupCount = ttUSHORT(lookupList);
+   lookupListOffset = ttUSHORT(data+8);
+   lookupList = data + lookupListOffset;
+   lookupCount = ttUSHORT(lookupList);
 
-    for (i=0; i<lookupCount; ++i) {
-        stbtt_uint16 lookupOffset = ttUSHORT(lookupList + 2 + 2 * i);
-        stbtt_uint8 *lookupTable = lookupList + lookupOffset;
+   for (i=0; i<lookupCount; ++i) {
+      stbtt_uint16 lookupOffset = ttUSHORT(lookupList + 2 + 2 * i);
+      stbtt_uint8 *lookupTable = lookupList + lookupOffset;
 
-        stbtt_uint16 lookupType = ttUSHORT(lookupTable);
-        stbtt_uint16 subTableCount = ttUSHORT(lookupTable + 4);
-        stbtt_uint8 *subTableOffsets = lookupTable + 6;
-        switch(lookupType) {
-            case 2: { // Pair Adjustment Positioning Subtable
-                stbtt_int32 sti;
-                for (sti=0; sti<subTableCount; sti++) {
-                    stbtt_uint16 subtableOffset = ttUSHORT(subTableOffsets + 2 * sti);
-                    stbtt_uint8 *table = lookupTable + subtableOffset;
-                    stbtt_uint16 posFormat = ttUSHORT(table);
-                    stbtt_uint16 coverageOffset = ttUSHORT(table + 2);
-                    stbtt_int32 coverageIndex = stbtt__GetCoverageIndex(table + coverageOffset, glyph1);
-                    if (coverageIndex == -1) continue;
+      stbtt_uint16 lookupType = ttUSHORT(lookupTable);
+      stbtt_uint16 subTableCount = ttUSHORT(lookupTable + 4);
+      stbtt_uint8 *subTableOffsets = lookupTable + 6;
+      if (lookupType != 2) // Pair Adjustment Positioning Subtable
+         continue;
 
-                    switch (posFormat) {
-                        case 1: {
-                            stbtt_int32 l, r, m;
-                            int straw, needle;
-                            stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
-                            stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
-                            if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
-                                stbtt_int32 valueRecordPairSizeInBytes = 2;
-                                stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
-                                stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
-                                stbtt_uint8 *pairValueTable = table + pairPosOffset;
-                                stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
-                                stbtt_uint8 *pairValueArray = pairValueTable + 2;
+      for (sti=0; sti<subTableCount; sti++) {
+         stbtt_uint16 subtableOffset = ttUSHORT(subTableOffsets + 2 * sti);
+         stbtt_uint8 *table = lookupTable + subtableOffset;
+         stbtt_uint16 posFormat = ttUSHORT(table);
+         stbtt_uint16 coverageOffset = ttUSHORT(table + 2);
+         stbtt_int32 coverageIndex = stbtt__GetCoverageIndex(table + coverageOffset, glyph1);
+         if (coverageIndex == -1) continue;
 
-                                if (coverageIndex >= pairSetCount) return 0;
+         switch (posFormat) {
+            case 1: {
+               stbtt_int32 l, r, m;
+               int straw, needle;
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_int32 valueRecordPairSizeInBytes = 2;
+                  stbtt_uint16 pairSetCount = ttUSHORT(table + 8);
+                  stbtt_uint16 pairPosOffset = ttUSHORT(table + 10 + 2 * coverageIndex);
+                  stbtt_uint8 *pairValueTable = table + pairPosOffset;
+                  stbtt_uint16 pairValueCount = ttUSHORT(pairValueTable);
+                  stbtt_uint8 *pairValueArray = pairValueTable + 2;
 
-                                needle=glyph2;
-                                r=pairValueCount-1;
-                                l=0;
+                  if (coverageIndex >= pairSetCount) return 0;
 
-                                // Binary search.
-                                while (l <= r) {
-                                    stbtt_uint16 secondGlyph;
-                                    stbtt_uint8 *pairValue;
-                                    m = (l + r) >> 1;
-                                    pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
-                                    secondGlyph = ttUSHORT(pairValue);
-                                    straw = secondGlyph;
-                                    if (needle < straw)
-                                        r = m - 1;
-                                    else if (needle > straw)
-                                        l = m + 1;
-                                    else {
-                                        stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
-                                        return xAdvance;
-                                    }
-                                }
-                            } else
-                                return 0;
-                        } break;
+                  needle=glyph2;
+                  r=pairValueCount-1;
+                  l=0;
 
-                        case 2: {
-                            stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
-                            stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
-                            if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
-                                stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
-                                stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
-                                int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
-                                int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
+                  // Binary search.
+                  while (l <= r) {
+                     stbtt_uint16 secondGlyph;
+                     stbtt_uint8 *pairValue;
+                     m = (l + r) >> 1;
+                     pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m;
+                     secondGlyph = ttUSHORT(pairValue);
+                     straw = secondGlyph;
+                     if (needle < straw)
+                        r = m - 1;
+                     else if (needle > straw)
+                        l = m + 1;
+                     else {
+                        stbtt_int16 xAdvance = ttSHORT(pairValue + 2);
+                        return xAdvance;
+                     }
+                  }
+               } else
+                  return 0;
+               break;
+            }
 
-                                stbtt_uint16 class1Count = ttUSHORT(table + 12);
-                                stbtt_uint16 class2Count = ttUSHORT(table + 14);
+            case 2: {
+               stbtt_uint16 valueFormat1 = ttUSHORT(table + 4);
+               stbtt_uint16 valueFormat2 = ttUSHORT(table + 6);
+               if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats?
+                  stbtt_uint16 classDef1Offset = ttUSHORT(table + 8);
+                  stbtt_uint16 classDef2Offset = ttUSHORT(table + 10);
+                  int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1);
+                  int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2);
 
-                                if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
-                                if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed
+                  stbtt_uint16 class1Count = ttUSHORT(table + 12);
+                  stbtt_uint16 class2Count = ttUSHORT(table + 14);
 
-                                stbtt_uint8 *class1Records = table + 16;
-                                stbtt_uint8 *class2Records = class1Records + 2 * (glyph1class * class2Count);
-                                stbtt_int16 xAdvance = ttSHORT(class2Records + 2 * glyph2class);
-                                return xAdvance;
-                            } else
-                                return 0;
-                        } break;
+                  if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
+                  if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed
 
-                        default: {
-                            // Unsupported definition type
-                            return 0;
-                            break;
-                        };
-                    }
-                }
-                break;
-            };
+                  stbtt_uint8 *class1Records = table + 16;
+                  stbtt_uint8 *class2Records = class1Records + 2 * (glyph1class * class2Count);
+                  stbtt_int16 xAdvance = ttSHORT(class2Records + 2 * glyph2class);
+                  return xAdvance;
+               } else
+                  return 0;
+               break;
+            }
 
             default:
-                // TODO: Implement other stuff.
-                break;
-        }
-    }
+               return 0; // Unsupported position format
+         }
+      }
+   }
 
-    return 0;
+   return 0;
 }
 
 STBTT_DEF int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int g1, int g2)

From 46c259ff903a4951f1acf0e129b9648a54eb8b75 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 03:32:04 -0700
Subject: [PATCH 084/170] stb_sprintf: PUBLICDEC on declarations, as intended

The code was using PUBLICDEF on both declarations and
definitions.

Fixes issue #458.
---
 stb_sprintf.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index 3152ff3..e055697 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -199,13 +199,13 @@ typedef char *STBSP_SPRINTFCB(const char *buf, void *user, int len);
 #define STB_SPRINTF_DECORATE(name) stbsp_##name // define this before including if you want to change the names
 #endif
 
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va);
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va);
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(2,3);
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(3,4);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(2,3);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(3,4);
 
-STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va);
-STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char period);
+STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va);
+STBSP__PUBLICDEC void STB_SPRINTF_DECORATE(set_separators)(char comma, char period);
 
 #endif // STB_SPRINTF_H_INCLUDE
 

From afc9c16bfd5be9ce81e0e0e1255b47d38218c773 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 03:43:15 -0700
Subject: [PATCH 085/170] stb_truetype: Change spelling of fallthrough comment

To pacify GCC warnings at -Wimplicit-fallthrough=4. Why the
all-caps version works and the others don't, I'm not sure; the
GCC manual page lists regular expressions that GCC is checking for
but does not say which regular expression syntax variant it's using,
whether it's looking for a substring match or a full match for
the comment, and what exactly the text being matched against is
for a single-line comment. Sigh.

Fixes issue #507.
---
 stb_truetype.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 783ecfa..667da88 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -2136,7 +2136,7 @@ static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, st
                subrs = stbtt__cid_get_glyph_subrs(info, glyph_index);
             has_subrs = 1;
          }
-         // fallthrough
+         // FALLTHROUGH
       case 0x1D: // callgsubr
          if (sp < 1) return STBTT__CSERR("call(g|)subr stack");
          v = (int) s[--sp];

From 8af4d40950e38301d3ce8c6cda380aa28ba8c710 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 03:58:30 -0700
Subject: [PATCH 086/170] stb_c_lexer: Move token enum defn to header portion

Required some tinkering, hope I didn't mess the logic up.

This now requires the config section to be set for the header
file, and different sites that include it should agree on what
the values are. This is kind of iffy but hard to avoid.

Fixes issue #647.
---
 stb_c_lexer.h | 126 +++++++++++++++++++++++++++-----------------------
 1 file changed, 67 insertions(+), 59 deletions(-)

diff --git a/stb_c_lexer.h b/stb_c_lexer.h
index 0cb9f39..f8da55d 100644
--- a/stb_c_lexer.h
+++ b/stb_c_lexer.h
@@ -42,13 +42,19 @@
 //
 //   See end of file for license information.
 
-#ifdef STB_C_LEXER_IMPLEMENTATION
 #ifndef STB_C_LEXER_DEFINITIONS
 // to change the default parsing rules, copy the following lines
 // into your C/C++ file *before* including this, and then replace
-// the Y's with N's for the ones you don't want.
+// the Y's with N's for the ones you don't want. This needs to be
+// set to the same values for every place in your program where
+// stb_c_lexer.h is included.
 // --BEGIN--
 
+#if defined(Y) || defined(N)
+#error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
+#endif
+
+
 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
 #define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
@@ -97,7 +103,6 @@
 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
 // --END--
 
-#endif
 #endif
 
 #ifndef INCLUDE_STB_C_LEXER_H
@@ -167,35 +172,11 @@ extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where,
 }
 #endif
 
-#endif // INCLUDE_STB_C_LEXER_H
-
-#ifdef STB_C_LEXER_IMPLEMENTATION
-
-   #if defined(Y) || defined(N)
-   #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
-   #endif
-
-
 // Hacky definitions so we can easily #if on them
 #define Y(x) 1
 #define N(x) 0
 
-#if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
-typedef double     stb__clex_int;
-#define intfield   real_number
-#define STB__clex_int_as_double
-#else
-typedef long       stb__clex_int;
-#define intfield   int_number
-#endif
-
-// Convert these config options to simple conditional #defines so we can more
-// easily test them once we've change the meaning of Y/N
-
-#if STB_C_LEX_PARSE_SUFFIXES(x)
-#define STB__clex_parse_suffixes
-#endif
-
+// Config variable that influence which lexer tokens get declared need to go here
 #if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
 #define STB__clex_define_int
 #endif
@@ -204,35 +185,6 @@ typedef long       stb__clex_int;
 #define STB__clex_define_shifts
 #endif
 
-#if STB_C_LEX_C99_HEX_FLOATS(x)
-#define STB__clex_hex_floats
-#endif
-
-#if STB_C_LEX_C_HEX_INTS(x)
-#define STB__clex_hex_ints
-#endif
-
-#if STB_C_LEX_C_DECIMAL_INTS(x)
-#define STB__clex_decimal_ints
-#endif
-
-#if STB_C_LEX_C_OCTAL_INTS(x)
-#define STB__clex_octal_ints
-#endif
-
-#if STB_C_LEX_C_DECIMAL_FLOATS(x)
-#define STB__clex_decimal_floats
-#endif
-
-#if STB_C_LEX_DISCARD_PREPROCESSOR(x)
-#define STB__clex_discard_preprocessor
-#endif
-
-#if STB_C_LEX_USE_STDLIB(x) && (!defined(STB__clex_hex_floats) || __STDC_VERSION__ >= 199901L)
-#define STB__CLEX_use_stdlib
-#include <stdlib.h>
-#endif
-
 // Now pick a definition of Y/N that's conducive to
 // defining the enum of token names.
 #if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST)
@@ -287,12 +239,68 @@ enum
 
    CLEX_first_unused_token
 
-#undef Y
-#define Y(a) a
 };
 
+#undef Y
+#undef N
+
+#endif // INCLUDE_STB_C_LEXER_H
+
+#ifdef STB_C_LEXER_IMPLEMENTATION
+
+// Hacky definitions so we can easily #if on them
+#define Y(x) 1
+#define N(x) 0
+
+#if STB_C_LEX_INTEGERS_AS_DOUBLES(x)
+typedef double     stb__clex_int;
+#define intfield   real_number
+#define STB__clex_int_as_double
+#else
+typedef long       stb__clex_int;
+#define intfield   int_number
+#endif
+
+// Convert these config options to simple conditional #defines so we can more
+// easily test them once we've change the meaning of Y/N
+
+#if STB_C_LEX_PARSE_SUFFIXES(x)
+#define STB__clex_parse_suffixes
+#endif
+
+#if STB_C_LEX_C99_HEX_FLOATS(x)
+#define STB__clex_hex_floats
+#endif
+
+#if STB_C_LEX_C_HEX_INTS(x)
+#define STB__clex_hex_ints
+#endif
+
+#if STB_C_LEX_C_DECIMAL_INTS(x)
+#define STB__clex_decimal_ints
+#endif
+
+#if STB_C_LEX_C_OCTAL_INTS(x)
+#define STB__clex_octal_ints
+#endif
+
+#if STB_C_LEX_C_DECIMAL_FLOATS(x)
+#define STB__clex_decimal_floats
+#endif
+
+#if STB_C_LEX_DISCARD_PREPROCESSOR(x)
+#define STB__clex_discard_preprocessor
+#endif
+
+#if STB_C_LEX_USE_STDLIB(x) && (!defined(STB__clex_hex_floats) || __STDC_VERSION__ >= 199901L)
+#define STB__CLEX_use_stdlib
+#include <stdlib.h>
+#endif
+
 // Now for the rest of the file we'll use the basic definition where
 // where Y expands to its contents and N expands to nothing
+#undef  Y
+#define Y(a) a
 #undef N
 #define N(a)
 

From 315e979f34a1f8479bfc82481fb60fd22965fd1d Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 04:10:52 -0700
Subject: [PATCH 087/170] stb_truetype: Add protoype for stbtt_FindSVGDoc

Fixes issue #979.
---
 stb_truetype.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stb_truetype.h b/stb_truetype.h
index 667da88..99ec365 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -855,6 +855,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
 STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
 // frees the data allocated above
 
+STBTT_DEF stbtt_uint8 *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl)
 STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg);
 STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg);
 // fills svg with the character's SVG data.

From b14a80784c689979b75f6e8929a7d7b4b4a86678 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 14:35:46 -0700
Subject: [PATCH 088/170] stb_image_write: Support writing BMPs with alpha

Fixes issue #1159.
---
 stb_image_write.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/stb_image_write.h b/stb_image_write.h
index a909e82..f1a1773 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -491,11 +491,22 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,
 
 static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
-   int pad = (-x*3) & 3;
-   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
-           "11 4 22 4" "4 44 22 444444",
-           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
-            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   if (comp != 4) {
+      // write RGB bitmap
+      int pad = (-x*3) & 3;
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+              "11 4 22 4" "4 44 22 444444",
+              'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+               40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+   } else {
+      // RGBA bitmaps need a v4 header
+      // use BI_BITFIELDS mode with 32bpp and alpha mask
+      // (straight BI_RGB with alpha mask doesn't work in most readers)
+      return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0,
+         "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444",
+         'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header
+         108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header
+   }
 }
 
 STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
@@ -1619,6 +1630,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
 /* Revision history
       1.15  (          )
              make Deflate code emit uncompressed blocks when it would otherwise expand
+             support writing BMPs with alpha channel
       1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
       1.13
       1.12

From 2af2e692197267092f737db562817231feb3b5be Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 14:59:13 -0700
Subject: [PATCH 089/170] stb_ds: Fix addn when n=0

In this case, the array pointer may still be 0 even after the
maybegrow.

Fixes issue #1015.
---
 stb_ds.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_ds.h b/stb_ds.h
index f6ec239..530f504 100644
--- a/stb_ds.h
+++ b/stb_ds.h
@@ -544,8 +544,8 @@ extern void * stbds_shmode_func(size_t elemsize, int mode);
 #define stbds_arrpush          stbds_arrput  // synonym
 #define stbds_arrpop(a)        (stbds_header(a)->length--, (a)[stbds_header(a)->length])
 #define stbds_arraddn(a,n)     ((void)(stbds_arraddnindex(a, n)))    // deprecated, use one of the following instead:
-#define stbds_arraddnptr(a,n)  (stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)])
-#define stbds_arraddnindex(a,n)(stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), stbds_header(a)->length-(n))
+#define stbds_arraddnptr(a,n)  (stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)]) : (a))
+#define stbds_arraddnindex(a,n)(stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), stbds_header(a)->length-(n)) : stbds_arrlen(a))
 #define stbds_arraddnoff       stbds_arraddnindex
 #define stbds_arrlast(a)       ((a)[stbds_header(a)->length-1])
 #define stbds_arrfree(a)       ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL)

From 013884b53b3e346c8d497ca65b438857ffefbad9 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 15:38:42 -0700
Subject: [PATCH 090/170] stb_sprintf: Fix string length calc

Factor out string length computation into helper func, comment
it a bit more, always use a limit to avoid 32b unsigned overflow,
and avoid reading past the bounds of non-0-terminated strings given
with specified precision.

Fixes issue #966.
---
 stb_sprintf.h | 74 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 43 insertions(+), 31 deletions(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index e055697..46369d5 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -300,6 +300,46 @@ static void stbsp__lead_sign(stbsp__uint32 fl, char *sign)
    }
 }
 
+static STBSP__ASAN stbsp__uint32 stbsp__strlen_limited(char const *s, stbsp__uint32 limit)
+{
+   char const * sn = s;
+
+   // get up to 4-byte alignment
+   for (;;) {
+      if (((stbsp__uintptr)sn & 3) == 0)
+         break;
+
+      if (!limit || *sn == 0)
+         return (stbsp__uint32)(sn - s);
+
+      ++sn;
+      --limit;
+   }
+
+   // scan over 4 bytes at a time to find terminating 0
+   // this will intentionally scan up to 3 bytes past the end of buffers,
+   // but becase it works 4B aligned, it will never cross page boundaries
+   // (hence the STBSP__ASAN markup; the over-read here is intentional
+   // and harmless)
+   while (limit >= 4) {
+      stbsp__uint32 v = *(stbsp__uint32 *)sn;
+      // bit hack to find if there's a 0 byte in there
+      if ((v - 0x01010101) & (~v) & 0x80808080UL)
+         break;
+
+      sn += 4;
+      limit -= 4;
+   }
+
+   // handle the last few characters to find actual size
+   while (limit && *sn) {
+      ++sn;
+      --limit;
+   }
+
+   return (stbsp__uint32)(sn - s);
+}
+
 STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va)
 {
    static char hex[] = "0123456789abcdefxp";
@@ -543,37 +583,9 @@ STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback,
          s = va_arg(va, char *);
          if (s == 0)
             s = (char *)"null";
-         // get the length
-         sn = s;
-         for (;;) {
-            if ((((stbsp__uintptr)sn) & 3) == 0)
-               break;
-         lchk:
-            if (sn[0] == 0)
-               goto ld;
-            ++sn;
-         }
-         n = 0xffffffff;
-         if (pr >= 0) {
-            n = (stbsp__uint32)(sn - s);
-            if (n >= (stbsp__uint32)pr)
-               goto ld;
-            n = ((stbsp__uint32)(pr - n)) >> 2;
-         }
-         while (n) {
-            stbsp__uint32 v = *(stbsp__uint32 *)sn;
-            if ((v - 0x01010101) & (~v) & 0x80808080UL)
-               goto lchk;
-            sn += 4;
-            --n;
-         }
-         goto lchk;
-      ld:
-
-         l = (stbsp__uint32)(sn - s);
-         // clamp to precision
-         if (l > (stbsp__uint32)pr)
-            l = pr;
+         // get the length, limited to desired precision
+         // always limit to ~0u chars since our counts are 32b
+         l = stbsp__strlen_limited(s, (pr >= 0) ? pr : ~0u);
          lead[0] = 0;
          tail[0] = 0;
          pr = 0;

From 3be65f1c0c5e605bcc3d75a6466328ddb9e68945 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 15:43:47 -0700
Subject: [PATCH 091/170] stb_sprintf: Small simplification

This was intentional but we might as well just set cs=0
directly here.

Fixes issue #997.
---
 stb_sprintf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index 46369d5..0271568 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -1022,7 +1022,7 @@ STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback,
             lead[0] = 0;
             if (pr == 0) {
                l = 0;
-               cs = (((l >> 4) & 15)) << 24;
+               cs = 0;
                goto scopy;
             }
          }

From 06b6009e3be9985a47829976c3c00711bb51d4f3 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Wed, 7 Jul 2021 15:51:56 -0700
Subject: [PATCH 092/170] stb_sprintf: Fix unused variable warning with
 STB_SPRINTF_NOFLOAT

Fixes issue #986.
---
 stb_sprintf.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index 0271568..e491425 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -187,6 +187,12 @@ PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC):
 #define STBSP__ATTRIBUTE_FORMAT(fmt,va)
 #endif
 
+#ifdef _MSC_VER
+#define STBSP__NOTUSED(v)  (void)(v)
+#else
+#define STBSP__NOTUSED(v)  (void)sizeof(v)
+#endif
+
 #include <stdarg.h> // for va_arg(), va_list()
 #include <stddef.h> // size_t, ptrdiff_t
 
@@ -626,8 +632,8 @@ STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback,
          lead[0] = 0;
          tail[0] = 0;
          pr = 0;
-         dp = 0;
          cs = 0;
+         STBSP__NOTUSED(dp);
          goto scopy;
 #else
       case 'A': // hex float

From d2476c384527020cf145d359e8a2a5689e1c51cd Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 8 Jul 2021 00:21:08 -0700
Subject: [PATCH 093/170] stb_sprintf: GCC compilation fix

Fixes issue #1010.
---
 stb_sprintf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_sprintf.h b/stb_sprintf.h
index e491425..82358a5 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -154,8 +154,8 @@ PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC):
    #endif
   #endif
  #endif
-#elif __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
- #if __SANITIZE_ADDRESS__
+#elif defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+ #if defined(__SANITIZE_ADDRESS__) && __SANITIZE_ADDRESS__
   #define STBSP__ASAN __attribute__((__no_sanitize_address__))
  #endif
 #endif

From 1e1efbdc75487a6c29d76314eb174a6a346a4d8e Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 8 Jul 2021 00:24:00 -0700
Subject: [PATCH 094/170] stb_image_write: Make globals extern "C" in C++

Fixes issue #921
---
 stb_image_write.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_image_write.h b/stb_image_write.h
index f1a1773..d7ed1d3 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -167,9 +167,9 @@ LICENSE
 #endif
 
 #ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
-extern int stbi_write_tga_with_rle;
-extern int stbi_write_png_compression_level;
-extern int stbi_write_force_png_filter;
+STBIWDEF int stbi_write_tga_with_rle;
+STBIWDEF int stbi_write_png_compression_level;
+STBIWDEF int stbi_write_force_png_filter;
 #endif
 
 #ifndef STBI_WRITE_NO_STDIO

From 2b667e4d30e8fd04aebec170763a2c118f635652 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Thu, 8 Jul 2021 00:32:38 -0700
Subject: [PATCH 095/170] stb_rect_pack: Several minor fixes

1. Always use large rects mode to avoid definition of stbrp_coord
   in header file depending on implementation #defines
2. Expose STBRP__MAXVAL to users
3. Fix value of STBRP__MAXVAL for large rect mode (stbrp_coord
   is a 32-bit int, so needs to be <=0x7fffffff, 0xffffffff
   doesn't work)
4. Add comment at the top about which #define to set to get the
   implementation.

Fixes issue #1143, or rather, replaces that pull request.
---
 stb_rect_pack.h | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/stb_rect_pack.h b/stb_rect_pack.h
index 5c848de..48362eb 100644
--- a/stb_rect_pack.h
+++ b/stb_rect_pack.h
@@ -1,9 +1,15 @@
-// stb_rect_pack.h - v1.00 - public domain - rectangle packing
+// stb_rect_pack.h - v1.01 - public domain - rectangle packing
 // Sean Barrett 2014
 //
 // Useful for e.g. packing rectangular textures into an atlas.
 // Does not do rotation.
 //
+// Before #including,
+//
+//    #define STB_RECT_PACK_IMPLEMENTATION
+//
+// in the file that you want to have the implementation.
+//
 // Not necessarily the awesomest packing method, but better than
 // the totally naive one in stb_truetype (which is primarily what
 // this is meant to replace).
@@ -35,6 +41,7 @@
 //
 // Version history:
 //
+//     1.01  (          )  always use large rect mode, expose STBRP__MAXVAL in public section
 //     1.00  (2019-02-25)  avoid small space waste; gracefully fail too-wide rectangles
 //     0.99  (2019-02-07)  warning fixes
 //     0.11  (2017-03-03)  return packing success/fail result
@@ -75,11 +82,10 @@ typedef struct stbrp_context stbrp_context;
 typedef struct stbrp_node    stbrp_node;
 typedef struct stbrp_rect    stbrp_rect;
 
-#ifdef STBRP_LARGE_RECTS
 typedef int            stbrp_coord;
-#else
-typedef unsigned short stbrp_coord;
-#endif
+
+#define STBRP__MAXVAL  0x7fffffff
+// Mostly for internal use, but this is the maximum supported coordinate value.
 
 STBRP_DEF int stbrp_pack_rects (stbrp_context *context, stbrp_rect *rects, int num_rects);
 // Assign packed locations to rectangles. The rectangles are of type
@@ -253,9 +259,6 @@ STBRP_DEF void stbrp_setup_allow_out_of_mem(stbrp_context *context, int allow_ou
 STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes)
 {
    int i;
-#ifndef STBRP_LARGE_RECTS
-   STBRP_ASSERT(width <= 0xffff && height <= 0xffff);
-#endif
 
    for (i=0; i < num_nodes-1; ++i)
       nodes[i].next = &nodes[i+1];
@@ -274,11 +277,7 @@ STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height,
    context->extra[0].y = 0;
    context->extra[0].next = &context->extra[1];
    context->extra[1].x = (stbrp_coord) width;
-#ifdef STBRP_LARGE_RECTS
    context->extra[1].y = (1<<30);
-#else
-   context->extra[1].y = 65535;
-#endif
    context->extra[1].next = NULL;
 }
 
@@ -538,12 +537,6 @@ static int rect_original_order(const void *a, const void *b)
    return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed);
 }
 
-#ifdef STBRP_LARGE_RECTS
-#define STBRP__MAXVAL  0xffffffff
-#else
-#define STBRP__MAXVAL  0xffff
-#endif
-
 STBRP_DEF int stbrp_pack_rects(stbrp_context *context, stbrp_rect *rects, int num_rects)
 {
    int i, all_rects_packed = 1;

From 4bb337ddf0a4122bc1547a17f13f1932e228c1ad Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Fri, 9 Jul 2021 05:36:53 -0700
Subject: [PATCH 096/170] test including stb_c_lexer header independnet of
 implementation

---
 tests/test_cpp_compilation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_cpp_compilation.cpp b/tests/test_cpp_compilation.cpp
index 10f27b3..f8d3d47 100644
--- a/tests/test_cpp_compilation.cpp
+++ b/tests/test_cpp_compilation.cpp
@@ -5,6 +5,7 @@
 #include "stb_rect_pack.h"
 #include "stb_truetype.h"
 #include "stb_image_write.h"
+#include "stb_c_lexer.h"
 #include "stb_perlin.h"
 #include "stb_dxt.h"
 #include "stb_divide.h"
@@ -46,7 +47,6 @@ void my_free(void *) { }
 #include "stb_image_write.h"
 #include "stb_perlin.h"
 #include "stb_dxt.h"
-#include "stb_c_lexer.h"
 #include "stb_divide.h"
 #include "stb_herringbone_wang_tile.h"
 #include "stb_ds.h"

From 5bf78b8f231cea48b6d7030aa86f910f6dd37d6b Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Fri, 9 Jul 2021 05:37:56 -0700
Subject: [PATCH 097/170] stb_truetype: fix bad declaration

---
 stb_truetype.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 99ec365..0dcf33b 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -855,7 +855,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s
 STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
 // frees the data allocated above
 
-STBTT_DEF stbtt_uint8 *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl)
+STBTT_DEF unsigned char *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl);
 STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg);
 STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg);
 // fills svg with the character's SVG data.

From c8e3d3086dd86e5abeb548e8b4f54ec82d627739 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Fri, 9 Jul 2021 18:49:15 -0700
Subject: [PATCH 098/170] stb_truetype: Small tweaks to make PVS Studio happy

No actual bugs as far as I can tell.

Fixes issue #1162.
---
 stb_truetype.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 0dcf33b..65ebbfb 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -1874,7 +1874,7 @@ static int stbtt__GetGlyphShapeTT(const stbtt_fontinfo *info, int glyph_index, s
                if (comp_verts) STBTT_free(comp_verts, info->userdata);
                return 0;
             }
-            if (num_vertices > 0) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
+            if (num_vertices > 0 && vertices) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
             STBTT_memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
             if (vertices) STBTT_free(vertices, info->userdata);
             vertices = tmp;
@@ -2242,7 +2242,7 @@ static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, st
       } break;
 
       default:
-         if (b0 != 255 && b0 != 28 && (b0 < 32 || b0 > 254))
+         if (b0 != 255 && b0 != 28 && b0 < 32)
             return STBTT__CSERR("reserved operator");
 
          // push immediate
@@ -4397,15 +4397,14 @@ static int stbtt__compute_crossings_x(float x, float y, int nverts, stbtt_vertex
    float y_frac;
    int winding = 0;
 
-   orig[0] = x;
-   orig[1] = y;
-
    // make sure y never passes through a vertex of the shape
    y_frac = (float) STBTT_fmod(y, 1.0f);
    if (y_frac < 0.01f)
       y += 0.01f;
    else if (y_frac > 0.99f)
       y -= 0.01f;
+
+   orig[0] = x;
    orig[1] = y;
 
    // test a ray from (-infinity,y) to (x,y)

From 818ac26785a77cd011403a21eeedc5725abdd591 Mon Sep 17 00:00:00 2001
From: ocornut <omarcornut@gmail.com>
Date: Fri, 9 Jul 2021 13:17:06 +0200
Subject: [PATCH 099/170] stb_rect_pack: making functions cdecl for msvc qsort

---
 stb_rect_pack.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/stb_rect_pack.h b/stb_rect_pack.h
index 48362eb..a0667a1 100644
--- a/stb_rect_pack.h
+++ b/stb_rect_pack.h
@@ -215,8 +215,10 @@ struct stbrp_context
 
 #ifdef _MSC_VER
 #define STBRP__NOTUSED(v)  (void)(v)
+#define STBRP__CDECL       __cdecl
 #else
 #define STBRP__NOTUSED(v)  (void)sizeof(v)
+#define STBRP__CDECL
 #endif
 
 enum
@@ -519,7 +521,7 @@ static stbrp__findresult stbrp__skyline_pack_rectangle(stbrp_context *context, i
    return res;
 }
 
-static int rect_height_compare(const void *a, const void *b)
+static int STBRP__CDECL rect_height_compare(const void *a, const void *b)
 {
    const stbrp_rect *p = (const stbrp_rect *) a;
    const stbrp_rect *q = (const stbrp_rect *) b;
@@ -530,7 +532,7 @@ static int rect_height_compare(const void *a, const void *b)
    return (p->w > q->w) ? -1 : (p->w < q->w);
 }
 
-static int rect_original_order(const void *a, const void *b)
+static int STBRP__CDECL rect_original_order(const void *a, const void *b)
 {
    const stbrp_rect *p = (const stbrp_rect *) a;
    const stbrp_rect *q = (const stbrp_rect *) b;

From a0a8653a7d33a2d43966d46601b20af0bfe00cf6 Mon Sep 17 00:00:00 2001
From: Randy <randy408@protonmail.com>
Date: Sat, 10 Jul 2021 15:29:39 +0200
Subject: [PATCH 100/170] attempt to fix issue with CIFuzz

---
 tests/ossfuzz.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/ossfuzz.sh b/tests/ossfuzz.sh
index 4b8fd27..104f033 100755
--- a/tests/ossfuzz.sh
+++ b/tests/ossfuzz.sh
@@ -15,10 +15,6 @@ find $SRC/stb/tests/pngsuite -name "*.png" | \
 
 cp $SRC/stb/tests/stb_png.dict $OUT/stb_png_read_fuzzer.dict
 
-tar xvzf $SRC/stb/jpg.tar.gz --directory $SRC/stb/tests
-tar xvzf $SRC/stb/gif.tar.gz --directory $SRC/stb/tests
-unzip    $SRC/stb/bmp.zip    -d $SRC/stb/tests
-unzip    $SRC/stb/tga.zip    -d $SRC/stb/tests
 
 find $SRC/stb/tests -name "*.png" -o -name "*.jpg" -o -name "*.gif" \
                  -o -name "*.bmp" -o -name "*.tga" -o -name "*.TGA" \

From af4c6731749c0b4a940d0055d254b5fd2f911062 Mon Sep 17 00:00:00 2001
From: Randy <randy408@protonmail.com>
Date: Sat, 10 Jul 2021 15:51:11 +0200
Subject: [PATCH 101/170] change directory for seed corpus downloads

---
 tests/ossfuzz.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/ossfuzz.sh b/tests/ossfuzz.sh
index 104f033..2db2060 100755
--- a/tests/ossfuzz.sh
+++ b/tests/ossfuzz.sh
@@ -15,11 +15,15 @@ find $SRC/stb/tests/pngsuite -name "*.png" | \
 
 cp $SRC/stb/tests/stb_png.dict $OUT/stb_png_read_fuzzer.dict
 
+tar xvzf $SRC/stbi/jpg.tar.gz --directory $SRC/stb/tests
+tar xvzf $SRC/stbi/gif.tar.gz --directory $SRC/stb/tests
+unzip    $SRC/stbi/bmp.zip    -d $SRC/stb/tests
+unzip    $SRC/stbi/tga.zip    -d $SRC/stb/tests
 
 find $SRC/stb/tests -name "*.png" -o -name "*.jpg" -o -name "*.gif" \
                  -o -name "*.bmp" -o -name "*.tga" -o -name "*.TGA" \
                  -o -name "*.ppm" -o -name "*.pgm" \
     | xargs zip $OUT/stbi_read_fuzzer_seed_corpus.zip
 
-echo "" >> $SRC/stb/tests/gif.dict
-cat $SRC/stb/tests/gif.dict $SRC/stb/tests/stb_png.dict > $OUT/stbi_read_fuzzer.dict
+echo "" >> $SRC/stbi/gif.dict
+cat $SRC/stbi/gif.dict $SRC/stb/tests/stb_png.dict > $OUT/stbi_read_fuzzer.dict

From e05ecc05ee556773f67c51fbf8a52b2a63cb91d2 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 11 Jul 2021 15:26:16 -0700
Subject: [PATCH 102/170] Fix compiling-as-C-pre-C99 issues

---
 stb_truetype.h                 | 8 +++++---
 tests/test_cpp_compilation.cpp | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 65ebbfb..40ea142 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -2581,13 +2581,15 @@ static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, in
 
                   stbtt_uint16 class1Count = ttUSHORT(table + 12);
                   stbtt_uint16 class2Count = ttUSHORT(table + 14);
+                  stbtt_uint8 *class1Records, *class2Records;
+                  stbtt_int16 xAdvance;
 
                   if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed
                   if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed
 
-                  stbtt_uint8 *class1Records = table + 16;
-                  stbtt_uint8 *class2Records = class1Records + 2 * (glyph1class * class2Count);
-                  stbtt_int16 xAdvance = ttSHORT(class2Records + 2 * glyph2class);
+                  class1Records = table + 16;
+                  class2Records = class1Records + 2 * (glyph1class * class2Count);
+                  xAdvance = ttSHORT(class2Records + 2 * glyph2class);
                   return xAdvance;
                } else
                   return 0;
diff --git a/tests/test_cpp_compilation.cpp b/tests/test_cpp_compilation.cpp
index f8d3d47..66f8f85 100644
--- a/tests/test_cpp_compilation.cpp
+++ b/tests/test_cpp_compilation.cpp
@@ -183,6 +183,7 @@ void dummy3(void)
   stb_textedit_paste(0,0,0,0);
 }
 
+#if 0
 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
 #define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
@@ -229,4 +230,6 @@ void dummy3(void)
 //#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
 
 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
+#endif
+
 #include "stb_c_lexer.h"

From 0be82e4814eaad3b9af6261ee1355608a0d04b0e Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 11 Jul 2021 16:26:02 -0700
Subject: [PATCH 103/170] stb_truetype: fix incorrect antialiasing computation
 in v2 rasterizer, and handle certain cases where math blew up

---
 stb_truetype.h        | 27 +++++++++++++++++++--------
 tests/stb.dsp         |  2 +-
 tests/test_truetype.c | 39 ++++++++++++++++++++++++++++++++-------
 3 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 40ea142..3327d20 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -3120,7 +3120,7 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                scanline_fill[x] += e->direction * height; // everything right of this pixel is filled
             } else {
                int x,x1,x2;
-               float y_crossing, step, sign, area;
+               float y_crossing, y_final, step, sign, area;
                // covers 2+ pixels
                if (x_top > x_bottom) {
                   // flip scanline vertically; signed area is the same
@@ -3133,28 +3133,39 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                   dy = -dy;
                   t = x0, x0 = xb, xb = t;
                }
+               assert(dy >= 0);
+               assert(dx >= 0);
 
                x1 = (int) x_top;
                x2 = (int) x_bottom;
                // compute intersection with y axis at x1+1
                y_crossing = (x1+1 - x0) * dy + y_top;
+               // if x2 is right at the right edge of x1, y_crossing can blow up, github #1057
+               if (y_crossing > y_bottom)
+                  y_crossing = y_bottom;
 
                sign = e->direction;
                // area of the rectangle covered from y0..y_crossing
                area = sign * (y_crossing-sy0);
                // area of the triangle (x_top,y0), (x+1,y0), (x+1,y_crossing)
-               scanline[x1] += area * (1-((x_top - x1)+(x1+1-x1))/2);
+               scanline[x1] += area * (x1+1 - x_top)/2;
 
-               step = sign * dy;
+               // check if final y_crossing is blown up; no test case for this
+               y_final = y_crossing + dy * (x2 - (x1+1)); // advance y by number of steps taken below
+               if (y_final > y_bottom) {
+                  y_final = y_bottom;
+                  dy = (y_final - y_crossing ) / (x2 - (x1+1)); // if denom=0, y_final = y_crossing, so y_final <= y_bottom
+               }
+
+               step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x, which is also how much pixel area changes for each step in x
                for (x = x1+1; x < x2; ++x) {
-                  scanline[x] += area + step/2;
+                  scanline[x] += area + step/2; // area of parallelogram is step/2
                   area += step;
                }
-               y_crossing += dy * (x2 - (x1+1));
+               STBTT_assert(STBTT_fabs(area) <= 1.01f); // accumulated error from area += step unless we round step down
 
-               STBTT_assert(STBTT_fabs(area) <= 1.01f);
-
-               scanline[x2] += area + sign * (1-((x2-x2)+(x_bottom-x2))/2) * (sy1-y_crossing);
+               // area of the triangle (x2,y_crossing), (x_bottom,y1), (x2,y1)
+               scanline[x2] += area + sign * (x_bottom - x2)/2 * (sy1-y_crossing);
 
                scanline_fill[x2] += sign * (sy1-sy0);
             }
diff --git a/tests/stb.dsp b/tests/stb.dsp
index bc13a81..50fff38 100644
--- a/tests/stb.dsp
+++ b/tests/stb.dsp
@@ -66,7 +66,7 @@ LINK32=link.exe
 # PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
-# ADD CPP /nologo /MTd /W3 /GX /Zi /Od /I ".." /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "DS_TEST" /FR /FD /GZ /c
+# ADD CPP /nologo /MTd /W3 /GX /Zi /Od /I ".." /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "TT_TEST" /FR /FD /GZ /c
 # SUBTRACT CPP /YX
 # ADD BASE RSC /l 0x409 /d "_DEBUG"
 # ADD RSC /l 0x409 /d "_DEBUG"
diff --git a/tests/test_truetype.c b/tests/test_truetype.c
index 5f03d3b..cb9b35f 100644
--- a/tests/test_truetype.c
+++ b/tests/test_truetype.c
@@ -37,7 +37,7 @@ int main(int argc, char **argv)
 {
    stbtt_fontinfo font;
    unsigned char *bitmap;
-   int w,h,i,j,c = (argc > 1 ? atoi(argv[1]) : 34807), s = (argc > 2 ? atoi(argv[2]) : 32);
+   int w,h,i,j,c = (argc > 1 ? atoi(argv[1]) : '@'), s = (argc > 2 ? atoi(argv[2]) : 32);
 
    //debug();
 
@@ -49,6 +49,25 @@ int main(int argc, char **argv)
    stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
 
 #if 0
+   {
+      stbtt__bitmap b;
+      stbtt__point p[2];
+      int wcount[2] = { 2,0 };
+      p[0].x = 0.2f;
+      p[0].y = 0.3f;
+      p[1].x = 3.8f;
+      p[1].y = 0.8f;
+      b.w = 16;
+      b.h = 2;
+      b.stride = 16;
+      b.pixels = malloc(b.w*b.h);
+      stbtt__rasterize(&b, p, wcount, 1, 1, 1, 0, 0, 0, 0, 0, NULL);
+      for (i=0; i < 8; ++i)
+         printf("%f\n", b.pixels[i]/255.0);
+   }
+#endif
+
+#if 1
    {
       static stbtt_pack_context pc;
       static stbtt_packedchar cd[256];
@@ -60,16 +79,14 @@ int main(int argc, char **argv)
    }
 #endif
 
+
+#if 1
    {
       static stbtt_pack_context pc;
       static stbtt_packedchar cd[256];
       static unsigned char atlas[1024*1024];
       unsigned char *data;
 
-      stbtt_PackBegin(&pc, atlas, 1024,1024,1024,1,NULL);
-      stbtt_PackFontRange(&pc, ttf_buffer, 0, 32.0, 'u', 1, cd);
-      stbtt_PackEnd(&pc);
-
       data = stbtt_GetCodepointSDF(&font, stbtt_ScaleForPixelHeight(&font,32.0), 'u', 4, 128, 128/4, &w,&h,&i,&j);
       for (j=0; j < h; ++j) {
          for (i=0; i < w; ++i) {
@@ -77,8 +94,8 @@ int main(int argc, char **argv)
          }
          putchar('\n');
       }
-      return 0;
    }
+#endif
 
 #if 0
    stbtt_BakeFontBitmap(ttf_buffer,stbtt_GetFontOffsetForIndex(ttf_buffer,0), 40.0, temp_bitmap[0],BITMAP_W,BITMAP_H, 32,96, cdata); // no guarantee this fits!
@@ -117,8 +134,16 @@ int main(int argc, char **argv)
    return 0;
 #endif
 
-   bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, (float)s), c, &w, &h, 0,0);
+   (void)stbtt_GetCodepointBitmapSubpixel(&font,
+					 0.4972374737262726f,
+					 0.4986416995525360f,
+					 0.2391788959503174f,
+					 0.1752119064331055f,
+					 'd',
+					 &w, &h,
+					 0,0);
 
+   bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, (float)s), c, &w, &h, 0,0);
    for (j=0; j < h; ++j) {
       for (i=0; i < w; ++i)
          putchar(" .:ioVM@"[bitmap[j*w+i]>>5]);

From fd7807e92d8ba5740615362b453bab2d9ecbc8a4 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 11 Jul 2021 16:37:34 -0700
Subject: [PATCH 104/170] stb_c_lexer: allow including stb_c_lexer.h without
 defining overrides (all tokens are always defined; token values have changed)

---
 stb_c_lexer.h                  | 93 ++++++++++------------------------
 tests/test_c_lexer.c           | 49 ++++++++++++++++++
 tests/test_cpp_compilation.cpp | 49 ------------------
 3 files changed, 77 insertions(+), 114 deletions(-)

diff --git a/stb_c_lexer.h b/stb_c_lexer.h
index f8da55d..8d4305f 100644
--- a/stb_c_lexer.h
+++ b/stb_c_lexer.h
@@ -42,6 +42,7 @@
 //
 //   See end of file for license information.
 
+#ifdef STB_C_LEXER_IMPLEMENTATION
 #ifndef STB_C_LEXER_DEFINITIONS
 // to change the default parsing rules, copy the following lines
 // into your C/C++ file *before* including this, and then replace
@@ -54,7 +55,6 @@
 #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined"
 #endif
 
-
 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
 #define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
@@ -102,7 +102,7 @@
 
 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
 // --END--
-
+#endif
 #endif
 
 #ifndef INCLUDE_STB_C_LEXER_H
@@ -172,78 +172,41 @@ extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where,
 }
 #endif
 
-// Hacky definitions so we can easily #if on them
-#define Y(x) 1
-#define N(x) 0
-
-// Config variable that influence which lexer tokens get declared need to go here
-#if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
-#define STB__clex_define_int
-#endif
-
-#if (STB_C_LEX_C_ARITHEQ(x) && STB_C_LEX_C_SHIFTS(x)) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x)
-#define STB__clex_define_shifts
-#endif
-
-// Now pick a definition of Y/N that's conducive to
-// defining the enum of token names.
-#if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST)
-  #undef  N
-  #define N(a) Y(a)
-#else
-  #undef  N
-  #define N(a)
-#endif
-
-#undef  Y
-#define Y(a) a,
-
 enum
 {
    CLEX_eof = 256,
    CLEX_parse_error,
-
-#ifdef STB__clex_define_int
-   CLEX_intlit,
-#endif
-
-   STB_C_LEX_C_DECIMAL_FLOATS( CLEX_floatlit    )
-   STB_C_LEX_C_IDENTIFIERS(  CLEX_id            )
-   STB_C_LEX_C_DQ_STRINGS(   CLEX_dqstring      )
-   STB_C_LEX_C_SQ_STRINGS(   CLEX_sqstring      )
-   STB_C_LEX_C_CHARS(        CLEX_charlit       )
-   STB_C_LEX_C_COMPARISONS(  CLEX_eq            )
-   STB_C_LEX_C_COMPARISONS(  CLEX_noteq         )
-   STB_C_LEX_C_COMPARISONS(  CLEX_lesseq        )
-   STB_C_LEX_C_COMPARISONS(  CLEX_greatereq     )
-   STB_C_LEX_C_LOGICAL(      CLEX_andand        )
-   STB_C_LEX_C_LOGICAL(      CLEX_oror          )
-   STB_C_LEX_C_SHIFTS(       CLEX_shl           )
-   STB_C_LEX_C_SHIFTS(       CLEX_shr           )
-   STB_C_LEX_C_INCREMENTS(   CLEX_plusplus      )
-   STB_C_LEX_C_INCREMENTS(   CLEX_minusminus    )
-   STB_C_LEX_C_ARITHEQ(      CLEX_pluseq        )
-   STB_C_LEX_C_ARITHEQ(      CLEX_minuseq       )
-   STB_C_LEX_C_ARITHEQ(      CLEX_muleq         )
-   STB_C_LEX_C_ARITHEQ(      CLEX_diveq         )
-   STB_C_LEX_C_ARITHEQ(      CLEX_modeq         )
-   STB_C_LEX_C_BITWISEEQ(    CLEX_andeq         )
-   STB_C_LEX_C_BITWISEEQ(    CLEX_oreq          )
-   STB_C_LEX_C_BITWISEEQ(    CLEX_xoreq         )
-   STB_C_LEX_C_ARROW(        CLEX_arrow         )
-   STB_C_LEX_EQUAL_ARROW(    CLEX_eqarrow       )
-
-#ifdef STB__clex_define_shifts
+   CLEX_intlit        ,
+   CLEX_floatlit      ,
+   CLEX_id            ,
+   CLEX_dqstring      ,
+   CLEX_sqstring      ,
+   CLEX_charlit       ,
+   CLEX_eq            ,
+   CLEX_noteq         ,
+   CLEX_lesseq        ,
+   CLEX_greatereq     ,
+   CLEX_andand        ,
+   CLEX_oror          ,
+   CLEX_shl           ,
+   CLEX_shr           ,
+   CLEX_plusplus      ,
+   CLEX_minusminus    ,
+   CLEX_pluseq        ,
+   CLEX_minuseq       ,
+   CLEX_muleq         ,
+   CLEX_diveq         ,
+   CLEX_modeq         ,
+   CLEX_andeq         ,
+   CLEX_oreq          ,
+   CLEX_xoreq         ,
+   CLEX_arrow         ,
+   CLEX_eqarrow       ,
    CLEX_shleq, CLEX_shreq,
-#endif
 
    CLEX_first_unused_token
 
 };
-
-#undef Y
-#undef N
-
 #endif // INCLUDE_STB_C_LEXER_H
 
 #ifdef STB_C_LEXER_IMPLEMENTATION
diff --git a/tests/test_c_lexer.c b/tests/test_c_lexer.c
index 579ed5d..cc11234 100644
--- a/tests/test_c_lexer.c
+++ b/tests/test_c_lexer.c
@@ -1 +1,50 @@
 #include "stb_c_lexer.h" 
+
+#define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
+#define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
+#define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
+#define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)     CLEX_floatlit
+#define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+     CLEX_floatlit
+#define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
+#define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
+#define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
+#define STB_C_LEX_C_CHARS           Y   //  single-quote-delimited character with escape CLEX_charlits
+#define STB_C_LEX_C_COMMENTS        Y   //  "/* comment */"
+#define STB_C_LEX_CPP_COMMENTS      Y   //  "// comment to end of line\n"
+#define STB_C_LEX_C_COMPARISONS     Y   //  "==" CLEX_eq  "!=" CLEX_noteq   "<=" CLEX_lesseq  ">=" CLEX_greatereq
+#define STB_C_LEX_C_LOGICAL         Y   //  "&&"  CLEX_andand   "||"  CLEX_oror
+#define STB_C_LEX_C_SHIFTS          Y   //  "<<"  CLEX_shl      ">>"  CLEX_shr
+#define STB_C_LEX_C_INCREMENTS      Y   //  "++"  CLEX_plusplus "--"  CLEX_minusminus
+#define STB_C_LEX_C_ARROW           Y   //  "->"  CLEX_arrow
+#define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
+#define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
+#define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
+                                        //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
+                                        //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
+                                        //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
+
+#define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
+#define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
+#define STB_C_LEX_HEX_SUFFIXES      ""  // e.g. "uUlL"
+#define STB_C_LEX_OCTAL_SUFFIXES    ""  // e.g. "uUlL"
+#define STB_C_LEX_FLOAT_SUFFIXES    ""  //
+
+#define STB_C_LEX_0_IS_EOF             Y  // if Y, ends parsing at '\0'; if N, returns '\0' as token
+#define STB_C_LEX_INTEGERS_AS_DOUBLES  N  // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
+#define STB_C_LEX_MULTILINE_DSTRINGS   N  // allow newlines in double-quoted strings
+#define STB_C_LEX_MULTILINE_SSTRINGS   N  // allow newlines in single-quoted strings
+#define STB_C_LEX_USE_STDLIB           Y  // use strtod,strtol for parsing #s; otherwise inaccurate hack
+#define STB_C_LEX_DOLLAR_IDENTIFIER    Y  // allow $ as an identifier character
+#define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
+
+#define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
+                                              // leaving it as N should help you catch config bugs
+
+#define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
+                                              // still have #line, #pragma, etc)
+
+//#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
+
+#define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
+
+#include "stb_c_lexer.h" 
diff --git a/tests/test_cpp_compilation.cpp b/tests/test_cpp_compilation.cpp
index 66f8f85..fd8c5b6 100644
--- a/tests/test_cpp_compilation.cpp
+++ b/tests/test_cpp_compilation.cpp
@@ -183,53 +183,4 @@ void dummy3(void)
   stb_textedit_paste(0,0,0,0);
 }
 
-#if 0
-#define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
-#define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
-#define STB_C_LEX_C_OCTAL_INTS      Y   //  "[0-7]+"                               CLEX_intlit
-#define STB_C_LEX_C_DECIMAL_FLOATS  Y   //  "[0-9]*(.[0-9]*([eE][-+]?[0-9]+)?)     CLEX_floatlit
-#define STB_C_LEX_C99_HEX_FLOATS    N   //  "0x{hex}+(.{hex}*)?[pP][-+]?{hex}+     CLEX_floatlit
-#define STB_C_LEX_C_IDENTIFIERS     Y   //  "[_a-zA-Z][_a-zA-Z0-9]*"               CLEX_id
-#define STB_C_LEX_C_DQ_STRINGS      Y   //  double-quote-delimited strings with escapes  CLEX_dqstring
-#define STB_C_LEX_C_SQ_STRINGS      N   //  single-quote-delimited strings with escapes  CLEX_ssstring
-#define STB_C_LEX_C_CHARS           Y   //  single-quote-delimited character with escape CLEX_charlits
-#define STB_C_LEX_C_COMMENTS        Y   //  "/* comment */"
-#define STB_C_LEX_CPP_COMMENTS      Y   //  "// comment to end of line\n"
-#define STB_C_LEX_C_COMPARISONS     Y   //  "==" CLEX_eq  "!=" CLEX_noteq   "<=" CLEX_lesseq  ">=" CLEX_greatereq
-#define STB_C_LEX_C_LOGICAL         Y   //  "&&"  CLEX_andand   "||"  CLEX_oror
-#define STB_C_LEX_C_SHIFTS          Y   //  "<<"  CLEX_shl      ">>"  CLEX_shr
-#define STB_C_LEX_C_INCREMENTS      Y   //  "++"  CLEX_plusplus "--"  CLEX_minusminus
-#define STB_C_LEX_C_ARROW           Y   //  "->"  CLEX_arrow
-#define STB_C_LEX_EQUAL_ARROW       N   //  "=>"  CLEX_eqarrow
-#define STB_C_LEX_C_BITWISEEQ       Y   //  "&="  CLEX_andeq    "|="  CLEX_oreq     "^="  CLEX_xoreq
-#define STB_C_LEX_C_ARITHEQ         Y   //  "+="  CLEX_pluseq   "-="  CLEX_minuseq
-                                        //  "*="  CLEX_muleq    "/="  CLEX_diveq    "%=" CLEX_modeq
-                                        //  if both STB_C_LEX_SHIFTS & STB_C_LEX_ARITHEQ:
-                                        //                      "<<=" CLEX_shleq    ">>=" CLEX_shreq
-
-#define STB_C_LEX_PARSE_SUFFIXES    N   // letters after numbers are parsed as part of those numbers, and must be in suffix list below
-#define STB_C_LEX_DECIMAL_SUFFIXES  ""  // decimal integer suffixes e.g. "uUlL" -- these are returned as-is in string storage
-#define STB_C_LEX_HEX_SUFFIXES      ""  // e.g. "uUlL"
-#define STB_C_LEX_OCTAL_SUFFIXES    ""  // e.g. "uUlL"
-#define STB_C_LEX_FLOAT_SUFFIXES    ""  //
-
-#define STB_C_LEX_0_IS_EOF             Y  // if Y, ends parsing at '\0'; if N, returns '\0' as token
-#define STB_C_LEX_INTEGERS_AS_DOUBLES  N  // parses integers as doubles so they can be larger than 'int', but only if STB_C_LEX_STDLIB==N
-#define STB_C_LEX_MULTILINE_DSTRINGS   N  // allow newlines in double-quoted strings
-#define STB_C_LEX_MULTILINE_SSTRINGS   N  // allow newlines in single-quoted strings
-#define STB_C_LEX_USE_STDLIB           Y  // use strtod,strtol for parsing #s; otherwise inaccurate hack
-#define STB_C_LEX_DOLLAR_IDENTIFIER    Y  // allow $ as an identifier character
-#define STB_C_LEX_FLOAT_NO_DECIMAL     Y  // allow floats that have no decimal point if they have an exponent
-
-#define STB_C_LEX_DEFINE_ALL_TOKEN_NAMES  N   // if Y, all CLEX_ token names are defined, even if never returned
-                                              // leaving it as N should help you catch config bugs
-
-#define STB_C_LEX_DISCARD_PREPROCESSOR    Y   // discard C-preprocessor directives (e.g. after prepocess
-                                              // still have #line, #pragma, etc)
-
-//#define STB_C_LEX_ISWHITE(str)    ... // return length in bytes of whitespace characters if first char is whitespace
-
-#define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
-#endif
-
 #include "stb_c_lexer.h"

From 1ee679ca2ef753a528db5ba6801e1067b40481b8 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 11 Jul 2021 17:07:54 -0700
Subject: [PATCH 105/170] update version numbers

---
 README.md            | 49 ++++++++++++++------------------------------
 stb_c_lexer.h        |  3 ++-
 stb_divide.h         |  2 +-
 stb_ds.h             |  2 +-
 stb_image.h          |  2 +-
 stb_image_resize.h   |  2 +-
 stb_image_write.h    |  5 +++--
 stb_rect_pack.h      |  2 +-
 stb_sprintf.h        |  2 +-
 stb_textedit.h       |  2 +-
 stb_truetype.h       |  3 ++-
 stb_vorbis.c         |  2 +-
 tests/test_c_lexer.c |  4 ++--
 tools/README.list    |  2 +-
 14 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 731b632..4b986ef 100644
--- a/README.md
+++ b/README.md
@@ -20,54 +20,35 @@ by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts.
 
 library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
-<<<<<<< HEAD
-**[stb_vorbis.c](stb_vorbis.c)** | 1.21 | audio | 5569 | decode ogg vorbis files from file/memory to float/16-bit signed output
-**[stb_image.h](stb_image.h)** | 2.26 | graphics | 7762 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC, PNM
-**[stb_truetype.h](stb_truetype.h)** | 1.24 | graphics | 5011 | parse, decode, and rasterize characters from truetype fonts
-**[stb_image_write.h](stb_image_write.h)** | 1.15 | graphics | 1690 | image writing to disk: PNG, TGA, BMP, JPG, HDR
-**[stb_image_resize.h](stb_image_resize.h)** | 0.96 | graphics | 2631 | resize images larger/smaller with good quality
-**[stb_rect_pack.h](stb_rect_pack.h)** | 1.00 | graphics | 628 | simple 2D rectangle packer with decent quality
-**[stb_ds.h](stb_ds.h)** | 0.66 | utility | 1893 | typesafe dynamic array and hash tables for C, will compile in C++
-**[stb_sprintf.h](stb_sprintf.h)** | 1.09 | utility | 1879 | fast sprintf, snprintf for C/C++
-**[stb_textedit.h](stb_textedit.h)** | 1.13 | user&nbsp;interface | 1404 | guts of a text editor for games etc implementing them from scratch
+**[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
+**[stb_image.h](stb_image.h)** | 2.27 | graphics | 7890 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_truetype.h](stb_truetype.h)** | 1.25 | graphics | 5011 | parse, decode, and rasterize characters from truetype fonts
+**[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
+**[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
+**[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
+**[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
+**[stb_sprintf.h](stb_sprintf.h)** | 1.10 | utility | 1906 | fast sprintf, snprintf for C/C++
+**[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
 **[stb_voxel_render.h](stb_voxel_render.h)** | 0.89 | 3D&nbsp;graphics | 3807 | Minecraft-esque voxel rendering "engine" with many more features
-**[stb_dxt.h](stb_dxt.h)** | 1.10 | 3D&nbsp;graphics | 753 | Fabian "ryg" Giesen's real-time DXT compressor
+**[stb_dxt.h](stb_dxt.h)** | 1.11 | 3D&nbsp;graphics | 718 | Fabian "ryg" Giesen's real-time DXT compressor
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | 3D&nbsp;graphics | 428 | revised Perlin noise (3D input, 1D output)
 **[stb_easy_font.h](stb_easy_font.h)** | 1.1 | 3D&nbsp;graphics | 305 | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc
-**[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.41 | game&nbsp;dev | 4161 | embeddable tilemap editor
+**[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.42 | game&nbsp;dev | 4187 | embeddable tilemap editor
 **[stb_herringbone_wa...](stb_herringbone_wang_tile.h)** | 0.7 | game&nbsp;dev | 1221 | herringbone Wang tile map generator
-**[stb_c_lexer.h](stb_c_lexer.h)** | 0.11 | parsing | 966 | simplify writing parsers for C-like languages
-**[stb_divide.h](stb_divide.h)** | 0.93 | math | 430 | more useful 32-bit modulus e.g. "euclidean divide"
+**[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 940 | simplify writing parsers for C-like languages
+**[stb_divide.h](stb_divide.h)** | 0.94 | math | 433 | more useful 32-bit modulus e.g. "euclidean divide"
 **[stb_connected_comp...](stb_connected_components.h)** | 0.96 | misc | 1049 | incrementally compute reachability on grids
-**[stb.h](stb.h)** | 2.37 | misc | 14454 | helper functions for C, mostly redundant in C++; basically author's personal stuff
+**[stb.h](stb.h)** | 2.38 | misc | 14457 | _deprecated_ helper functions for C, mostly redundant in C++; basically author's personal stuff
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 56530
+Total lines of C code: 56730
 
 
 FAQ
 ---
 
-#### How do I use these libraries?
-
-The idea behind single-header file libraries is that they're easy to distribute and deploy
-because all the code is contained in a single file. By default, the .h files in here act as
-their own header files, i.e. they declare the functions contained in the file but don't
-actually result in any code getting compiled.
-
-So in addition, you should select _exactly one_ C/C++ source file that actually instantiates
-the code, preferably a file you're not editing frequently. This file should define a
-specific macro (this is documented per-library) to actually enable the function definitions.
-For example, to use stb_image, you should have exactly one C/C++ file that doesn't
-include stb_image.h regularly, but instead does
-
-    #define STB_IMAGE_IMPLEMENTATION
-    #include "stb_image.h"
-
-The right macro to define is pointed out right at the top of each of these libraries.
-
 #### What's the license?
 
 These libraries are in the public domain. You can do anything you
diff --git a/stb_c_lexer.h b/stb_c_lexer.h
index 8d4305f..bf89dca 100644
--- a/stb_c_lexer.h
+++ b/stb_c_lexer.h
@@ -1,4 +1,4 @@
-// stb_c_lexer.h - v0.11 - public domain Sean Barrett 2013
+// stb_c_lexer.h - v0.12 - public domain Sean Barrett 2013
 // lexer for making little C-like languages with recursive-descent parsers
 //
 // This file provides both the interface and the implementation.
@@ -10,6 +10,7 @@
 // suffixes on integer constants are not handled (you can override this).
 //
 // History:
+//     0.12 fix compilation bug for NUL support; better support separate inclusion
 //     0.11 fix clang static analysis warning
 //     0.10 fix warnings
 //     0.09 hex floats, no-stdlib fixes
diff --git a/stb_divide.h b/stb_divide.h
index 455aacf..6a51e3f 100644
--- a/stb_divide.h
+++ b/stb_divide.h
@@ -3,7 +3,7 @@
 //
 // HISTORY
 //
-//   v0.94              Fix integer overflow issues 
+//   v0.94              Fix integer overflow issues
 //   v0.93  2020-02-02  Write useful exit() value from main()
 //   v0.92  2019-02-25  Fix warning
 //   v0.91  2010-02-27  Fix euclidean division by INT_MIN for non-truncating C
diff --git a/stb_ds.h b/stb_ds.h
index 530f504..e84c82d 100644
--- a/stb_ds.h
+++ b/stb_ds.h
@@ -1,4 +1,4 @@
-/* stb_ds.h - v0.66 - public domain data structures - Sean Barrett 2019
+/* stb_ds.h - v0.67 - public domain data structures - Sean Barrett 2019
 
    This is a single-header-file library that provides easy-to-use
    dynamic arrays and hash tables for C (also works in C++).
diff --git a/stb_image.h b/stb_image.h
index e6207e5..0f8459a 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -48,7 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
-      2.27  (          ) document stbi_info better, 16-bit PNM support, bug fixes
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
       2.26  (2020-07-13) many minor fixes
       2.25  (2020-02-02) fix warnings
       2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
diff --git a/stb_image_resize.h b/stb_image_resize.h
index 17e46b7..ef9e6fe 100644
--- a/stb_image_resize.h
+++ b/stb_image_resize.h
@@ -1,4 +1,4 @@
-/* stb_image_resize - v0.96 - public domain image resizing
+/* stb_image_resize - v0.97 - public domain image resizing
    by Jorge L Rodriguez (@VinoBS) - 2014
    http://github.com/nothings/stb
 
diff --git a/stb_image_write.h b/stb_image_write.h
index d7ed1d3..e4b32ed 100644
--- a/stb_image_write.h
+++ b/stb_image_write.h
@@ -1,4 +1,4 @@
-/* stb_image_write - v1.15 - public domain - http://nothings.org/stb
+/* stb_image_write - v1.16 - public domain - http://nothings.org/stb
    writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
                                      no warranty implied; use at your own risk
 
@@ -1628,9 +1628,10 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
-      1.15  (          )
+      1.16  (2021-07-11)
              make Deflate code emit uncompressed blocks when it would otherwise expand
              support writing BMPs with alpha channel
+      1.15  (2020-07-13) unknown
       1.14  (2020-02-02) updated JPEG writer to downsample chroma channels
       1.13
       1.12
diff --git a/stb_rect_pack.h b/stb_rect_pack.h
index a0667a1..6a633ce 100644
--- a/stb_rect_pack.h
+++ b/stb_rect_pack.h
@@ -41,7 +41,7 @@
 //
 // Version history:
 //
-//     1.01  (          )  always use large rect mode, expose STBRP__MAXVAL in public section
+//     1.01  (2021-07-11)  always use large rect mode, expose STBRP__MAXVAL in public section
 //     1.00  (2019-02-25)  avoid small space waste; gracefully fail too-wide rectangles
 //     0.99  (2019-02-07)  warning fixes
 //     0.11  (2017-03-03)  return packing success/fail result
diff --git a/stb_sprintf.h b/stb_sprintf.h
index 82358a5..ca432a6 100644
--- a/stb_sprintf.h
+++ b/stb_sprintf.h
@@ -1,4 +1,4 @@
-// stb_sprintf - v1.09 - public domain snprintf() implementation
+// stb_sprintf - v1.10 - public domain snprintf() implementation
 // originally by Jeff Roberts / RAD Game Tools, 2015/10/20
 // http://github.com/nothings/stb
 //
diff --git a/stb_textedit.h b/stb_textedit.h
index 5bb3a57..1442493 100644
--- a/stb_textedit.h
+++ b/stb_textedit.h
@@ -29,7 +29,7 @@
 //
 // VERSION HISTORY
 //
-//   1.14 (          ) page up/down, various fixes
+//   1.14 (2021-07-11) page up/down, various fixes
 //   1.13 (2019-02-07) fix bug in undo size management
 //   1.12 (2018-01-29) user can change STB_TEXTEDIT_KEYTYPE, fix redo to avoid crash
 //   1.11 (2017-03-03) fix HOME on last line, dragging off single-line textfield
diff --git a/stb_truetype.h b/stb_truetype.h
index 3327d20..c79968a 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -1,4 +1,4 @@
-// stb_truetype.h - v1.24 - public domain
+// stb_truetype.h - v1.25 - public domain
 // authored from 2009-2020 by Sean Barrett / RAD Game Tools
 //
 // =======================================================================
@@ -58,6 +58,7 @@
 //
 // VERSION HISTORY
 //
+//   1.25 (2020-07-11) many fixes
 //   1.24 (2020-02-05) fix warning
 //   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
 //   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
diff --git a/stb_vorbis.c b/stb_vorbis.c
index 534e689..3e5c250 100644
--- a/stb_vorbis.c
+++ b/stb_vorbis.c
@@ -36,7 +36,7 @@
 //    AnthoFoxo          github:morlat       Gabriel Ravier
 //
 // Partial history:
-//    1.22    -            - various small fixes
+//    1.22    - 2021-07-11 - various small fixes
 //    1.21    - 2021-07-02 - fix bug for files with no comments
 //    1.20    - 2020-07-11 - several small fixes
 //    1.19    - 2020-02-05 - warnings
diff --git a/tests/test_c_lexer.c b/tests/test_c_lexer.c
index cc11234..7921b1c 100644
--- a/tests/test_c_lexer.c
+++ b/tests/test_c_lexer.c
@@ -1,4 +1,4 @@
-#include "stb_c_lexer.h" 
+#include "stb_c_lexer.h"
 
 #define STB_C_LEX_C_DECIMAL_INTS    Y   //  "0|[1-9][0-9]*"                        CLEX_intlit
 #define STB_C_LEX_C_HEX_INTS        Y   //  "0x[0-9a-fA-F]+"                       CLEX_intlit
@@ -47,4 +47,4 @@
 
 #define STB_C_LEXER_DEFINITIONS         // This line prevents the header file from replacing your definitions
 
-#include "stb_c_lexer.h" 
+#include "stb_c_lexer.h"
diff --git a/tools/README.list b/tools/README.list
index 2464351..3daa276 100644
--- a/tools/README.list
+++ b/tools/README.list
@@ -16,6 +16,6 @@ stb_herringbone_wang_tile.h | game dev         | herringbone Wang tile map gener
 stb_c_lexer.h               | parsing          | simplify writing parsers for C-like languages
 stb_divide.h                | math             | more useful 32-bit modulus e.g. "euclidean divide"
 stb_connected_components.h  | misc             | incrementally compute reachability on grids
-stb.h                       | misc             | helper functions for C, mostly redundant in C++; basically author's personal stuff
+stb.h                       | misc             | _deprecated_ helper functions for C, mostly redundant in C++; basically author's personal stuff
 stb_leakcheck.h             | misc             | quick-and-dirty malloc/free leak-checking
 stb_include.h               | misc             | implement recursive #include support, particularly for GLSL

From 3cfb892cf3144eb0bd4d85f84695193fb6f007c0 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 11 Jul 2021 17:56:55 -0700
Subject: [PATCH 106/170] stb_dxt: Fix bug in table generator

Forgot to put the *100 back, oops.
---
 stb_dxt.h | 131 +++++++++++++++++++++++++++---------------------------
 1 file changed, 66 insertions(+), 65 deletions(-)

diff --git a/stb_dxt.h b/stb_dxt.h
index 02a06ae..ed8405a 100644
--- a/stb_dxt.h
+++ b/stb_dxt.h
@@ -10,6 +10,7 @@
 //     You can turn on dithering and "high quality" using mode.
 //
 // version history:
+//   v1.12  - (ryg) fix bug in single-color table generator
 //   v1.11  - (ryg) avoid racy global init, better single-color tables, remove dither
 //   v1.10  - (i.c) various small quality improvements
 //   v1.09  - (stb) update documentation re: surprising alpha channel requirement
@@ -92,72 +93,72 @@ STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *sr
 #endif
 
 static const unsigned char stb__OMatch5[256][2] = {
-   {  0,  0 }, {  0,  0 }, {  0,  0 }, {  0,  0 }, {  0,  0 }, {  1,  1 }, {  1,  1 }, {  1,  1 },
-   {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  2,  2 }, {  2,  2 }, {  2,  2 },
-   {  2,  2 }, {  2,  2 }, {  2,  2 }, {  2,  2 }, {  2,  2 }, {  3,  3 }, {  3,  3 }, {  3,  3 },
-   {  3,  3 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  4,  4 }, {  4,  4 }, {  4,  4 },
-   {  4,  4 }, {  4,  4 }, {  4,  4 }, {  4,  4 }, {  4,  4 }, {  4,  4 }, {  5,  5 }, {  5,  5 },
-   {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  6,  6 }, {  6,  6 },
-   {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  6 }, {  7,  7 }, {  7,  7 },
-   {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  8,  8 }, {  8,  8 },
-   {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  8,  8 }, {  9,  9 },
-   {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, { 10, 10 },
-   { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 11, 11 },
-   { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 12, 12 },
-   { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 }, { 12, 12 },
-   { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 },
-   { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 14 },
-   { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 },
-   { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 },
-   { 16, 16 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 },
-   { 17, 17 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 },
-   { 18, 18 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 },
-   { 19, 19 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 },
-   { 20, 20 }, { 20, 20 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 },
-   { 21, 21 }, { 21, 21 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 },
-   { 22, 22 }, { 22, 22 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 },
-   { 23, 23 }, { 23, 23 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 },
-   { 24, 24 }, { 24, 24 }, { 24, 24 }, { 25, 25 }, { 25, 25 }, { 25, 25 }, { 25, 25 }, { 25, 25 },
-   { 25, 25 }, { 25, 25 }, { 25, 25 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, { 26, 26 },
-   { 26, 26 }, { 26, 26 }, { 26, 26 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, { 27, 27 },
-   { 27, 27 }, { 27, 27 }, { 27, 27 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 },
-   { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 29, 29 }, { 29, 29 }, { 29, 29 }, { 29, 29 },
-   { 29, 29 }, { 29, 29 }, { 29, 29 }, { 29, 29 }, { 30, 30 }, { 30, 30 }, { 30, 30 }, { 30, 30 },
-   { 30, 30 }, { 30, 30 }, { 30, 30 }, { 30, 30 }, { 31, 31 }, { 31, 31 }, { 31, 31 }, { 31, 31 },
+   {  0,  0 }, {  0,  0 }, {  0,  1 }, {  0,  1 }, {  1,  0 }, {  1,  0 }, {  1,  0 }, {  1,  1 },
+   {  1,  1 }, {  1,  1 }, {  1,  2 }, {  0,  4 }, {  2,  1 }, {  2,  1 }, {  2,  1 }, {  2,  2 },
+   {  2,  2 }, {  2,  2 }, {  2,  3 }, {  1,  5 }, {  3,  2 }, {  3,  2 }, {  4,  0 }, {  3,  3 },
+   {  3,  3 }, {  3,  3 }, {  3,  4 }, {  3,  4 }, {  3,  4 }, {  3,  5 }, {  4,  3 }, {  4,  3 },
+   {  5,  2 }, {  4,  4 }, {  4,  4 }, {  4,  5 }, {  4,  5 }, {  5,  4 }, {  5,  4 }, {  5,  4 },
+   {  6,  3 }, {  5,  5 }, {  5,  5 }, {  5,  6 }, {  4,  8 }, {  6,  5 }, {  6,  5 }, {  6,  5 },
+   {  6,  6 }, {  6,  6 }, {  6,  6 }, {  6,  7 }, {  5,  9 }, {  7,  6 }, {  7,  6 }, {  8,  4 },
+   {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  8 }, {  7,  8 }, {  7,  8 }, {  7,  9 }, {  8,  7 },
+   {  8,  7 }, {  9,  6 }, {  8,  8 }, {  8,  8 }, {  8,  9 }, {  8,  9 }, {  9,  8 }, {  9,  8 },
+   {  9,  8 }, { 10,  7 }, {  9,  9 }, {  9,  9 }, {  9, 10 }, {  8, 12 }, { 10,  9 }, { 10,  9 },
+   { 10,  9 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 11 }, {  9, 13 }, { 11, 10 }, { 11, 10 },
+   { 12,  8 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 12 }, { 11, 12 }, { 11, 12 }, { 11, 13 },
+   { 12, 11 }, { 12, 11 }, { 13, 10 }, { 12, 12 }, { 12, 12 }, { 12, 13 }, { 12, 13 }, { 13, 12 },
+   { 13, 12 }, { 13, 12 }, { 14, 11 }, { 13, 13 }, { 13, 13 }, { 13, 14 }, { 12, 16 }, { 14, 13 },
+   { 14, 13 }, { 14, 13 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 15 }, { 13, 17 }, { 15, 14 },
+   { 15, 14 }, { 16, 12 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 16 }, { 15, 16 }, { 15, 16 },
+   { 15, 17 }, { 16, 15 }, { 16, 15 }, { 17, 14 }, { 16, 16 }, { 16, 16 }, { 16, 17 }, { 16, 17 },
+   { 17, 16 }, { 17, 16 }, { 17, 16 }, { 18, 15 }, { 17, 17 }, { 17, 17 }, { 17, 18 }, { 16, 20 },
+   { 18, 17 }, { 18, 17 }, { 18, 17 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 19 }, { 17, 21 },
+   { 19, 18 }, { 19, 18 }, { 20, 16 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 20 }, { 19, 20 },
+   { 19, 20 }, { 19, 21 }, { 20, 19 }, { 20, 19 }, { 21, 18 }, { 20, 20 }, { 20, 20 }, { 20, 21 },
+   { 20, 21 }, { 21, 20 }, { 21, 20 }, { 21, 20 }, { 22, 19 }, { 21, 21 }, { 21, 21 }, { 21, 22 },
+   { 20, 24 }, { 22, 21 }, { 22, 21 }, { 22, 21 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 23 },
+   { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 24 },
+   { 23, 24 }, { 23, 24 }, { 23, 25 }, { 24, 23 }, { 24, 23 }, { 25, 22 }, { 24, 24 }, { 24, 24 },
+   { 24, 25 }, { 24, 25 }, { 25, 24 }, { 25, 24 }, { 25, 24 }, { 26, 23 }, { 25, 25 }, { 25, 25 },
+   { 25, 26 }, { 24, 28 }, { 26, 25 }, { 26, 25 }, { 26, 25 }, { 26, 26 }, { 26, 26 }, { 26, 26 },
+   { 26, 27 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 27, 27 }, { 27, 27 }, { 27, 27 },
+   { 27, 28 }, { 27, 28 }, { 27, 28 }, { 27, 29 }, { 28, 27 }, { 28, 27 }, { 29, 26 }, { 28, 28 },
+   { 28, 28 }, { 28, 29 }, { 28, 29 }, { 29, 28 }, { 29, 28 }, { 29, 28 }, { 30, 27 }, { 29, 29 },
+   { 29, 29 }, { 29, 30 }, { 29, 30 }, { 30, 29 }, { 30, 29 }, { 30, 29 }, { 30, 30 }, { 30, 30 },
+   { 30, 30 }, { 30, 31 }, { 30, 31 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 },
 };
 static const unsigned char stb__OMatch6[256][2] = {
-   {  0,  0 }, {  0,  0 }, {  0,  0 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  1,  1 }, {  2,  2 },
-   {  2,  2 }, {  2,  2 }, {  2,  2 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  3,  3 }, {  4,  4 },
-   {  4,  4 }, {  4,  4 }, {  4,  4 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  5,  5 }, {  6,  6 },
-   {  6,  6 }, {  6,  6 }, {  6,  6 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  7,  7 }, {  8,  8 },
-   {  8,  8 }, {  8,  8 }, {  8,  8 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, {  9,  9 }, { 10, 10 },
-   { 10, 10 }, { 10, 10 }, { 10, 10 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 12, 12 },
-   { 12, 12 }, { 12, 12 }, { 12, 12 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 13, 13 }, { 14, 14 },
-   { 14, 14 }, { 14, 14 }, { 14, 14 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 16, 16 },
-   { 16, 16 }, { 16, 16 }, { 16, 16 }, { 16, 16 }, { 17, 17 }, { 17, 17 }, { 17, 17 }, { 17, 17 },
-   { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 19 },
-   { 20, 20 }, { 20, 20 }, { 20, 20 }, { 20, 20 }, { 21, 21 }, { 21, 21 }, { 21, 21 }, { 21, 21 },
-   { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 23 },
-   { 24, 24 }, { 24, 24 }, { 24, 24 }, { 24, 24 }, { 25, 25 }, { 25, 25 }, { 25, 25 }, { 25, 25 },
-   { 26, 26 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, { 27, 27 },
-   { 28, 28 }, { 28, 28 }, { 28, 28 }, { 28, 28 }, { 29, 29 }, { 29, 29 }, { 29, 29 }, { 29, 29 },
-   { 30, 30 }, { 30, 30 }, { 30, 30 }, { 30, 30 }, { 31, 31 }, { 31, 31 }, { 31, 31 }, { 31, 31 },
-   { 32, 32 }, { 32, 32 }, { 32, 32 }, { 32, 32 }, { 32, 32 }, { 33, 33 }, { 33, 33 }, { 33, 33 },
-   { 33, 33 }, { 34, 34 }, { 34, 34 }, { 34, 34 }, { 34, 34 }, { 35, 35 }, { 35, 35 }, { 35, 35 },
-   { 35, 35 }, { 36, 36 }, { 36, 36 }, { 36, 36 }, { 36, 36 }, { 37, 37 }, { 37, 37 }, { 37, 37 },
-   { 37, 37 }, { 38, 38 }, { 38, 38 }, { 38, 38 }, { 38, 38 }, { 39, 39 }, { 39, 39 }, { 39, 39 },
-   { 39, 39 }, { 40, 40 }, { 40, 40 }, { 40, 40 }, { 40, 40 }, { 41, 41 }, { 41, 41 }, { 41, 41 },
-   { 41, 41 }, { 42, 42 }, { 42, 42 }, { 42, 42 }, { 42, 42 }, { 43, 43 }, { 43, 43 }, { 43, 43 },
-   { 43, 43 }, { 44, 44 }, { 44, 44 }, { 44, 44 }, { 44, 44 }, { 45, 45 }, { 45, 45 }, { 45, 45 },
-   { 45, 45 }, { 46, 46 }, { 46, 46 }, { 46, 46 }, { 46, 46 }, { 47, 47 }, { 47, 47 }, { 47, 47 },
-   { 47, 47 }, { 48, 48 }, { 48, 48 }, { 48, 48 }, { 48, 48 }, { 48, 48 }, { 49, 49 }, { 49, 49 },
-   { 49, 49 }, { 49, 49 }, { 50, 50 }, { 50, 50 }, { 50, 50 }, { 50, 50 }, { 51, 51 }, { 51, 51 },
-   { 51, 51 }, { 51, 51 }, { 52, 52 }, { 52, 52 }, { 52, 52 }, { 52, 52 }, { 53, 53 }, { 53, 53 },
-   { 53, 53 }, { 53, 53 }, { 54, 54 }, { 54, 54 }, { 54, 54 }, { 54, 54 }, { 55, 55 }, { 55, 55 },
-   { 55, 55 }, { 55, 55 }, { 56, 56 }, { 56, 56 }, { 56, 56 }, { 56, 56 }, { 57, 57 }, { 57, 57 },
-   { 57, 57 }, { 57, 57 }, { 58, 58 }, { 58, 58 }, { 58, 58 }, { 58, 58 }, { 59, 59 }, { 59, 59 },
-   { 59, 59 }, { 59, 59 }, { 60, 60 }, { 60, 60 }, { 60, 60 }, { 60, 60 }, { 61, 61 }, { 61, 61 },
-   { 61, 61 }, { 61, 61 }, { 62, 62 }, { 62, 62 }, { 62, 62 }, { 62, 62 }, { 63, 63 }, { 63, 63 },
+   {  0,  0 }, {  0,  1 }, {  1,  0 }, {  1,  1 }, {  1,  1 }, {  1,  2 }, {  2,  1 }, {  2,  2 },
+   {  2,  2 }, {  2,  3 }, {  3,  2 }, {  3,  3 }, {  3,  3 }, {  3,  4 }, {  4,  3 }, {  4,  4 },
+   {  4,  4 }, {  4,  5 }, {  5,  4 }, {  5,  5 }, {  5,  5 }, {  5,  6 }, {  6,  5 }, {  6,  6 },
+   {  6,  6 }, {  6,  7 }, {  7,  6 }, {  7,  7 }, {  7,  7 }, {  7,  8 }, {  8,  7 }, {  8,  8 },
+   {  8,  8 }, {  8,  9 }, {  9,  8 }, {  9,  9 }, {  9,  9 }, {  9, 10 }, { 10,  9 }, { 10, 10 },
+   { 10, 10 }, { 10, 11 }, { 11, 10 }, {  8, 16 }, { 11, 11 }, { 11, 12 }, { 12, 11 }, {  9, 17 },
+   { 12, 12 }, { 12, 13 }, { 13, 12 }, { 11, 16 }, { 13, 13 }, { 13, 14 }, { 14, 13 }, { 12, 17 },
+   { 14, 14 }, { 14, 15 }, { 15, 14 }, { 14, 16 }, { 15, 15 }, { 15, 16 }, { 16, 14 }, { 16, 15 },
+   { 17, 14 }, { 16, 16 }, { 16, 17 }, { 17, 16 }, { 18, 15 }, { 17, 17 }, { 17, 18 }, { 18, 17 },
+   { 20, 14 }, { 18, 18 }, { 18, 19 }, { 19, 18 }, { 21, 15 }, { 19, 19 }, { 19, 20 }, { 20, 19 },
+   { 20, 20 }, { 20, 20 }, { 20, 21 }, { 21, 20 }, { 21, 21 }, { 21, 21 }, { 21, 22 }, { 22, 21 },
+   { 22, 22 }, { 22, 22 }, { 22, 23 }, { 23, 22 }, { 23, 23 }, { 23, 23 }, { 23, 24 }, { 24, 23 },
+   { 24, 24 }, { 24, 24 }, { 24, 25 }, { 25, 24 }, { 25, 25 }, { 25, 25 }, { 25, 26 }, { 26, 25 },
+   { 26, 26 }, { 26, 26 }, { 26, 27 }, { 27, 26 }, { 24, 32 }, { 27, 27 }, { 27, 28 }, { 28, 27 },
+   { 25, 33 }, { 28, 28 }, { 28, 29 }, { 29, 28 }, { 27, 32 }, { 29, 29 }, { 29, 30 }, { 30, 29 },
+   { 28, 33 }, { 30, 30 }, { 30, 31 }, { 31, 30 }, { 30, 32 }, { 31, 31 }, { 31, 32 }, { 32, 30 },
+   { 32, 31 }, { 33, 30 }, { 32, 32 }, { 32, 33 }, { 33, 32 }, { 34, 31 }, { 33, 33 }, { 33, 34 },
+   { 34, 33 }, { 36, 30 }, { 34, 34 }, { 34, 35 }, { 35, 34 }, { 37, 31 }, { 35, 35 }, { 35, 36 },
+   { 36, 35 }, { 36, 36 }, { 36, 36 }, { 36, 37 }, { 37, 36 }, { 37, 37 }, { 37, 37 }, { 37, 38 },
+   { 38, 37 }, { 38, 38 }, { 38, 38 }, { 38, 39 }, { 39, 38 }, { 39, 39 }, { 39, 39 }, { 39, 40 },
+   { 40, 39 }, { 40, 40 }, { 40, 40 }, { 40, 41 }, { 41, 40 }, { 41, 41 }, { 41, 41 }, { 41, 42 },
+   { 42, 41 }, { 42, 42 }, { 42, 42 }, { 42, 43 }, { 43, 42 }, { 40, 48 }, { 43, 43 }, { 43, 44 },
+   { 44, 43 }, { 41, 49 }, { 44, 44 }, { 44, 45 }, { 45, 44 }, { 43, 48 }, { 45, 45 }, { 45, 46 },
+   { 46, 45 }, { 44, 49 }, { 46, 46 }, { 46, 47 }, { 47, 46 }, { 46, 48 }, { 47, 47 }, { 47, 48 },
+   { 48, 46 }, { 48, 47 }, { 49, 46 }, { 48, 48 }, { 48, 49 }, { 49, 48 }, { 50, 47 }, { 49, 49 },
+   { 49, 50 }, { 50, 49 }, { 52, 46 }, { 50, 50 }, { 50, 51 }, { 51, 50 }, { 53, 47 }, { 51, 51 },
+   { 51, 52 }, { 52, 51 }, { 52, 52 }, { 52, 52 }, { 52, 53 }, { 53, 52 }, { 53, 53 }, { 53, 53 },
+   { 53, 54 }, { 54, 53 }, { 54, 54 }, { 54, 54 }, { 54, 55 }, { 55, 54 }, { 55, 55 }, { 55, 55 },
+   { 55, 56 }, { 56, 55 }, { 56, 56 }, { 56, 56 }, { 56, 57 }, { 57, 56 }, { 57, 57 }, { 57, 57 },
+   { 57, 58 }, { 58, 57 }, { 58, 58 }, { 58, 58 }, { 58, 59 }, { 59, 58 }, { 59, 59 }, { 59, 59 },
+   { 59, 60 }, { 60, 59 }, { 60, 60 }, { 60, 60 }, { 60, 61 }, { 61, 60 }, { 61, 61 }, { 61, 61 },
+   { 61, 62 }, { 62, 61 }, { 62, 62 }, { 62, 62 }, { 62, 63 }, { 63, 62 }, { 63, 63 }, { 63, 63 },
 };
 
 static int stb__Mul8Bit(int a, int b)
@@ -649,7 +650,7 @@ int main()
             for (mx=0;mx<size;mx++) {
                int mine = (mn * dequant) >> 4;
                int maxe = (mx * dequant) >> 4;
-               int err = abs(stb__Lerp13(maxe, mine) - j);
+               int err = abs(stb__Lerp13(maxe, mine) - j) * 100;
 
                // DX10 spec says that interpolation must be within 3% of "correct" result,
                // add this as error term. Normally we'd expect a random distribution of

From 5324597dcb80510f71db05246b3932cb9876b9e4 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 11 Jul 2021 17:58:19 -0700
Subject: [PATCH 107/170] README: update stb_dxt version for fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4b986ef..72731b4 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ library    | lastest version | category | LoC | description
 **[stb_sprintf.h](stb_sprintf.h)** | 1.10 | utility | 1906 | fast sprintf, snprintf for C/C++
 **[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
 **[stb_voxel_render.h](stb_voxel_render.h)** | 0.89 | 3D&nbsp;graphics | 3807 | Minecraft-esque voxel rendering "engine" with many more features
-**[stb_dxt.h](stb_dxt.h)** | 1.11 | 3D&nbsp;graphics | 718 | Fabian "ryg" Giesen's real-time DXT compressor
+**[stb_dxt.h](stb_dxt.h)** | 1.12 | 3D&nbsp;graphics | 718 | Fabian "ryg" Giesen's real-time DXT compressor
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | 3D&nbsp;graphics | 428 | revised Perlin noise (3D input, 1D output)
 **[stb_easy_font.h](stb_easy_font.h)** | 1.1 | 3D&nbsp;graphics | 305 | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc
 **[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.42 | game&nbsp;dev | 4187 | embeddable tilemap editor

From 7023e273f1513b68b8d4086077c6faca555d50df Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 11 Jul 2021 18:06:46 -0700
Subject: [PATCH 108/170] fix stb_dxt version

---
 README.md | 6 +++---
 stb_dxt.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 72731b4..173b1ac 100644
--- a/README.md
+++ b/README.md
@@ -30,7 +30,7 @@ library    | lastest version | category | LoC | description
 **[stb_sprintf.h](stb_sprintf.h)** | 1.10 | utility | 1906 | fast sprintf, snprintf for C/C++
 **[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
 **[stb_voxel_render.h](stb_voxel_render.h)** | 0.89 | 3D&nbsp;graphics | 3807 | Minecraft-esque voxel rendering "engine" with many more features
-**[stb_dxt.h](stb_dxt.h)** | 1.12 | 3D&nbsp;graphics | 718 | Fabian "ryg" Giesen's real-time DXT compressor
+**[stb_dxt.h](stb_dxt.h)** | 1.12 | 3D&nbsp;graphics | 719 | Fabian "ryg" Giesen's real-time DXT compressor
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | 3D&nbsp;graphics | 428 | revised Perlin noise (3D input, 1D output)
 **[stb_easy_font.h](stb_easy_font.h)** | 1.1 | 3D&nbsp;graphics | 305 | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc
 **[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.42 | game&nbsp;dev | 4187 | embeddable tilemap editor
@@ -38,12 +38,12 @@ library    | lastest version | category | LoC | description
 **[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 940 | simplify writing parsers for C-like languages
 **[stb_divide.h](stb_divide.h)** | 0.94 | math | 433 | more useful 32-bit modulus e.g. "euclidean divide"
 **[stb_connected_comp...](stb_connected_components.h)** | 0.96 | misc | 1049 | incrementally compute reachability on grids
-**[stb.h](stb.h)** | 2.38 | misc | 14457 | _deprecated_ helper functions for C, mostly redundant in C++; basically author's personal stuff
+**[stb.h](stb.h)** | 2.37 | misc | 13105 | _deprecated_ helper functions for C, mostly redundant in C++; basically author's personal stuff
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 56730
+Total lines of C code: 55379
 
 
 FAQ
diff --git a/stb_dxt.h b/stb_dxt.h
index ed8405a..6150a87 100644
--- a/stb_dxt.h
+++ b/stb_dxt.h
@@ -1,4 +1,4 @@
-// stb_dxt.h - v1.11 - DXT1/DXT5 compressor - public domain
+// stb_dxt.h - v1.12 - DXT1/DXT5 compressor - public domain
 // original by fabian "ryg" giesen - ported to C by stb
 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
 //

From c4ef8e1fdcc67d5c60501760e0068b9f83f53ca7 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 01:23:29 -0700
Subject: [PATCH 109/170] fix version date

---
 stb_truetype.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index c79968a..9414a63 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -1,5 +1,5 @@
 // stb_truetype.h - v1.25 - public domain
-// authored from 2009-2020 by Sean Barrett / RAD Game Tools
+// authored from 2009-2021 by Sean Barrett / RAD Game Tools
 //
 // =======================================================================
 //
@@ -58,7 +58,7 @@
 //
 // VERSION HISTORY
 //
-//   1.25 (2020-07-11) many fixes
+//   1.25 (2021-07-11) many fixes
 //   1.24 (2020-02-05) fix warning
 //   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
 //   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
@@ -4913,6 +4913,12 @@ STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const
 
 // FULL VERSION HISTORY
 //
+//   1.25 (2021-07-11) many fixes
+//   1.24 (2020-02-05) fix warning
+//   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
+//   1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined
+//   1.21 (2019-02-25) fix warning
+//   1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics()
 //   1.19 (2018-02-11) OpenType GPOS kerning (horizontal only), STBTT_fmod
 //   1.18 (2018-01-29) add missing function
 //   1.17 (2017-07-23) make more arguments const; doc fix

From 5a0bb8b1c1b1ca3f4e2485f4114c1c8ea021b781 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 01:39:27 -0700
Subject: [PATCH 110/170] update readme

---
 README.md              | 22 ++++++++++++++++++++--
 tools/README.footer.md | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 173b1ac..dfc1ebb 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_image.h](stb_image.h)** | 2.27 | graphics | 7890 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
-**[stb_truetype.h](stb_truetype.h)** | 1.25 | graphics | 5011 | parse, decode, and rasterize characters from truetype fonts
+**[stb_truetype.h](stb_truetype.h)** | 1.25 | graphics | 5017 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
@@ -43,7 +43,7 @@ library    | lastest version | category | LoC | description
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 55379
+Total lines of C code: 55385
 
 
 FAQ
@@ -59,6 +59,24 @@ They are also licensed under the MIT open source license, if you have lawyers
 who are unhappy with public domain. Every source file includes an explicit
 dual-license for you to choose from.
 
+#### How do I use these libraries?
+
+The idea behind single-header file libraries is that they're easy to distribute and deploy
+because all the code is contained in a single file. By default, the .h files in here act as
+their own header files, i.e. they declare the functions contained in the file but don't
+actually result in any code getting compiled.
+
+So in addition, you should select _exactly one_ C/C++ source file that actually instantiates
+the code, preferably a file you're not editing frequently. This file should define a
+specific macro (this is documented per-library) to actually enable the function definitions.
+For example, to use stb_image, you should have exactly one C/C++ file that doesn't
+include stb_image.h regularly, but instead does
+
+    #define STB_IMAGE_IMPLEMENTATION
+    #include "stb_image.h"
+
+The right macro to define is pointed out right at the top of each of these libraries.
+
 #### <a name="other_libs"></a> Are there other single-file public-domain/open source libraries with minimal dependencies out there?
 
 [Yes.](https://github.com/nothings/single_file_libs)
diff --git a/tools/README.footer.md b/tools/README.footer.md
index 6ef5e1f..3eb8dc0 100644
--- a/tools/README.footer.md
+++ b/tools/README.footer.md
@@ -12,6 +12,24 @@ They are also licensed under the MIT open source license, if you have lawyers
 who are unhappy with public domain. Every source file includes an explicit
 dual-license for you to choose from.
 
+#### How do I use these libraries?
+
+The idea behind single-header file libraries is that they're easy to distribute and deploy
+because all the code is contained in a single file. By default, the .h files in here act as
+their own header files, i.e. they declare the functions contained in the file but don't
+actually result in any code getting compiled.
+
+So in addition, you should select _exactly one_ C/C++ source file that actually instantiates
+the code, preferably a file you're not editing frequently. This file should define a
+specific macro (this is documented per-library) to actually enable the function definitions.
+For example, to use stb_image, you should have exactly one C/C++ file that doesn't
+include stb_image.h regularly, but instead does
+
+    #define STB_IMAGE_IMPLEMENTATION
+    #include "stb_image.h"
+
+The right macro to define is pointed out right at the top of each of these libraries.
+
 #### <a name="other_libs"></a> Are there other single-file public-domain/open source libraries with minimal dependencies out there?
 
 [Yes.](https://github.com/nothings/single_file_libs)

From a0a939058c579ddefd4c5671b046f29d12aeae01 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 01:50:47 -0700
Subject: [PATCH 111/170] update README

---
 README.md         | 5 +++--
 tools/README.list | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index dfc1ebb..899e38c 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts.
 library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
+**[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
 **[stb_image.h](stb_image.h)** | 2.27 | graphics | 7890 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.25 | graphics | 5017 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
@@ -42,8 +43,8 @@ library    | lastest version | category | LoC | description
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
-Total libraries: 21
-Total lines of C code: 55385
+Total libraries: 22
+Total lines of C code: 56065
 
 
 FAQ
diff --git a/tools/README.list b/tools/README.list
index 3daa276..c2ae379 100644
--- a/tools/README.list
+++ b/tools/README.list
@@ -1,4 +1,5 @@
 stb_vorbis.c                | audio            | decode ogg vorbis files from file/memory to float/16-bit signed output
+stb_hexwave.h               | audio            | audio waveform synthesizer
 stb_image.h                 | graphics         | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 stb_truetype.h              | graphics         | parse, decode, and rasterize characters from truetype fonts
 stb_image_write.h           | graphics         | image writing to disk: PNG, TGA, BMP

From cd6b6f70ec2be6ee80e142c6b7388d0d00d41b2f Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 15:57:39 -0700
Subject: [PATCH 112/170] fix compiling with NO_HDR NO_LINEAR

---
 stb_image.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 0f8459a..08020ac 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1032,7 +1032,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 }
 
 // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 {
    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
@@ -1055,7 +1055,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add)
    return stbi__malloc(a*b*c + add);
 }
 
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 {
    if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;

From 4adb57af424cbdd1a27937995a11509cc65e0217 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 15:58:08 -0700
Subject: [PATCH 113/170] clean up project file

---
 tests/stb.dsp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/stb.dsp b/tests/stb.dsp
index 50fff38..ba13ba1 100644
--- a/tests/stb.dsp
+++ b/tests/stb.dsp
@@ -154,14 +154,6 @@ SOURCE=..\stb_perlin.h
 # End Source File
 # Begin Source File
 
-SOURCE=..\stb_pg.h
-# End Source File
-# Begin Source File
-
-SOURCE=.\stb_query.h
-# End Source File
-# Begin Source File
-
 SOURCE=..\stb_rect_pack.h
 # End Source File
 # Begin Source File

From 3a1174060a7dd4eb652d4e6854bc4cd98c159200 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 21:27:12 -0700
Subject: [PATCH 114/170] opengl stb_truetype demo app

---
 tests/truetype_test_win32.c | 184 ++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 tests/truetype_test_win32.c

diff --git a/tests/truetype_test_win32.c b/tests/truetype_test_win32.c
new file mode 100644
index 0000000..c9aed99
--- /dev/null
+++ b/tests/truetype_test_win32.c
@@ -0,0 +1,184 @@
+// tested in VC6 (1998) and VS 2019
+#define _CRT_SECURE_NO_WARNINGS
+#define WIN32_MEAN_AND_LEAN
+#include <windows.h>
+
+#include <stdio.h>
+#include <tchar.h>
+
+#define STB_TRUETYPE_IMPLEMENTATION
+#include "stb_truetype.h"
+
+#include <gl/gl.h>
+#include <gl/glu.h>
+
+int screen_x=1024, screen_y=768;
+GLuint tex;
+
+unsigned char ttf_buffer[1<<20];
+unsigned char temp_bitmap[1024*1024];
+stbtt_bakedchar cdata[96]; // ASCII 32..126 is 95 glyphs
+
+void init(void)
+{
+   fread(ttf_buffer, 1, 1<<20, fopen("c:/windows/fonts/times.ttf", "rb"));
+   stbtt_BakeFontBitmap(ttf_buffer,0, 64.0, temp_bitmap,1024,1024, 32,96, cdata);
+   glGenTextures(1, &tex);
+   glBindTexture(GL_TEXTURE_2D, tex);
+   glTexImage2D(GL_TEXTURE_2D, 0, GL_ALPHA, 1024,1024,0, GL_ALPHA, GL_UNSIGNED_BYTE, temp_bitmap);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+}
+
+void print(float x, float y, char *text)
+{
+   // assume orthographic projection with units = screen pixels, origin at top left
+   glBindTexture(GL_TEXTURE_2D, tex);
+   glBegin(GL_QUADS);
+   while (*text) {
+      if (*text >= 32 && *text < 128) {
+         stbtt_aligned_quad q;
+         stbtt_GetBakedQuad(cdata, 1024,1024, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1);
+      }
+      ++text;
+   }
+   glEnd();
+}
+
+void draw(void)
+{
+   glViewport(0,0,screen_x,screen_y);
+   glClearColor(0.45f,0.45f,0.75f,0);
+   glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+   glDisable(GL_CULL_FACE);
+   glDisable(GL_DEPTH_TEST);
+   glDisable(GL_BLEND);
+
+   glMatrixMode(GL_PROJECTION);
+   glLoadIdentity();
+   glOrtho(0,screen_x,screen_y,0,-1,1);
+   glMatrixMode(GL_MODELVIEW);
+   glLoadIdentity();
+
+   glEnable(GL_TEXTURE_2D);
+   glEnable(GL_BLEND);
+   glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+   glColor3f(1,1,1);
+
+   print(100,150, "This is a simple test!");
+
+   // show font bitmap
+   glBegin(GL_QUADS);
+      glTexCoord2f(0,0); glVertex2f(256,200+0);
+      glTexCoord2f(1,0); glVertex2f(768,200+0);
+      glTexCoord2f(1,1); glVertex2f(768,200+512);
+      glTexCoord2f(0,1); glVertex2f(256,200+512);
+   glEnd();
+}
+
+///////////////////////////////////////////////////////////////////////
+///
+///
+///  Windows OpenGL setup
+///
+///
+
+HINSTANCE app;
+HWND  window;
+HGLRC rc;
+HDC   dc;
+
+#pragma comment(lib, "opengl32.lib")
+#pragma comment(lib, "glu32.lib")
+#pragma comment(lib, "winmm.lib")
+
+int mySetPixelFormat(HWND win)
+{
+   PIXELFORMATDESCRIPTOR pfd = { sizeof(pfd), 1, PFD_SUPPORT_OPENGL | PFD_DRAW_TO_WINDOW | PFD_DOUBLEBUFFER, PFD_TYPE_RGBA };
+   int                   pixel_format;
+   pfd.dwLayerMask  = PFD_MAIN_PLANE;
+   pfd.cColorBits   = 24;
+   pfd.cAlphaBits   = 8;
+   pfd.cDepthBits   = 24;
+   pfd.cStencilBits = 8;
+   pixel_format = ChoosePixelFormat(dc, &pfd);
+   if (!pixel_format) return FALSE;
+   if (!DescribePixelFormat(dc, pixel_format, sizeof(PIXELFORMATDESCRIPTOR), &pfd))
+      return FALSE;
+   SetPixelFormat(dc, pixel_format, &pfd);
+   return TRUE;
+}
+
+static int WINAPI WinProc(HWND wnd, UINT msg, WPARAM wparam, LPARAM lparam)
+{
+   switch (msg) {
+      case WM_CREATE: {
+         LPCREATESTRUCT lpcs = (LPCREATESTRUCT) lparam;
+         dc = GetDC(wnd);
+         if (mySetPixelFormat(wnd)) {
+            rc = wglCreateContext(dc);
+            if (rc) {
+               wglMakeCurrent(dc, rc);
+               return 0;
+            }
+         }
+         return -1;
+      }
+
+      case WM_DESTROY:
+         wglMakeCurrent(NULL, NULL); 
+         if (rc) wglDeleteContext(rc);
+         PostQuitMessage (0);
+         return 0;
+
+      default:
+         return DefWindowProc (wnd, msg, wparam, lparam);
+   }
+
+   return DefWindowProc (wnd, msg, wparam, lparam);
+}
+
+int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow)
+{
+   DWORD dwstyle = WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_MINIMIZEBOX;
+   WNDCLASSEX  wndclass;
+   wndclass.cbSize        = sizeof(wndclass);
+   wndclass.style         = CS_OWNDC;
+   wndclass.lpfnWndProc   = (WNDPROC) WinProc;
+   wndclass.cbClsExtra    = 0;
+   wndclass.cbWndExtra    = 0;
+   wndclass.hInstance     = hInstance;
+   wndclass.hIcon         = LoadIcon(hInstance, _T("appicon"));
+   wndclass.hCursor       = LoadCursor(NULL,IDC_ARROW);
+   wndclass.hbrBackground = GetStockObject(NULL_BRUSH);
+   wndclass.lpszMenuName  = _T("truetype-test");
+   wndclass.lpszClassName = _T("truetype-test");
+   wndclass.hIconSm       = NULL;
+   app = hInstance;
+
+   if (!RegisterClassEx(&wndclass))
+      return 0;
+
+   window = CreateWindow(_T("truetype-test"), _T("truetype test"), dwstyle,
+                      CW_USEDEFAULT,0, screen_x, screen_y,
+                      NULL, NULL, app,  NULL);
+   ShowWindow(window, SW_SHOWNORMAL);
+   init();
+
+   for(;;) {
+      MSG msg;
+      if (GetMessage(&msg, NULL, 0, 0)) {
+         TranslateMessage(&msg);
+         DispatchMessage(&msg);
+      } else {
+         return 1;   // WM_QUIT
+      }
+      wglMakeCurrent(dc, rc);
+      draw();
+      SwapBuffers(dc);
+   }
+   return 0;
+}

From be901954b254b020b0b99d0e7ff95c2cbdb08df0 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 23:47:20 -0700
Subject: [PATCH 115/170] stb_truetype: fix sample code drawing characters
 upside-down

---
 stb_truetype.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 9414a63..f7edfb7 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -271,8 +271,8 @@
 ////  SAMPLE PROGRAMS
 ////
 //
-//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
-//
+//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless.
+//  See "tests/truetype_demo_win32.c" for a complete version.
 #if 0
 #define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
 #include "stb_truetype.h"
@@ -307,10 +307,10 @@ void my_stbtt_print(float x, float y, char *text)
       if (*text >= 32 && *text < 128) {
          stbtt_aligned_quad q;
          stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
-         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
-         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
-         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
-         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1);
       }
       ++text;
    }

From 7c65d5621f00c3304a3d4dcc8c036b37cf8fec9b Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 15:57:39 -0700
Subject: [PATCH 116/170] fix compiling with NO_HDR NO_LINEAR

---
 stb_image.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 0f8459a..08020ac 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1032,7 +1032,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add)
 }
 
 // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
 {
    return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
@@ -1055,7 +1055,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add)
    return stbi__malloc(a*b*c + add);
 }
 
-#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
 static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 {
    if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;

From 1401f2257d1c8b6417b442bdf59e7c185be7f9d9 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 15:58:08 -0700
Subject: [PATCH 117/170] clean up project file

---
 tests/stb.dsp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/stb.dsp b/tests/stb.dsp
index 50fff38..ba13ba1 100644
--- a/tests/stb.dsp
+++ b/tests/stb.dsp
@@ -154,14 +154,6 @@ SOURCE=..\stb_perlin.h
 # End Source File
 # Begin Source File
 
-SOURCE=..\stb_pg.h
-# End Source File
-# Begin Source File
-
-SOURCE=.\stb_query.h
-# End Source File
-# Begin Source File
-
 SOURCE=..\stb_rect_pack.h
 # End Source File
 # Begin Source File

From c404b789ca01a00cd244240484f7eccdd639ef06 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 12 Jul 2021 23:47:20 -0700
Subject: [PATCH 118/170] stb_truetype: fix sample code drawing characters
 upside-down

---
 stb_truetype.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/stb_truetype.h b/stb_truetype.h
index 9414a63..f7edfb7 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -271,8 +271,8 @@
 ////  SAMPLE PROGRAMS
 ////
 //
-//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
-//
+//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless.
+//  See "tests/truetype_demo_win32.c" for a complete version.
 #if 0
 #define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
 #include "stb_truetype.h"
@@ -307,10 +307,10 @@ void my_stbtt_print(float x, float y, char *text)
       if (*text >= 32 && *text < 128) {
          stbtt_aligned_quad q;
          stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9
-         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
-         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
-         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
-         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
+         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0);
+         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0);
+         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1);
+         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1);
       }
       ++text;
    }

From 5ba0baaa269b3fd681828e0e3b3ac0f1472eaf40 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 25 Jul 2021 20:24:10 -0700
Subject: [PATCH 119/170] stb_image: Reject fractional JPEG component
 subsampling ratios

The component resamplers are not written to support this and I've
never seen it happen in a real (non-crafted) JPEG file so I'm
fine rejecting this as outright corrupt.

Fixes issue #1178.
---
 stb_image.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/stb_image.h b/stb_image.h
index 08020ac..d60371b 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -3267,6 +3267,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
    }
 
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
    // compute interleaved mcu info
    z->img_h_max = h_max;
    z->img_v_max = v_max;

From 59e7dec3e8bb0a8d4050d03c2dc32cf71ffa87c6 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Fri, 13 Aug 2021 13:57:23 -0700
Subject: [PATCH 120/170] delete file covered by patents

---
 stb_perlin.h | 428 ---------------------------------------------------
 1 file changed, 428 deletions(-)
 delete mode 100644 stb_perlin.h

diff --git a/stb_perlin.h b/stb_perlin.h
deleted file mode 100644
index 47cb9a4..0000000
--- a/stb_perlin.h
+++ /dev/null
@@ -1,428 +0,0 @@
-// stb_perlin.h - v0.5 - perlin noise
-// public domain single-file C implementation by Sean Barrett
-//
-// LICENSE
-//
-//   See end of file.
-//
-//
-// to create the implementation,
-//     #define STB_PERLIN_IMPLEMENTATION
-// in *one* C/CPP file that includes this file.
-//
-//
-// Documentation:
-//
-// float  stb_perlin_noise3( float x,
-//                           float y,
-//                           float z,
-//                           int   x_wrap=0,
-//                           int   y_wrap=0,
-//                           int   z_wrap=0)
-//
-// This function computes a random value at the coordinate (x,y,z).
-// Adjacent random values are continuous but the noise fluctuates
-// its randomness with period 1, i.e. takes on wholly unrelated values
-// at integer points. Specifically, this implements Ken Perlin's
-// revised noise function from 2002.
-//
-// The "wrap" parameters can be used to create wraparound noise that
-// wraps at powers of two. The numbers MUST be powers of two. Specify
-// 0 to mean "don't care". (The noise always wraps every 256 due
-// details of the implementation, even if you ask for larger or no
-// wrapping.)
-//
-// float  stb_perlin_noise3_seed( float x,
-//                                float y,
-//                                float z,
-//                                int   x_wrap=0,
-//                                int   y_wrap=0,
-//                                int   z_wrap=0,
-//                                int   seed)
-//
-// As above, but 'seed' selects from multiple different variations of the
-// noise function. The current implementation only uses the bottom 8 bits
-// of 'seed', but possibly in the future more bits will be used.
-//
-//
-// Fractal Noise:
-//
-// Three common fractal noise functions are included, which produce
-// a wide variety of nice effects depending on the parameters
-// provided. Note that each function will call stb_perlin_noise3
-// 'octaves' times, so this parameter will affect runtime.
-//
-// float stb_perlin_ridge_noise3(float x, float y, float z,
-//                               float lacunarity, float gain, float offset, int octaves)
-//
-// float stb_perlin_fbm_noise3(float x, float y, float z,
-//                             float lacunarity, float gain, int octaves)
-//
-// float stb_perlin_turbulence_noise3(float x, float y, float z,
-//                                    float lacunarity, float gain, int octaves)
-//
-// Typical values to start playing with:
-//     octaves    =   6     -- number of "octaves" of noise3() to sum
-//     lacunarity = ~ 2.0   -- spacing between successive octaves (use exactly 2.0 for wrapping output)
-//     gain       =   0.5   -- relative weighting applied to each successive octave
-//     offset     =   1.0?  -- used to invert the ridges, may need to be larger, not sure
-//
-//
-// Contributors:
-//    Jack Mott - additional noise functions
-//    Jordan Peck - seeded noise
-//
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-extern float stb_perlin_noise3(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap);
-extern float stb_perlin_noise3_seed(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, int seed);
-extern float stb_perlin_ridge_noise3(float x, float y, float z, float lacunarity, float gain, float offset, int octaves);
-extern float stb_perlin_fbm_noise3(float x, float y, float z, float lacunarity, float gain, int octaves);
-extern float stb_perlin_turbulence_noise3(float x, float y, float z, float lacunarity, float gain, int octaves);
-extern float stb_perlin_noise3_wrap_nonpow2(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed);
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef STB_PERLIN_IMPLEMENTATION
-
-#include <math.h> // fabs()
-
-// not same permutation table as Perlin's reference to avoid copyright issues;
-// Perlin's table can be found at http://mrl.nyu.edu/~perlin/noise/
-static unsigned char stb__perlin_randtab[512] =
-{
-   23, 125, 161, 52, 103, 117, 70, 37, 247, 101, 203, 169, 124, 126, 44, 123,
-   152, 238, 145, 45, 171, 114, 253, 10, 192, 136, 4, 157, 249, 30, 35, 72,
-   175, 63, 77, 90, 181, 16, 96, 111, 133, 104, 75, 162, 93, 56, 66, 240,
-   8, 50, 84, 229, 49, 210, 173, 239, 141, 1, 87, 18, 2, 198, 143, 57,
-   225, 160, 58, 217, 168, 206, 245, 204, 199, 6, 73, 60, 20, 230, 211, 233,
-   94, 200, 88, 9, 74, 155, 33, 15, 219, 130, 226, 202, 83, 236, 42, 172,
-   165, 218, 55, 222, 46, 107, 98, 154, 109, 67, 196, 178, 127, 158, 13, 243,
-   65, 79, 166, 248, 25, 224, 115, 80, 68, 51, 184, 128, 232, 208, 151, 122,
-   26, 212, 105, 43, 179, 213, 235, 148, 146, 89, 14, 195, 28, 78, 112, 76,
-   250, 47, 24, 251, 140, 108, 186, 190, 228, 170, 183, 139, 39, 188, 244, 246,
-   132, 48, 119, 144, 180, 138, 134, 193, 82, 182, 120, 121, 86, 220, 209, 3,
-   91, 241, 149, 85, 205, 150, 113, 216, 31, 100, 41, 164, 177, 214, 153, 231,
-   38, 71, 185, 174, 97, 201, 29, 95, 7, 92, 54, 254, 191, 118, 34, 221,
-   131, 11, 163, 99, 234, 81, 227, 147, 156, 176, 17, 142, 69, 12, 110, 62,
-   27, 255, 0, 194, 59, 116, 242, 252, 19, 21, 187, 53, 207, 129, 64, 135,
-   61, 40, 167, 237, 102, 223, 106, 159, 197, 189, 215, 137, 36, 32, 22, 5,
-
-   // and a second copy so we don't need an extra mask or static initializer
-   23, 125, 161, 52, 103, 117, 70, 37, 247, 101, 203, 169, 124, 126, 44, 123,
-   152, 238, 145, 45, 171, 114, 253, 10, 192, 136, 4, 157, 249, 30, 35, 72,
-   175, 63, 77, 90, 181, 16, 96, 111, 133, 104, 75, 162, 93, 56, 66, 240,
-   8, 50, 84, 229, 49, 210, 173, 239, 141, 1, 87, 18, 2, 198, 143, 57,
-   225, 160, 58, 217, 168, 206, 245, 204, 199, 6, 73, 60, 20, 230, 211, 233,
-   94, 200, 88, 9, 74, 155, 33, 15, 219, 130, 226, 202, 83, 236, 42, 172,
-   165, 218, 55, 222, 46, 107, 98, 154, 109, 67, 196, 178, 127, 158, 13, 243,
-   65, 79, 166, 248, 25, 224, 115, 80, 68, 51, 184, 128, 232, 208, 151, 122,
-   26, 212, 105, 43, 179, 213, 235, 148, 146, 89, 14, 195, 28, 78, 112, 76,
-   250, 47, 24, 251, 140, 108, 186, 190, 228, 170, 183, 139, 39, 188, 244, 246,
-   132, 48, 119, 144, 180, 138, 134, 193, 82, 182, 120, 121, 86, 220, 209, 3,
-   91, 241, 149, 85, 205, 150, 113, 216, 31, 100, 41, 164, 177, 214, 153, 231,
-   38, 71, 185, 174, 97, 201, 29, 95, 7, 92, 54, 254, 191, 118, 34, 221,
-   131, 11, 163, 99, 234, 81, 227, 147, 156, 176, 17, 142, 69, 12, 110, 62,
-   27, 255, 0, 194, 59, 116, 242, 252, 19, 21, 187, 53, 207, 129, 64, 135,
-   61, 40, 167, 237, 102, 223, 106, 159, 197, 189, 215, 137, 36, 32, 22, 5,
-};
-
-
-// perlin's gradient has 12 cases so some get used 1/16th of the time
-// and some 2/16ths. We reduce bias by changing those fractions
-// to 5/64ths and 6/64ths
-
-// this array is designed to match the previous implementation
-// of gradient hash: indices[stb__perlin_randtab[i]&63]
-static unsigned char stb__perlin_randtab_grad_idx[512] =
-{
-    7, 9, 5, 0, 11, 1, 6, 9, 3, 9, 11, 1, 8, 10, 4, 7,
-    8, 6, 1, 5, 3, 10, 9, 10, 0, 8, 4, 1, 5, 2, 7, 8,
-    7, 11, 9, 10, 1, 0, 4, 7, 5, 0, 11, 6, 1, 4, 2, 8,
-    8, 10, 4, 9, 9, 2, 5, 7, 9, 1, 7, 2, 2, 6, 11, 5,
-    5, 4, 6, 9, 0, 1, 1, 0, 7, 6, 9, 8, 4, 10, 3, 1,
-    2, 8, 8, 9, 10, 11, 5, 11, 11, 2, 6, 10, 3, 4, 2, 4,
-    9, 10, 3, 2, 6, 3, 6, 10, 5, 3, 4, 10, 11, 2, 9, 11,
-    1, 11, 10, 4, 9, 4, 11, 0, 4, 11, 4, 0, 0, 0, 7, 6,
-    10, 4, 1, 3, 11, 5, 3, 4, 2, 9, 1, 3, 0, 1, 8, 0,
-    6, 7, 8, 7, 0, 4, 6, 10, 8, 2, 3, 11, 11, 8, 0, 2,
-    4, 8, 3, 0, 0, 10, 6, 1, 2, 2, 4, 5, 6, 0, 1, 3,
-    11, 9, 5, 5, 9, 6, 9, 8, 3, 8, 1, 8, 9, 6, 9, 11,
-    10, 7, 5, 6, 5, 9, 1, 3, 7, 0, 2, 10, 11, 2, 6, 1,
-    3, 11, 7, 7, 2, 1, 7, 3, 0, 8, 1, 1, 5, 0, 6, 10,
-    11, 11, 0, 2, 7, 0, 10, 8, 3, 5, 7, 1, 11, 1, 0, 7,
-    9, 0, 11, 5, 10, 3, 2, 3, 5, 9, 7, 9, 8, 4, 6, 5,
-
-    // and a second copy so we don't need an extra mask or static initializer
-    7, 9, 5, 0, 11, 1, 6, 9, 3, 9, 11, 1, 8, 10, 4, 7,
-    8, 6, 1, 5, 3, 10, 9, 10, 0, 8, 4, 1, 5, 2, 7, 8,
-    7, 11, 9, 10, 1, 0, 4, 7, 5, 0, 11, 6, 1, 4, 2, 8,
-    8, 10, 4, 9, 9, 2, 5, 7, 9, 1, 7, 2, 2, 6, 11, 5,
-    5, 4, 6, 9, 0, 1, 1, 0, 7, 6, 9, 8, 4, 10, 3, 1,
-    2, 8, 8, 9, 10, 11, 5, 11, 11, 2, 6, 10, 3, 4, 2, 4,
-    9, 10, 3, 2, 6, 3, 6, 10, 5, 3, 4, 10, 11, 2, 9, 11,
-    1, 11, 10, 4, 9, 4, 11, 0, 4, 11, 4, 0, 0, 0, 7, 6,
-    10, 4, 1, 3, 11, 5, 3, 4, 2, 9, 1, 3, 0, 1, 8, 0,
-    6, 7, 8, 7, 0, 4, 6, 10, 8, 2, 3, 11, 11, 8, 0, 2,
-    4, 8, 3, 0, 0, 10, 6, 1, 2, 2, 4, 5, 6, 0, 1, 3,
-    11, 9, 5, 5, 9, 6, 9, 8, 3, 8, 1, 8, 9, 6, 9, 11,
-    10, 7, 5, 6, 5, 9, 1, 3, 7, 0, 2, 10, 11, 2, 6, 1,
-    3, 11, 7, 7, 2, 1, 7, 3, 0, 8, 1, 1, 5, 0, 6, 10,
-    11, 11, 0, 2, 7, 0, 10, 8, 3, 5, 7, 1, 11, 1, 0, 7,
-    9, 0, 11, 5, 10, 3, 2, 3, 5, 9, 7, 9, 8, 4, 6, 5,
-};
-
-static float stb__perlin_lerp(float a, float b, float t)
-{
-   return a + (b-a) * t;
-}
-
-static int stb__perlin_fastfloor(float a)
-{
-    int ai = (int) a;
-    return (a < ai) ? ai-1 : ai;
-}
-
-// different grad function from Perlin's, but easy to modify to match reference
-static float stb__perlin_grad(int grad_idx, float x, float y, float z)
-{
-   static float basis[12][4] =
-   {
-      {  1, 1, 0 },
-      { -1, 1, 0 },
-      {  1,-1, 0 },
-      { -1,-1, 0 },
-      {  1, 0, 1 },
-      { -1, 0, 1 },
-      {  1, 0,-1 },
-      { -1, 0,-1 },
-      {  0, 1, 1 },
-      {  0,-1, 1 },
-      {  0, 1,-1 },
-      {  0,-1,-1 },
-   };
-
-   float *grad = basis[grad_idx];
-   return grad[0]*x + grad[1]*y + grad[2]*z;
-}
-
-float stb_perlin_noise3_internal(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed)
-{
-   float u,v,w;
-   float n000,n001,n010,n011,n100,n101,n110,n111;
-   float n00,n01,n10,n11;
-   float n0,n1;
-
-   unsigned int x_mask = (x_wrap-1) & 255;
-   unsigned int y_mask = (y_wrap-1) & 255;
-   unsigned int z_mask = (z_wrap-1) & 255;
-   int px = stb__perlin_fastfloor(x);
-   int py = stb__perlin_fastfloor(y);
-   int pz = stb__perlin_fastfloor(z);
-   int x0 = px & x_mask, x1 = (px+1) & x_mask;
-   int y0 = py & y_mask, y1 = (py+1) & y_mask;
-   int z0 = pz & z_mask, z1 = (pz+1) & z_mask;
-   int r0,r1, r00,r01,r10,r11;
-
-   #define stb__perlin_ease(a)   (((a*6-15)*a + 10) * a * a * a)
-
-   x -= px; u = stb__perlin_ease(x);
-   y -= py; v = stb__perlin_ease(y);
-   z -= pz; w = stb__perlin_ease(z);
-
-   r0 = stb__perlin_randtab[x0+seed];
-   r1 = stb__perlin_randtab[x1+seed];
-
-   r00 = stb__perlin_randtab[r0+y0];
-   r01 = stb__perlin_randtab[r0+y1];
-   r10 = stb__perlin_randtab[r1+y0];
-   r11 = stb__perlin_randtab[r1+y1];
-
-   n000 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z0], x  , y  , z   );
-   n001 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z1], x  , y  , z-1 );
-   n010 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z0], x  , y-1, z   );
-   n011 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z1], x  , y-1, z-1 );
-   n100 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z0], x-1, y  , z   );
-   n101 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z1], x-1, y  , z-1 );
-   n110 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z0], x-1, y-1, z   );
-   n111 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z1], x-1, y-1, z-1 );
-
-   n00 = stb__perlin_lerp(n000,n001,w);
-   n01 = stb__perlin_lerp(n010,n011,w);
-   n10 = stb__perlin_lerp(n100,n101,w);
-   n11 = stb__perlin_lerp(n110,n111,w);
-
-   n0 = stb__perlin_lerp(n00,n01,v);
-   n1 = stb__perlin_lerp(n10,n11,v);
-
-   return stb__perlin_lerp(n0,n1,u);
-}
-
-float stb_perlin_noise3(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap)
-{
-    return stb_perlin_noise3_internal(x,y,z,x_wrap,y_wrap,z_wrap,0);
-}
-
-float stb_perlin_noise3_seed(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, int seed)
-{
-    return stb_perlin_noise3_internal(x,y,z,x_wrap,y_wrap,z_wrap, (unsigned char) seed);
-}
-
-float stb_perlin_ridge_noise3(float x, float y, float z, float lacunarity, float gain, float offset, int octaves)
-{
-   int i;
-   float frequency = 1.0f;
-   float prev = 1.0f;
-   float amplitude = 0.5f;
-   float sum = 0.0f;
-
-   for (i = 0; i < octaves; i++) {
-      float r = stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i);
-      r = offset - (float) fabs(r);
-      r = r*r;
-      sum += r*amplitude*prev;
-      prev = r;
-      frequency *= lacunarity;
-      amplitude *= gain;
-   }
-   return sum;
-}
-
-float stb_perlin_fbm_noise3(float x, float y, float z, float lacunarity, float gain, int octaves)
-{
-   int i;
-   float frequency = 1.0f;
-   float amplitude = 1.0f;
-   float sum = 0.0f;
-
-   for (i = 0; i < octaves; i++) {
-      sum += stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i)*amplitude;
-      frequency *= lacunarity;
-      amplitude *= gain;
-   }
-   return sum;
-}
-
-float stb_perlin_turbulence_noise3(float x, float y, float z, float lacunarity, float gain, int octaves)
-{
-   int i;
-   float frequency = 1.0f;
-   float amplitude = 1.0f;
-   float sum = 0.0f;
-
-   for (i = 0; i < octaves; i++) {
-      float r = stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i)*amplitude;
-      sum += (float) fabs(r);
-      frequency *= lacunarity;
-      amplitude *= gain;
-   }
-   return sum;
-}
-
-float stb_perlin_noise3_wrap_nonpow2(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed)
-{
-   float u,v,w;
-   float n000,n001,n010,n011,n100,n101,n110,n111;
-   float n00,n01,n10,n11;
-   float n0,n1;
-
-   int px = stb__perlin_fastfloor(x);
-   int py = stb__perlin_fastfloor(y);
-   int pz = stb__perlin_fastfloor(z);
-   int x_wrap2 = (x_wrap ? x_wrap : 256);
-   int y_wrap2 = (y_wrap ? y_wrap : 256);
-   int z_wrap2 = (z_wrap ? z_wrap : 256);
-   int x0 = px % x_wrap2, x1;
-   int y0 = py % y_wrap2, y1;
-   int z0 = pz % z_wrap2, z1;
-   int r0,r1, r00,r01,r10,r11;
-
-   if (x0 < 0) x0 += x_wrap2;
-   if (y0 < 0) y0 += y_wrap2;
-   if (z0 < 0) z0 += z_wrap2;
-   x1 = (x0+1) % x_wrap2;
-   y1 = (y0+1) % y_wrap2;
-   z1 = (z0+1) % z_wrap2;
-
-   #define stb__perlin_ease(a)   (((a*6-15)*a + 10) * a * a * a)
-
-   x -= px; u = stb__perlin_ease(x);
-   y -= py; v = stb__perlin_ease(y);
-   z -= pz; w = stb__perlin_ease(z);
-
-   r0 = stb__perlin_randtab[x0];
-   r0 = stb__perlin_randtab[r0+seed];
-   r1 = stb__perlin_randtab[x1];
-   r1 = stb__perlin_randtab[r1+seed];
-
-   r00 = stb__perlin_randtab[r0+y0];
-   r01 = stb__perlin_randtab[r0+y1];
-   r10 = stb__perlin_randtab[r1+y0];
-   r11 = stb__perlin_randtab[r1+y1];
-
-   n000 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z0], x  , y  , z   );
-   n001 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z1], x  , y  , z-1 );
-   n010 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z0], x  , y-1, z   );
-   n011 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z1], x  , y-1, z-1 );
-   n100 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z0], x-1, y  , z   );
-   n101 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z1], x-1, y  , z-1 );
-   n110 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z0], x-1, y-1, z   );
-   n111 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z1], x-1, y-1, z-1 );
-
-   n00 = stb__perlin_lerp(n000,n001,w);
-   n01 = stb__perlin_lerp(n010,n011,w);
-   n10 = stb__perlin_lerp(n100,n101,w);
-   n11 = stb__perlin_lerp(n110,n111,w);
-
-   n0 = stb__perlin_lerp(n00,n01,v);
-   n1 = stb__perlin_lerp(n10,n11,v);
-
-   return stb__perlin_lerp(n0,n1,u);
-}
-#endif  // STB_PERLIN_IMPLEMENTATION
-
-/*
-------------------------------------------------------------------------------
-This software is available under 2 licenses -- choose whichever you prefer.
-------------------------------------------------------------------------------
-ALTERNATIVE A - MIT License
-Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-------------------------------------------------------------------------------
-ALTERNATIVE B - Public Domain (www.unlicense.org)
-This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
-software, either in source code form or as a compiled binary, for any purpose,
-commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this
-software dedicate any and all copyright interest in the software to the public
-domain. We make this dedication for the benefit of the public at large and to
-the detriment of our heirs and successors. We intend this dedication to be an
-overt act of relinquishment in perpetuity of all present and future rights to
-this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-------------------------------------------------------------------------------
-*/

From ef861421e2bc467e01eafc3d13864d931cf5014d Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sat, 28 Aug 2021 03:20:04 -0700
Subject: [PATCH 121/170] stb.h: stb_splitpath correctly handles relative paths
 with explicit drive specifiers

---
 deprecated/stb.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/deprecated/stb.h b/deprecated/stb.h
index 80fa202..1633c3b 100644
--- a/deprecated/stb.h
+++ b/deprecated/stb.h
@@ -2423,6 +2423,12 @@ static char *stb__splitpath_raw(char *buffer, char *path, int flag)
    char *s = stb_strrchr2(path, '/', '\\');
    char *t = strrchr(path, '.');
    if (s && t && t < s) t = NULL;
+
+   if (!s) {
+      // check for drive
+      if (isalpha(path[0]) && path[1] == ':')
+         s = &path[1];
+   }
    if (s) ++s;
 
    if (flag == STB_EXT_NO_PERIOD)

From c0c982601f40183e74d84a61237e968dca08380e Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sat, 28 Aug 2021 04:52:41 -0700
Subject: [PATCH 122/170] stb_truetype 1.26: fix rendering glitches

---
 README.md           | 10 ++---
 stb_truetype.h      | 90 +++++++++++++++++++++++++++++++++++++--------
 tools/README.list   |  2 -
 tools/make_readme.c |  1 +
 4 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 899e38c..9fa5d36 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
-**[stb_image.h](stb_image.h)** | 2.27 | graphics | 7890 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
-**[stb_truetype.h](stb_truetype.h)** | 1.25 | graphics | 5017 | parse, decode, and rasterize characters from truetype fonts
+**[stb_image.h](stb_image.h)** | 2.27 | graphics | 7897 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
@@ -32,19 +32,17 @@ library    | lastest version | category | LoC | description
 **[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
 **[stb_voxel_render.h](stb_voxel_render.h)** | 0.89 | 3D&nbsp;graphics | 3807 | Minecraft-esque voxel rendering "engine" with many more features
 **[stb_dxt.h](stb_dxt.h)** | 1.12 | 3D&nbsp;graphics | 719 | Fabian "ryg" Giesen's real-time DXT compressor
-**[stb_perlin.h](stb_perlin.h)** | 0.5 | 3D&nbsp;graphics | 428 | revised Perlin noise (3D input, 1D output)
 **[stb_easy_font.h](stb_easy_font.h)** | 1.1 | 3D&nbsp;graphics | 305 | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc
 **[stb_tilemap_editor.h](stb_tilemap_editor.h)** | 0.42 | game&nbsp;dev | 4187 | embeddable tilemap editor
 **[stb_herringbone_wa...](stb_herringbone_wang_tile.h)** | 0.7 | game&nbsp;dev | 1221 | herringbone Wang tile map generator
 **[stb_c_lexer.h](stb_c_lexer.h)** | 0.12 | parsing | 940 | simplify writing parsers for C-like languages
 **[stb_divide.h](stb_divide.h)** | 0.94 | math | 433 | more useful 32-bit modulus e.g. "euclidean divide"
 **[stb_connected_comp...](stb_connected_components.h)** | 0.96 | misc | 1049 | incrementally compute reachability on grids
-**[stb.h](stb.h)** | 2.37 | misc | 13105 | _deprecated_ helper functions for C, mostly redundant in C++; basically author's personal stuff
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
-Total libraries: 22
-Total lines of C code: 56065
+Total libraries: 20
+Total lines of C code: 42599
 
 
 FAQ
diff --git a/stb_truetype.h b/stb_truetype.h
index f7edfb7..bbf2284 100644
--- a/stb_truetype.h
+++ b/stb_truetype.h
@@ -1,4 +1,4 @@
-// stb_truetype.h - v1.25 - public domain
+// stb_truetype.h - v1.26 - public domain
 // authored from 2009-2021 by Sean Barrett / RAD Game Tools
 //
 // =======================================================================
@@ -58,6 +58,7 @@
 //
 // VERSION HISTORY
 //
+//   1.26 (2021-08-28) fix broken rasterizer
 //   1.25 (2021-07-11) many fixes
 //   1.24 (2020-02-05) fix warning
 //   1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS)
@@ -3061,6 +3062,23 @@ static void stbtt__handle_clipped_edge(float *scanline, int x, stbtt__active_edg
    }
 }
 
+static float stbtt__sized_trapezoid_area(float height, float top_width, float bottom_width)
+{
+   STBTT_assert(top_width >= 0);
+   STBTT_assert(bottom_width >= 0);
+   return (top_width + bottom_width) / 2.0f * height;
+}
+
+static float stbtt__position_trapezoid_area(float height, float tx0, float tx1, float bx0, float bx1)
+{
+   return stbtt__sized_trapezoid_area(height, tx1 - tx0, bx1 - bx0);
+}
+
+static float stbtt__sized_triangle_area(float height, float width)
+{
+   return height * width / 2;
+}
+
 static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, int len, stbtt__active_edge *e, float y_top)
 {
    float y_bottom = y_top+1;
@@ -3115,10 +3133,10 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                float height;
                // simple case, only spans one pixel
                int x = (int) x_top;
-               height = sy1 - sy0;
+               height = (sy1 - sy0) * e->direction;
                STBTT_assert(x >= 0 && x < len);
-               scanline[x] += e->direction * (1-((x_top - x) + (x_bottom-x))/2)  * height;
-               scanline_fill[x] += e->direction * height; // everything right of this pixel is filled
+               scanline[x]      += stbtt__position_trapezoid_area(height, x_top, x+1.0f, x_bottom, x+1.0f);
+               scanline_fill[x] += height; // everything right of this pixel is filled
             } else {
                int x,x1,x2;
                float y_crossing, y_final, step, sign, area;
@@ -3134,40 +3152,79 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
                   dy = -dy;
                   t = x0, x0 = xb, xb = t;
                }
-               assert(dy >= 0);
-               assert(dx >= 0);
+               STBTT_assert(dy >= 0);
+               STBTT_assert(dx >= 0);
 
                x1 = (int) x_top;
                x2 = (int) x_bottom;
                // compute intersection with y axis at x1+1
-               y_crossing = (x1+1 - x0) * dy + y_top;
+               y_crossing = y_top + dy * (x1+1 - x0);
+
+               // compute intersection with y axis at x2
+               y_final = y_top + dy * (x2 - x0);
+
+               //           x1    x_top                            x2    x_bottom
+               //     y_top  +------|-----+------------+------------+--------|---+------------+
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //       sy0  |      Txxxxx|............|............|............|............|
+               // y_crossing |            *xxxxx.......|............|............|............|
+               //            |            |     xxxxx..|............|............|............|
+               //            |            |     /-   xx*xxxx........|............|............|
+               //            |            | dy <       |    xxxxxx..|............|............|
+               //   y_final  |            |     \-     |          xx*xxx.........|............|
+               //       sy1  |            |            |            |   xxxxxB...|............|
+               //            |            |            |            |            |            |
+               //            |            |            |            |            |            |
+               //  y_bottom  +------------+------------+------------+------------+------------+
+               //
+               // goal is to measure the area covered by '.' in each pixel
+
                // if x2 is right at the right edge of x1, y_crossing can blow up, github #1057
+               // @TODO: maybe test against sy1 rather than y_bottom?
                if (y_crossing > y_bottom)
                   y_crossing = y_bottom;
 
                sign = e->direction;
-               // area of the rectangle covered from y0..y_crossing
+
+               // area of the rectangle covered from sy0..y_crossing
                area = sign * (y_crossing-sy0);
-               // area of the triangle (x_top,y0), (x+1,y0), (x+1,y_crossing)
-               scanline[x1] += area * (x1+1 - x_top)/2;
+
+               // area of the triangle (x_top,sy0), (x1+1,sy0), (x1+1,y_crossing)
+               scanline[x1] += stbtt__sized_triangle_area(area, x1+1 - x_top);
 
                // check if final y_crossing is blown up; no test case for this
-               y_final = y_crossing + dy * (x2 - (x1+1)); // advance y by number of steps taken below
                if (y_final > y_bottom) {
                   y_final = y_bottom;
                   dy = (y_final - y_crossing ) / (x2 - (x1+1)); // if denom=0, y_final = y_crossing, so y_final <= y_bottom
                }
 
-               step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x, which is also how much pixel area changes for each step in x
+               // in second pixel, area covered by line segment found in first pixel
+               // is always a rectangle 1 wide * the height of that line segment; this
+               // is exactly what the variable 'area' stores. it also gets a contribution
+               // from the line segment within it. the THIRD pixel will get the first
+               // pixel's rectangle contribution, the second pixel's rectangle contribution,
+               // and its own contribution. the 'own contribution' is the same in every pixel except
+               // the leftmost and rightmost, a trapezoid that slides down in each pixel.
+               // the second pixel's contribution to the third pixel will be the
+               // rectangle 1 wide times the height change in the second pixel, which is dy.
+
+               step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x,
+               // which multiplied by 1-pixel-width is how much pixel area changes for each step in x
+               // so the area advances by 'step' every time
+
                for (x = x1+1; x < x2; ++x) {
-                  scanline[x] += area + step/2; // area of parallelogram is step/2
+                  scanline[x] += area + step/2; // area of trapezoid is 1*step/2
                   area += step;
                }
                STBTT_assert(STBTT_fabs(area) <= 1.01f); // accumulated error from area += step unless we round step down
+               STBTT_assert(sy1 > y_final-0.01f);
 
-               // area of the triangle (x2,y_crossing), (x_bottom,y1), (x2,y1)
-               scanline[x2] += area + sign * (x_bottom - x2)/2 * (sy1-y_crossing);
+               // area covered in the last pixel is the rectangle from all the pixels to the left,
+               // plus the trapezoid filled by the line segment in this pixel all the way to the right edge
+               scanline[x2] += area + sign * stbtt__position_trapezoid_area(sy1-y_final, (float) x2, x2+1.0f, x_bottom, x2+1.0f);
 
+               // the rest of the line is filled based on the total height of the line segment in this pixel
                scanline_fill[x2] += sign * (sy1-sy0);
             }
          } else {
@@ -3175,6 +3232,9 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill,
             // clipping logic. since this does not match the intended use
             // of this library, we use a different, very slow brute
             // force implementation
+            // note though that this does happen some of the time because
+            // x_top and x_bottom can be extrapolated at the top & bottom of
+            // the shape and actually lie outside the bounding box
             int x;
             for (x=0; x < len; ++x) {
                // cases:
diff --git a/tools/README.list b/tools/README.list
index c2ae379..6d9764b 100644
--- a/tools/README.list
+++ b/tools/README.list
@@ -10,13 +10,11 @@ stb_sprintf.h               | utility          | fast sprintf, snprintf for C/C+
 stb_textedit.h              | user interface   | guts of a text editor for games etc implementing them from scratch
 stb_voxel_render.h          | 3D graphics      | Minecraft-esque voxel rendering "engine" with many more features
 stb_dxt.h                   | 3D graphics      | Fabian "ryg" Giesen's real-time DXT compressor
-stb_perlin.h                | 3D graphics      | revised Perlin noise (3D input, 1D output)
 stb_easy_font.h             | 3D graphics      | quick-and-dirty easy-to-deploy bitmap font for printing frame rate, etc
 stb_tilemap_editor.h        | game dev         | embeddable tilemap editor
 stb_herringbone_wang_tile.h | game dev         | herringbone Wang tile map generator
 stb_c_lexer.h               | parsing          | simplify writing parsers for C-like languages
 stb_divide.h                | math             | more useful 32-bit modulus e.g. "euclidean divide"
 stb_connected_components.h  | misc             | incrementally compute reachability on grids
-stb.h                       | misc             | _deprecated_ helper functions for C, mostly redundant in C++; basically author's personal stuff
 stb_leakcheck.h             | misc             | quick-and-dirty malloc/free leak-checking
 stb_include.h               | misc             | implement recursive #include support, particularly for GLSL
diff --git a/tools/make_readme.c b/tools/make_readme.c
index 3fe2fc7..bca9d2b 100644
--- a/tools/make_readme.c
+++ b/tools/make_readme.c
@@ -20,6 +20,7 @@ int main(int argc, char  **argv)
       int num_lines;
       char **lines = stb_stringfile(stb_sprintf("../%s", tokens[0]), &num_lines);
       char *s1, *s2,*s3;
+      if (lines == NULL) stb_fatal("Couldn't open '%s'", tokens[0]);
       s1 = strchr(lines[0], '-');
       if (!s1) stb_fatal("Couldn't find '-' before version number in %s", tokens[0]); // stb_fatal -- print error message & exit
       s2 = strchr(s1+2, '-');

From b1826c9894c048e18cce45a289ee1db3fde7f40b Mon Sep 17 00:00:00 2001
From: Sean Barrett <sean2@nothings.org>
Date: Fri, 10 Sep 2021 00:41:32 -0700
Subject: [PATCH 123/170] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 24 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 20 +++++++++++++++++++
 2 files changed, 44 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..8c84cd0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,24 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..71c8763
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: 4 enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.

From b9e1d86edc4e526e494855bcb46d02f3269d58aa Mon Sep 17 00:00:00 2001
From: Sean Barrett <sean2@nothings.org>
Date: Fri, 10 Sep 2021 00:43:51 -0700
Subject: [PATCH 124/170] Create config.yml

---
 .github/ISSUE_TEMPLATE/config.yml | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000..85cbf6e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,4 @@
+contact_links:
+  - name: support forum
+    url: https://github.com/nothings/stb/discussions/categories/q-a
+    about: having trouble using an stb library? don't create an issue, post in the forum

From 37e21f17b2269fdc49cb0de83064d3af5f2962f6 Mon Sep 17 00:00:00 2001
From: Sean Barrett <sean2@nothings.org>
Date: Fri, 10 Sep 2021 00:47:35 -0700
Subject: [PATCH 125/170] Update bug_report.md

---
 .github/ISSUE_TEMPLATE/bug_report.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 8c84cd0..2b15438 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,6 +1,6 @@
 ---
 name: Bug report
-about: Create a report to help us improve
+about: if you're having trouble using a library, look at the bottom of this list
 title: ''
 labels: ''
 assignees: ''

From af1a5bc352164740c1cc1354942b1c6b72eacb8a Mon Sep 17 00:00:00 2001
From: Sean Barrett <sean2@nothings.org>
Date: Fri, 10 Sep 2021 00:48:29 -0700
Subject: [PATCH 126/170] Update bug_report.md

---
 .github/ISSUE_TEMPLATE/bug_report.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 2b15438..a5cb26f 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,6 +1,6 @@
 ---
 name: Bug report
-about: if you're having trouble using a library, look at the bottom of this list
+about: if you're having trouble using a library, try the support forum instead
 title: ''
 labels: ''
 assignees: ''

From 4af130e86341928e3003ba5657f3e9faec50c1dc Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 8 Sep 2022 09:35:48 -0700
Subject: [PATCH 127/170] stb_perlin.h: restore file now that patent is expired

---
 stb_perlin.h | 428 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 428 insertions(+)
 create mode 100644 stb_perlin.h

diff --git a/stb_perlin.h b/stb_perlin.h
new file mode 100644
index 0000000..47cb9a4
--- /dev/null
+++ b/stb_perlin.h
@@ -0,0 +1,428 @@
+// stb_perlin.h - v0.5 - perlin noise
+// public domain single-file C implementation by Sean Barrett
+//
+// LICENSE
+//
+//   See end of file.
+//
+//
+// to create the implementation,
+//     #define STB_PERLIN_IMPLEMENTATION
+// in *one* C/CPP file that includes this file.
+//
+//
+// Documentation:
+//
+// float  stb_perlin_noise3( float x,
+//                           float y,
+//                           float z,
+//                           int   x_wrap=0,
+//                           int   y_wrap=0,
+//                           int   z_wrap=0)
+//
+// This function computes a random value at the coordinate (x,y,z).
+// Adjacent random values are continuous but the noise fluctuates
+// its randomness with period 1, i.e. takes on wholly unrelated values
+// at integer points. Specifically, this implements Ken Perlin's
+// revised noise function from 2002.
+//
+// The "wrap" parameters can be used to create wraparound noise that
+// wraps at powers of two. The numbers MUST be powers of two. Specify
+// 0 to mean "don't care". (The noise always wraps every 256 due
+// details of the implementation, even if you ask for larger or no
+// wrapping.)
+//
+// float  stb_perlin_noise3_seed( float x,
+//                                float y,
+//                                float z,
+//                                int   x_wrap=0,
+//                                int   y_wrap=0,
+//                                int   z_wrap=0,
+//                                int   seed)
+//
+// As above, but 'seed' selects from multiple different variations of the
+// noise function. The current implementation only uses the bottom 8 bits
+// of 'seed', but possibly in the future more bits will be used.
+//
+//
+// Fractal Noise:
+//
+// Three common fractal noise functions are included, which produce
+// a wide variety of nice effects depending on the parameters
+// provided. Note that each function will call stb_perlin_noise3
+// 'octaves' times, so this parameter will affect runtime.
+//
+// float stb_perlin_ridge_noise3(float x, float y, float z,
+//                               float lacunarity, float gain, float offset, int octaves)
+//
+// float stb_perlin_fbm_noise3(float x, float y, float z,
+//                             float lacunarity, float gain, int octaves)
+//
+// float stb_perlin_turbulence_noise3(float x, float y, float z,
+//                                    float lacunarity, float gain, int octaves)
+//
+// Typical values to start playing with:
+//     octaves    =   6     -- number of "octaves" of noise3() to sum
+//     lacunarity = ~ 2.0   -- spacing between successive octaves (use exactly 2.0 for wrapping output)
+//     gain       =   0.5   -- relative weighting applied to each successive octave
+//     offset     =   1.0?  -- used to invert the ridges, may need to be larger, not sure
+//
+//
+// Contributors:
+//    Jack Mott - additional noise functions
+//    Jordan Peck - seeded noise
+//
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern float stb_perlin_noise3(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap);
+extern float stb_perlin_noise3_seed(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, int seed);
+extern float stb_perlin_ridge_noise3(float x, float y, float z, float lacunarity, float gain, float offset, int octaves);
+extern float stb_perlin_fbm_noise3(float x, float y, float z, float lacunarity, float gain, int octaves);
+extern float stb_perlin_turbulence_noise3(float x, float y, float z, float lacunarity, float gain, int octaves);
+extern float stb_perlin_noise3_wrap_nonpow2(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed);
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef STB_PERLIN_IMPLEMENTATION
+
+#include <math.h> // fabs()
+
+// not same permutation table as Perlin's reference to avoid copyright issues;
+// Perlin's table can be found at http://mrl.nyu.edu/~perlin/noise/
+static unsigned char stb__perlin_randtab[512] =
+{
+   23, 125, 161, 52, 103, 117, 70, 37, 247, 101, 203, 169, 124, 126, 44, 123,
+   152, 238, 145, 45, 171, 114, 253, 10, 192, 136, 4, 157, 249, 30, 35, 72,
+   175, 63, 77, 90, 181, 16, 96, 111, 133, 104, 75, 162, 93, 56, 66, 240,
+   8, 50, 84, 229, 49, 210, 173, 239, 141, 1, 87, 18, 2, 198, 143, 57,
+   225, 160, 58, 217, 168, 206, 245, 204, 199, 6, 73, 60, 20, 230, 211, 233,
+   94, 200, 88, 9, 74, 155, 33, 15, 219, 130, 226, 202, 83, 236, 42, 172,
+   165, 218, 55, 222, 46, 107, 98, 154, 109, 67, 196, 178, 127, 158, 13, 243,
+   65, 79, 166, 248, 25, 224, 115, 80, 68, 51, 184, 128, 232, 208, 151, 122,
+   26, 212, 105, 43, 179, 213, 235, 148, 146, 89, 14, 195, 28, 78, 112, 76,
+   250, 47, 24, 251, 140, 108, 186, 190, 228, 170, 183, 139, 39, 188, 244, 246,
+   132, 48, 119, 144, 180, 138, 134, 193, 82, 182, 120, 121, 86, 220, 209, 3,
+   91, 241, 149, 85, 205, 150, 113, 216, 31, 100, 41, 164, 177, 214, 153, 231,
+   38, 71, 185, 174, 97, 201, 29, 95, 7, 92, 54, 254, 191, 118, 34, 221,
+   131, 11, 163, 99, 234, 81, 227, 147, 156, 176, 17, 142, 69, 12, 110, 62,
+   27, 255, 0, 194, 59, 116, 242, 252, 19, 21, 187, 53, 207, 129, 64, 135,
+   61, 40, 167, 237, 102, 223, 106, 159, 197, 189, 215, 137, 36, 32, 22, 5,
+
+   // and a second copy so we don't need an extra mask or static initializer
+   23, 125, 161, 52, 103, 117, 70, 37, 247, 101, 203, 169, 124, 126, 44, 123,
+   152, 238, 145, 45, 171, 114, 253, 10, 192, 136, 4, 157, 249, 30, 35, 72,
+   175, 63, 77, 90, 181, 16, 96, 111, 133, 104, 75, 162, 93, 56, 66, 240,
+   8, 50, 84, 229, 49, 210, 173, 239, 141, 1, 87, 18, 2, 198, 143, 57,
+   225, 160, 58, 217, 168, 206, 245, 204, 199, 6, 73, 60, 20, 230, 211, 233,
+   94, 200, 88, 9, 74, 155, 33, 15, 219, 130, 226, 202, 83, 236, 42, 172,
+   165, 218, 55, 222, 46, 107, 98, 154, 109, 67, 196, 178, 127, 158, 13, 243,
+   65, 79, 166, 248, 25, 224, 115, 80, 68, 51, 184, 128, 232, 208, 151, 122,
+   26, 212, 105, 43, 179, 213, 235, 148, 146, 89, 14, 195, 28, 78, 112, 76,
+   250, 47, 24, 251, 140, 108, 186, 190, 228, 170, 183, 139, 39, 188, 244, 246,
+   132, 48, 119, 144, 180, 138, 134, 193, 82, 182, 120, 121, 86, 220, 209, 3,
+   91, 241, 149, 85, 205, 150, 113, 216, 31, 100, 41, 164, 177, 214, 153, 231,
+   38, 71, 185, 174, 97, 201, 29, 95, 7, 92, 54, 254, 191, 118, 34, 221,
+   131, 11, 163, 99, 234, 81, 227, 147, 156, 176, 17, 142, 69, 12, 110, 62,
+   27, 255, 0, 194, 59, 116, 242, 252, 19, 21, 187, 53, 207, 129, 64, 135,
+   61, 40, 167, 237, 102, 223, 106, 159, 197, 189, 215, 137, 36, 32, 22, 5,
+};
+
+
+// perlin's gradient has 12 cases so some get used 1/16th of the time
+// and some 2/16ths. We reduce bias by changing those fractions
+// to 5/64ths and 6/64ths
+
+// this array is designed to match the previous implementation
+// of gradient hash: indices[stb__perlin_randtab[i]&63]
+static unsigned char stb__perlin_randtab_grad_idx[512] =
+{
+    7, 9, 5, 0, 11, 1, 6, 9, 3, 9, 11, 1, 8, 10, 4, 7,
+    8, 6, 1, 5, 3, 10, 9, 10, 0, 8, 4, 1, 5, 2, 7, 8,
+    7, 11, 9, 10, 1, 0, 4, 7, 5, 0, 11, 6, 1, 4, 2, 8,
+    8, 10, 4, 9, 9, 2, 5, 7, 9, 1, 7, 2, 2, 6, 11, 5,
+    5, 4, 6, 9, 0, 1, 1, 0, 7, 6, 9, 8, 4, 10, 3, 1,
+    2, 8, 8, 9, 10, 11, 5, 11, 11, 2, 6, 10, 3, 4, 2, 4,
+    9, 10, 3, 2, 6, 3, 6, 10, 5, 3, 4, 10, 11, 2, 9, 11,
+    1, 11, 10, 4, 9, 4, 11, 0, 4, 11, 4, 0, 0, 0, 7, 6,
+    10, 4, 1, 3, 11, 5, 3, 4, 2, 9, 1, 3, 0, 1, 8, 0,
+    6, 7, 8, 7, 0, 4, 6, 10, 8, 2, 3, 11, 11, 8, 0, 2,
+    4, 8, 3, 0, 0, 10, 6, 1, 2, 2, 4, 5, 6, 0, 1, 3,
+    11, 9, 5, 5, 9, 6, 9, 8, 3, 8, 1, 8, 9, 6, 9, 11,
+    10, 7, 5, 6, 5, 9, 1, 3, 7, 0, 2, 10, 11, 2, 6, 1,
+    3, 11, 7, 7, 2, 1, 7, 3, 0, 8, 1, 1, 5, 0, 6, 10,
+    11, 11, 0, 2, 7, 0, 10, 8, 3, 5, 7, 1, 11, 1, 0, 7,
+    9, 0, 11, 5, 10, 3, 2, 3, 5, 9, 7, 9, 8, 4, 6, 5,
+
+    // and a second copy so we don't need an extra mask or static initializer
+    7, 9, 5, 0, 11, 1, 6, 9, 3, 9, 11, 1, 8, 10, 4, 7,
+    8, 6, 1, 5, 3, 10, 9, 10, 0, 8, 4, 1, 5, 2, 7, 8,
+    7, 11, 9, 10, 1, 0, 4, 7, 5, 0, 11, 6, 1, 4, 2, 8,
+    8, 10, 4, 9, 9, 2, 5, 7, 9, 1, 7, 2, 2, 6, 11, 5,
+    5, 4, 6, 9, 0, 1, 1, 0, 7, 6, 9, 8, 4, 10, 3, 1,
+    2, 8, 8, 9, 10, 11, 5, 11, 11, 2, 6, 10, 3, 4, 2, 4,
+    9, 10, 3, 2, 6, 3, 6, 10, 5, 3, 4, 10, 11, 2, 9, 11,
+    1, 11, 10, 4, 9, 4, 11, 0, 4, 11, 4, 0, 0, 0, 7, 6,
+    10, 4, 1, 3, 11, 5, 3, 4, 2, 9, 1, 3, 0, 1, 8, 0,
+    6, 7, 8, 7, 0, 4, 6, 10, 8, 2, 3, 11, 11, 8, 0, 2,
+    4, 8, 3, 0, 0, 10, 6, 1, 2, 2, 4, 5, 6, 0, 1, 3,
+    11, 9, 5, 5, 9, 6, 9, 8, 3, 8, 1, 8, 9, 6, 9, 11,
+    10, 7, 5, 6, 5, 9, 1, 3, 7, 0, 2, 10, 11, 2, 6, 1,
+    3, 11, 7, 7, 2, 1, 7, 3, 0, 8, 1, 1, 5, 0, 6, 10,
+    11, 11, 0, 2, 7, 0, 10, 8, 3, 5, 7, 1, 11, 1, 0, 7,
+    9, 0, 11, 5, 10, 3, 2, 3, 5, 9, 7, 9, 8, 4, 6, 5,
+};
+
+static float stb__perlin_lerp(float a, float b, float t)
+{
+   return a + (b-a) * t;
+}
+
+static int stb__perlin_fastfloor(float a)
+{
+    int ai = (int) a;
+    return (a < ai) ? ai-1 : ai;
+}
+
+// different grad function from Perlin's, but easy to modify to match reference
+static float stb__perlin_grad(int grad_idx, float x, float y, float z)
+{
+   static float basis[12][4] =
+   {
+      {  1, 1, 0 },
+      { -1, 1, 0 },
+      {  1,-1, 0 },
+      { -1,-1, 0 },
+      {  1, 0, 1 },
+      { -1, 0, 1 },
+      {  1, 0,-1 },
+      { -1, 0,-1 },
+      {  0, 1, 1 },
+      {  0,-1, 1 },
+      {  0, 1,-1 },
+      {  0,-1,-1 },
+   };
+
+   float *grad = basis[grad_idx];
+   return grad[0]*x + grad[1]*y + grad[2]*z;
+}
+
+float stb_perlin_noise3_internal(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed)
+{
+   float u,v,w;
+   float n000,n001,n010,n011,n100,n101,n110,n111;
+   float n00,n01,n10,n11;
+   float n0,n1;
+
+   unsigned int x_mask = (x_wrap-1) & 255;
+   unsigned int y_mask = (y_wrap-1) & 255;
+   unsigned int z_mask = (z_wrap-1) & 255;
+   int px = stb__perlin_fastfloor(x);
+   int py = stb__perlin_fastfloor(y);
+   int pz = stb__perlin_fastfloor(z);
+   int x0 = px & x_mask, x1 = (px+1) & x_mask;
+   int y0 = py & y_mask, y1 = (py+1) & y_mask;
+   int z0 = pz & z_mask, z1 = (pz+1) & z_mask;
+   int r0,r1, r00,r01,r10,r11;
+
+   #define stb__perlin_ease(a)   (((a*6-15)*a + 10) * a * a * a)
+
+   x -= px; u = stb__perlin_ease(x);
+   y -= py; v = stb__perlin_ease(y);
+   z -= pz; w = stb__perlin_ease(z);
+
+   r0 = stb__perlin_randtab[x0+seed];
+   r1 = stb__perlin_randtab[x1+seed];
+
+   r00 = stb__perlin_randtab[r0+y0];
+   r01 = stb__perlin_randtab[r0+y1];
+   r10 = stb__perlin_randtab[r1+y0];
+   r11 = stb__perlin_randtab[r1+y1];
+
+   n000 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z0], x  , y  , z   );
+   n001 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z1], x  , y  , z-1 );
+   n010 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z0], x  , y-1, z   );
+   n011 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z1], x  , y-1, z-1 );
+   n100 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z0], x-1, y  , z   );
+   n101 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z1], x-1, y  , z-1 );
+   n110 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z0], x-1, y-1, z   );
+   n111 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z1], x-1, y-1, z-1 );
+
+   n00 = stb__perlin_lerp(n000,n001,w);
+   n01 = stb__perlin_lerp(n010,n011,w);
+   n10 = stb__perlin_lerp(n100,n101,w);
+   n11 = stb__perlin_lerp(n110,n111,w);
+
+   n0 = stb__perlin_lerp(n00,n01,v);
+   n1 = stb__perlin_lerp(n10,n11,v);
+
+   return stb__perlin_lerp(n0,n1,u);
+}
+
+float stb_perlin_noise3(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap)
+{
+    return stb_perlin_noise3_internal(x,y,z,x_wrap,y_wrap,z_wrap,0);
+}
+
+float stb_perlin_noise3_seed(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, int seed)
+{
+    return stb_perlin_noise3_internal(x,y,z,x_wrap,y_wrap,z_wrap, (unsigned char) seed);
+}
+
+float stb_perlin_ridge_noise3(float x, float y, float z, float lacunarity, float gain, float offset, int octaves)
+{
+   int i;
+   float frequency = 1.0f;
+   float prev = 1.0f;
+   float amplitude = 0.5f;
+   float sum = 0.0f;
+
+   for (i = 0; i < octaves; i++) {
+      float r = stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i);
+      r = offset - (float) fabs(r);
+      r = r*r;
+      sum += r*amplitude*prev;
+      prev = r;
+      frequency *= lacunarity;
+      amplitude *= gain;
+   }
+   return sum;
+}
+
+float stb_perlin_fbm_noise3(float x, float y, float z, float lacunarity, float gain, int octaves)
+{
+   int i;
+   float frequency = 1.0f;
+   float amplitude = 1.0f;
+   float sum = 0.0f;
+
+   for (i = 0; i < octaves; i++) {
+      sum += stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i)*amplitude;
+      frequency *= lacunarity;
+      amplitude *= gain;
+   }
+   return sum;
+}
+
+float stb_perlin_turbulence_noise3(float x, float y, float z, float lacunarity, float gain, int octaves)
+{
+   int i;
+   float frequency = 1.0f;
+   float amplitude = 1.0f;
+   float sum = 0.0f;
+
+   for (i = 0; i < octaves; i++) {
+      float r = stb_perlin_noise3_internal(x*frequency,y*frequency,z*frequency,0,0,0,(unsigned char)i)*amplitude;
+      sum += (float) fabs(r);
+      frequency *= lacunarity;
+      amplitude *= gain;
+   }
+   return sum;
+}
+
+float stb_perlin_noise3_wrap_nonpow2(float x, float y, float z, int x_wrap, int y_wrap, int z_wrap, unsigned char seed)
+{
+   float u,v,w;
+   float n000,n001,n010,n011,n100,n101,n110,n111;
+   float n00,n01,n10,n11;
+   float n0,n1;
+
+   int px = stb__perlin_fastfloor(x);
+   int py = stb__perlin_fastfloor(y);
+   int pz = stb__perlin_fastfloor(z);
+   int x_wrap2 = (x_wrap ? x_wrap : 256);
+   int y_wrap2 = (y_wrap ? y_wrap : 256);
+   int z_wrap2 = (z_wrap ? z_wrap : 256);
+   int x0 = px % x_wrap2, x1;
+   int y0 = py % y_wrap2, y1;
+   int z0 = pz % z_wrap2, z1;
+   int r0,r1, r00,r01,r10,r11;
+
+   if (x0 < 0) x0 += x_wrap2;
+   if (y0 < 0) y0 += y_wrap2;
+   if (z0 < 0) z0 += z_wrap2;
+   x1 = (x0+1) % x_wrap2;
+   y1 = (y0+1) % y_wrap2;
+   z1 = (z0+1) % z_wrap2;
+
+   #define stb__perlin_ease(a)   (((a*6-15)*a + 10) * a * a * a)
+
+   x -= px; u = stb__perlin_ease(x);
+   y -= py; v = stb__perlin_ease(y);
+   z -= pz; w = stb__perlin_ease(z);
+
+   r0 = stb__perlin_randtab[x0];
+   r0 = stb__perlin_randtab[r0+seed];
+   r1 = stb__perlin_randtab[x1];
+   r1 = stb__perlin_randtab[r1+seed];
+
+   r00 = stb__perlin_randtab[r0+y0];
+   r01 = stb__perlin_randtab[r0+y1];
+   r10 = stb__perlin_randtab[r1+y0];
+   r11 = stb__perlin_randtab[r1+y1];
+
+   n000 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z0], x  , y  , z   );
+   n001 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r00+z1], x  , y  , z-1 );
+   n010 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z0], x  , y-1, z   );
+   n011 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r01+z1], x  , y-1, z-1 );
+   n100 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z0], x-1, y  , z   );
+   n101 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r10+z1], x-1, y  , z-1 );
+   n110 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z0], x-1, y-1, z   );
+   n111 = stb__perlin_grad(stb__perlin_randtab_grad_idx[r11+z1], x-1, y-1, z-1 );
+
+   n00 = stb__perlin_lerp(n000,n001,w);
+   n01 = stb__perlin_lerp(n010,n011,w);
+   n10 = stb__perlin_lerp(n100,n101,w);
+   n11 = stb__perlin_lerp(n110,n111,w);
+
+   n0 = stb__perlin_lerp(n00,n01,v);
+   n1 = stb__perlin_lerp(n10,n11,v);
+
+   return stb__perlin_lerp(n0,n1,u);
+}
+#endif  // STB_PERLIN_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/

From 8b5f1f37b5b75829fc72d38e7b5d4bcbf8a26d55 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 8 Sep 2022 09:40:02 -0700
Subject: [PATCH 128/170] update readme

---
 README.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9fa5d36..dd33a3b 100644
--- a/README.md
+++ b/README.md
@@ -22,12 +22,13 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
-**[stb_image.h](stb_image.h)** | 2.27 | graphics | 7897 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
-**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
+**[stb_image.h](stb_image.h)** | 2.27x | graphics | 7901 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5084 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
-**[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
+**[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
+**[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1897 | typesafe dynamic array and hash tables for C, will compile in C++
 **[stb_sprintf.h](stb_sprintf.h)** | 1.10 | utility | 1906 | fast sprintf, snprintf for C/C++
 **[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
 **[stb_voxel_render.h](stb_voxel_render.h)** | 0.89 | 3D&nbsp;graphics | 3807 | Minecraft-esque voxel rendering "engine" with many more features
@@ -41,8 +42,8 @@ library    | lastest version | category | LoC | description
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
-Total libraries: 20
-Total lines of C code: 42599
+Total libraries: 21
+Total lines of C code: 43040
 
 
 FAQ

From 2a02ff76b5f4d2a0b01f0eee289c47e0901c49b6 Mon Sep 17 00:00:00 2001
From: Neil Bickford <nbickford@nvidia.com>
Date: Thu, 7 Oct 2021 13:00:32 -0700
Subject: [PATCH 129/170] Fixes two stb_image issues that could occur with
 specially constructed HDR and PGM files.

Signed-off-by: Neil Bickford <nbickford@nvidia.com>
---
 stb_image.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index d60371b..8518c05 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -108,7 +108,7 @@ RECENT REVISION HISTORY:
     Cass Everitt            Ryamond Barbiero                        github:grim210
     Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
     Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
-    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
     Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
     Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
                             Brad Weinberger    Matvey Cherevko      github:mosra
@@ -7187,12 +7187,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
                   // Run
                   value = stbi__get8(s);
                   count -= 128;
-                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = value;
                } else {
                   // Dump
-                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = stbi__get8(s);
                }
@@ -7446,10 +7446,17 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req
 
    out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
 
    if (req_comp && req_comp != s->img_n) {
-      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
       if (out == NULL) return out; // stbi__convert_format frees input on failure
    }
    return out;

From 84b94010a7b08003cc3fb93635582849398e7ae2 Mon Sep 17 00:00:00 2001
From: Neil Bickford <nbickford@nvidia.com>
Date: Tue, 22 Feb 2022 23:48:42 -0800
Subject: [PATCH 130/170] Add checks for PNM integer read overflows, add a 1GB
 limit on IDAT chunk sizes to fix an OOM issue, and check for a situation
 where a sequence of bad Huffman code reads could result in a left shift by a
 negative number.

---
 stb_image.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/stb_image.h b/stb_image.h
index d60371b..6321f5e 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -2283,6 +2283,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
             k += (r >> 4) & 15; // run
             s = r & 15; // combined length
             j->code_buffer <<= s;
+            if (s > j->code_bits) return stbi__err("bad huffman code","Combined length longer than code bits available");
             j->code_bits -= s;
             zig = stbi__jpeg_dezigzag[k++];
             data[zig] = (short) ((r >> 8) * (1 << shift));
@@ -5116,6 +5117,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
             if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
             if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {
                stbi__uint32 idata_limit_old = idata_limit;
@@ -7486,6 +7488,8 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
    while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
       value = value*10 + (*c - '0');
       *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
    }
 
    return value;
@@ -7516,9 +7520,13 @@ static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
    stbi__pnm_skip_whitespace(s, &c);
 
    *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
    stbi__pnm_skip_whitespace(s, &c);
 
    *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
    stbi__pnm_skip_whitespace(s, &c);
 
    maxv = stbi__pnm_getinteger(s, &c);  // read max value

From 96fe76c21308653d22672e986dd39506f6871421 Mon Sep 17 00:00:00 2001
From: Neil Bickford <nbickford@nvidia.com>
Date: Wed, 23 Feb 2022 00:53:34 -0800
Subject: [PATCH 131/170] Add range checks to fix a few crash issues in
 stb_image issues 1289 and 1291

---
 stb_image.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 6321f5e..800c83d 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1985,9 +1985,12 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
    int i,j,k=0;
    unsigned int code;
    // build size list for each symbol (from JPEG spec)
-   for (i=0; i < 16; ++i)
-      for (j=0; j < count[i]; ++j)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
          h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
    h->size[k] = 0;
 
    // compute actual symbols (from jpeg spec)
@@ -2112,6 +2115,8 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 
    // convert the huffman code to the symbol id
    c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
    STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
 
    // convert the id to a symbol
@@ -3103,6 +3108,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
                sizes[i] = stbi__get8(z->s);
                n += sizes[i];
             }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
             L -= 17;
             if (tc == 0) {
                if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;

From 47164e4086c1349ef3042fb04e0f7f7ceaf1fcee Mon Sep 17 00:00:00 2001
From: Neil Bickford <nbickford@nvidia.com>
Date: Wed, 23 Feb 2022 23:48:49 -0800
Subject: [PATCH 132/170] Add checks for signed integer overflow; further guard
 against cases where stbi__grow_buffer_unsafe doesn't read all bits required.

---
 stb_image.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 800c83d..9d10099 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1063,6 +1063,23 @@ static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
 }
 #endif
 
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -2135,6 +2152,7 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
    unsigned int k;
    int sgn;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
 
    sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
    k = stbi_lrot(j->code_buffer, n);
@@ -2149,6 +2167,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
 {
    unsigned int k;
    if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
    k = stbi_lrot(j->code_buffer, n);
    j->code_buffer = k & ~stbi__bmask[n];
    k &= stbi__bmask[n];
@@ -2160,6 +2179,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 {
    unsigned int k;
    if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
    k = j->code_buffer;
    j->code_buffer <<= 1;
    --j->code_bits;
@@ -2197,8 +2217,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
    memset(data,0,64*sizeof(data[0]));
 
    diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
    dc = j->img_comp[b].dc_pred + diff;
    j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
    data[0] = (short) (dc * dequant[0]);
 
    // decode AC components, see JPEG spec
@@ -2212,6 +2234,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman
       if (r) { // fast-AC path
          k += (r >> 4) & 15; // run
          s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
          j->code_buffer <<= s;
          j->code_bits -= s;
          // decode into unzigzag'd location
@@ -2251,8 +2274,10 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__
       if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
       diff = t ? stbi__extend_receive(j, t) : 0;
 
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
       dc = j->img_comp[b].dc_pred + diff;
       j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
       data[0] = (short) (dc * (1 << j->succ_low));
    } else {
       // refinement scan for DC coefficient
@@ -2287,8 +2312,8 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__
          if (r) { // fast-AC path
             k += (r >> 4) & 15; // run
             s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
             j->code_buffer <<= s;
-            if (s > j->code_bits) return stbi__err("bad huffman code","Combined length longer than code bits available");
             j->code_bits -= s;
             zig = stbi__jpeg_dezigzag[k++];
             data[zig] = (short) ((r >> 8) * (1 << shift));

From 5cfc2a744ad7047cda2396cc67772f313a46093d Mon Sep 17 00:00:00 2001
From: Neil Bickford <nbickford@nvidia.com>
Date: Fri, 25 Feb 2022 14:27:31 -0800
Subject: [PATCH 133/170] Zero-initialize stbi__jpeg to avoid intermittent
 errors found by fuzz-testing

---
 stb_image.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/stb_image.h b/stb_image.h
index 9d10099..631e4e5 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4008,6 +4008,7 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re
    unsigned char* result;
    stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
    if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
    STBI_NOTUSED(ri);
    j->s = s;
    stbi__setup_jpeg(j);
@@ -4021,6 +4022,7 @@ static int stbi__jpeg_test(stbi__context *s)
    int r;
    stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
    if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
    j->s = s;
    stbi__setup_jpeg(j);
    r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -4046,6 +4048,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
    int result;
    stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
    if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
    j->s = s;
    result = stbi__jpeg_info_raw(j, x, y, comp);
    STBI_FREE(j);

From 9f22cc900810d93ed7808f31b90ade0c7c915e6a Mon Sep 17 00:00:00 2001
From: Neil Bickford <nbickford@nvidia.com>
Date: Fri, 15 Oct 2021 11:04:41 -0700
Subject: [PATCH 134/170] stb_image PNG: Checks for invalid DEFLATE codes.

Specifically, this rejects length codes 286 and 287, and distance codes 30 and 31.
This avoids a scenario in which a file could contain a table in which
0 corresponded to length code 287, which would result in writing 0 bits.

Signed-off-by: Neil Bickford <nbickford@nvidia.com>
---
 stb_image.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index d60371b..ab616c5 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4256,11 +4256,12 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
             a->zout = zout;
             return 1;
          }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
          z -= 257;
          len = stbi__zlength_base[z];
          if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
          z = stbi__zhuffman_decode(a, &a->z_distance);
-         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
          dist = stbi__zdist_base[z];
          if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
          if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");

From 4d160de463003e8e35660eeb1ed4891419dddbdb Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 22 Jan 2023 15:29:01 -0800
Subject: [PATCH 135/170] stb_image: Fix name of
 stbi_set_unpremultiply_on_load_thread impl

Bug reported by N-R-K. Fixes #1276.
---
 stb_image.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 8dd3cc4..a4b5d90 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4956,7 +4956,7 @@ STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
 static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
 static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
 
-STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
 {
    stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
    stbi__unpremultiply_on_load_set = 1;

From 038b6ab9ec485423c0231913b5b3bf1184e48f2a Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 22 Jan 2023 15:51:38 -0800
Subject: [PATCH 136/170] stb_image: Avoid stdint.h on 32-bit Symbian.

As suggested by "mupfdev" on Github. We don't have a way to test
here but seems simple enough.

Fixes issue #1321.
---
 stb_image.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index d230a78..720bffe 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -635,7 +635,7 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
    #endif
 #endif
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
 typedef unsigned short stbi__uint16;
 typedef   signed short stbi__int16;
 typedef unsigned int   stbi__uint32;

From 51d295e33e6ea9384653f52f1ffd3e99eff8df22 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 22 Jan 2023 16:16:02 -0800
Subject: [PATCH 137/170] stb_image: Add trailing semicolon to usage example

Fixes issue #1330 reported by lunasorcery.
---
 stb_image.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index 720bffe..9a032c7 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -140,7 +140,7 @@ RECENT REVISION HISTORY:
 //    // ... x = width, y = height, n = # 8-bit components per pixel ...
 //    // ... replace '0' with '1'..'4' to force that many components per pixel
 //    // ... but 'n' will always be the number that it would have been if you said 0
-//    stbi_image_free(data)
+//    stbi_image_free(data);
 //
 // Standard parameters:
 //    int *x                 -- outputs image width in pixels

From 1096389590d9dace29a7f4ce4f337d7a0905c58a Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 22 Jan 2023 18:52:04 -0800
Subject: [PATCH 138/170] stb_image: Accept some extra data between BMP header
 and payload

Limit to 1k, which is the maximum size of a 256-entry palette that
would ordinarily go there. It feels sensible to relax this a bit but
we don't want to go overboard.

Fixes issue #1325.
---
 stb_image.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 9a032c7..da378cb 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -5535,9 +5535,23 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
          psize = (info.offset - info.extra_read - info.hsz) >> 2;
    }
    if (psize == 0) {
-      if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
-        return stbi__errpuc("bad offset", "Corrupt BMP");
-      }
+	  // accept some number of extra bytes after the header, but if the offset points either to before
+	  // the header ends or implies a large amount of extra data, reject the file as malformed
+	  int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+	  int header_limit = 1024; // max we actually read is below 256 bytes currently.
+	  int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+	  if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+		 return stbi__errpuc("bad header", "Corrupt BMP");
+	  }
+	  // we established that bytes_read_so_far is positive and sensible.
+	  // the first half of this test rejects offsets that are either too small positives, or
+	  // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+	  // ensures the number computed in the second half of the test can't overflow.
+	  if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+		 stbi__skip(s, info.offset - bytes_read_so_far);
+	  }
    }
 
    if (info.bpp == 24 && ma == 0xff000000)

From 0613e9c043cb356e2bdb66d9be520bf8e696a1b5 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 22 Jan 2023 19:42:55 -0800
Subject: [PATCH 139/170] stb_image: Tweak end-of-JPEG junk scanning loop.

Be a bit more picky about what we consider a valid marker.
0xff 0x00 (stuffed 0 byte) should not count.

Fixes issue #1409.
---
 stb_image.h | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index da378cb..91dc05c 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -3383,6 +3383,28 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
    return 1;
 }
 
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
 // decode image to YCbCr format
 static int stbi__decode_jpeg_image(stbi__jpeg *j)
 {
@@ -3399,14 +3421,7 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
          if (!stbi__process_scan_header(j)) return 0;
          if (!stbi__parse_entropy_coded_data(j)) return 0;
          if (j->marker == STBI__MARKER_none ) {
-            // handle 0s at the end of image data from IP Kamera 9060
-            while (!stbi__at_eof(j->s)) {
-               int x = stbi__get8(j->s);
-               if (x == 255) {
-                  j->marker = stbi__get8(j->s);
-                  break;
-               }
-            }
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
          }
       } else if (stbi__DNL(m)) {

From 24fa1ff2e9877a14ec7ff7b3f49f422926fba279 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Sun, 22 Jan 2023 19:44:49 -0800
Subject: [PATCH 140/170] stb_image: Fix broken indenting introduced earlier

---
 stb_image.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 91dc05c..0844662 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -5550,23 +5550,23 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req
          psize = (info.offset - info.extra_read - info.hsz) >> 2;
    }
    if (psize == 0) {
-	  // accept some number of extra bytes after the header, but if the offset points either to before
-	  // the header ends or implies a large amount of extra data, reject the file as malformed
-	  int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
-	  int header_limit = 1024; // max we actually read is below 256 bytes currently.
-	  int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
-	  if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
-		 return stbi__errpuc("bad header", "Corrupt BMP");
-	  }
-	  // we established that bytes_read_so_far is positive and sensible.
-	  // the first half of this test rejects offsets that are either too small positives, or
-	  // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
-	  // ensures the number computed in the second half of the test can't overflow.
-	  if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
          return stbi__errpuc("bad offset", "Corrupt BMP");
       } else {
-		 stbi__skip(s, info.offset - bytes_read_so_far);
-	  }
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
    }
 
    if (info.bpp == 24 && ma == 0xff000000)

From 40eb865b3bd713ea5ece4b87f97ab1b469b894a4 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Mon, 23 Jan 2023 00:46:33 -0800
Subject: [PATCH 141/170] stb_image: header scan mode always needs to check for
 tRNS

For paletted images, header-scanning mode (used by stbi_info) kept
going after the image header to see if a tRNS (transparency) chunk
shows up to correctly determine the channel count, but we would
immediately return after IHDR for non-paletted images.

This is incorrect. We also change our reported channel count on
RGB images with a tRNS chunk, therefore non-paletted images also
have to resume scanning further chunks.

Update the logic to keep scanning regardless and report the
correct channel count from stbi_info for RGB images with tRNS
(constant alpha channel).

Fixes issue #1419.
---
 stb_image.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 0844662..9128cab 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -5115,14 +5115,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (!pal_img_n) {
                s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
                if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
-               if (scan == STBI__SCAN_header) return 1;
             } else {
                // if paletted, then pal_n is our final components, and
                // img_n is # components to decompress/filter.
                s->img_n = 1;
                if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
-               // if SCAN_header, have to scan to see if we have a tRNS
             }
+            // even with SCAN_header, have to scan to see if we have a tRNS
             break;
          }
 
@@ -5154,6 +5153,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
                has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
                if (z->depth == 16) {
                   for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
                } else {
@@ -5166,7 +5167,12 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
          case STBI__PNG_TYPE('I','D','A','T'): {
             if (first) return stbi__err("first not IHDR", "Corrupt PNG");
             if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
-            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
             if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
             if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {

From 18910607826071538e97c2401382726bd776e618 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 29 Jan 2023 06:17:45 -0800
Subject: [PATCH 142/170] a fix

---
 stb_image.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index d60371b..1e6d519 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.27x - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -3377,15 +3377,19 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
             }
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
          }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
       } else if (stbi__DNL(m)) {
          int Ld = stbi__get16be(j->s);
          stbi__uint32 NL = stbi__get16be(j->s);
          if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
          if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
       } else {
-         if (!stbi__process_marker(j, m)) return 0;
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
       }
-      m = stbi__get_marker(j);
    }
    if (j->progressive)
       stbi__jpeg_finish(j);

From 6199bf77130da41fd424722eeb7a8db4d766c4c6 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 29 Jan 2023 06:24:13 -0800
Subject: [PATCH 143/170] update stb_image to 2.28

---
 README.md            | 11 +++++------
 stb_image.h          |  2 +-
 tests/image_test.dsp |  5 +++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index dd33a3b..80efff9 100644
--- a/README.md
+++ b/README.md
@@ -22,13 +22,12 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
-**[stb_image.h](stb_image.h)** | 2.27x | graphics | 7901 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
-**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5084 | parse, decode, and rasterize characters from truetype fonts
+**[stb_image.h](stb_image.h)** | 2.28 | graphics | 7986 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
-**[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
-**[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1897 | typesafe dynamic array and hash tables for C, will compile in C++
+**[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
 **[stb_sprintf.h](stb_sprintf.h)** | 1.10 | utility | 1906 | fast sprintf, snprintf for C/C++
 **[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
 **[stb_voxel_render.h](stb_voxel_render.h)** | 0.89 | 3D&nbsp;graphics | 3807 | Minecraft-esque voxel rendering "engine" with many more features
@@ -42,8 +41,8 @@ library    | lastest version | category | LoC | description
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
-Total libraries: 21
-Total lines of C code: 43040
+Total libraries: 20
+Total lines of C code: 42688
 
 
 FAQ
diff --git a/stb_image.h b/stb_image.h
index b481426..8526d3e 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.27x - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
diff --git a/tests/image_test.dsp b/tests/image_test.dsp
index e3f66b6..65f0aaf 100644
--- a/tests/image_test.dsp
+++ b/tests/image_test.dsp
@@ -39,9 +39,10 @@ RSC=rc.exe
 # PROP Use_Debug_Libraries 0
 # PROP Output_Dir "Release"
 # PROP Intermediate_Dir "Release"
+# PROP Ignore_Export_Lib 0
 # PROP Target_Dir ""
 # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
-# ADD CPP /nologo /W3 /GX /O2 /I ".." /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
+# ADD CPP /nologo /W3 /GX /Zi /O2 /I ".." /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /FD /c
 # SUBTRACT CPP /YX
 # ADD BASE RSC /l 0x409 /d "NDEBUG"
 # ADD RSC /l 0x409 /d "NDEBUG"
@@ -50,7 +51,7 @@ BSC32=bscmake.exe
 # ADD BSC32 /nologo
 LINK32=link.exe
 # ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
-# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386
 
 !ELSEIF  "$(CFG)" == "image_test - Win32 Debug"
 

From 3ecc60f25ae1391cf6434578ece782afa1458b56 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 29 Jan 2023 10:19:12 -0800
Subject: [PATCH 144/170] add version history update for stb_image

---
 stb_image.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stb_image.h b/stb_image.h
index 8526d3e..5e807a0 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -48,6 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
       2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
       2.26  (2020-07-13) many minor fixes
       2.25  (2020-02-02) fix warnings

From 5736b15f7ea0ffb08dd38af21067c314d6a3aae9 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sun, 29 Jan 2023 10:46:04 -0800
Subject: [PATCH 145/170] re-add perlin noise again

---
 README.md         | 7 ++++---
 tools/README.list | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 80efff9..f228dac 100644
--- a/README.md
+++ b/README.md
@@ -22,11 +22,12 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
-**[stb_image.h](stb_image.h)** | 2.28 | graphics | 7986 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_image.h](stb_image.h)** | 2.28 | graphics | 7987 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
 **[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
+**[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
 **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
 **[stb_sprintf.h](stb_sprintf.h)** | 1.10 | utility | 1906 | fast sprintf, snprintf for C/C++
 **[stb_textedit.h](stb_textedit.h)** | 1.14 | user&nbsp;interface | 1429 | guts of a text editor for games etc implementing them from scratch
@@ -41,8 +42,8 @@ library    | lastest version | category | LoC | description
 **[stb_leakcheck.h](stb_leakcheck.h)** | 0.6 | misc | 194 | quick-and-dirty malloc/free leak-checking
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
-Total libraries: 20
-Total lines of C code: 42688
+Total libraries: 21
+Total lines of C code: 43117
 
 
 FAQ
diff --git a/tools/README.list b/tools/README.list
index 6d9764b..294bf87 100644
--- a/tools/README.list
+++ b/tools/README.list
@@ -5,6 +5,7 @@ stb_truetype.h              | graphics         | parse, decode, and rasterize ch
 stb_image_write.h           | graphics         | image writing to disk: PNG, TGA, BMP
 stb_image_resize.h          | graphics         | resize images larger/smaller with good quality
 stb_rect_pack.h             | graphics         | simple 2D rectangle packer with decent quality
+stb_perlin.h                | graphics         | perlin's revised simplex noise w/ different seeds
 stb_ds.h                    | utility          | typesafe dynamic array and hash tables for C, will compile in C++
 stb_sprintf.h               | utility          | fast sprintf, snprintf for C/C++
 stb_textedit.h              | user interface   | guts of a text editor for games etc implementing them from scratch

From c4bbb6e75f688318b2df2b70c2df2d641c1a8481 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Mon, 9 Oct 2023 17:23:04 -0700
Subject: [PATCH 146/170] stb_image_resize2.h 2.00

---
 README.md                                     |     4 +-
 .../stb_image_resize.h                        |     0
 stb_image_resize2.h                           | 10303 ++++++++++++++++
 stb_image_resize_test/dotimings.c             |   224 +
 stb_image_resize_test/old_image_resize.h      |  2738 ++++
 stb_image_resize_test/oldir.c                 |    56 +
 stb_image_resize_test/stbirtest.c             |   992 ++
 stb_image_resize_test/vf_train.c              |   999 ++
 tests/resample_test.cpp                       |    13 +-
 tests/resize.dsp                              |     2 +-
 tests/stb.dsp                                 |     4 -
 tests/test_c_compilation.c                    |     5 +-
 tests/test_cpp_compilation.cpp                |     2 +-
 tools/README.list                             |     2 +-
 14 files changed, 15329 insertions(+), 15 deletions(-)
 rename stb_image_resize.h => deprecated/stb_image_resize.h (100%)
 create mode 100644 stb_image_resize2.h
 create mode 100644 stb_image_resize_test/dotimings.c
 create mode 100644 stb_image_resize_test/old_image_resize.h
 create mode 100644 stb_image_resize_test/oldir.c
 create mode 100644 stb_image_resize_test/stbirtest.c
 create mode 100644 stb_image_resize_test/vf_train.c

diff --git a/README.md b/README.md
index f228dac..cf7f162 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ library    | lastest version | category | LoC | description
 **[stb_image.h](stb_image.h)** | 2.28 | graphics | 7987 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
-**[stb_image_resize.h](stb_image_resize.h)** | 0.97 | graphics | 2634 | resize images larger/smaller with good quality
+**[stb_image_resize2.h](stb_image_resize2.h)** | 2.00 | graphics | 10303 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
 **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
@@ -43,7 +43,7 @@ library    | lastest version | category | LoC | description
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 43117
+Total lines of C code: 50786
 
 
 FAQ
diff --git a/stb_image_resize.h b/deprecated/stb_image_resize.h
similarity index 100%
rename from stb_image_resize.h
rename to deprecated/stb_image_resize.h
diff --git a/stb_image_resize2.h b/stb_image_resize2.h
new file mode 100644
index 0000000..7aaeab0
--- /dev/null
+++ b/stb_image_resize2.h
@@ -0,0 +1,10303 @@
+/* stb_image_resize2 - v2.00 - public domain image resizing
+   
+   by Jeff Roberts (v2) and Jorge L Rodriguez 
+   http://github.com/nothings/stb
+
+   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only 
+   scaling and translation is supported, no rotations or shears.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   PORTING FROM VERSION 1
+
+      The API has changed. You can continue to use the old version of stb_image_resize.h,
+      which is available in the "deprecated/" directory.
+
+      If you're using the old simple-to-use API, porting is straightforward.
+      (For more advanced APIs, read the documentation.)
+
+        stbir_resize_uint8():
+          - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout`
+
+        stbir_resize_float():
+          - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout`
+
+        stbir_resize_uint8_srgb():
+          - function name is unchanged
+          - cast channel count to `stbir_pixel_layout`
+          - above is sufficient unless your image has alpha and it's not RGBA/BGRA
+            - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode
+
+        stbir_resize_uint8_srgb_edgemode()
+          - switch to the "medium complexity" API
+          - stbir_resize(), very similar API but a few more parameters:
+            - pixel_layout: cast channel count to `stbir_pixel_layout`
+            - data_type:    STBIR_TYPE_UINT8_SRGB
+            - edge:         unchanged (STBIR_EDGE_WRAP, etc.)
+            - filter:       STBIR_FILTER_DEFAULT
+          - which channel is alpha is specified in stbir_pixel_layout, see enum for details
+
+   EASY API CALLS:
+     Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge.
+
+     stbir_resize_uint8_srgb( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                              output_pixels, output_w, output_h, output_stride_in_bytes,
+                              pixel_layout_enum )
+
+     stbir_resize_uint8_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                                output_pixels, output_w, output_h, output_stride_in_bytes,
+                                pixel_layout_enum )
+
+     stbir_resize_float_linear( input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                                output_pixels, output_w, output_h, output_stride_in_bytes,
+                                pixel_layout_enum )
+
+     If you pass NULL or zero for the output_pixels, we will allocate the output buffer
+     for you and return it from the function (free with free() or STBIR_FREE).
+     As a special case, XX_stride_in_bytes of 0 means packed continuously in memory.
+
+   API LEVELS
+      There are three levels of API - easy-to-use, medium-complexity and extended-complexity.
+
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      MEMORY ALLOCATION
+         By default, we use malloc and free for memory allocation.  To override the 
+         memory allocation, before the implementation #include, add a:
+
+            #define STBIR_MALLOC(size,user_data) ...
+            #define STBIR_FREE(ptr,user_data)   ...
+
+         Each resize makes exactly one call to malloc/free (unless you use the 
+         extended API where you can do one allocation for many resizes). Under
+         address sanitizer, we do separate allocations to find overread/writes.
+
+      PERFORMANCE
+         This library was written with an emphasis on performance. When testing
+         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with 
+         STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
+         libs do by default). Also, make sure SIMD is turned on of course (default 
+         for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
+
+         This library also comes with profiling built-in. If you define STBIR_PROFILE,
+         you can use the advanced API and get low-level profiling information by 
+         calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
+         after a resize.
+
+      SIMD
+         Most of the routines have optimized SSE2, AVX, NEON and WASM versions. 
+
+         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and 
+         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or 
+         STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
+         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2 
+         support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
+
+         On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
+         we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
+         on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
+         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to 
+         automatically enable NEON.
+
+         On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
+         for converting back and forth to half-floats. This is autoselected when we
+         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses 
+         the built-in half float hardware NEON instructions. 
+
+         You can also tell us to use multiply-add instructions with STBIR_USE_FMA. 
+         Because x86 doesn't always have fma, we turn it off by default to maintain
+         determinism across all platforms. If you don't care about non-FMA determinism
+         and are willing to restrict yourself to more recent x86 CPUs (around the AVX 
+         timeframe), then fma will give you around a 15% speedup.
+
+         You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
+         off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
+         to 40% faster, and AVX2 is generally another 12%.
+        
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how the alpha 
+         channel of an image is processed.
+
+         When alpha represents transparency, it is important that when combining
+         colors with filtering, the pixels should not be treated equally; they
+         should use a weighted average based on their alpha values. For example,
+         if a pixel is 1% opaque bright green and another pixel is 99% opaque
+         black and you average them, the average will be 50% opaque, but the
+         unweighted average and will be a middling green color, while the weighted
+         average will be nearly black. This means the unweighted version introduced
+         green energy that didn't exist in the source image.
+
+         (If you want to know why this makes sense, you can work out the math for
+         the following: consider what happens if you alpha composite a source image
+         over a fixed color and then average the output, vs. if you average the
+         source image pixels and then composite that over the same fixed color.
+         Only the weighted average produces the same result as the ground truth
+         composite-then-average result.)
+
+         Therefore, it is in general best to "alpha weight" the pixels when applying
+         filters to them. This essentially means multiplying the colors by the alpha
+         values before combining them, and then dividing by the alpha value at the
+         end.
+
+         The computer graphics industry introduced a technique called "premultiplied
+         alpha" or "associated alpha" in which image colors are stored in image files
+         already multiplied by their alpha. This saves some math when compositing,
+         and also avoids the need to divide by the alpha at the end (which is quite
+         inefficient). However, while premultiplied alpha is common in the movie CGI
+         industry, it is not commonplace in other industries like videogames, and most
+         consumer file formats are generally expected to contain not-premultiplied
+         colors. For example, Photoshop saves PNG files "unpremultiplied", and web
+         browsers like Chrome and Firefox expect PNG images to be unpremultiplied.
+
+         Note that there are three possibilities that might describe your image
+         and resize expectation:
+
+             1. images are not premultiplied, alpha weighting is desired
+             2. images are not premultiplied, alpha weighting is not desired
+             3. images are premultiplied
+
+         Both case #2 and case #3 require the exact same math: no alpha weighting
+         should be applied or removed. Only case 1 requires extra math operations;
+         the other two cases can be handled identically.
+
+         stb_image_resize expects case #1 by default, applying alpha weighting to
+         images, expecting the input images to be unpremultiplied. This is what the
+         COLOR+ALPHA buffer types tell the resizer to do. 
+
+         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB, 
+         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are 
+         non-premultiplied. In these cases, the resizer will alpha weight the colors 
+         (effectively creating the premultiplied image), do the filtering, and then 
+         convert back to non-premult on exit.
+
+         When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
+         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels 
+         ARE premultiplied. In this case, the resizer doesn't have to do the 
+         premultipling - it can filter directly on the input. This about twice as 
+         fast as the non-premultiplied case, so it's the right option if your data is 
+         already setup correctly.
+
+         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are 
+         telling us that there is no channel that represents transparency; it may be 
+         RGB and some unrelated fourth channel that has been stored in the alpha 
+         channel, but it is actually not alpha. No special processing will be 
+         performed. 
+
+         The difference between the generic 4 or 2 channel layouts, and the 
+         specialized _PM versions is with the _PM versions you are telling us that
+         the data *is* alpha, just don't premultiply it. That's important when
+         using SRGB pixel formats, we need to know where the alpha is, because
+         it is converted linearly (rather than with the SRGB converters).
+   
+         Because alpha weighting produces the same effect as premultiplying, you
+         even have the option with non-premultiplied inputs to let the resizer
+         produce a premultiplied output. Because the intially computed alpha-weighted
+         output image is effectively premultiplied, this is actually more performant
+         than the normal path which un-premultiplies the output image as a final step.
+
+         Finally, when converting both in and out of non-premulitplied space (for
+         example, when using STBIR_RGBA), we go to somewhat heroic measures to 
+         ensure that areas with zero alpha value pixels get something reasonable 
+         in the RGB values. If you don't care about the RGB values of zero alpha 
+         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality() 
+         function - this runs a premultiplied resize about 25% faster. That said,
+         when you really care about speed, using premultiplied pixels for both in
+         and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
+         options.
+
+      PIXEL LAYOUT CONVERSION
+         The resizer can convert from some pixel layouts to others. When using the
+         stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA
+         on input, and STBIR_ARGB on output, and it will re-organize the channels
+         during the resize. Currently, you can only convert between two pixel
+         layouts with the same number of channels.
+
+      DETERMINISM
+         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc). 
+         This requires compiling with fast-math off (using at least /fp:precise). 
+         Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
+         We attempt to do this with pragmas, but with Clang, you usually want to add 
+         -ffp-contract=off to the command line as well.
+
+         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is, 
+         if the scalar x87 unit gets used at all, we immediately lose determinism. 
+         On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
+         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even 
+         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and 
+         -fpmath=sse.
+
+         Note that we will not be deterministic with float data containing NaNs -
+         the NaNs will propagate differently on different SIMD and platforms. 
+
+         If you turn on STBIR_USE_FMA, then we will be deterministic with other 
+         fma targets, but we will differ from non-fma targets (this is unavoidable, 
+         because a fma isn't simply an add with a mult - it also introduces a 
+         rounding difference compared to non-fma instruction sequences. 
+
+      FLOAT PIXEL FORMAT RANGE
+         Any range of values can be used for the non-alpha float data that you pass 
+         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values 
+         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we 
+         scale back properly. The alpha channel must also be 0 to 1 for any format 
+         that does premultiplication prior to resizing. 
+
+         Note also that with float output, using filters with negative lobes, the 
+         output filtered values might go slightly out of range. You can define 
+         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range 
+         to clamp to on output, if that's important. 
+
+      MAX/MIN SCALE FACTORS
+         The input pixel resolutions are in integers, and we do the internal pointer
+         resolution in size_t sized integers. However, the scale ratio from input
+         resolution to output resolution is calculated in float form. This means
+         the effective possible scale ratio is limited to 24 bits (or 16 million
+         to 1). As you get close to the size of the float resolution (again, 16
+         million pixels wide or high), you might start seeing float inaccuracy
+         issues in general in the pipeline. If you have to do extreme resizes,
+         you can usually do this is multiple stages (using float intermediate
+         buffers).
+
+      FLIPPED IMAGES
+         Stride is just the delta from one scanline to the next. This means you can 
+         use a negative stride to handle inverted images (point to the final 
+         scanline and use a negative stride). You can invert the input or output,
+         using negative strides.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters to 
+         use, you can change the compile-time defaults with:
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are supplied. For a list of supported 
+         filters, see the stbir_filter enum. You can install your own filters by 
+         using the stbir_set_filter_callbacks function.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can use the the 
+         scanline callbacks in the extended API. It would have to be a *very* large 
+         image resample to need progress though - we're very fast.
+
+      CEIL and FLOOR
+         In scalar mode, the only functions we use from math.h are ceilf and floorf, 
+         but if you have your own versions, you can define the STBIR_CEILF(v) and 
+         STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
+         our own versions.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+      FUTURE TODOS
+        *  For polyphase integral filters, we just memcpy the coeffs to dupe
+           them, but we should indirect and use the same coeff memory.
+        *  Add pixel layout conversions for sensible different channel counts
+           (maybe, 1->3/4, 3->4, 4->1, 3->1).
+         * For SIMD encode and decode scanline routines, do any pre-aligning
+           for bad input/output buffer alignments and pitch?
+         * For very wide scanlines, we should we do vertical strips to stay within
+           L2 cache. Maybe do chunks of 1K pixels at a time. There would be 
+           some pixel reconversion, but probably dwarfed by things falling out
+           of cache. Probably also something possible with alternating between
+           scattering and gathering at high resize scales?
+         * Rewrite the coefficient generator to do many at once.
+         * AVX-512 vertical kernels - worried about downclocking here.
+         * Convert the reincludes to macros when we know they aren't changing.
+         * Experiment with pivoting the horizontal and always using the
+           vertical filters (which are faster, but perhaps not enough to overcome
+           the pivot cost and the extra memory touches). Need to buffer the whole
+           image so have to balance memory use.
+         * Most of our code is internally function pointers, should we compile
+           all the SIMD stuff always and dynamically dispatch? 
+
+   CONTRIBUTORS
+      Jeff Roberts: 2.0 implementation, optimizations, SIMD
+      Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer.
+      Fabian Giesen: half float and srgb converters
+      Sean Barrett: API design, optimizations
+      Jorge L Rodriguez: Original 1.0 implementation
+      Aras Pranckevicius: bugfixes for 1.0
+      Nathan Reed: warning fixes for 1.0
+
+   REVISIONS
+      2.00 (2022-02-20) mostly new source: new api, optimizations, simd, vertical-first, etc 
+                       (2x-5x faster without simd, 4x-12x faster with simd)
+                       (in some cases, 20x to 40x faster - resizing to very small for example)
+      0.96 (2019-03-04) fixed warnings
+      0.95 (2017-07-23) fixed warnings
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+*/
+
+#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS)   // for internal re-includes
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+
+#include <stddef.h>
+#ifdef _MSC_VER
+typedef unsigned char    stbir_uint8;
+typedef unsigned short   stbir_uint16;
+typedef unsigned int     stbir_uint32;
+typedef unsigned __int64 stbir_uint64;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+typedef uint64_t stbir_uint64;
+#endif
+
+#ifdef _M_IX86_FP
+#if ( _M_IX86_FP >= 1 )
+#ifndef STBIR_SSE
+#define STBIR_SSE
+#endif
+#endif
+#endif 
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
+  #ifndef STBIR_SSE2
+    #define STBIR_SSE2
+  #endif
+  #if defined(__AVX__) || defined(STBIR_AVX2)
+    #ifndef STBIR_AVX
+      #ifndef STBIR_NO_AVX
+        #define STBIR_AVX
+      #endif
+    #endif
+  #endif
+  #if defined(__AVX2__) || defined(STBIR_AVX2)
+    #ifndef STBIR_NO_AVX2
+      #ifndef STBIR_AVX2  
+        #define STBIR_AVX2
+      #endif
+      #if defined( _MSC_VER ) && !defined(__clang__)
+        #ifndef STBIR_FP16C  // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c
+          #define STBIR_FP16C
+        #endif
+      #endif
+    #endif
+  #endif
+  #ifdef __F16C__
+    #ifndef STBIR_FP16C  // turn on FP16C instructions if the define is set (for clang and gcc)
+      #define STBIR_FP16C
+    #endif
+  #endif
+#endif
+
+#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(_M_ARM) || (__ARM_NEON_FP & 4) != 0 &&  __ARM_FP16_FORMAT_IEEE != 0
+#ifndef STBIR_NEON
+#define STBIR_NEON
+#endif
+#endif
+
+#if defined(_M_ARM)
+#ifdef STBIR_USE_FMA
+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC 
+#endif
+#endif
+
+#if defined(__wasm__) && defined(__wasm_simd128__)
+#ifndef STBIR_WASM
+#define STBIR_WASM
+#endif
+#endif
+
+#ifndef STBIRDEF
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+////   start "header file" ///////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * stride is the offset between successive rows of image data 
+//        in memory, in bytes. specify 0 for packed continuously in memory
+//     * colorspace is linear or sRGB as specified by function name
+//     * Uses the default filters
+//     * Uses edge mode clamped
+//     * returned result is 1 for success or 0 in case of an error.
+
+
+// stbir_pixel_layout specifies:
+//   number of channels
+//   order of channels
+//   whether color is premultiplied by alpha
+// for back compatibility, you can cast the old channel count to an stbir_pixel_layout
+typedef enum 
+{
+  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
+  STBIR_1CHANNEL = 1,              
+  STBIR_2CHANNEL = 2,
+  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping) 
+  STBIR_RGBA     = 4,               // alpha formats, alpha is NOT premultiplied into color channels
+
+  STBIR_4CHANNEL = 5,
+  STBIR_BGRA = 6,
+  STBIR_ARGB = 7,
+  STBIR_ABGR = 8,
+  STBIR_RA   = 9,
+  STBIR_AR   = 10,
+
+  STBIR_RGBA_PM = 11,               // alpha formats, alpha is premultiplied into color channels
+  STBIR_BGRA_PM = 12,
+  STBIR_ARGB_PM = 13,
+  STBIR_ABGR_PM = 14,
+  STBIR_RA_PM   = 15,
+  STBIR_AR_PM   = 16,
+} stbir_pixel_layout;
+
+//===============================================================
+//  Simple-complexity API
+//
+//    If output_pixels is NULL (0), then we will allocate the buffer and return it to you.
+//--------------------------------
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_type );
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_type );
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_type );
+//===============================================================
+
+//===============================================================
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT
+//     * Edge wrap can selected explicitly
+//     * Filter can be selected explicitly
+//--------------------------------
+
+typedef enum
+{
+  STBIR_EDGE_CLAMP   = 0,
+  STBIR_EDGE_REFLECT = 1,
+  STBIR_EDGE_WRAP    = 2,  // this edge mode is slower and uses more memory
+  STBIR_EDGE_ZERO    = 3,
+} stbir_edge;
+
+typedef enum
+{
+  STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+  STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+  STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+  STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+  STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+  STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+  STBIR_FILTER_POINT_SAMPLE = 6,  // Simple point sampling
+  STBIR_FILTER_OTHER        = 7,  // User callback specified
+} stbir_filter;
+
+typedef enum
+{
+  STBIR_TYPE_UINT8            = 0,
+  STBIR_TYPE_UINT8_SRGB       = 1,
+  STBIR_TYPE_UINT8_SRGB_ALPHA = 2,  // alpha channel, when present, should also be SRGB (this is very unusual)
+  STBIR_TYPE_UINT16           = 3,
+  STBIR_TYPE_FLOAT            = 4,
+  STBIR_TYPE_HALF_FLOAT       = 5
+} stbir_datatype;
+
+// medium api
+STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                     void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                               stbir_pixel_layout pixel_layout, stbir_datatype data_type,
+                               stbir_edge edge, stbir_filter filter );
+//===============================================================
+
+
+
+//===============================================================
+// Extended-complexity API
+//
+// This API exposes all resize functionality.
+//
+//     * Separate filter types for each axis
+//     * Separate edge modes for each axis
+//     * Separate input and output data types
+//     * Can specify regions with subpixel correctness
+//     * Can specify alpha flags
+//     * Can specify a memory callback 
+//     * Can specify a callback data type for pixel input and output 
+//     * Can be threaded for a single resize
+//     * Can be used to resize many frames without recalculating the sampler info
+//
+//  Use this API as follows:
+//     1) Call the stbir_resize_init function on a local STBIR_RESIZE structure
+//     2) Call any of the stbir_set functions
+//     3) Optionally call stbir_build_samplers() if you are going to resample multiple times
+//        with the same input and output dimensions (like resizing video frames)
+//     4) Resample by calling stbir_resize_extended().
+//     5) Call stbir_free_samplers() if you called stbir_build_samplers()
+//--------------------------------
+
+
+// Types:
+
+// INPUT CALLBACK: this callback is used for input scanlines
+typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context );
+
+// OUTPUT CALLBACK: this callback is used for output scanlines
+typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context );
+
+// callbacks for user installed filters
+typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero
+typedef float stbir__support_callback( float scale, void * user_data );
+
+// internal structure with precomputed scaling
+typedef struct stbir__info stbir__info; 
+
+typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
+{
+  void * user_data;
+  void const * input_pixels;
+  int input_w, input_h;
+  double input_s0, input_t0, input_s1, input_t1;
+  stbir_input_callback * input_cb;
+  void * output_pixels;
+  int output_w, output_h;
+  int output_subx, output_suby, output_subw, output_subh;
+  stbir_output_callback * output_cb;
+  int input_stride_in_bytes;
+  int output_stride_in_bytes;
+  int splits;
+  int fast_alpha;
+  int needs_rebuild;
+  int called_alloc;
+  stbir_pixel_layout input_pixel_layout_public;
+  stbir_pixel_layout output_pixel_layout_public;
+  stbir_datatype input_data_type;
+  stbir_datatype output_data_type;
+  stbir_filter horizontal_filter, vertical_filter;
+  stbir_edge horizontal_edge, vertical_edge;
+  stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
+  stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
+  stbir__info * samplers;      
+} STBIR_RESIZE;
+
+// extended complexity api
+
+
+// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls!
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
+                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
+                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
+                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type );
+
+//===============================================================
+// You can update these parameters any time after resize_init and there is no cost
+//--------------------------------
+
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );         
+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
+
+//===============================================================
+
+
+//===============================================================
+// If you call any of these functions, you will trigger a sampler rebuild!
+//--------------------------------
+
+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout );  // sets new buffer layouts
+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
+
+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support ); 
+
+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default)
+
+// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique
+//   that fills the zero alpha pixel's RGB values with something plausible.  If you don't care about areas of
+//   zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA
+//   types of resizes.
+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality );
+//===============================================================
+
+
+//===============================================================
+// You can call build_samplers to prebuild all the internal data we need to resample.
+//   Then, if you call resize_extended many times with the same resize, you only pay the
+//   cost once.
+// If you do call build_samplers, you MUST call free_samplers eventually.
+//--------------------------------
+
+// This builds the samplers and does one allocation
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize ); 
+
+// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
+STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
+//===============================================================
+
+
+// And this is the main function to perform the resize synchronously on one thread.
+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
+
+
+//===============================================================
+// Use these functions for multithreading.
+//   1) You call stbir_build_samplers_with_splits first on the main thread
+//   2) Then stbir_resize_with_split on each thread
+//   3) stbir_free_samplers when done on the main thread
+//--------------------------------
+
+// This will build samplers for threading.
+//   You can pass in the number of threads you'd like to use (try_splits).
+//   It returns the number of splits (threads) that you can call it with.
+///  It might be less if the image resize can't be split up that many ways.
+
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );             
+
+// This function does a split of the resizing (you call this fuction for each
+// split, on multiple threads). A split is a piece of the output resize pixel space.
+
+// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split!
+
+// Usually, you will always call stbir_resize_split with split_start as the thread_index
+//   and "1" for the split_count.
+// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
+//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the 
+//   split_count each time to turn in into a 4 thread resize. (This is unusual).
+
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );         
+//===============================================================
+
+
+//===============================================================
+// Pixel Callbacks info:
+//--------------------------------
+
+//   The input callback is super flexible - it calls you with the input address
+//   (based on the stride and base pointer), it gives you an optional_output
+//   pointer that you can fill, or you can just return your own pointer into
+//   your own data. 
+//
+//   You can also do conversion from non-supported data types if necessary - in 
+//   this case, you ignore the input_ptr and just use the x and y parameters to 
+//   calculate your own input_ptr based on the size of each non-supported pixel.
+//   (Something like the third example below.)
+//
+//   You can also install just an input or just an output callback by setting the
+//   callback that you don't want to zero.
+//
+//     First example, progress: (getting a callback that you can monitor the progress):
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           percentage_done = y / input_height;
+//           return input_ptr;  // use buffer from call
+//        }
+//
+//     Next example, copying: (copy from some other buffer or stream):  
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
+//           return optional_output;  // return the optional buffer that we filled
+//        }
+//
+//     Third example, input another buffer without copying: (zero-copy from other buffer):  
+//        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
+//        {
+//           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
+//           return pixels;       // return pointer to your data without copying
+//        }
+//
+//
+//   The output callback is considerably simpler - it just calls you so that you can dump
+//   out each scanline. You could even directly copy out to disk if you have a simple format
+//   like TGA or BMP. You can also convert to other output types here if you want.
+//
+//   Simple example:
+//        void const * my_output( void * output_ptr, int num_pixels, int y, void * context )
+//        {
+//           percentage_done = y / output_height;
+//           fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file );
+//        }
+//===============================================================
+
+
+
+
+//===============================================================
+// optional built-in profiling API
+//--------------------------------
+
+#ifdef STBIR_PROFILE
+
+typedef struct STBIR_PROFILE_INFO 
+{
+  stbir_uint64 total_clocks;
+
+  // how many clocks spent (of total_clocks) in the various resize routines, along with a string description
+  //    there are "resize_count" number of zones
+  stbir_uint64 clocks[ 8 ];
+  char const ** descriptions;
+  
+  // count of clocks and descriptions
+  stbir_uint32 count;
+} STBIR_PROFILE_INFO;
+
+// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits)
+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
+
+// use after calling stbir_resize_extended
+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize );
+
+// use after calling stbir_resize_extended_split
+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num );
+
+//===============================================================
+
+#endif
+
+
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H
+
+#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION)
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size))
+#define STBIR_FREE(ptr,user_data)    ((void)(user_data), free(ptr))
+// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings)
+#endif
+
+#ifdef _MSC_VER
+
+#define stbir__inline __forceinline
+
+#else
+
+#define stbir__inline __inline__
+
+// Clang address sanitizer
+#if defined(__has_feature)
+  #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer)
+    #ifndef STBIR__SEPARATE_ALLOCATIONS
+      #define STBIR__SEPARATE_ALLOCATIONS
+    #endif
+  #endif
+#endif
+
+#endif
+
+// GCC and MSVC
+#if defined(__SANITIZE_ADDRESS__)
+  #ifndef STBIR__SEPARATE_ALLOCATIONS
+    #define STBIR__SEPARATE_ALLOCATIONS
+  #endif
+#endif
+
+// Always turn off automatic FMA use - use STBIR_USE_FMA if you want.
+// Otherwise, this is a determinism disaster.
+#ifndef STBIR_DONT_CHANGE_FP_CONTRACT  // override in case you don't want this behavior
+#if defined(_MSC_VER) && !defined(__clang__)
+#if _MSC_VER > 1200
+#pragma fp_contract(off)
+#endif
+#elif defined(__GNUC__) &&  !defined(__clang__)
+#pragma GCC optimize("fp-contract=off")
+#else
+#pragma STDC FP_CONTRACT OFF
+#endif
+#endif
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED(v)  (void)(v)
+#else
+#define STBIR__UNUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+
+#ifndef STBIR__HEADER_FILENAME
+#define STBIR__HEADER_FILENAME "stb_image_resize2.h"
+#endif
+
+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
+typedef enum 
+{
+  STBIRI_1CHANNEL = 0,
+  STBIRI_2CHANNEL = 1,
+  STBIRI_RGB      = 2,
+  STBIRI_BGR      = 3,
+  STBIRI_4CHANNEL = 4,
+  
+  STBIRI_RGBA = 5,
+  STBIRI_BGRA = 6,
+  STBIRI_ARGB = 7,
+  STBIRI_ABGR = 8,
+  STBIRI_RA   = 9,
+  STBIRI_AR   = 10,
+
+  STBIRI_RGBA_PM = 11,
+  STBIRI_BGRA_PM = 12,
+  STBIRI_ARGB_PM = 13,
+  STBIRI_ABGR_PM = 14,
+  STBIRI_RA_PM   = 15,
+  STBIRI_AR_PM   = 16,
+} stbir_internal_pixel_layout;
+
+// define the public pixel layouts to not compile inside the implementation (to avoid accidental use)
+#define STBIR_BGR bad_dont_use_in_implementation
+#define STBIR_1CHANNEL STBIR_BGR
+#define STBIR_2CHANNEL STBIR_BGR
+#define STBIR_RGB STBIR_BGR
+#define STBIR_RGBA STBIR_BGR
+#define STBIR_4CHANNEL STBIR_BGR
+#define STBIR_BGRA STBIR_BGR
+#define STBIR_ARGB STBIR_BGR
+#define STBIR_ABGR STBIR_BGR
+#define STBIR_RA STBIR_BGR
+#define STBIR_AR STBIR_BGR
+#define STBIR_RGBA_PM STBIR_BGR
+#define STBIR_BGRA_PM STBIR_BGR
+#define STBIR_ARGB_PM STBIR_BGR
+#define STBIR_ABGR_PM STBIR_BGR
+#define STBIR_RA_PM STBIR_BGR
+#define STBIR_AR_PM STBIR_BGR
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+  1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT
+};
+
+// When gathering, the contributors are which source pixels contribute.
+// When scattering, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+  int n0; // First contributing pixel
+  int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+  int lowest;    // First sample index for whole filter
+  int highest;   // Last sample index for whole filter
+  int widest;    // widest single set of samples for an output
+} stbir__filter_extent_info;
+
+typedef struct
+{
+  int n0; // First pixel of decode buffer to write to
+  int n1; // Last pixel of decode that will be written to
+  int pixel_offset_for_input;  // Pixel offset into input_scanline
+} stbir__span;
+
+typedef struct stbir__scale_info
+{
+  int input_full_size;
+  int output_sub_size;
+  float scale;
+  float inv_scale;
+  float pixel_shift; // starting shift in output pixel space (in pixels)
+  int scale_is_rational;
+  stbir_uint32 scale_numerator, scale_denominator;
+} stbir__scale_info;
+
+typedef struct
+{
+  stbir__contributors * contributors;
+  float* coefficients;
+  stbir__contributors * gather_prescatter_contributors;
+  float * gather_prescatter_coefficients;
+  stbir__scale_info scale_info;
+  float support;
+  stbir_filter filter_enum;
+  stbir__kernel_callback * filter_kernel;
+  stbir__support_callback * filter_support;
+  stbir_edge edge;
+  int coefficient_width;
+  int filter_pixel_width;
+  int filter_pixel_margin;
+  int num_contributors;
+  int contributors_size;
+  int coefficients_size;
+  stbir__filter_extent_info extent_info;
+  int is_gather;  // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1
+  int gather_prescatter_num_contributors;
+  int gather_prescatter_coefficient_width;
+  int gather_prescatter_contributors_size;
+  int gather_prescatter_coefficients_size;
+} stbir__sampler;
+
+typedef struct
+{
+  stbir__contributors conservative;
+  int edge_sizes[2];    // this can be less than filter_pixel_margin, if the filter and scaling falls off
+  stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
+} stbir__extents;
+
+typedef struct 
+{
+#ifdef STBIR_PROFILE
+  union
+  {
+    struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
+    stbir_uint64 array[8];
+  } profile;
+  stbir_uint64 * current_zone_excluded_ptr;
+#endif
+  float* decode_buffer;
+
+  int ring_buffer_first_scanline;
+  int ring_buffer_last_scanline;
+  int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+  int start_output_y, end_output_y;
+  int start_input_y, end_input_y;  // used in scatter only
+
+  #ifdef STBIR__SEPARATE_ALLOCATIONS
+    float** ring_buffers; // one pointer for each ring buffer
+  #else
+    float* ring_buffer;  // one big buffer that we index into
+  #endif
+
+  float* vertical_buffer;
+
+  char no_cache_straddle[64];
+} stbir__per_split_info;
+
+typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
+typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
+typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, 
+  stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
+typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
+typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
+
+struct stbir__info
+{
+#ifdef STBIR_PROFILE
+  union
+  {
+    struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named;
+    stbir_uint64 array[7];
+  } profile;
+  stbir_uint64 * current_zone_excluded_ptr;
+#endif
+  stbir__sampler horizontal;
+  stbir__sampler vertical;
+
+  void const * input_data;
+  void * output_data;
+
+  int input_stride_bytes;
+  int output_stride_bytes;
+  int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+  int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+
+  stbir_datatype input_type;
+  stbir_datatype output_type;
+
+  stbir_input_callback * in_pixels_cb;
+  void * user_data;
+  stbir_output_callback * out_pixels_cb;
+
+  stbir__extents scanline_extents;
+
+  void * alloced_mem;
+  stbir__per_split_info * split_info;  // by default 1, but there will be N of these allocated based on the thread init you did
+
+  stbir__decode_pixels_func * decode_pixels;
+  stbir__alpha_weight_func * alpha_weight;
+  stbir__horizontal_gather_channels_func * horizontal_gather_channels;
+  stbir__alpha_unweight_func * alpha_unweight;
+  stbir__encode_pixels_func * encode_pixels;
+  
+  int alloced_total;
+  int splits; // count of splits
+  
+  stbir_internal_pixel_layout input_pixel_layout_internal;
+  stbir_internal_pixel_layout output_pixel_layout_internal;
+
+  int input_color_and_type;
+  int offset_x, offset_y; // offset within output_data
+  int vertical_first;
+  int channels;
+  int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+};
+
+
+#define stbir__max_uint8_as_float             255.0f
+#define stbir__max_uint16_as_float            65535.0f
+#define stbir__max_uint8_as_float_inverted    (1.0f/255.0f)
+#define stbir__max_uint16_as_float_inverted   (1.0f/65535.0f)
+#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+
+// min/max friendly
+#define STBIR_CLAMP(x, xmin, xmax) do { \
+  if ( (x) < (xmin) ) (x) = (xmin);     \
+  if ( (x) > (xmax) ) (x) = (xmax);     \
+} while (0)
+
+static stbir__inline int stbir__min(int a, int b)
+{
+  return a < b ? a : b;
+}
+
+static stbir__inline int stbir__max(int a, int b)
+{
+  return a > b ? a : b;
+}
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+  0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+  0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+  0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+  0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+  0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+  0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+  0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+  0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+  0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+  0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+  0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+  0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+  0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+  0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+  0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+  0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+  0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+  0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+  0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+  0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+  0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+  0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+  0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+  0.982251f, 0.991102f, 1.0f
+};
+
+typedef union
+{
+  unsigned int u;
+  float f;
+} stbir__FP32;
+
+// From https://gist.github.com/rygorous/2203834
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+  0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+  0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+  0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+  0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+  0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+  0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+  0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+  0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+  0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+  0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+  0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+  0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+  0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+ 
+static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+  static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+  static const stbir__FP32 minval = { (127-13) << 23 };
+  stbir_uint32 tab,bias,scale,t;
+  stbir__FP32 f;
+
+  // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+  // The tests are carefully written so that NaNs map to 0, same as in the reference
+  // implementation.
+  if (!(in > minval.f)) // written this way to catch NaNs
+      return 0;
+  if (in > almostone.f)
+      return 255;
+
+  // Do the table lookup and unpack bias, scale
+  f.f = in;
+  tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+  bias = (tab >> 16) << 9;
+  scale = tab & 0xffff;
+
+  // Grab next-highest mantissa bits and perform linear interpolation
+  t = (f.u >> 12) & 0xff;
+  return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT
+#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
+#endif
+
+// restrict pointers for the output pointers
+#if defined( _MSC_VER ) && !defined(__clang__)
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict
+  #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
+#elif defined(  __clang__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) asm (""::"r"(ptr))
+#elif defined(  __GNUC__ )
+  #define STBIR_STREAMOUT_PTR( star ) star __restrict__
+  #define STBIR_NO_UNROLL( ptr ) asm (""::"r"(ptr))
+#else
+  #define STBIR_STREAMOUT_PTR( star ) star
+  #define STBIR_NO_UNROLL( ptr )
+#endif
+
+#ifdef STBIR_NO_SIMD // force simd off for whatever reason
+
+// force simd off overrides everything else, so clear it all
+
+#ifdef STBIR_SSE2
+#undef STBIR_SSE2
+#endif
+
+#ifdef STBIR_AVX
+#undef STBIR_AVX
+#endif
+
+#ifdef STBIR_NEON
+#undef STBIR_NEON
+#endif
+
+#ifdef STBIR_AVX2
+#undef STBIR_AVX2
+#endif
+
+#ifdef STBIR_FP16C
+#undef STBIR_FP16C
+#endif
+
+#ifdef STBIR_WASM
+#undef STBIR_WASM
+#endif
+
+#ifdef STBIR_SIMD
+#undef STBIR_SIMD
+#endif
+
+#else // STBIR_SIMD
+
+#ifdef STBIR_SSE2
+  #include <emmintrin.h>
+  
+  #define stbir__simdf __m128
+  #define stbir__simdi __m128i
+
+  #define stbir_simdi_castf( reg ) _mm_castps_si128(reg)
+  #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) ))
+  #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) )  // top values must be zero
+  #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) ))
+
+  #define stbir__simdf_zeroP() _mm_setzero_ps()
+  #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps()
+
+  #define stbir__simdf_store( ptr, reg )  _mm_storeu_ps( (float*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg )
+  #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) )
+  #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) )
+
+  #define stbir__simdi_store( ptr, reg )  _mm_storeu_si128( (__m128i*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) )
+  #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
+
+  #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
+ 
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out2 = _mm_unpacklo_epi8( ireg, zero ); \
+    out3 = _mm_unpackhi_epi8( ireg, zero ); \
+    out0 = _mm_unpacklo_epi16( out2, zero ); \
+    out1 = _mm_unpackhi_epi16( out2, zero ); \
+    out2 = _mm_unpacklo_epi16( out3, zero ); \
+    out3 = _mm_unpackhi_epi16( out3, zero ); \
+  }
+
+#define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out = _mm_unpacklo_epi8( ireg, zero ); \
+    out = _mm_unpacklo_epi16( out, zero ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    stbir__simdi zero = _mm_setzero_si128(); \
+    out0 = _mm_unpacklo_epi16( ireg, zero ); \
+    out1 = _mm_unpackhi_epi16( ireg, zero ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f)
+  #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f)
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
+
+  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i) 
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) )
+
+  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
+  #include <immintrin.h>
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add )
+  #else
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) )
+  #endif
+
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) )
+
+  static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f };
+  static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f };
+  #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) )
+  #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) )
+  #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones )
+  #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros )
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) )
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    stbir__simdf af,bf; \
+    stbir__simdi a,b; \
+    af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \
+    bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \
+    af = _mm_max_ps( af, _mm_setzero_ps() ); \
+    bf = _mm_max_ps( bf, _mm_setzero_ps() ); \
+    a = _mm_cvttps_epi32( af ); \
+    b = _mm_cvttps_epi32( bf ); \
+    a = _mm_packs_epi32( a, b ); \
+    out = _mm_packus_epi16( a, a ); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+      stbir__simdf_load( o0, (ptr) );    \
+      stbir__simdf_load( o1, (ptr)+4 );  \
+      stbir__simdf_load( o2, (ptr)+8 );  \
+      stbir__simdf_load( o3, (ptr)+12 ); \
+      {                                  \
+        __m128 tmp0, tmp1, tmp2, tmp3;   \
+        tmp0 = _mm_unpacklo_ps(o0, o1);  \
+        tmp2 = _mm_unpacklo_ps(o2, o3);  \
+        tmp1 = _mm_unpackhi_ps(o0, o1);  \
+        tmp3 = _mm_unpackhi_ps(o2, o3);  \
+        o0 = _mm_movelh_ps(tmp0, tmp2);  \
+        o1 = _mm_movehl_ps(tmp2, tmp0);  \
+        o2 = _mm_movelh_ps(tmp1, tmp3);  \
+        o3 = _mm_movehl_ps(tmp3, tmp1);  \
+      }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+      r0 = _mm_packs_epi32( r0, r1 ); \
+      r2 = _mm_packs_epi32( r2, r3 ); \
+      r1 = _mm_unpacklo_epi16( r0, r2 ); \
+      r3 = _mm_unpackhi_epi16( r0, r2 ); \
+      r0 = _mm_unpacklo_epi16( r1, r3 ); \
+      r2 = _mm_unpackhi_epi16( r1, r3 ); \
+      r0 = _mm_packus_epi16( r0, r2 ); \
+      stbir__simdi_store( ptr, r0 ); \
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm )
+
+  #if defined(_MSC_VER) && !defined(__clang__)
+    // msvc inits with 8 bytes
+    #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255)
+    #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v )
+    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 )
+  #else
+    // everything else inits with long long's
+    #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v)))
+    #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2)))
+  #endif
+
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  #if defined(STBIR_AVX) || defined(__SSE4_1__)
+    #include <smmintrin.h>
+    #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))
+  #else
+    STBIR__SIMDI_CONST(stbir__s32_32768, 32768);
+    STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768));
+
+    #define stbir__simdf_pack_to_8words(out,reg0,reg1) \
+      { \
+        stbir__simdi tmp0,tmp1; \
+        tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
+        tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \
+        tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \
+        tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \
+        out = _mm_packs_epi32( tmp0, tmp1 ); \
+        out = _mm_sub_epi16( out, stbir__s16_32768 ); \
+      }
+
+  #endif
+
+  #define STBIR_SIMD
+
+  // if we detect AVX, set the simd8 defines
+  #ifdef STBIR_AVX
+    #include <immintrin.h>
+    #define STBIR_SIMD8
+    #define stbir__simdf8 __m256
+    #define stbir__simdi8 __m256i
+    #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) )
+    #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) )
+    #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) )
+    #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out )
+    #define stbir__simdi8_store( ptr, reg )  _mm256_storeu_si256( (__m256i*)(ptr), reg )
+    #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval )
+
+    #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 )
+    #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 )
+
+    #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) )
+    #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
+    #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) )
+    #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b )
+    #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr )
+    #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr )  // avx load instruction
+
+    #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
+    #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
+  
+    #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
+    #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
+    
+    #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
+
+    #ifdef STBIR_AVX2
+
+    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
+    { \
+      stbir__simdi8 a, zero  =_mm256_setzero_si256();\
+      a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \
+      out0 = _mm256_unpacklo_epi16( a, zero ); \
+      out1 = _mm256_unpackhi_epi16( a, zero ); \
+    }
+
+    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
+    { \
+      stbir__simdi8 t; \
+      stbir__simdf8 af,bf; \
+      stbir__simdi8 a,b; \
+      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
+      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
+      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+      a = _mm256_cvttps_epi32( af ); \
+      b = _mm256_cvttps_epi32( bf ); \
+      t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
+      out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
+    }
+
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() ); 
+  
+    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
+      { \
+        stbir__simdf8 af,bf; \
+        stbir__simdi8 a,b; \
+        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
+        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
+        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+        a = _mm256_cvttps_epi32( af ); \
+        b = _mm256_cvttps_epi32( bf ); \
+        (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \
+      }
+
+    #else
+
+    #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \
+    { \
+      stbir__simdi a,zero = _mm_setzero_si128(); \
+      a = _mm_unpacklo_epi8( ireg, zero ); \
+      out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
+      a = _mm_unpackhi_epi8( ireg, zero ); \
+      out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
+    }
+  
+    #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
+    { \
+      stbir__simdi t; \
+      stbir__simdf8 af,bf; \
+      stbir__simdi8 a,b; \
+      af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \
+      bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \
+      af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+      bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+      a = _mm256_cvttps_epi32( af ); \
+      b = _mm256_cvttps_epi32( bf ); \
+      out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
+      out = _mm_packus_epi16( out, out ); \
+      t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
+      t = _mm_packus_epi16( t, t ); \
+      out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
+    }
+  
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
+    { \
+      stbir__simdi a,b,zero = _mm_setzero_si128(); \
+      a = _mm_unpacklo_epi16( ireg, zero ); \
+      b = _mm_unpackhi_epi16( ireg, zero ); \
+      out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \
+    }
+
+    #define stbir__simdf8_pack_to_16words(out,aa,bb) \
+      { \
+        stbir__simdi t0,t1; \
+        stbir__simdf8 af,bf; \
+        stbir__simdi8 a,b; \
+        af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \
+        bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \
+        af = _mm256_max_ps( af, _mm256_setzero_ps() ); \
+        bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \
+        a = _mm256_cvttps_epi32( af ); \
+        b = _mm256_cvttps_epi32( bf ); \
+        t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \
+        t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \
+        out = _mm256_setr_m128i( t0, t1 ); \
+      }
+
+    #endif
+
+    static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) };
+    #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 )
+
+    static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) };
+    #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 )
+
+    #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
+
+    #define stbir__simdf8_load2( out, ptr ) (out) = _mm256_castsi256_ps(_mm256_castsi128_si256( _mm_loadl_epi64( (__m128i*)(ptr)) )) // top values can be random (not denormal or nan for perf)
+    #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
+
+    static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
+    #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 )
+    #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8,  _mm256_castps128_ps256( b ) )
+
+    static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i(  0x80000000,  0x80000000, 0, 0 ) };
+    #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 )
+
+    #define stbir__simdf8_0123to00000000( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) )
+    #define stbir__simdf8_0123to11111111( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) )
+    #define stbir__simdf8_0123to22222222( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) )
+    #define stbir__simdf8_0123to33333333( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) )
+    #define stbir__simdf8_0123to21032103( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) )
+    #define stbir__simdf8_0123to32103210( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) )
+    #define stbir__simdf8_0123to12301230( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) )
+    #define stbir__simdf8_0123to10321032( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) )
+    #define stbir__simdf8_0123to30123012( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) )
+
+    #define stbir__simdf8_0123to11331133( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) )
+    #define stbir__simdf8_0123to00220022( out, in ) (out) =  _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) )
+
+    #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) )
+    #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) )
+    #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
+    #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) )
+
+    #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps()
+
+    #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
+    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
+    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( _mm256_castps128_ps256( mul ), _mm256_castps128_ps256( _mm_loadu_ps( (float const*)(ptr) ) ), add )
+    #else
+    #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
+    #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_castps128_ps256( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) ) )
+    #endif
+    #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
+
+  #endif
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)  // martins floorf
+  {
+    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
+    __m128 t = _mm_set_ss(x);
+    return _mm_cvtss_f32( _mm_floor_ss(t, t) );
+    #else
+    __m128 f = _mm_set_ss(x);
+    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
+    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f)));
+    return _mm_cvtss_f32(r);
+    #endif
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)  // martins ceilf
+  {
+    #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41)
+    __m128 t = _mm_set_ss(x);
+    return _mm_cvtss_f32( _mm_ceil_ss(t, t) );
+    #else
+    __m128 f = _mm_set_ss(x);
+    __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f));
+    __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f)));
+    return _mm_cvtss_f32(r);
+    #endif
+  }
+
+#elif defined(STBIR_NEON)
+  
+  #include <arm_neon.h>
+
+  #define stbir__simdf float32x4_t
+  #define stbir__simdi uint32x4_t
+
+  #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg)
+  #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg)
+
+  #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) )
+  #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero
+  #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar )
+  #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar )
+  #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) )  // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) )
+
+  #define stbir__simdf_zeroP() vdupq_n_f32(0)
+  #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0)
+
+  #define stbir__simdf_store( ptr, reg )  vst1q_f32( (float*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0)
+  #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) )
+  #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) )
+
+  #define stbir__simdi_store( ptr, reg )  vst1q_u32( (uint32_t*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 )
+  #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) )
+
+  #define stbir__prefetch( ptr )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \
+    uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \
+    out0 = vmovl_u16( vget_low_u16 ( l ) ); \
+    out1 = vmovl_u16( vget_high_u16( l ) ); \
+    out2 = vmovl_u16( vget_low_u16 ( h ) ); \
+    out3 = vmovl_u16( vget_high_u16( h ) ); \
+  }
+
+  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \
+    out = vmovl_u16( vget_low_u16( tmp ) ); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \
+    out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \
+    out1 = vmovl_u16( vget_high_u16( tmp ) ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
+  #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
+  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0) 
+  #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
+  #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) )
+
+  #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd)
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) )
+  #else
+  #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) )
+  #endif
+
+  #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
+  #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 )
+
+  #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0]
+  #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0]
+
+  #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+
+    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3)
+    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0)
+
+    #if defined( _MSC_VER ) && !defined(__clang__)
+      #define stbir_make16(a,b,c,d) vcombine_u8( \
+        vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
+          ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \
+        vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \
+          ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) )
+    #else
+      #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3}
+    #endif
+
+    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
+  
+    #define stbir__simdi_16madd( out, reg0, reg1 ) \
+    { \
+      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
+      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
+      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
+      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
+      (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \
+    }
+
+  #else
+
+    #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3)
+    #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0)
+
+    #if defined( _MSC_VER ) && !defined(__clang__)
+      static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg)
+      {
+        uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } };
+        return r;
+      }
+      #define stbir_make8(a,b) vcreate_u8( \
+        (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \
+        ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) )
+    #else
+      #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }
+      #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3}
+    #endif
+
+    #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \
+        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \
+        vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) )
+
+    #define stbir__simdi_16madd( out, reg0, reg1 ) \
+    { \
+      int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
+      int16x8_t r1 = vreinterpretq_s16_u32(reg1); \
+      int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \
+      int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \
+      int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \
+      int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \
+      (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \
+    }
+
+  #endif
+
+  #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
+    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \
+    int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \
+    int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \
+    uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \
+    out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \
+  }
+
+  #define stbir__simdf_pack_to_8words(out,aa,bb) \
+  { \
+    float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
+    float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \
+    int32x4_t ai = vcvtq_s32_f32( af ); \
+    int32x4_t bi = vcvtq_s32_f32( bf ); \
+    out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \
+  }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+  { \
+    int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \
+    int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \
+    uint8x8x2_t out = \
+    { { \
+      vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \
+      vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \
+    } }; \
+    vst2_u8(ptr, out); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+  { \
+    float32x4x4_t tmp = vld4q_f32(ptr); \
+    o0 = tmp.val[0]; \
+    o1 = tmp.val[1]; \
+    o2 = tmp.val[2]; \
+    o3 = tmp.val[3]; \
+  }
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm )
+
+  #if defined( _MSC_VER ) && !defined(__clang__)
+    #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x }
+    #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x }
+    #define STBIR__CONSTF(var) (*(const float32x4_t*)var)
+    #define STBIR__CONSTI(var) (*(const uint32x4_t*)var)
+  #else
+    #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x }
+    #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
+    #define STBIR__CONSTF(var) (var)
+    #define STBIR__CONSTI(var) (var)
+  #endif
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)
+  {
+    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+    return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0);
+    #else
+    float32x2_t f = vdup_n_f32(x);
+    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
+    uint32x2_t a = vclt_f32(f, t);
+    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f));
+    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
+    return vget_lane_f32(r, 0);
+    #endif
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)
+  {
+    #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ )
+    return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0);
+    #else
+    float32x2_t f = vdup_n_f32(x);
+    float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f));
+    uint32x2_t a = vclt_f32(t, f);
+    uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f));
+    float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b)));
+    return vget_lane_f32(r, 0);
+    #endif
+  }
+
+  #define STBIR_SIMD
+
+#elif defined(STBIR_WASM)
+
+  #include <wasm_simd128.h>
+
+  #define stbir__simdf v128_t
+  #define stbir__simdi v128_t
+
+  #define stbir_simdi_castf( reg ) (reg)
+  #define stbir_simdf_casti( reg ) (reg)
+
+  #define stbir__simdf_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
+  #define stbir__simdi_load( reg, ptr )             (reg) = wasm_v128_load( (void const*)(ptr) )
+  #define stbir__simdf_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdi_load1( out, ptr )            (out) = wasm_v128_load32_splat( (void const*)(ptr) )
+  #define stbir__simdf_load1z( out, ptr )           (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero
+  #define stbir__simdf_frep4( fvar )                wasm_f32x4_splat( fvar )
+  #define stbir__simdf_load1frep4( out, fvar )      (out) = wasm_f32x4_splat( fvar )
+  #define stbir__simdf_load2( out, ptr )            (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf)
+  #define stbir__simdf_load2z( out, ptr )           (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero
+  #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 )
+
+  #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0)
+  #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0)
+
+  #define stbir__simdf_store( ptr, reg )   wasm_v128_store( (void*)(ptr), reg )
+  #define stbir__simdf_store1( ptr, reg )  wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdf_store2( ptr, reg )  wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 )
+
+  #define stbir__simdi_store( ptr, reg )  wasm_v128_store( (void*)(ptr), reg )
+  #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 )
+  #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 )
+
+  #define stbir__prefetch( ptr )
+
+  #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
+  { \
+    v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \
+    v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \
+    out0 = wasm_u32x4_extend_low_u16x8 ( l ); \
+    out1 = wasm_u32x4_extend_high_u16x8( l ); \
+    out2 = wasm_u32x4_extend_low_u16x8 ( h ); \
+    out3 = wasm_u32x4_extend_high_u16x8( h ); \
+  }
+
+  #define stbir__simdi_expand_u8_to_1u32(out,ireg) \
+  { \
+    v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \
+    out = wasm_u32x4_extend_low_u16x8(tmp); \
+  }
+
+  #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \
+  { \
+    out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \
+    out1 = wasm_u32x4_extend_high_u16x8( ireg ); \
+  }
+
+  #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
+  #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
+  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0) 
+  #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
+  #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
+  #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
+  #define stbir__simdf_add( out, reg0, reg1 )          (out) = wasm_f32x4_add( reg0, reg1 )
+  #define stbir__simdf_mult( out, reg0, reg1 )         (out) = wasm_f32x4_mul( reg0, reg1 )
+  #define stbir__simdf_mult_mem( out, reg, ptr )       (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) )
+  #define stbir__simdf_mult1_mem( out, reg, ptr )      (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
+  #define stbir__simdf_add_mem( out, reg, ptr )        (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) )
+  #define stbir__simdf_add1_mem( out, reg, ptr )       (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) )
+
+  #define stbir__simdf_madd( out, add, mul1, mul2 )    (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
+  #define stbir__simdf_madd1( out, add, mul1, mul2 )   (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) )
+  #define stbir__simdf_madd_mem( out, add, mul, ptr )  (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) )
+  #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) )
+
+  #define stbir__simdf_add1( out, reg0, reg1 )  (out) = wasm_f32x4_add( reg0, reg1 )
+  #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 )
+
+  #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 )
+  #define stbir__simdf_or( out, reg0, reg1 )  (out) = wasm_v128_or( reg0, reg1 )
+
+  #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
+  #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
+  #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 )
+  #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 )
+
+  #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 )
+  #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 )
+
+  #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4)
+  #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0)
+  #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4)
+  #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2)
+
+  #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four)
+
+  #define stbir__simdi_and( out, reg0, reg1 )    (out) = wasm_v128_and( reg0, reg1 )
+  #define stbir__simdi_or( out, reg0, reg1 )     (out) = wasm_v128_or( reg0, reg1 )
+  #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 )
+
+  #define stbir__simdf_pack_to_8bytes(out,aa,bb) \
+  { \
+    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
+    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \
+    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
+    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
+    v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \
+    out = wasm_u8x16_narrow_i16x8( out16, out16 ); \
+  }
+
+  #define stbir__simdf_pack_to_8words(out,aa,bb) \
+  { \
+    v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
+    v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \
+    v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \
+    v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \
+    out = wasm_u16x8_narrow_i32x4( ai, bi ); \
+  }
+
+  #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \
+  { \
+    v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \
+    v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \
+    v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \
+    tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \
+    wasm_v128_store( (void*)(ptr), tmp); \
+  }
+
+  #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \
+  { \
+    v128_t t0 = wasm_v128_load( ptr    ); \
+    v128_t t1 = wasm_v128_load( ptr+4  ); \
+    v128_t t2 = wasm_v128_load( ptr+8  ); \
+    v128_t t3 = wasm_v128_load( ptr+12 ); \
+    v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \
+    v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \
+    v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \
+    v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \
+    o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \
+    o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \
+    o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \
+    o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \
+  }
+
+  #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm )
+
+  typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16)));
+  #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x }
+  #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x }
+  #define STBIR__CONSTF(var) (var)
+  #define STBIR__CONSTI(var) (var)
+
+  #ifdef STBIR_FLOORF
+  #undef STBIR_FLOORF
+  #endif
+  #define STBIR_FLOORF stbir_simd_floorf
+  static stbir__inline float stbir_simd_floorf(float x)
+  {
+    return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0);
+  }
+
+  #ifdef STBIR_CEILF
+  #undef STBIR_CEILF
+  #endif
+  #define STBIR_CEILF stbir_simd_ceilf
+  static stbir__inline float stbir_simd_ceilf(float x)
+  {
+    return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0);
+  }
+
+  #define STBIR_SIMD
+
+#endif  // SSE2/NEON/WASM
+
+#endif // NO SIMD
+
+#ifdef STBIR_SIMD8
+  #define stbir__simdfX stbir__simdf8
+  #define stbir__simdiX stbir__simdi8
+  #define stbir__simdfX_load stbir__simdf8_load
+  #define stbir__simdiX_load stbir__simdi8_load
+  #define stbir__simdfX_mult stbir__simdf8_mult
+  #define stbir__simdfX_add_mem stbir__simdf8_add_mem
+  #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem
+  #define stbir__simdfX_store stbir__simdf8_store
+  #define stbir__simdiX_store stbir__simdi8_store
+  #define stbir__simdf_frepX  stbir__simdf8_frep8
+  #define stbir__simdfX_madd stbir__simdf8_madd
+  #define stbir__simdfX_min stbir__simdf8_min
+  #define stbir__simdfX_max stbir__simdf8_max
+  #define stbir__simdfX_aaa1 stbir__simdf8_aaa1
+  #define stbir__simdfX_1aaa stbir__simdf8_1aaa
+  #define stbir__simdfX_a1a1 stbir__simdf8_a1a1
+  #define stbir__simdfX_1a1a stbir__simdf8_1a1a
+  #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32
+  #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words
+  #define stbir__simdfX_zero stbir__simdf8_zero
+  #define STBIR_onesX STBIR_ones8
+  #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8
+  #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8
+  #define STBIR_simd_point5X STBIR_simd_point58
+  #define stbir__simdfX_float_count 8
+  #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230
+  #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103
+  static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted };
+  static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted };
+  static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 };
+  static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 };
+  static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float };
+  static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float };
+#else
+  #define stbir__simdfX stbir__simdf
+  #define stbir__simdiX stbir__simdi
+  #define stbir__simdfX_load stbir__simdf_load
+  #define stbir__simdiX_load stbir__simdi_load
+  #define stbir__simdfX_mult stbir__simdf_mult
+  #define stbir__simdfX_add_mem stbir__simdf_add_mem
+  #define stbir__simdfX_madd_mem stbir__simdf_madd_mem
+  #define stbir__simdfX_store stbir__simdf_store
+  #define stbir__simdiX_store stbir__simdi_store
+  #define stbir__simdf_frepX  stbir__simdf_frep4
+  #define stbir__simdfX_madd stbir__simdf_madd
+  #define stbir__simdfX_min stbir__simdf_min
+  #define stbir__simdfX_max stbir__simdf_max
+  #define stbir__simdfX_aaa1 stbir__simdf_aaa1
+  #define stbir__simdfX_1aaa stbir__simdf_1aaa
+  #define stbir__simdfX_a1a1 stbir__simdf_a1a1
+  #define stbir__simdfX_1a1a stbir__simdf_1a1a
+  #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32
+  #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words
+  #define stbir__simdfX_zero stbir__simdf_zero
+  #define STBIR_onesX STBIR__CONSTF(STBIR_ones)
+  #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5)
+  #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float)
+  #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float)
+  #define stbir__simdfX_float_count 4
+  #define stbir__if_simdf8_cast_to_simdf4( val ) ( val )
+  #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230
+  #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103
+#endif
+
+
+#if defined(STBIR_NEON) && !defined(_M_ARM)
+
+  #if defined( _MSC_VER ) && !defined(__clang__)
+  typedef __int16 stbir__FP16;
+  #else
+  typedef float16_t stbir__FP16;
+  #endif
+
+#else // no NEON, or 32-bit ARM for MSVC
+
+  typedef union stbir__FP16
+  {
+    unsigned short u;
+  } stbir__FP16;
+
+#endif
+
+#if !defined(STBIR_NEON) && !defined(STBIR_FP16C) || defined(STBIR_NEON) && defined(_M_ARM)
+
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    static const stbir__FP32 magic = { (254 - 15) << 23 };
+    static const stbir__FP32 was_infnan = { (127 + 16) << 23 };
+    stbir__FP32 o;
+
+    o.u = (h.u & 0x7fff) << 13;     // exponent/mantissa bits
+    o.f *= magic.f;                 // exponent adjust
+    if (o.f >= was_infnan.f)        // make sure Inf/NaN survive
+      o.u |= 255 << 23;
+    o.u |= (h.u & 0x8000) << 16;    // sign bit
+    return o.f;
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half(float val)
+  {
+    stbir__FP32 f32infty = { 255 << 23 };
+    stbir__FP32 f16max   = { (127 + 16) << 23 };
+    stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
+    unsigned int sign_mask = 0x80000000u;
+    stbir__FP16 o = { 0 };
+    stbir__FP32 f;
+    unsigned int sign; 
+
+    f.f = val;
+    sign = f.u & sign_mask;
+    f.u ^= sign;
+
+    if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
+      o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+    else // (De)normalized number or zero
+    {
+      if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero
+      {
+        // use a magic value to align our 10 mantissa bits at the bottom of
+        // the float. as long as FP addition is round-to-nearest-even this
+        // just works.
+        f.f += denorm_magic.f;
+        // and one integer subtract of the bias later, we have our final float!
+        o.u = (unsigned short) ( f.u - denorm_magic.u );
+      }
+      else
+      {
+        unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+        // update exponent, rounding bias part 1
+        f.u = f.u + ((15u - 127) << 23) + 0xfff;
+        // rounding bias part 2
+        f.u += mant_odd;
+        // take the bits!
+        o.u = (unsigned short) ( f.u >> 13 );
+      }
+    }
+
+    o.u |= sign >> 16;
+    return o;
+  }
+
+#endif
+
+
+#if defined(STBIR_FP16C)
+
+  #include <immintrin.h>
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) );
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) );
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) );
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    stbir__FP16 h;
+    h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) );
+    return h;
+  }
+
+#elif defined(STBIR_SSE2)
+
+  // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668
+  stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_nosign,      0x7fff);
+    static const STBIR__SIMDI_CONST(smallest_normal,  0x0400);
+    static const STBIR__SIMDI_CONST(infinity,         0x7c00);
+    static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23);
+    static const STBIR__SIMDI_CONST(magic_denorm,     113 << 23);
+
+    __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) );
+    __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() );
+    __m128i mnosign     = STBIR__CONSTI(mask_nosign);
+    __m128i eadjust     = STBIR__CONSTI(expadjust_normal);
+    __m128i smallest    = STBIR__CONSTI(smallest_normal);
+    __m128i infty       = STBIR__CONSTI(infinity);
+    __m128i expmant     = _mm_and_si128(mnosign, h);
+    __m128i justsign    = _mm_xor_si128(h, expmant);
+    __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+    __m128i b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+    __m128i shifted     = _mm_slli_epi32(expmant, 13);
+    __m128i adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+    __m128i den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
+    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    __m128i sign        = _mm_slli_epi32(justsign, 16);
+    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+    stbir__simdf_store( output + 0,  final );
+
+    h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() );
+    expmant     = _mm_and_si128(mnosign, h);
+    justsign    = _mm_xor_si128(h, expmant);
+    b_notinfnan = _mm_cmpgt_epi32(infty, expmant);
+    b_isdenorm  = _mm_cmpgt_epi32(smallest, expmant);
+    shifted     = _mm_slli_epi32(expmant, 13);
+    adj_infnan  = _mm_andnot_si128(b_notinfnan, eadjust);
+    adjusted    = _mm_add_epi32(eadjust, shifted);
+    den1        = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm));
+    adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+    den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    sign        = _mm_slli_epi32(justsign, 16);
+    final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+    stbir__simdf_store( output + 4,  final );
+
+    // ~38 SSE2 ops for 8 values
+  }
+
+  // Fabian's round-to-nearest-even float to half
+  // ~48 SSE2 ops for 8 output
+  stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input)
+  {
+    static const STBIR__SIMDI_CONST(mask_sign,      0x80000000u);
+    static const STBIR__SIMDI_CONST(c_f16max,       (127 + 16) << 23); // all FP32 values >=this round to +inf
+    static const STBIR__SIMDI_CONST(c_nanbit,        0x200);
+    static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00);
+    static const STBIR__SIMDI_CONST(c_min_normal,    (127 - 14) << 23); // smallest FP32 that yields a normalized FP16
+    static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23);
+    static const STBIR__SIMDI_CONST(c_normal_bias,    0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding
+
+    __m128  f           =  _mm_loadu_ps(input);
+    __m128  msign       = _mm_castsi128_ps(STBIR__CONSTI(mask_sign));
+    __m128  justsign    = _mm_and_ps(msign, f);
+    __m128  absf        = _mm_xor_ps(f, justsign);
+    __m128i absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    __m128i f16max      = STBIR__CONSTI(c_f16max);
+    __m128  b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
+    __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
+    __m128i nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit));
+    __m128i inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    __m128i min_normal  = STBIR__CONSTI(c_min_normal);
+    __m128i b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
+
+    // "result is subnormal" path
+    __m128  subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    __m128i subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    __m128i mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    __m128i mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    __m128i round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
+    __m128i round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    __m128i normal      = _mm_srli_epi32(round2, 13); // rounded result
+
+    // combine the two non-specials
+    __m128i nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
+
+    // merge in specials as well
+    __m128i joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
+
+    __m128i sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
+    __m128i final2, final= _mm_or_si128(joined, sign_shift);
+
+    f           =  _mm_loadu_ps(input+4);
+    justsign    = _mm_and_ps(msign, f);
+    absf        = _mm_xor_ps(f, justsign);
+    absf_int    = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit)
+    b_isnan     = _mm_cmpunord_ps(absf, absf); // is this a NaN?
+    b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special?
+    nanbit      = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit);
+    inf_or_nan  = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials
+
+    b_issub     = _mm_cmpgt_epi32(min_normal, absf_int);
+
+    // "result is subnormal" path
+    subnorm1    = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa
+    subnorm2    = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias
+
+    // "result is normal" path
+    mantoddbit  = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign
+    mantodd     = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0
+
+    round1      = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias));
+    round2      = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE)
+    normal      = _mm_srli_epi32(round2, 13); // rounded result
+
+    // combine the two non-specials
+    nonspecial  = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal));
+
+    // merge in specials as well
+    joined      = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan));
+
+    sign_shift  = _mm_srai_epi32(_mm_castps_si128(justsign), 16);
+    final2      = _mm_or_si128(joined, sign_shift);
+    final       = _mm_packs_epi32(final, final2);
+    stbir__simdi_store( output,final );
+  }
+
+#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM)) // WASM or 32-bit ARM on MSVC/clang
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__half_to_float(input[i]);
+    }
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    for (int i=0; i<8; i++)
+    {
+      output[i] = stbir__float_to_half(input[i]);
+    }
+  }
+
+#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang)
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    float16x4_t in0 = vld1_f16(input + 0);
+    float16x4_t in1 = vld1_f16(input + 4);
+    vst1q_f32(output + 0, vcvt_f32_f16(in0));
+    vst1q_f32(output + 4, vcvt_f32_f16(in1));
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
+    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
+    vst1_f16(output+0, out0);
+    vst1_f16(output+4, out1);
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0);
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0];
+  }
+
+#elif defined(STBIR_NEON) // 64-bit ARM
+
+  static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input)
+  {
+    float16x8_t in = vld1q_f16(input);
+    vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in)));
+    vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in)));
+  }
+
+  static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input)
+  {
+    float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0));
+    float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4));
+    vst1q_f16(output, vcombine_f16(out0, out1));
+  }
+
+  static stbir__inline float stbir__half_to_float( stbir__FP16 h )
+  {
+    return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0);
+  }
+
+  static stbir__inline stbir__FP16 stbir__float_to_half( float f )
+  {
+    return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0);
+  }
+
+#endif
+
+
+#ifdef STBIR_SIMD
+
+#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 )
+#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 )
+#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 )
+#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 )
+#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 )
+#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 )
+#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 )
+#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 )
+#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 )
+#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 )
+#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 )
+#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 )
+#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 )
+#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
+#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
+#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
+#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 ) 
+#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 ) 
+#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 ) 
+#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 ) 
+
+typedef union stbir__simdi_u32
+{
+  stbir_uint32 m128i_u32[4];
+  int m128i_i32[4];
+  stbir__simdi m128i_i128;
+} stbir__simdi_u32;
+
+static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 };
+
+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float,           stbir__max_uint8_as_float);
+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float,          stbir__max_uint16_as_float);
+static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted,  stbir__max_uint8_as_float_inverted);
+static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted);
+
+static const STBIR__SIMDF_CONST(STBIR_simd_point5,   0.5f);
+static const STBIR__SIMDF_CONST(STBIR_ones,          1.0f);
+static const STBIR__SIMDI_CONST(STBIR_almost_zero,   (127 - 13) << 23);
+static const STBIR__SIMDI_CONST(STBIR_almost_one,    0x3f7fffff);
+static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff);
+static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
+
+//   Basically, in simd mode, we unroll the proper amount, and we don't want
+//   the non-simd remnant loops to be unroll because they only run a few times
+//   Adding this switch saves about 5K on clang which is Captain Unroll the 3rd.
+#define STBIR_SIMD_STREAMOUT_PTR( star )  STBIR_STREAMOUT_PTR( star )
+#define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr)
+
+#ifdef STBIR_MEMCPY
+#undef STBIR_MEMCPY
+#define STBIR_MEMCPY stbir_simd_memcpy
+#endif
+
+// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
+static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) 
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
+  ptrdiff_t ofs_to_src = (char*)src - (char*)dest;
+
+  // check overlaps
+  STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) );
+
+  if ( bytes < (16*stbir__simdfX_float_count) )
+  {
+    if ( bytes < 16 )
+    {
+      if ( bytes ) 
+      {
+        do
+        {
+          STBIR_SIMD_NO_UNROLL(d);
+          d[ 0 ] = d[ ofs_to_src ];
+          ++d;
+        } while ( d < d_end );
+      }
+    }
+    else
+    {
+      stbir__simdf x;
+      // do one unaligned to get us aligned for the stream out below
+      stbir__simdf_load( x, ( d + ofs_to_src ) );
+      stbir__simdf_store( d, x );
+      d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 );
+
+      for(;;)
+      {
+        STBIR_SIMD_NO_UNROLL(d);
+
+        if ( d > ( d_end - 16 ) )
+        {
+          if ( d == d_end )
+            return;
+          d = d_end - 16;
+        }
+
+        stbir__simdf_load( x, ( d + ofs_to_src ) );
+        stbir__simdf_store( d, x );
+        d += 16;
+      }
+    }
+  }
+  else
+  {
+    stbir__simdfX x0,x1,x2,x3;
+
+    // do one unaligned to get us aligned for the stream out below
+    stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
+    stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
+    stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
+    stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
+    stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
+    stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
+    stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
+    stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
+    d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+
+    for(;;)
+    {
+      STBIR_SIMD_NO_UNROLL(d);
+  
+      if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
+      {
+        if ( d == d_end )
+          return;
+        d = d_end - (16*stbir__simdfX_float_count);
+      }
+
+      stbir__simdfX_load( x0, ( d + ofs_to_src ) +  0*stbir__simdfX_float_count );
+      stbir__simdfX_load( x1, ( d + ofs_to_src ) +  4*stbir__simdfX_float_count );
+      stbir__simdfX_load( x2, ( d + ofs_to_src ) +  8*stbir__simdfX_float_count );
+      stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count );
+      stbir__simdfX_store( d +  0*stbir__simdfX_float_count, x0 );
+      stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
+      stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
+      stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
+      d += (16*stbir__simdfX_float_count);
+    }
+  }
+}
+
+// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
+//   the diff between dest and src)
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
+  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
+
+  if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away?
+  {
+    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15);
+    do
+    {
+      stbir__simdf x;
+      STBIR_SIMD_NO_UNROLL(sd);
+      stbir__simdf_load( x, sd );
+      stbir__simdf_store(  ( sd + ofs_to_dest ), x );
+      sd += 16;
+    } while ( sd < s_end16 );
+
+    if ( sd == s_end )
+      return;
+  }
+
+  do
+  {
+    STBIR_SIMD_NO_UNROLL(sd);
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    sd += 4;
+  } while ( sd < s_end );
+}
+
+#else // no SSE2
+
+// when in scalar mode, we let unrolling happen, so this macro just does the __restrict
+#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
+#define STBIR_SIMD_NO_UNROLL(ptr) 
+
+#endif // SSE2
+
+
+#ifdef STBIR_PROFILE
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+
+#ifdef _MSC_VER
+
+  STBIRDEF stbir_uint64 __rdtsc();
+  #define STBIR_PROFILE_FUNC() __rdtsc()
+
+#else // non msvc
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() 
+  {
+    stbir_uint32 lo, hi;
+    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
+    return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
+  }
+
+#endif  // msvc
+
+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) 
+
+#if defined( _MSC_VER ) && !defined(__clang__)
+
+  #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT)
+
+#else
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
+  {
+    stbir_uint64 tsc;
+    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+    return tsc;
+  }
+
+#endif
+
+#else // x64, arm
+
+#error Unknown platform for profiling.
+
+#endif  //x64 and   
+
+
+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info
+
+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info
+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
+
+// super light-weight micro profiler
+#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded; 
+#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
+#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
+#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
+
+// for thread data
+#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh )
+#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh )
+#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh )
+#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count )
+
+// for build data
+#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh )
+#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; }
+
+#else  // no profile
+
+#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO
+#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO
+
+#define STBIR_ONLY_PROFILE_BUILD_GET_INFO
+#define STBIR_ONLY_PROFILE_BUILD_SET_INFO
+
+#define STBIR_PROFILE_START( wh )
+#define STBIR_PROFILE_END( wh )
+#define STBIR_PROFILE_FIRST_START( wh )
+#define STBIR_PROFILE_CLEAR_EXTRAS( )
+
+#define STBIR_PROFILE_BUILD_START( wh ) 
+#define STBIR_PROFILE_BUILD_END( wh ) 
+#define STBIR_PROFILE_BUILD_FIRST_START( wh )
+#define STBIR_PROFILE_BUILD_CLEAR( info )
+
+#endif  // stbir_profile
+
+#ifndef STBIR_CEILF
+#include <math.h>
+#if _MSC_VER <= 1200 // support VC6 for Sean
+#define STBIR_CEILF(x) ((float)ceil((float)(x)))
+#define STBIR_FLOORF(x) ((float)floor((float)(x)))
+#else
+#define STBIR_CEILF(x) ceilf(x)
+#define STBIR_FLOORF(x) floorf(x)
+#endif
+#endif
+
+#ifndef STBIR_MEMCPY
+// For memcpy
+#include <string.h>
+#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len )
+#endif
+
+#ifndef STBIR_SIMD
+
+// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
+//   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
+//   the diff between dest and src)
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+{
+  char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
+  char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
+  ptrdiff_t ofs_to_dest = (char*)dest - (char*)src;
+
+  if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away?
+  {
+    char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7);
+    do
+    {
+      STBIR_NO_UNROLL(sd);
+      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; 
+      sd += 8;
+    } while ( sd < s_end8 );
+
+    if ( sd == s_end )
+      return;
+  }
+
+  do
+  {
+    STBIR_NO_UNROLL(sd);
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    sd += 4;
+  } while ( sd < s_end );
+}
+
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale, void * user_data)
+{
+  float halfscale = scale / 2;
+  float t = 0.5f + halfscale;
+  STBIR_ASSERT(scale <= 1);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x >= t)
+    return 0.0f;
+  else
+  {
+    float r = 0.5f - halfscale;
+    if (x <= r)
+      return 1.0f;
+    else
+      return (t - x) / scale;
+  }
+}
+
+static float stbir__support_trapezoid(float scale, void * user_data)
+{
+  STBIR__UNUSED(user_data);
+  return 0.5f + scale / 2.0f;
+}
+
+static float stbir__filter_triangle(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x <= 1.0f)
+    return 1.0f - x;
+  else
+    return 0.0f;
+}
+
+static float stbir__filter_point(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(x);
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  return 1.0f;
+}
+
+static float stbir__filter_cubic(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f;
+  else if (x < 2.0f)
+    return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f;
+
+  return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return 1.0f - x*x*(2.5f - 1.5f*x);
+  else if (x < 2.0f)
+    return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f));
+
+  return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+
+  if ( x < 0.0f ) x = -x;
+
+  if (x < 1.0f)
+    return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f;
+  else if (x < 2.0f)
+    return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f;
+
+  return (0.0f);
+}
+
+static float stbir__support_zero(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 0;
+}
+
+static float stbir__support_zeropoint5(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 0.5f;
+}
+
+static float stbir__support_one(float s, void * user_data)
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 1;
+}
+
+static float stbir__support_two(float s, void * user_data) 
+{
+  STBIR__UNUSED(s);
+  STBIR__UNUSED(user_data);
+  return 2;
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter from the output pixel's perspective
+static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data)
+{
+  STBIR_ASSERT(support != 0);
+
+  if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale
+    return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f);
+  else
+    return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
+}
+
+// this is how many coefficents per run of the filter (which is different 
+//   from the filter_pixel_width depending on if we are scattering or gathering)
+static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
+{
+  float scale = samp->scale_info.scale;
+  stbir__support_callback * support = samp->filter_support;
+
+  switch( is_gather )
+  {
+    case 1:
+      return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f);
+    case 2:
+      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale);
+    case 0:
+      return (int)STBIR_CEILF(support(scale, user_data) * 2.0f);
+    default:
+      STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) );
+      return 0;
+  }
+}
+
+static int stbir__get_contributors(stbir__sampler * samp, int is_gather)  
+{
+  if (is_gather)
+      return samp->scale_info.output_sub_size;
+  else
+      return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2);
+}
+
+static int stbir__edge_zero_full( int n, int max )
+{
+  STBIR__UNUSED(n);
+  STBIR__UNUSED(max);
+  return 0; // NOTREACHED
+}
+
+static int stbir__edge_clamp_full( int n, int max )
+{
+  if (n < 0)
+    return 0;
+
+  if (n >= max)
+    return max - 1;
+
+  return n; // NOTREACHED
+}
+
+static int stbir__edge_reflect_full( int n, int max )
+{
+  if (n < 0)
+  {
+    if (n > -max)    
+      return -n;
+    else
+      return max - 1;
+  }
+
+  if (n >= max)
+  {
+    int max2 = max * 2;
+    if (n >= max2)
+      return 0;
+    else
+      return max2 - n - 1;
+  }
+
+  return n; // NOTREACHED
+}
+
+static int stbir__edge_wrap_full( int n, int max )
+{
+  if (n >= 0)
+    return (n % max);
+  else
+  {
+    int m = (-n) % max;
+
+    if (m != 0)
+      m = max - m;
+
+    return (m);
+  }
+}
+
+typedef int stbir__edge_wrap_func( int n, int max );
+static stbir__edge_wrap_func * stbir__edge_wrap_slow[] =
+{
+  stbir__edge_clamp_full,    // STBIR_EDGE_CLAMP
+  stbir__edge_reflect_full,  // STBIR_EDGE_REFLECT
+  stbir__edge_wrap_full,     // STBIR_EDGE_WRAP
+  stbir__edge_zero_full,     // STBIR_EDGE_ZERO
+};
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+  // avoid per-pixel switch
+  if (n >= 0 && n < max)
+      return n;
+  return stbir__edge_wrap_slow[edge]( n, max );
+}
+
+#define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16
+
+// get information on the extents of a sampler
+static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents )
+{
+  int j, stop;
+  int left_margin, right_margin;
+  int min_n = 0x7fffffff, max_n = -0x7fffffff;
+  int min_left = 0x7fffffff, max_left = -0x7fffffff;
+  int min_right = 0x7fffffff, max_right = -0x7fffffff;
+  stbir_edge edge = samp->edge;
+  stbir__contributors* contributors = samp->contributors;
+  int output_sub_size = samp->scale_info.output_sub_size;
+  int input_full_size = samp->scale_info.input_full_size;
+  int filter_pixel_margin = samp->filter_pixel_margin;
+
+  STBIR_ASSERT( samp->is_gather );
+
+  stop = output_sub_size;
+  for (j = 0; j < stop; j++ )
+  {
+    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
+    if ( contributors[j].n0 < min_n )
+    {
+      min_n = contributors[j].n0;
+      stop = j + filter_pixel_margin;  // if we find a new min, only scan another filter width
+      if ( stop > output_sub_size ) stop = output_sub_size;
+    }
+  }
+
+  stop = 0;
+  for (j = output_sub_size - 1; j >= stop; j-- )
+  {
+    STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 );
+    if ( contributors[j].n1 > max_n )
+    {
+      max_n = contributors[j].n1;
+      stop = j - filter_pixel_margin;  // if we find a new max, only scan another filter width
+      if (stop<0) stop = 0;
+    }
+  }
+
+  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
+  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
+
+  // now calculate how much into the margins we really read
+  left_margin = 0;
+  if ( min_n < 0 )
+  {
+    left_margin = -min_n;
+    min_n = 0;
+  }
+  
+  right_margin = 0;
+  if ( max_n >= input_full_size )
+  {
+    right_margin = max_n - input_full_size + 1;
+    max_n = input_full_size - 1;
+  }
+
+  // index 1 is margin pixel extents (how many pixels we hang over the edge)
+  scanline_extents->edge_sizes[0] = left_margin;
+  scanline_extents->edge_sizes[1] = right_margin;
+
+  // index 2 is pixels read from the input
+  scanline_extents->spans[0].n0 = min_n;
+  scanline_extents->spans[0].n1 = max_n;
+  scanline_extents->spans[0].pixel_offset_for_input = min_n;
+
+  // default to no other input range
+  scanline_extents->spans[1].n0 = 0;
+  scanline_extents->spans[1].n1 = -1;
+  scanline_extents->spans[1].pixel_offset_for_input = 0;
+
+  // don't have to do edge calc for zero clamp
+  if ( edge == STBIR_EDGE_ZERO )
+    return;
+  
+  // convert margin pixels to the pixels within the input (min and max)
+  for( j = -left_margin ; j < 0 ; j++ )
+  {
+      int p = stbir__edge_wrap( edge, j, input_full_size );
+      if ( p < min_left )
+        min_left = p;
+      if ( p > max_left )
+        max_left = p;
+  }
+
+  for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ )
+  {
+      int p = stbir__edge_wrap( edge, j, input_full_size );
+      if ( p < min_right )
+        min_right = p;
+      if ( p > max_right )
+        max_right = p;
+  }
+
+  // merge the left margin pixel region if it connects within 4 pixels of main pixel region
+  if ( min_left != 0x7fffffff )
+  {
+    if ( ( ( min_left <= min_n ) && ( ( max_left  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
+         ( ( min_n <= min_left ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) )
+    {
+      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left );
+      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left );
+      scanline_extents->spans[0].pixel_offset_for_input = min_n;
+      left_margin = 0;
+    }
+  }
+
+  // merge the right margin pixel region if it connects within 4 pixels of main pixel region
+  if ( min_right != 0x7fffffff )
+  {
+    if ( ( ( min_right <= min_n ) && ( ( max_right  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) ||
+         ( ( min_n <= min_right ) && ( ( max_n  + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) )
+    {
+      scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right );
+      scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right );
+      scanline_extents->spans[0].pixel_offset_for_input = min_n;
+      right_margin = 0;
+    }
+  }
+
+  STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n );
+  STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n );
+
+  // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize
+  //   so you need to get a second run of pixels from the opposite side of the scanline (which you
+  //   wouldn't need except for WRAP)
+
+
+  // if we can't merge the min_left range, add it as a second range
+  if ( ( left_margin ) && ( min_left != 0x7fffffff ) )
+  {
+    stbir__span * newspan = scanline_extents->spans + 1;
+    STBIR_ASSERT( right_margin == 0 );
+    if ( min_left < scanline_extents->spans[0].n0 )
+    {
+      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
+      --newspan;
+    }
+    newspan->pixel_offset_for_input = min_left;
+    newspan->n0 = -left_margin;
+    newspan->n1 = ( max_left - min_left ) - left_margin;
+    scanline_extents->edge_sizes[0] = 0;  // don't need to copy the left margin, since we are directly decoding into the margin
+    return;
+  }
+
+  // if we can't merge the min_left range, add it as a second range
+  if ( ( right_margin ) && ( min_right != 0x7fffffff ) )
+  {
+    stbir__span * newspan = scanline_extents->spans + 1;
+    if ( min_right < scanline_extents->spans[0].n0 )
+    {
+      scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0;
+      scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1;
+      --newspan;
+    }
+    newspan->pixel_offset_for_input = min_right;
+    newspan->n0 = scanline_extents->spans[1].n1 + 1;
+    newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right );
+    scanline_extents->edge_sizes[1] = 0;  // don't need to copy the right margin, since we are directly decoding into the margin
+    return;
+  }
+}
+
+static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge )
+{
+  int first, last;
+  float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+  float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale; 
+  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale; 
+
+  first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
+  last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
+
+  if ( edge == STBIR_EDGE_WRAP )
+  {
+    if ( first <= -input_size )
+      first = -(input_size-1);
+    if ( last >= (input_size*2))
+      last = (input_size*2) - 1;
+  }
+  
+  *first_pixel = first;
+  *last_pixel = last;
+}
+
+static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data )
+{
+  int n, end;
+  float inv_scale = scale_info->inv_scale;
+  float out_shift = scale_info->pixel_shift;
+  int input_size  = scale_info->input_full_size;
+  int numerator = scale_info->scale_numerator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
+
+  // Looping through out pixels
+  end = num_contributors; if ( polyphase ) end = numerator;
+  for (n = 0; n < end; n++)
+  {
+    int i;
+    int last_non_zero;
+    float out_pixel_center = (float)n + 0.5f;
+    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;  
+
+    int in_first_pixel, in_last_pixel;
+    
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
+
+    last_non_zero = -1;
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+      float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+      float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data);
+
+      // kill denormals
+      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
+      {
+        if ( i == 0 )  // if we're at the front, just eat zero contributors
+        { 
+          STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
+          ++in_first_pixel;
+          i--;
+          continue;
+        }
+        coeff = 0;  // make sure is fully zero (should keep denormals away)
+      }
+      else
+        last_non_zero = i;
+      
+      coefficient_group[i] = coeff;
+    }
+    
+    in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
+    contributors->n0 = in_first_pixel;
+    contributors->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributors->n1 >= contributors->n0);
+
+    ++contributors;
+    coefficient_group += coefficient_width;
+  }
+}
+
+static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff )
+{
+  if ( new_pixel <= contribs->n1 )  // before the end
+  {
+    if ( new_pixel < contribs->n0 ) // before the front?
+    {
+      int j, o = contribs->n0 - new_pixel;
+      for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- )
+        coeffs[ j + o ] = coeffs[ j ];
+      for ( j = 1 ; j < o ; j-- )
+        coeffs[ j ] = coeffs[ 0 ];
+      coeffs[ 0 ] = new_coeff;
+      contribs->n0 = new_pixel;
+    }
+    else
+    {
+      coeffs[ new_pixel - contribs->n0 ] += new_coeff;
+    }
+  }
+  else
+  {
+    int j, e = new_pixel - contribs->n0;
+    for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any
+      coeffs[j] = 0;
+
+    coeffs[ e ] = new_coeff;
+    contribs->n1 = new_pixel;
+  }
+}
+
+static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size )
+{
+  float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+  float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+  float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift;
+  float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift;
+  int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f));
+  int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f));
+
+  if ( out_first_pixel < 0 )
+    out_first_pixel = 0;
+  if ( out_last_pixel >= out_size )
+    out_last_pixel = out_size - 1;
+  *first_pixel = out_first_pixel;
+  *last_pixel = out_last_pixel;
+}
+
+static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data )
+{
+  int in_pixel;
+  int i;
+  int first_out_inited = -1;
+  float scale = scale_info->scale;
+  float out_shift = scale_info->pixel_shift;
+  int out_size = scale_info->output_sub_size;
+  int numerator = scale_info->scale_numerator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) );
+
+  STBIR__UNUSED(num_contributors);
+
+  // Loop through the input pixels
+  for (in_pixel = start; in_pixel < end; in_pixel++)
+  {
+    float in_pixel_center = (float)in_pixel + 0.5f;
+    float out_center_of_in = in_pixel_center * scale - out_shift;
+    int out_first_pixel, out_last_pixel;
+
+    stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size );
+
+    if ( out_first_pixel > out_last_pixel )
+      continue;
+
+    // clamp or exit if we are using polyphase filtering, and the limit is up
+    if ( polyphase )
+    {
+      // when polyphase, you only have to do coeffs up to the numerator count
+      if ( out_first_pixel == numerator )
+        break;
+
+      // don't do any extra work, clamp last pixel at numerator too
+      if ( out_last_pixel >= numerator )
+        out_last_pixel = numerator - 1;
+    }
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+      float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+      float x = out_pixel_center - out_center_of_in;
+      float coeff = kernel(x, scale, user_data) * scale;
+
+      // kill the coeff if it's too small (avoid denormals)
+      if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
+        coeff = 0.0f;
+
+      {
+        int out = i + out_first_pixel;
+        float * coeffs = coefficient_group + out * coefficient_width;
+        stbir__contributors * contribs = contributors + out;
+
+        // is this the first time this output pixel has been seen?  Init it.
+        if ( out > first_out_inited ) 
+        {
+          STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
+          first_out_inited = out;
+          contribs->n0 = in_pixel;
+          contribs->n1 = in_pixel;
+          coeffs[0]  = coeff;
+        }
+        else 
+        {
+          // insert on end (always in order)
+          if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
+          {
+            STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos
+            contribs->n0 = in_pixel;
+          }
+          contribs->n1 = in_pixel;
+          STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width );
+          coeffs[in_pixel - contribs->n0]  = coeff;
+        }
+      }
+    }
+  }
+}
+
+static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
+{
+  int input_size = scale_info->input_full_size;
+  int input_last_n1 = input_size - 1; 
+  int n, end;
+  int lowest = 0x7fffffff;
+  int highest = -0x7fffffff;
+  int widest = -1;
+  int numerator = scale_info->scale_numerator;
+  int denominator = scale_info->scale_denominator;
+  int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) );
+  float * coeffs;
+  stbir__contributors * contribs;
+
+  // weight all the coeffs for each sample
+  coeffs = coefficient_group;
+  contribs = contributors;
+  end = num_contributors; if ( polyphase ) end = numerator;
+  for (n = 0; n < end; n++)
+  {
+    int i;
+    float filter_scale, total_filter = 0;
+    int e;
+
+    // add all contribs
+    e = contribs->n1 - contribs->n0;
+    for( i = 0 ; i <= e ; i++ )
+    {
+      total_filter += coeffs[i];
+      STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
+    }
+
+    // rescale
+    if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) )
+    {
+      // all coeffs are extremely small, just zero it
+      contribs->n1 = contribs->n0;
+      coeffs[0] = 0.0f;
+    }
+    else
+    {
+      // if the total isn't 1.0, rescale everything
+      if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
+      {
+        filter_scale = 1.0f / total_filter;
+        // scale them all
+        for (i = 0; i <= e; i++)
+          coeffs[i] *= filter_scale;
+      }
+    }
+    ++contribs;
+    coeffs += coefficient_width;
+  }
+
+  // if we have a rational for the scale, we can exploit the polyphaseness to not calculate
+  //   most of the coefficients, so we copy them here
+  if ( polyphase )
+  {
+    stbir__contributors * prev_contribs = contributors;
+    stbir__contributors * cur_contribs = contributors + numerator;
+
+    for( n = numerator ; n < num_contributors ; n++ )
+    {
+      cur_contribs->n0 = prev_contribs->n0 + denominator;
+      cur_contribs->n1 = prev_contribs->n1 + denominator;
+      ++cur_contribs;
+      ++prev_contribs;
+    }
+    stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) );
+  }
+
+  coeffs = coefficient_group;
+  contribs = contributors;
+  for (n = 0; n < num_contributors; n++)
+  {
+    int i;
+
+    // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now)
+    if ( edge == STBIR_EDGE_ZERO )
+    {
+      // shrink the right side if necessary
+      if ( contribs->n1 > input_last_n1 )
+        contribs->n1 = input_last_n1;
+
+      // shrink the left side
+      if ( contribs->n0 < 0 )
+      {
+        int j, left, skips = 0;
+
+        skips = -contribs->n0;
+        contribs->n0 = 0;
+
+        // now move down the weights
+        left = contribs->n1 - contribs->n0 + 1;
+        if ( left > 0 )
+        {
+          for( j = 0 ; j < left ; j++ )
+            coeffs[ j ] = coeffs[ j + skips ];
+        }
+      }
+    }
+    else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
+    {
+      // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
+      
+      // right hand side first
+      if ( contribs->n1 > input_last_n1 )
+      {
+        int start = contribs->n0;
+        int endi = contribs->n1;
+        contribs->n1 = input_last_n1;  
+        for( i = input_size; i <= endi; i++ )
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
+      }
+
+      // now check left hand edge
+      if ( contribs->n0 < 0 )
+      {
+        int save_n0;
+        float save_n0_coeff;
+        float * c = coeffs - ( contribs->n0 + 1 );
+        
+        // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
+        for( i = -1 ; i > contribs->n0 ; i-- ) 
+          stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
+        save_n0 = contribs->n0;
+        save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
+
+        // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
+        contribs->n0 = 0;  
+        for(i = 0 ; i <= contribs->n1 ; i++ )
+          coeffs[i] = coeffs[i-save_n0];
+        
+        // now that we have shrunk down the contribs, we insert the first one safely
+        stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
+      }
+    }
+
+    if ( contribs->n0 <= contribs->n1 )
+    {
+      int diff = contribs->n1 - contribs->n0 + 1;
+      while ( diff && ( coeffs[ diff-1 ] == 0.0f ) )
+        --diff;
+      contribs->n1 = contribs->n0 + diff - 1;
+
+      if ( contribs->n0 <= contribs->n1 )
+      {
+        if ( contribs->n0 < lowest )
+          lowest = contribs->n0;
+        if ( contribs->n1 > highest )
+          highest = contribs->n1;
+        if ( diff > widest )
+          widest = diff;
+      }
+
+      // re-zero out unused coefficients (if any)
+      for( i = diff ; i < coefficient_width ; i++ )
+        coeffs[i] = 0.0f;
+    }
+
+    ++contribs;
+    coeffs += coefficient_width;
+  }
+  filter_info->lowest = lowest;
+  filter_info->highest = highest;
+  filter_info->widest = widest;
+}
+
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row_width )
+{
+  #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
+  #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
+  #ifdef STBIR_SIMD
+  #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); }
+  #else
+  #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
+  #endif
+  if ( coefficient_width != widest )
+  {
+    float * pc = coefficents;
+    float * coeffs = coefficents;
+    float * pc_end = coefficents + num_contributors * widest;
+    switch( widest )
+    {
+      case 1:
+        do {
+          STBIR_MOVE_1( pc, coeffs );
+          ++pc;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 2:
+        do {
+          STBIR_MOVE_2( pc, coeffs );
+          pc += 2;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 3:
+        do {
+          STBIR_MOVE_2( pc, coeffs );
+          STBIR_MOVE_1( pc+2, coeffs+2 );
+          pc += 3;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 4:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          pc += 4;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 5:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_1( pc+4, coeffs+4 );
+          pc += 5;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 6:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_2( pc+4, coeffs+4 );
+          pc += 6;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 7:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_2( pc+4, coeffs+4 );
+          STBIR_MOVE_1( pc+6, coeffs+6 );
+          pc += 7;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 8:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          pc += 8;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 9:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_1( pc+8, coeffs+8 );
+          pc += 9;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 10:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_2( pc+8, coeffs+8 );
+          pc += 10;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 11:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_2( pc+8, coeffs+8 );
+          STBIR_MOVE_1( pc+10, coeffs+10 );
+          pc += 11;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      case 12:
+        do {
+          STBIR_MOVE_4( pc, coeffs );
+          STBIR_MOVE_4( pc+4, coeffs+4 );
+          STBIR_MOVE_4( pc+8, coeffs+8 );
+          pc += 12;
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+      default:
+        do {
+          float * copy_end = pc + widest - 4;
+          float * c = coeffs;
+          do {
+            STBIR_NO_UNROLL( pc );
+            STBIR_MOVE_4( pc, c );
+            pc += 4;
+            c += 4;
+          } while ( pc <= copy_end );
+          copy_end += 4;
+          while ( pc < copy_end )
+          {
+            STBIR_MOVE_1( pc, c );
+            ++pc; ++c;
+          }
+          coeffs += coefficient_width;
+        } while ( pc < pc_end );
+        break;
+    }
+  }
+
+  // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal
+  coefficents[ widest * num_contributors ] = 8888.0f;
+
+  // the minimum we might read for unrolled filters widths is 12. So, we need to
+  //   make sure we never read outside the decode buffer, by possibly moving 
+  //   the sample area back into the scanline, and putting zeros weights first.
+  // we start on the right edge and check until we're well past the possible
+  //   clip area (2*widest).
+  {
+    stbir__contributors * contribs = contributors + num_contributors - 1;
+    float * coeffs = coefficents + widest * ( num_contributors - 1 );
+
+    // go until no chance of clipping (this is usually less than 8 lops)
+    while ( ( ( contribs->n0 + widest*2 ) >= row_width ) && ( contribs >= contributors ) )
+    {
+      // might we clip??
+      if ( ( contribs->n0 + widest ) > row_width )
+      {
+        int stop_range = widest;
+      
+        // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
+        //   of this contrib n1, instead of a fixed widest amount - so calculate this
+        if ( widest > 12 )
+        {
+          int mod;
+
+          // how far will be read in the n_coeff loop (which depends on the widest count mod4);
+          mod = widest & 3;
+          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+
+          // the n_coeff loops do a minimum amount of coeffs, so factor that in!
+          if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
+        }
+
+        // now see if we still clip with the refined range
+        if ( ( contribs->n0 + stop_range ) > row_width )
+        {
+          int new_n0 = row_width - stop_range;
+          int num = contribs->n1 - contribs->n0 + 1;
+          int backup = contribs->n0 - new_n0;
+          float * from_co = coeffs + num - 1;
+          float * to_co = from_co + backup;
+
+          STBIR_ASSERT( ( new_n0 >= 0 ) && ( new_n0 < contribs->n0 ) );
+
+          // move the coeffs over
+          while( num )
+          {
+            *to_co-- = *from_co--;
+            --num;
+          }
+          // zero new positions
+          while ( to_co >= coeffs )
+            *to_co-- = 0;
+          // set new start point
+          contribs->n0 = new_n0;
+          if ( widest > 12 )
+          {
+            int mod;
+
+            // how far will be read in the n_coeff loop (which depends on the widest count mod4);
+            mod = widest & 3;
+            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+
+            // the n_coeff loops do a minimum amount of coeffs, so factor that in!
+            if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
+          }
+        }
+      }
+      --contribs;
+      coeffs -= widest;
+    }
+  }
+
+  return widest;
+  #undef STBIR_MOVE_1
+  #undef STBIR_MOVE_2
+  #undef STBIR_MOVE_4
+}
+
+static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
+{
+  int n;
+  float scale = samp->scale_info.scale;
+  stbir__kernel_callback * kernel = samp->filter_kernel;
+  stbir__support_callback * support = samp->filter_support;
+  float inv_scale = samp->scale_info.inv_scale;
+  int input_full_size = samp->scale_info.input_full_size;
+  int gather_num_contributors = samp->num_contributors;
+  stbir__contributors* gather_contributors = samp->contributors;
+  float * gather_coeffs = samp->coefficients; 
+  int gather_coefficient_width = samp->coefficient_width;
+
+  switch ( samp->is_gather )
+  {
+    case 1: // gather upsample
+    {
+      float out_pixels_radius = support(inv_scale,user_data) * scale;
+
+      stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data );
+
+      STBIR_PROFILE_BUILD_START( cleanup );
+      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
+      STBIR_PROFILE_BUILD_END( cleanup );
+    }
+    break;
+
+    case 0: // scatter downsample (only on vertical)
+    case 2: // gather downsample  
+    {
+      float in_pixels_radius = support(scale,user_data) * inv_scale;
+      int filter_pixel_margin = samp->filter_pixel_margin;
+      int input_end = input_full_size + filter_pixel_margin;
+      
+      // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
+      if ( !samp->is_gather )
+      {
+        // check if we are using the same gather downsample on the horizontal as this vertical, 
+        //   if so, then we don't have to generate them, we can just pivot from the horizontal.
+        if ( other_axis_for_pivot )
+        {
+          gather_contributors = other_axis_for_pivot->contributors;
+          gather_coeffs = other_axis_for_pivot->coefficients;
+          gather_coefficient_width = other_axis_for_pivot->coefficient_width;
+          gather_num_contributors = other_axis_for_pivot->num_contributors;
+          samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest;
+          samp->extent_info.highest = other_axis_for_pivot->extent_info.highest;
+          samp->extent_info.widest = other_axis_for_pivot->extent_info.widest;
+          goto jump_right_to_pivot;
+        }
+
+        gather_contributors = samp->gather_prescatter_contributors;
+        gather_coeffs = samp->gather_prescatter_coefficients;
+        gather_coefficient_width = samp->gather_prescatter_coefficient_width;
+        gather_num_contributors = samp->gather_prescatter_num_contributors;
+      }
+
+      stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data );
+
+      STBIR_PROFILE_BUILD_START( cleanup );
+      stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width );
+      STBIR_PROFILE_BUILD_END( cleanup );
+
+      if ( !samp->is_gather )
+      {
+        // if this is a scatter (vertical only), then we need to pivot the coeffs
+        stbir__contributors * scatter_contributors;
+        int highest_set;
+
+        jump_right_to_pivot:
+
+        STBIR_PROFILE_BUILD_START( pivot );
+
+        highest_set = (-filter_pixel_margin) - 1;
+        for (n = 0; n < gather_num_contributors; n++)
+        {
+          int k;
+          int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1;
+          int scatter_coefficient_width = samp->coefficient_width;
+          float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
+          float * g_coeffs = gather_coeffs;
+          scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
+          
+          for (k = gn0 ; k <= gn1 ; k++ )
+          {
+            float gc = *g_coeffs++;
+            if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
+            {
+              {
+                // if we are skipping over several contributors, we need to clear the skipped ones
+                stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+                while ( clear_contributors < scatter_contributors )
+                {
+                  clear_contributors->n0 = 0; 
+                  clear_contributors->n1 = -1;
+                  ++clear_contributors;
+                }
+              }
+              scatter_contributors->n0 = n;
+              scatter_contributors->n1 = n;
+              scatter_coeffs[0]  = gc;
+              highest_set = k;
+            }
+            else
+            {
+              stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+            }
+            ++scatter_contributors;
+            scatter_coeffs += scatter_coefficient_width;
+          }
+
+          ++gather_contributors;
+          gather_coeffs += gather_coefficient_width;
+        }
+
+        // now clear any unset contribs
+        {
+          stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+          stbir__contributors * end_contributors = samp->contributors + samp->num_contributors;
+          while ( clear_contributors < end_contributors )
+          {
+            clear_contributors->n0 = 0;
+            clear_contributors->n1 = -1;
+            ++clear_contributors;
+          }
+        }
+
+        STBIR_PROFILE_BUILD_END( pivot );
+      }
+    }
+    break;
+  }
+}
+
+
+//========================================================================================================
+// scanline decoders and encoders
+
+#define stbir__coder_min_num 1
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix BGRA
+#define stbir__decode_swizzle
+#define stbir__decode_order0  2 
+#define stbir__decode_order1  1
+#define stbir__decode_order2  0
+#define stbir__decode_order3  3
+#define stbir__encode_order0  2 
+#define stbir__encode_order1  1
+#define stbir__encode_order2  0
+#define stbir__encode_order3  3
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix ARGB
+#define stbir__decode_swizzle
+#define stbir__decode_order0  1 
+#define stbir__decode_order1  2
+#define stbir__decode_order2  3
+#define stbir__decode_order3  0
+#define stbir__encode_order0  3 
+#define stbir__encode_order1  0
+#define stbir__encode_order2  1
+#define stbir__encode_order3  2
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix ABGR
+#define stbir__decode_swizzle
+#define stbir__decode_order0  3 
+#define stbir__decode_order1  2
+#define stbir__decode_order2  1
+#define stbir__decode_order3  0
+#define stbir__encode_order0  3 
+#define stbir__encode_order1  2
+#define stbir__encode_order2  1
+#define stbir__encode_order3  0
+#define stbir__coder_min_num 4
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+#define stbir__decode_suffix AR
+#define stbir__decode_swizzle
+#define stbir__decode_order0  1 
+#define stbir__decode_order1  0 
+#define stbir__decode_order2  3
+#define stbir__decode_order3  2
+#define stbir__encode_order0  1 
+#define stbir__encode_order1  0 
+#define stbir__encode_order2  3
+#define stbir__encode_order3  2
+#define stbir__coder_min_num 2
+#define STB_IMAGE_RESIZE_DO_CODERS
+#include STBIR__HEADER_FILENAME
+
+
+// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels
+static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
+  float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7;  // decode buffer aligned to end of out_buffer
+  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
+
+  // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
+
+  #ifdef STBIR_SIMD
+  
+  #ifdef STBIR_SIMD8
+  decode += 16;
+  while ( decode <= end_decode )
+  {
+    stbir__simdf8 d0,d1,a0,a1,p0,p1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf8_load( d0, decode-16 );
+    stbir__simdf8_load( d1, decode-16+8 );
+    stbir__simdf8_0123to33333333( a0, d0 );
+    stbir__simdf8_0123to33333333( a1, d1 );
+    stbir__simdf8_mult( p0, a0, d0 );
+    stbir__simdf8_mult( p1, a1, d1 );
+    stbir__simdf8_bot4s( a0, d0, p0 );
+    stbir__simdf8_bot4s( a1, d1, p1 );
+    stbir__simdf8_top4s( d0, d0, p0 );
+    stbir__simdf8_top4s( d1, d1, p1 );
+    stbir__simdf8_store ( out, a0 );
+    stbir__simdf8_store ( out+7, d0 );
+    stbir__simdf8_store ( out+14, a1 );
+    stbir__simdf8_store ( out+21, d1 );
+    decode += 16;
+    out += 28;
+  }
+  decode -= 16;
+  #else  
+  decode += 8;
+  while ( decode <= end_decode )
+  {
+    stbir__simdf d0,a0,d1,a1,p0,p1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdf_load( d0, decode-8 );
+    stbir__simdf_load( d1, decode-8+4 );
+    stbir__simdf_0123to3333( a0, d0 );
+    stbir__simdf_0123to3333( a1, d1 );
+    stbir__simdf_mult( p0, a0, d0 );
+    stbir__simdf_mult( p1, a1, d1 );
+    stbir__simdf_store ( out, d0 );
+    stbir__simdf_store ( out+4, p0 );
+    stbir__simdf_store ( out+7, d1 );
+    stbir__simdf_store ( out+7+4, p1 );
+    decode += 8;
+    out += 14;
+  }
+  decode -= 8;
+  #endif
+
+  // might be one last odd pixel
+  #ifdef STBIR_SIMD8
+  while ( decode < end_decode )
+  #else
+  if ( decode < end_decode )
+  #endif
+  {
+    stbir__simdf d,a,p;
+    stbir__simdf_load( d, decode );
+    stbir__simdf_0123to3333( a, d );
+    stbir__simdf_mult( p, a, d );
+    stbir__simdf_store ( out, d );
+    stbir__simdf_store ( out+4, p );
+    decode += 4;
+    out += 7;
+  }
+
+  #else
+
+  while( decode < end_decode )
+  {
+    float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3];
+    out[0] = r;
+    out[1] = g;
+    out[2] = b;
+    out[3] = alpha;
+    out[4] = r * alpha;
+    out[5] = g * alpha;
+    out[6] = b * alpha;
+    out += 7;
+    decode += 4;
+  }
+
+  #endif
+}
+
+static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) out = out_buffer;
+  float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3;
+  float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels;
+
+  //  for fancy alpha, turns into: [X A Xpm][X A Xpm],etc
+
+  #ifdef STBIR_SIMD
+
+  decode += 8;
+  if ( decode <= end_decode )
+  {
+    do {
+      #ifdef STBIR_SIMD8
+      stbir__simdf8 d0,a0,p0;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdf8_load( d0, decode-8 );
+      stbir__simdf8_0123to11331133( p0, d0 );
+      stbir__simdf8_0123to00220022( a0, d0 );
+      stbir__simdf8_mult( p0, p0, a0 );
+ 
+      stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
+      stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
+      stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
+      
+      stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
+      stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
+      stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
+      #else
+      stbir__simdf d0,a0,d1,a1,p0,p1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdf_load( d0, decode-8 );
+      stbir__simdf_load( d1, decode-8+4 );
+      stbir__simdf_0123to1133( p0, d0 );
+      stbir__simdf_0123to1133( p1, d1 );
+      stbir__simdf_0123to0022( a0, d0 );
+      stbir__simdf_0123to0022( a1, d1 );
+      stbir__simdf_mult( p0, p0, a0 );
+      stbir__simdf_mult( p1, p1, a1 );
+
+      stbir__simdf_store2( out, d0 );
+      stbir__simdf_store( out+2, p0 );
+      stbir__simdf_store2h( out+3, d0 );
+
+      stbir__simdf_store2( out+6, d1 );
+      stbir__simdf_store( out+8, p1 );
+      stbir__simdf_store2h( out+9, d1 );
+      #endif
+      decode += 8;
+      out += 12;
+    } while ( decode <= end_decode );
+  }
+  decode -= 8;
+  #endif
+
+  while( decode < end_decode )
+  {
+    float x = decode[0], y = decode[1];
+    STBIR_SIMD_NO_UNROLL(decode);
+    out[0] = x;
+    out[1] = y;
+    out[2] = x * y;
+    out += 3;
+    decode += 2;
+  }
+}
+
+static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm
+
+  do {
+    float alpha = input[3];
+#ifdef STBIR_SIMD
+    stbir__simdf i,ia;
+    STBIR_SIMD_NO_UNROLL(encode);
+    if ( alpha < stbir__small_float )
+    {
+      stbir__simdf_load( i, input );
+      stbir__simdf_store( encode, i );
+    }
+    else
+    {
+      stbir__simdf_load1frep4( ia, 1.0f / alpha );
+      stbir__simdf_load( i, input+4 );
+      stbir__simdf_mult( i, i, ia );
+      stbir__simdf_store( encode, i );
+      encode[3] = alpha;
+    }
+#else
+    if ( alpha < stbir__small_float )
+    {
+      encode[0] = input[0];
+      encode[1] = input[1];
+      encode[2] = input[2];
+    }
+    else
+    {
+      float ialpha = 1.0f / alpha;
+      encode[0] = input[4] * ialpha;
+      encode[1] = input[5] * ialpha;
+      encode[2] = input[6] * ialpha;
+    }
+    encode[3] = alpha;
+#endif
+
+    input += 7;
+    encode += 4;
+  } while ( encode < end_output );
+}
+
+//  format: [X A Xpm][X A Xpm] etc
+static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = input[1];
+    encode[0] = input[0];
+    if ( alpha >= stbir__small_float )
+      encode[0] = input[2] / alpha;
+    encode[1] = alpha;
+
+    input += 3;
+    encode += 2;
+  } while ( encode < end_output );
+}
+
+static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    decode += 2 * stbir__simdfX_float_count;
+    while ( decode <= end_decode )
+    {
+      stbir__simdfX d0,a0,d1,a1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
+      stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
+      stbir__simdfX_aaa1( a0, d0, STBIR_onesX );
+      stbir__simdfX_aaa1( a1, d1, STBIR_onesX );
+      stbir__simdfX_mult( d0, d0, a0 );
+      stbir__simdfX_mult( d1, d1, a1 );
+      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
+      stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
+      decode += 2 * stbir__simdfX_float_count;
+    }
+    decode -= 2 * stbir__simdfX_float_count;
+
+    // few last pixels remnants
+    #ifdef STBIR_SIMD8
+    while ( decode < end_decode )
+    #else
+    if ( decode < end_decode )
+    #endif
+    {
+      stbir__simdf d,a;
+      stbir__simdf_load( d, decode );
+      stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) );
+      stbir__simdf_mult( d, d, a );
+      stbir__simdf_store ( decode, d );
+      decode += 4;
+    }
+  }
+
+  #else
+
+  while( decode < end_decode )
+  {
+    float alpha = decode[3];
+    decode[0] *= alpha;
+    decode[1] *= alpha;
+    decode[2] *= alpha;
+    decode += 4;
+  }
+
+  #endif
+}
+
+static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  decode += 2 * stbir__simdfX_float_count;
+  while ( decode <= end_decode )
+  {
+    stbir__simdfX d0,a0,d1,a1;
+    STBIR_NO_UNROLL(decode);
+    stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count );
+    stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count );
+    stbir__simdfX_a1a1( a0, d0, STBIR_onesX );
+    stbir__simdfX_a1a1( a1, d1, STBIR_onesX );
+    stbir__simdfX_mult( d0, d0, a0 );
+    stbir__simdfX_mult( d1, d1, a1 );
+    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 );
+    stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 );
+    decode += 2 * stbir__simdfX_float_count;
+  }
+  decode -= 2 * stbir__simdfX_float_count;
+  #endif
+
+  while( decode < end_decode )
+  {
+    float alpha = decode[1];
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0] *= alpha;
+    decode += 2;
+  }
+}
+
+static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = encode[3];
+
+#ifdef STBIR_SIMD
+    stbir__simdf i,ia;
+    STBIR_SIMD_NO_UNROLL(encode);
+    if ( alpha >= stbir__small_float )
+    {
+      stbir__simdf_load1frep4( ia, 1.0f / alpha );
+      stbir__simdf_load( i, encode );
+      stbir__simdf_mult( i, i, ia );
+      stbir__simdf_store( encode, i );
+      encode[3] = alpha;
+    }
+#else
+    if ( alpha >= stbir__small_float )
+    {
+      float ialpha = 1.0f / alpha;
+      encode[0] *= ialpha;
+      encode[1] *= ialpha;
+      encode[2] *= ialpha;
+    }
+#endif
+    encode += 4;
+  } while ( encode < end_output );
+}
+
+static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels )
+{
+  float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer;
+  float const * end_output = encode_buffer + width_times_channels;
+
+  do {
+    float alpha = encode[1];
+    if ( alpha >= stbir__small_float )
+      encode[0] /= alpha;
+    encode += 2;
+  } while ( encode < end_output );
+}
+
+
+// only used in RGB->BGR or BGR->RGB
+static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels )
+{
+  float STBIR_STREAMOUT_PTR(*) decode = decode_buffer;
+  float const * end_decode = decode_buffer + width_times_channels;
+
+  decode += 12;
+  while( decode <= end_decode )
+  {
+    float t0,t1,t2,t3;
+    STBIR_NO_UNROLL(decode);
+    t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9];
+    decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11];
+    decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3;
+    decode += 12;
+  }
+  decode -= 12;
+
+  while( decode < end_decode )
+  {
+    float t = decode[0];
+    STBIR_NO_UNROLL(decode);
+    decode[0] = decode[2];
+    decode[2] = t;
+    decode += 3;
+  }
+}
+
+
+
+static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  int channels = stbir_info->channels;
+  int effective_channels = stbir_info->effective_channels;
+  int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels;
+  stbir_edge edge_horizontal = stbir_info->horizontal.edge;
+  stbir_edge edge_vertical = stbir_info->vertical.edge;
+  int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes;
+  stbir__span const * spans = stbir_info->scanline_extents.spans;
+  float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
+
+  // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
+  STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
+
+  do 
+  {
+    float * decode_buffer;
+    void const * input_data;
+    float * end_decode;
+    int width_times_channels;
+    int width;
+
+    if ( spans->n1 < spans->n0 )    
+      break;
+
+    width = spans->n1 + 1 - spans->n0;
+    decode_buffer = full_decode_buffer + spans->n0 * effective_channels;
+    end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels;
+    width_times_channels = width * channels;
+
+    // read directly out of input plane by default
+    input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes;
+
+    // if we have an input callback, call it to get the input data
+    if ( stbir_info->in_pixels_cb )
+    {
+      // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
+      input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
+    }
+    
+    STBIR_PROFILE_START( decode );
+    // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
+    stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
+    STBIR_PROFILE_END( decode );
+
+    if (stbir_info->alpha_weight)
+    {
+      STBIR_PROFILE_START( alpha );
+      stbir_info->alpha_weight( decode_buffer, width_times_channels );
+      STBIR_PROFILE_END( alpha );
+    }
+
+    ++spans;
+  } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) );
+
+  // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage)
+  // basically the idea here is that if we have the whole scanline in memory, we don't redecode the
+  //   wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions
+  if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) )
+  {
+    // this code only runs if we're in edge_wrap, and we're doing the entire scanline
+    int e, start_x[2];
+    int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
+    
+    start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
+    start_x[1] =  input_full_size;                             // right edge
+
+    for( e = 0; e < 2 ; e++ )
+    {
+      // do each margin
+      int margin = stbir_info->scanline_extents.edge_sizes[e];
+      if ( margin )
+      {
+        int x = start_x[e];
+        float * marg = full_decode_buffer + x * effective_channels;
+        float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels;
+        STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) );
+      }
+    }
+  }
+}
+
+
+//=================
+// Do 1 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()          \
+    stbir__simdf tot,c;                \
+    STBIR_SIMD_NO_UNROLL(decode);      \
+    stbir__simdf_load1( c, hc );       \
+    stbir__simdf_mult1_mem( tot, c, decode ); 
+
+#define stbir__2_coeff_only()          \
+    stbir__simdf tot,c,d;              \
+    STBIR_SIMD_NO_UNROLL(decode);      \
+    stbir__simdf_load2z( c, hc );      \
+    stbir__simdf_load2( d, decode );   \
+    stbir__simdf_mult( tot, c, d );    \
+    stbir__simdf_0123to1230( c, tot ); \
+    stbir__simdf_add1( tot, tot, c );          
+
+#define stbir__3_coeff_only()                  \
+    stbir__simdf tot,c,t;                      \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc );                \
+    stbir__simdf_mult_mem( tot, c, decode );   \
+    stbir__simdf_0123to1230( c, tot );         \
+    stbir__simdf_0123to2301( t, tot );         \
+    stbir__simdf_add1( tot, tot, c );          \
+    stbir__simdf_add1( tot, tot, t );    
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store1( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#define stbir__4_coeff_start()                 \
+    stbir__simdf tot,c;                        \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc );                \
+    stbir__simdf_mult_mem( tot, c, decode );   \
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    STBIR_SIMD_NO_UNROLL(decode);              \
+    stbir__simdf_load( c, hc + (ofs) );        \
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) ); 
+
+#define stbir__1_coeff_remnant( ofs )          \
+    { stbir__simdf d;                          \
+    stbir__simdf_load1z( c, hc + (ofs) );      \
+    stbir__simdf_load1( d, decode + (ofs) );   \
+    stbir__simdf_madd( tot, tot, d, c ); }
+
+#define stbir__2_coeff_remnant( ofs )          \
+    { stbir__simdf d;                          \
+    stbir__simdf_load2z( c, hc+(ofs) );        \
+    stbir__simdf_load2( d, decode+(ofs) );     \
+    stbir__simdf_madd( tot, tot, d, c ); }   
+
+#define stbir__3_coeff_setup()                 \
+    stbir__simdf mask;                         \
+    stbir__simdf_load( mask, STBIR_mask + 3 );
+
+#define stbir__3_coeff_remnant( ofs )                  \
+    stbir__simdf_load( c, hc+(ofs) );                  \
+    stbir__simdf_and( c, c, mask );                    \
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
+
+#define stbir__store_output()                     \
+    stbir__simdf_0123to2301( c, tot );            \
+    stbir__simdf_add( tot, tot, c );              \
+    stbir__simdf_0123to1230( c, tot );            \
+    stbir__simdf_add1( tot, tot, c );             \
+    stbir__simdf_store1( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tot;                 \
+    tot = decode[0]*hc[0];     
+
+#define stbir__2_coeff_only()  \
+    float tot;                 \
+    tot = decode[0] * hc[0];   \
+    tot += decode[1] * hc[1];    
+
+#define stbir__3_coeff_only()  \
+    float tot;                 \
+    tot = decode[0] * hc[0];   \
+    tot += decode[1] * hc[1];  \
+    tot += decode[2] * hc[2];    
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot;                              \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#define stbir__4_coeff_start()  \
+    float tot0,tot1,tot2,tot3;  \
+    tot0 = decode[0] * hc[0];   \
+    tot1 = decode[1] * hc[1];   \
+    tot2 = decode[2] * hc[2];   \
+    tot3 = decode[3] * hc[3];     
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
+    tot3 += decode[3+(ofs)] * hc[3+(ofs)];     
+
+#define stbir__1_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   
+
+#define stbir__2_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
+
+#define stbir__3_coeff_remnant( ofs )        \
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
+    tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];   
+
+#define stbir__store_output()                     \
+    output[0] = (tot0+tot2)+(tot1+tot3);          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 1;
+
+#endif  
+
+#define STBIR__horizontal_channels 1
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+//=================
+// Do 2 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()         \
+    stbir__simdf tot,c,d;             \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load1z( c, hc );     \
+    stbir__simdf_0123to0011( c, c );  \
+    stbir__simdf_load2( d, decode );  \
+    stbir__simdf_mult( tot, d, c ); 
+
+#define stbir__2_coeff_only()         \
+    stbir__simdf tot,c;               \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load2( c, hc );      \
+    stbir__simdf_0123to0011( c, c );  \
+    stbir__simdf_mult_mem( tot, c, decode ); 
+
+#define stbir__3_coeff_only()                \
+    stbir__simdf tot,c,cs,d;                 \
+    STBIR_SIMD_NO_UNROLL(decode);            \
+    stbir__simdf_load( cs, hc );             \
+    stbir__simdf_0123to0011( c, cs );        \
+    stbir__simdf_mult_mem( tot, c, decode ); \
+    stbir__simdf_0123to2222( c, cs );        \
+    stbir__simdf_load2z( d, decode+4 );      \
+    stbir__simdf_madd( tot, tot, d, c );   
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_0123to2301( c, tot );            \
+    stbir__simdf_add( tot, tot, c );              \
+    stbir__simdf_store2( output, tot );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                    \
+    stbir__simdf8 tot0,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                 \
+    stbir__simdf8_load4b( cs, hc );               \
+    stbir__simdf8_0123to00112233( c, cs );        \
+    stbir__simdf8_mult_mem( tot0, c, decode );
+
+#define stbir__4_coeff_continue_from_4( ofs )        \
+    STBIR_SIMD_NO_UNROLL(decode);                    \
+    stbir__simdf8_load4b( cs, hc + (ofs) );          \
+    stbir__simdf8_0123to00112233( c, cs );           \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); 
+
+#define stbir__1_coeff_remnant( ofs )                \
+    { stbir__simdf t;                                \
+    stbir__simdf_load1z( t, hc + (ofs) );            \
+    stbir__simdf_0123to0011( t, t );                 \
+    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf8_add4( tot0, tot0, t ); }
+
+#define stbir__2_coeff_remnant( ofs )                \
+    { stbir__simdf t;                                \
+    stbir__simdf_load2( t, hc + (ofs) );             \
+    stbir__simdf_0123to0011( t, t );                 \
+    stbir__simdf_mult_mem( t, t, decode+(ofs)*2 );   \
+    stbir__simdf8_add4( tot0, tot0, t ); }
+
+#define stbir__3_coeff_remnant( ofs )                \
+    { stbir__simdf8 d;                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );          \
+    stbir__simdf8_0123to00112233( c, cs );           \
+    stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
+    stbir__simdf8_madd( tot0, tot0, c, d ); }               
+
+#define stbir__store_output()                     \
+    { stbir__simdf t,c;                           \
+    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
+    stbir__simdf_0123to2301( c, t );              \
+    stbir__simdf_add( t, t, c );                  \
+    stbir__simdf_store2( output, t );             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2; }
+
+#else
+
+#define stbir__4_coeff_start()                   \
+    stbir__simdf tot0,tot1,c,cs;                 \
+    STBIR_SIMD_NO_UNROLL(decode);                \
+    stbir__simdf_load( cs, hc );                 \
+    stbir__simdf_0123to0011( c, cs );            \
+    stbir__simdf_mult_mem( tot0, c, decode );    \
+    stbir__simdf_0123to2233( c, cs );            \
+    stbir__simdf_mult_mem( tot1, c, decode+4 );   
+
+#define stbir__4_coeff_continue_from_4( ofs )                \
+    STBIR_SIMD_NO_UNROLL(decode);                            \
+    stbir__simdf_load( cs, hc + (ofs) );                     \
+    stbir__simdf_0123to0011( c, cs );                        \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
+    stbir__simdf_0123to2233( c, cs );                        \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );   
+
+#define stbir__1_coeff_remnant( ofs )            \
+    { stbir__simdf d;                            \
+    stbir__simdf_load1z( cs, hc + (ofs) );       \
+    stbir__simdf_0123to0011( c, cs );            \
+    stbir__simdf_load2( d, decode + (ofs) * 2 ); \
+    stbir__simdf_madd( tot0, tot0, d, c ); }
+
+#define stbir__2_coeff_remnant( ofs )                      \
+    stbir__simdf_load2( cs, hc + (ofs) );                  \
+    stbir__simdf_0123to0011( c, cs );                      \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );       
+
+#define stbir__3_coeff_remnant( ofs )                       \
+    { stbir__simdf d;                                       \
+    stbir__simdf_load( cs, hc + (ofs) );                    \
+    stbir__simdf_0123to0011( c, cs );                       \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
+    stbir__simdf_0123to2222( c, cs );                       \
+    stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
+    stbir__simdf_madd( tot1, tot1, d, c ); }  
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot1 );         \
+    stbir__simdf_0123to2301( c, tot0 );           \
+    stbir__simdf_add( tot0, tot0, c );            \
+    stbir__simdf_store2( output, tot0 );          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;     
+
+#define stbir__2_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;        \
+    c = hc[1];                 \
+    tota += decode[2]*c;       \
+    totb += decode[3]*c;     
+
+// this weird order of add matches the simd
+#define stbir__3_coeff_only()  \
+    float tota,totb,c;         \
+    c = hc[0];                 \
+    tota = decode[0]*c;        \
+    totb = decode[1]*c;        \
+    c = hc[2];                 \
+    tota += decode[4]*c;       \
+    totb += decode[5]*c;       \
+    c = hc[1];                 \
+    tota += decode[2]*c;       \
+    totb += decode[3]*c;     
+
+#define stbir__store_output_tiny()                \
+    output[0] = tota;                             \
+    output[1] = totb;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#define stbir__4_coeff_start()      \
+    float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c;  \
+    c = hc[0];                      \
+    tota0 = decode[0]*c;            \
+    totb0 = decode[1]*c;            \
+    c = hc[1];                      \
+    tota1 = decode[2]*c;            \
+    totb1 = decode[3]*c;            \
+    c = hc[2];                      \
+    tota2 = decode[4]*c;            \
+    totb2 = decode[5]*c;            \
+    c = hc[3];                      \
+    tota3 = decode[6]*c;            \
+    totb3 = decode[7]*c;     
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    c = hc[0+(ofs)];                           \
+    tota0 += decode[0+(ofs)*2]*c;              \
+    totb0 += decode[1+(ofs)*2]*c;              \
+    c = hc[1+(ofs)];                           \
+    tota1 += decode[2+(ofs)*2]*c;              \
+    totb1 += decode[3+(ofs)*2]*c;              \
+    c = hc[2+(ofs)];                           \
+    tota2 += decode[4+(ofs)*2]*c;              \
+    totb2 += decode[5+(ofs)*2]*c;              \
+    c = hc[3+(ofs)];                           \
+    tota3 += decode[6+(ofs)*2]*c;              \
+    totb3 += decode[7+(ofs)*2]*c;     
+
+#define stbir__1_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;   
+
+#define stbir__2_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;    \
+    c = hc[1+(ofs)];                   \
+    tota1 += decode[2+(ofs)*2] * c;    \
+    totb1 += decode[3+(ofs)*2] * c;   
+
+#define stbir__3_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*2] * c;    \
+    totb0 += decode[1+(ofs)*2] * c;    \
+    c = hc[1+(ofs)];                   \
+    tota1 += decode[2+(ofs)*2] * c;    \
+    totb1 += decode[3+(ofs)*2] * c;    \
+    c = hc[2+(ofs)];                   \
+    tota2 += decode[4+(ofs)*2] * c;    \
+    totb2 += decode[5+(ofs)*2] * c;    
+
+#define stbir__store_output()                     \
+    output[0] = (tota0+tota2)+(tota1+tota3);      \
+    output[1] = (totb0+totb2)+(totb1+totb3);      \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 2;
+
+#endif  
+
+#define STBIR__horizontal_channels 2
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+//=================
+// Do 3 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()         \
+    stbir__simdf tot,c,d;             \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load1z( c, hc );     \
+    stbir__simdf_0123to0001( c, c );  \
+    stbir__simdf_load( d, decode );   \
+    stbir__simdf_mult( tot, d, c ); 
+
+#define stbir__2_coeff_only()         \
+    stbir__simdf tot,c,cs,d;          \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    stbir__simdf_load2( cs, hc );     \
+    stbir__simdf_0123to0000( c, cs ); \
+    stbir__simdf_load( d, decode );   \
+    stbir__simdf_mult( tot, d, c );   \
+    stbir__simdf_0123to1111( c, cs ); \
+    stbir__simdf_load( d, decode+3 ); \
+    stbir__simdf_madd( tot, tot, d, c ); 
+
+#define stbir__3_coeff_only()            \
+    stbir__simdf tot,c,d,cs;             \
+    STBIR_SIMD_NO_UNROLL(decode);        \
+    stbir__simdf_load( cs, hc );         \
+    stbir__simdf_0123to0000( c, cs );    \
+    stbir__simdf_load( d, decode );      \
+    stbir__simdf_mult( tot, d, c );      \
+    stbir__simdf_0123to1111( c, cs );    \
+    stbir__simdf_load( d, decode+3 );    \
+    stbir__simdf_madd( tot, tot, d, c ); \
+    stbir__simdf_0123to2222( c, cs );    \
+    stbir__simdf_load( d, decode+6 );    \
+    stbir__simdf_madd( tot, tot, d, c ); 
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store2( output, tot );           \
+    stbir__simdf_0123to2301( tot, tot );          \
+    stbir__simdf_store1( output+2, tot );         \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#ifdef STBIR_SIMD8
+
+// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t;  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );    
+
+#define stbir__4_coeff_continue_from_4( ofs )      \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc + (ofs) );        \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );    
+
+#define stbir__1_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 ); 
+
+#define stbir__2_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );   
+ 
+ #define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                                \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                      \
+    stbir__simdf8_0123to00001111( c, cs );                       \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
+    stbir__simdf8_0123to2222( t, cs );                           \
+    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 ); 
+
+#define stbir__store_output()                       \
+    stbir__simdf8_add( tot0, tot0, tot1 );          \
+    stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \
+    stbir__simdf8_add4halves( t, t, tot0 );         \
+    horizontal_coefficients += coefficient_width;   \
+    ++horizontal_contributors;                      \
+    output += 3;                                    \
+    if ( output < output_end )                      \
+    {                                               \
+      stbir__simdf_store( output-3, t );            \
+      continue;                                     \
+    }                                               \
+    { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \
+    stbir__simdf_store2( output-3, t );             \
+    stbir__simdf_store1( output+2-3, tt ); }        \
+    break;
+
+
+#else
+
+#define stbir__4_coeff_start()                  \
+    stbir__simdf tot0,tot1,tot2,c,cs;           \
+    STBIR_SIMD_NO_UNROLL(decode);               \
+    stbir__simdf_load( cs, hc );                \
+    stbir__simdf_0123to0001( c, cs );           \
+    stbir__simdf_mult_mem( tot0, c, decode );   \
+    stbir__simdf_0123to1122( c, cs );           \
+    stbir__simdf_mult_mem( tot1, c, decode+4 ); \
+    stbir__simdf_0123to2333( c, cs );           \
+    stbir__simdf_mult_mem( tot2, c, decode+8 ); 
+
+#define stbir__4_coeff_continue_from_4( ofs )                 \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load( cs, hc + (ofs) );                      \
+    stbir__simdf_0123to0001( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
+    stbir__simdf_0123to1122( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
+    stbir__simdf_0123to2333( c, cs );                         \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );   
+
+#define stbir__1_coeff_remnant( ofs )         \
+    STBIR_SIMD_NO_UNROLL(decode);             \
+    stbir__simdf_load1z( c, hc + (ofs) );     \
+    stbir__simdf_0123to0001( c, c );          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   
+
+#define stbir__2_coeff_remnant( ofs )                       \
+    { stbir__simdf d;                                       \
+    STBIR_SIMD_NO_UNROLL(decode);                           \
+    stbir__simdf_load2z( cs, hc + (ofs) );                  \
+    stbir__simdf_0123to0001( c, cs );                       \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
+    stbir__simdf_0123to1122( c, cs );                       \
+    stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
+    stbir__simdf_madd( tot1, tot1, c, d ); }                 
+
+#define stbir__3_coeff_remnant( ofs )                         \
+    { stbir__simdf d;                                         \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load( cs, hc + (ofs) );                      \
+    stbir__simdf_0123to0001( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   \
+    stbir__simdf_0123to1122( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
+    stbir__simdf_0123to2222( c, cs );                         \
+    stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
+    stbir__simdf_madd( tot2, tot2, c, d );  }                
+
+#define stbir__store_output()                       \
+    stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
+    stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 );  \
+    stbir__simdf_0123to1230( tot2, tot2 );          \
+    stbir__simdf_add( tot0, tot0, cs );             \
+    stbir__simdf_add( c, c, tot2 );                 \
+    stbir__simdf_add( tot0, tot0, c );              \
+    horizontal_coefficients += coefficient_width;   \
+    ++horizontal_contributors;                      \
+    output += 3;                                    \
+    if ( output < output_end )                      \
+    {                                               \
+      stbir__simdf_store( output-3, tot0 );         \
+      continue;                                     \
+    }                                               \
+    stbir__simdf_0123to2301( tot1, tot0 );          \
+    stbir__simdf_store2( output-3, tot0 );          \
+    stbir__simdf_store1( output+2-3, tot1 );        \
+    break;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;              
+
+#define stbir__2_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;        \
+    c = hc[1];                 \
+    tot0 += decode[3]*c;       \
+    tot1 += decode[4]*c;       \
+    tot2 += decode[5]*c;              
+
+#define stbir__3_coeff_only()  \
+    float tot0, tot1, tot2, c; \
+    c = hc[0];                 \
+    tot0 = decode[0]*c;        \
+    tot1 = decode[1]*c;        \
+    tot2 = decode[2]*c;        \
+    c = hc[1];                 \
+    tot0 += decode[3]*c;       \
+    tot1 += decode[4]*c;       \
+    tot2 += decode[5]*c;       \
+    c = hc[2];                 \
+    tot0 += decode[6]*c;       \
+    tot1 += decode[7]*c;       \
+    tot2 += decode[8]*c;              
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot0;                             \
+    output[1] = tot1;                             \
+    output[2] = tot2;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#define stbir__4_coeff_start()      \
+    float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c;  \
+    c = hc[0];                      \
+    tota0 = decode[0]*c;            \
+    tota1 = decode[1]*c;            \
+    tota2 = decode[2]*c;            \
+    c = hc[1];                      \
+    totb0 = decode[3]*c;            \
+    totb1 = decode[4]*c;            \
+    totb2 = decode[5]*c;            \
+    c = hc[2];                      \
+    totc0 = decode[6]*c;            \
+    totc1 = decode[7]*c;            \
+    totc2 = decode[8]*c;            \
+    c = hc[3];                      \
+    totd0 = decode[9]*c;            \
+    totd1 = decode[10]*c;           \
+    totd2 = decode[11]*c;            
+
+#define stbir__4_coeff_continue_from_4( ofs )  \
+    c = hc[0+(ofs)];                           \
+    tota0 += decode[0+(ofs)*3]*c;              \
+    tota1 += decode[1+(ofs)*3]*c;              \
+    tota2 += decode[2+(ofs)*3]*c;              \
+    c = hc[1+(ofs)];                           \
+    totb0 += decode[3+(ofs)*3]*c;              \
+    totb1 += decode[4+(ofs)*3]*c;              \
+    totb2 += decode[5+(ofs)*3]*c;              \
+    c = hc[2+(ofs)];                           \
+    totc0 += decode[6+(ofs)*3]*c;              \
+    totc1 += decode[7+(ofs)*3]*c;              \
+    totc2 += decode[8+(ofs)*3]*c;              \
+    c = hc[3+(ofs)];                           \
+    totd0 += decode[9+(ofs)*3]*c;              \
+    totd1 += decode[10+(ofs)*3]*c;             \
+    totd2 += decode[11+(ofs)*3]*c;              
+
+#define stbir__1_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;
+
+#define stbir__2_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;      \
+    c = hc[1+(ofs)];                   \
+    totb0 += decode[3+(ofs)*3]*c;      \
+    totb1 += decode[4+(ofs)*3]*c;      \
+    totb2 += decode[5+(ofs)*3]*c;      \
+
+#define stbir__3_coeff_remnant( ofs )  \
+    c = hc[0+(ofs)];                   \
+    tota0 += decode[0+(ofs)*3]*c;      \
+    tota1 += decode[1+(ofs)*3]*c;      \
+    tota2 += decode[2+(ofs)*3]*c;      \
+    c = hc[1+(ofs)];                   \
+    totb0 += decode[3+(ofs)*3]*c;      \
+    totb1 += decode[4+(ofs)*3]*c;      \
+    totb2 += decode[5+(ofs)*3]*c;      \
+    c = hc[2+(ofs)];                   \
+    totc0 += decode[6+(ofs)*3]*c;      \
+    totc1 += decode[7+(ofs)*3]*c;      \
+    totc2 += decode[8+(ofs)*3]*c;              
+
+#define stbir__store_output()                     \
+    output[0] = (tota0+totc0)+(totb0+totd0);      \
+    output[1] = (tota1+totc1)+(totb1+totd1);      \
+    output[2] = (tota2+totc2)+(totb2+totd2);      \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 3;
+
+#endif  
+
+#define STBIR__horizontal_channels 3
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+//=================
+// Do 4 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()             \
+    stbir__simdf tot,c;                   \
+    STBIR_SIMD_NO_UNROLL(decode);         \
+    stbir__simdf_load1( c, hc );          \
+    stbir__simdf_0123to0000( c, c );      \
+    stbir__simdf_mult_mem( tot, c, decode ); 
+
+#define stbir__2_coeff_only()                       \
+    stbir__simdf tot,c,cs;                          \
+    STBIR_SIMD_NO_UNROLL(decode);                   \
+    stbir__simdf_load2( cs, hc );                   \
+    stbir__simdf_0123to0000( c, cs );               \
+    stbir__simdf_mult_mem( tot, c, decode );        \
+    stbir__simdf_0123to1111( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); 
+
+#define stbir__3_coeff_only()                       \
+    stbir__simdf tot,c,cs;                          \
+    STBIR_SIMD_NO_UNROLL(decode);                   \
+    stbir__simdf_load( cs, hc );                    \
+    stbir__simdf_0123to0000( c, cs );               \
+    stbir__simdf_mult_mem( tot, c, decode );        \
+    stbir__simdf_0123to1111( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
+    stbir__simdf_0123to2222( c, cs );               \
+    stbir__simdf_madd_mem( tot, tot, c, decode+8 ); 
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store( output, tot );            \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,c,cs; stbir__simdf t;  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00001111( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode );     \
+    stbir__simdf8_0123to22223333( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );    
+
+#define stbir__4_coeff_continue_from_4( ofs )                  \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
+    stbir__simdf8_0123to00001111( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );    
+
+#define stbir__1_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load1rep4( t, hc + (ofs) );                   \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 ); 
+
+#define stbir__2_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
+    stbir__simdf8_0123to22223333( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   
+ 
+ #define stbir__3_coeff_remnant( ofs )                         \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                    \
+    stbir__simdf8_0123to00001111( c, cs );                     \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf8_0123to2222( t, cs );                         \
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 ); 
+
+#define stbir__store_output()                      \
+    stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
+    stbir__simdf_store( output, t );               \
+    horizontal_coefficients += coefficient_width;  \
+    ++horizontal_contributors;                     \
+    output += 4;
+
+#else    
+
+#define stbir__4_coeff_start()                        \
+    stbir__simdf tot0,tot1,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                     \
+    stbir__simdf_load( cs, hc );                      \
+    stbir__simdf_0123to0000( c, cs );                 \
+    stbir__simdf_mult_mem( tot0, c, decode );         \
+    stbir__simdf_0123to1111( c, cs );                 \
+    stbir__simdf_mult_mem( tot1, c, decode+4 );       \
+    stbir__simdf_0123to2222( c, cs );                 \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
+    stbir__simdf_0123to3333( c, cs );                 \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 ); 
+
+#define stbir__4_coeff_continue_from_4( ofs )                  \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
+    stbir__simdf_0123to1111( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
+    stbir__simdf_0123to2222( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
+    stbir__simdf_0123to3333( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 ); 
+
+#define stbir__1_coeff_remnant( ofs )                       \
+    STBIR_SIMD_NO_UNROLL(decode);                           \
+    stbir__simdf_load1( c, hc + (ofs) );                    \
+    stbir__simdf_0123to0000( c, c );                        \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); 
+
+#define stbir__2_coeff_remnant( ofs )                         \
+    STBIR_SIMD_NO_UNROLL(decode);                             \
+    stbir__simdf_load2( cs, hc + (ofs) );                     \
+    stbir__simdf_0123to0000( c, cs );                         \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
+    stbir__simdf_0123to1111( c, cs );                         \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); 
+  
+#define stbir__3_coeff_remnant( ofs )                          \
+    STBIR_SIMD_NO_UNROLL(decode);                              \
+    stbir__simdf_load( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );    \
+    stbir__simdf_0123to1111( c, cs );                          \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );  \
+    stbir__simdf_0123to2222( c, cs );                          \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;
+
+#define stbir__2_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;               \
+    c = hc[1];                        \
+    p0 += decode[4] * c;              \
+    p1 += decode[5] * c;              \
+    p2 += decode[6] * c;              \
+    p3 += decode[7] * c;
+
+#define stbir__3_coeff_only()         \
+    float p0,p1,p2,p3,c;              \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    p0 = decode[0] * c;               \
+    p1 = decode[1] * c;               \
+    p2 = decode[2] * c;               \
+    p3 = decode[3] * c;               \
+    c = hc[1];                        \
+    p0 += decode[4] * c;              \
+    p1 += decode[5] * c;              \
+    p2 += decode[6] * c;              \
+    p3 += decode[7] * c;              \
+    c = hc[2];                        \
+    p0 += decode[8] * c;              \
+    p1 += decode[9] * c;              \
+    p2 += decode[10] * c;             \
+    p3 += decode[11] * c;
+
+#define stbir__store_output_tiny()                \
+    output[0] = p0;                               \
+    output[1] = p1;                               \
+    output[2] = p2;                               \
+    output[3] = p3;                               \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#define stbir__4_coeff_start()        \
+    float x0,x1,x2,x3,y0,y1,y2,y3,c;  \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0];                        \
+    x0 = decode[0] * c;               \
+    x1 = decode[1] * c;               \
+    x2 = decode[2] * c;               \
+    x3 = decode[3] * c;               \
+    c = hc[1];                        \
+    y0 = decode[4] * c;               \
+    y1 = decode[5] * c;               \
+    y2 = decode[6] * c;               \
+    y3 = decode[7] * c;               \
+    c = hc[2];                        \
+    x0 += decode[8] * c;              \
+    x1 += decode[9] * c;              \
+    x2 += decode[10] * c;             \
+    x3 += decode[11] * c;             \
+    c = hc[3];                        \
+    y0 += decode[12] * c;             \
+    y1 += decode[13] * c;             \
+    y2 += decode[14] * c;             \
+    y3 += decode[15] * c;
+
+#define stbir__4_coeff_continue_from_4( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;      \
+    c = hc[2+(ofs)];                  \
+    x0 += decode[8+(ofs)*4] * c;      \
+    x1 += decode[9+(ofs)*4] * c;      \
+    x2 += decode[10+(ofs)*4] * c;     \
+    x3 += decode[11+(ofs)*4] * c;     \
+    c = hc[3+(ofs)];                  \
+    y0 += decode[12+(ofs)*4] * c;     \
+    y1 += decode[13+(ofs)*4] * c;     \
+    y2 += decode[14+(ofs)*4] * c;     \
+    y3 += decode[15+(ofs)*4] * c;
+
+#define stbir__1_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      
+
+#define stbir__2_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;    
+  
+#define stbir__3_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);     \
+    c = hc[0+(ofs)];                  \
+    x0 += decode[0+(ofs)*4] * c;      \
+    x1 += decode[1+(ofs)*4] * c;      \
+    x2 += decode[2+(ofs)*4] * c;      \
+    x3 += decode[3+(ofs)*4] * c;      \
+    c = hc[1+(ofs)];                  \
+    y0 += decode[4+(ofs)*4] * c;      \
+    y1 += decode[5+(ofs)*4] * c;      \
+    y2 += decode[6+(ofs)*4] * c;      \
+    y3 += decode[7+(ofs)*4] * c;      \
+    c = hc[2+(ofs)];                  \
+    x0 += decode[8+(ofs)*4] * c;      \
+    x1 += decode[9+(ofs)*4] * c;      \
+    x2 += decode[10+(ofs)*4] * c;     \
+    x3 += decode[11+(ofs)*4] * c;     
+
+#define stbir__store_output()                     \
+    output[0] = x0 + y0;                          \
+    output[1] = x1 + y1;                          \
+    output[2] = x2 + y2;                          \
+    output[3] = x3 + y3;                          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 4;
+
+#endif  
+
+#define STBIR__horizontal_channels 4
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+
+//=================
+// Do 7 channel horizontal routines
+
+#ifdef STBIR_SIMD
+
+#define stbir__1_coeff_only()                   \
+    stbir__simdf tot0,tot1,c;                   \
+    STBIR_SIMD_NO_UNROLL(decode);               \
+    stbir__simdf_load1( c, hc );                \
+    stbir__simdf_0123to0000( c, c );            \
+    stbir__simdf_mult_mem( tot0, c, decode );   \
+    stbir__simdf_mult_mem( tot1, c, decode+3 ); 
+
+#define stbir__2_coeff_only()                         \
+    stbir__simdf tot0,tot1,c,cs;                      \
+    STBIR_SIMD_NO_UNROLL(decode);                     \
+    stbir__simdf_load2( cs, hc );                     \
+    stbir__simdf_0123to0000( c, cs );                 \
+    stbir__simdf_mult_mem( tot0, c, decode );         \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );       \
+    stbir__simdf_0123to1111( c, cs );                 \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
+    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 ); 
+
+#define stbir__3_coeff_only()                           \
+    stbir__simdf tot0,tot1,c,cs;                        \
+    STBIR_SIMD_NO_UNROLL(decode);                       \
+    stbir__simdf_load( cs, hc );                        \
+    stbir__simdf_0123to0000( c, cs );                   \
+    stbir__simdf_mult_mem( tot0, c, decode );           \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );         \
+    stbir__simdf_0123to1111( c, cs );                   \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+7 );   \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
+    stbir__simdf_0123to2222( c, cs );                   \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  
+
+#define stbir__store_output_tiny()                \
+    stbir__simdf_store( output+3, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#ifdef STBIR_SIMD8
+
+#define stbir__4_coeff_start()                     \
+    stbir__simdf8 tot0,tot1,c,cs;                  \
+    STBIR_SIMD_NO_UNROLL(decode);                  \
+    stbir__simdf8_load4b( cs, hc );                \
+    stbir__simdf8_0123to00000000( c, cs );         \
+    stbir__simdf8_mult_mem( tot0, c, decode );     \
+    stbir__simdf8_0123to11111111( c, cs );         \
+    stbir__simdf8_mult_mem( tot1, c, decode+7 );   \
+    stbir__simdf8_0123to22222222( c, cs );         \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf8_0123to33333333( c, cs );         \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );  
+
+#define stbir__4_coeff_continue_from_4( ofs )                   \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
+    stbir__simdf8_0123to00000000( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_0123to11111111( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
+    stbir__simdf8_0123to22222222( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
+    stbir__simdf8_0123to33333333( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 ); 
+
+#define stbir__1_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load1b( c, hc + (ofs) );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    
+
+#define stbir__2_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load1b( c, hc + (ofs) );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );   
+
+#define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf8_load4b( cs, hc + (ofs) );                     \
+    stbir__simdf8_0123to00000000( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
+    stbir__simdf8_0123to11111111( c, cs );                      \
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
+    stbir__simdf8_0123to22222222( c, cs );                      \
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); 
+
+#define stbir__store_output()                     \
+    stbir__simdf8_add( tot0, tot0, tot1 );        \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;                                  \
+    if ( output < output_end )                    \
+    {                                             \
+      stbir__simdf8_store( output-7, tot0 );      \
+      continue;                                   \
+    }                                             \
+    stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \
+    stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) );           \
+    break;
+
+#else
+
+#define stbir__4_coeff_start()                    \
+    stbir__simdf tot0,tot1,tot2,tot3,c,cs;        \
+    STBIR_SIMD_NO_UNROLL(decode);                 \
+    stbir__simdf_load( cs, hc );                  \
+    stbir__simdf_0123to0000( c, cs );             \
+    stbir__simdf_mult_mem( tot0, c, decode );     \
+    stbir__simdf_mult_mem( tot1, c, decode+3 );   \
+    stbir__simdf_0123to1111( c, cs );             \
+    stbir__simdf_mult_mem( tot2, c, decode+7 );   \
+    stbir__simdf_mult_mem( tot3, c, decode+10 );  \
+    stbir__simdf_0123to2222( c, cs );             \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
+    stbir__simdf_0123to3333( c, cs );                   \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );         
+
+#define stbir__4_coeff_continue_from_4( ofs )                   \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load( cs, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
+    stbir__simdf_0123to2222( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
+    stbir__simdf_0123to3333( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );   
+
+#define stbir__1_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load1( c, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, c );                            \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+
+#define stbir__2_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load2( cs, hc + (ofs) );                       \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  
+  
+#define stbir__3_coeff_remnant( ofs )                           \
+    STBIR_SIMD_NO_UNROLL(decode);                               \
+    stbir__simdf_load( cs, hc + (ofs) );                        \
+    stbir__simdf_0123to0000( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 );     \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
+    stbir__simdf_0123to1111( c, cs );                           \
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
+    stbir__simdf_0123to2222( c, cs );                           \
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  
+
+#define stbir__store_output()                     \
+    stbir__simdf_add( tot0, tot0, tot2 );         \
+    stbir__simdf_add( tot1, tot1, tot3 );         \
+    stbir__simdf_store( output+3, tot1 );         \
+    stbir__simdf_store( output, tot0 );           \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#endif
+
+#else
+
+#define stbir__1_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              
+
+#define stbir__2_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              \
+    c = hc[1];                       \
+    tot0 += decode[7]*c;             \
+    tot1 += decode[8]*c;             \
+    tot2 += decode[9]*c;             \
+    tot3 += decode[10]*c;            \
+    tot4 += decode[11]*c;            \
+    tot5 += decode[12]*c;            \
+    tot6 += decode[13]*c;            \
+
+#define stbir__3_coeff_only()        \
+    float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
+    c = hc[0];                       \
+    tot0 = decode[0]*c;              \
+    tot1 = decode[1]*c;              \
+    tot2 = decode[2]*c;              \
+    tot3 = decode[3]*c;              \
+    tot4 = decode[4]*c;              \
+    tot5 = decode[5]*c;              \
+    tot6 = decode[6]*c;              \
+    c = hc[1];                       \
+    tot0 += decode[7]*c;             \
+    tot1 += decode[8]*c;             \
+    tot2 += decode[9]*c;             \
+    tot3 += decode[10]*c;            \
+    tot4 += decode[11]*c;            \
+    tot5 += decode[12]*c;            \
+    tot6 += decode[13]*c;            \
+    c = hc[2];                       \
+    tot0 += decode[14]*c;            \
+    tot1 += decode[15]*c;            \
+    tot2 += decode[16]*c;            \
+    tot3 += decode[17]*c;            \
+    tot4 += decode[18]*c;            \
+    tot5 += decode[19]*c;            \
+    tot6 += decode[20]*c;            \
+
+#define stbir__store_output_tiny()                \
+    output[0] = tot0;                             \
+    output[1] = tot1;                             \
+    output[2] = tot2;                             \
+    output[3] = tot3;                             \
+    output[4] = tot4;                             \
+    output[5] = tot5;                             \
+    output[6] = tot6;                             \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#define stbir__4_coeff_start()    \
+    float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \
+    STBIR_SIMD_NO_UNROLL(decode); \
+    c = hc[0];                    \
+    x0 = decode[0] * c;           \
+    x1 = decode[1] * c;           \
+    x2 = decode[2] * c;           \
+    x3 = decode[3] * c;           \
+    x4 = decode[4] * c;           \
+    x5 = decode[5] * c;           \
+    x6 = decode[6] * c;           \
+    c = hc[1];                    \
+    y0 = decode[7] * c;           \
+    y1 = decode[8] * c;           \
+    y2 = decode[9] * c;           \
+    y3 = decode[10] * c;          \
+    y4 = decode[11] * c;          \
+    y5 = decode[12] * c;          \
+    y6 = decode[13] * c;          \
+    c = hc[2];                    \
+    x0 += decode[14] * c;         \
+    x1 += decode[15] * c;         \
+    x2 += decode[16] * c;         \
+    x3 += decode[17] * c;         \
+    x4 += decode[18] * c;         \
+    x5 += decode[19] * c;         \
+    x6 += decode[20] * c;         \
+    c = hc[3];                    \
+    y0 += decode[21] * c;         \
+    y1 += decode[22] * c;         \
+    y2 += decode[23] * c;         \
+    y3 += decode[24] * c;         \
+    y4 += decode[25] * c;         \
+    y5 += decode[26] * c;         \
+    y6 += decode[27] * c; 
+
+#define stbir__4_coeff_continue_from_4( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+    c = hc[2+(ofs)];               \
+    x0 += decode[14+(ofs)*7] * c;  \
+    x1 += decode[15+(ofs)*7] * c;  \
+    x2 += decode[16+(ofs)*7] * c;  \
+    x3 += decode[17+(ofs)*7] * c;  \
+    x4 += decode[18+(ofs)*7] * c;  \
+    x5 += decode[19+(ofs)*7] * c;  \
+    x6 += decode[20+(ofs)*7] * c;  \
+    c = hc[3+(ofs)];               \
+    y0 += decode[21+(ofs)*7] * c;  \
+    y1 += decode[22+(ofs)*7] * c;  \
+    y2 += decode[23+(ofs)*7] * c;  \
+    y3 += decode[24+(ofs)*7] * c;  \
+    y4 += decode[25+(ofs)*7] * c;  \
+    y5 += decode[26+(ofs)*7] * c;  \
+    y6 += decode[27+(ofs)*7] * c; 
+
+#define stbir__1_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+
+#define stbir__2_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+  
+#define stbir__3_coeff_remnant( ofs ) \
+    STBIR_SIMD_NO_UNROLL(decode);  \
+    c = hc[0+(ofs)];               \
+    x0 += decode[0+(ofs)*7] * c;   \
+    x1 += decode[1+(ofs)*7] * c;   \
+    x2 += decode[2+(ofs)*7] * c;   \
+    x3 += decode[3+(ofs)*7] * c;   \
+    x4 += decode[4+(ofs)*7] * c;   \
+    x5 += decode[5+(ofs)*7] * c;   \
+    x6 += decode[6+(ofs)*7] * c;   \
+    c = hc[1+(ofs)];               \
+    y0 += decode[7+(ofs)*7] * c;   \
+    y1 += decode[8+(ofs)*7] * c;   \
+    y2 += decode[9+(ofs)*7] * c;   \
+    y3 += decode[10+(ofs)*7] * c;  \
+    y4 += decode[11+(ofs)*7] * c;  \
+    y5 += decode[12+(ofs)*7] * c;  \
+    y6 += decode[13+(ofs)*7] * c;  \
+    c = hc[2+(ofs)];               \
+    x0 += decode[14+(ofs)*7] * c;  \
+    x1 += decode[15+(ofs)*7] * c;  \
+    x2 += decode[16+(ofs)*7] * c;  \
+    x3 += decode[17+(ofs)*7] * c;  \
+    x4 += decode[18+(ofs)*7] * c;  \
+    x5 += decode[19+(ofs)*7] * c;  \
+    x6 += decode[20+(ofs)*7] * c;  \
+
+#define stbir__store_output()                     \
+    output[0] = x0 + y0;                          \
+    output[1] = x1 + y1;                          \
+    output[2] = x2 + y2;                          \
+    output[3] = x3 + y3;                          \
+    output[4] = x4 + y4;                          \
+    output[5] = x5 + y5;                          \
+    output[6] = x6 + y6;                          \
+    horizontal_coefficients += coefficient_width; \
+    ++horizontal_contributors;                    \
+    output += 7;
+
+#endif  
+
+#define STBIR__horizontal_channels 7
+#define STB_IMAGE_RESIZE_DO_HORIZONTALS
+#include STBIR__HEADER_FILENAME
+
+
+// include all of the vertical resamplers (both scatter and gather versions)
+
+#define STBIR__vertical_channels 1
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 1
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 2
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 2
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 3
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 3
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 4
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 4
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 5
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 5
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 6
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 6
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 7
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 7
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 8
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#include STBIR__HEADER_FILENAME
+
+#define STBIR__vertical_channels 8
+#define STB_IMAGE_RESIZE_DO_VERTICALS
+#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#include STBIR__HEADER_FILENAME
+
+typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end );
+
+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] =
+{
+  stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs
+};
+
+static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] =
+{
+  stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont
+};
+
+typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end );
+
+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] =
+{
+  stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs
+};
+
+static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] =
+{
+  stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont
+};
+
+
+static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row  STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  int num_pixels = stbir_info->horizontal.scale_info.output_sub_size;
+  int channels = stbir_info->channels;
+  int width_times_channels = num_pixels * channels;
+  void * output_buffer;
+
+  // un-alpha weight if we need to
+  if ( stbir_info->alpha_unweight )
+  {
+    STBIR_PROFILE_START( unalpha );
+    stbir_info->alpha_unweight( encode_buffer, width_times_channels );
+    STBIR_PROFILE_END( unalpha );
+  }
+
+  // write directly into output by default
+  output_buffer = output_buffer_data;
+
+  // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
+  if ( stbir_info->out_pixels_cb )
+    output_buffer = encode_buffer;
+  
+  STBIR_PROFILE_START( encode );
+  // convert into the output buffer
+  stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
+  STBIR_PROFILE_END( encode );
+
+  // if we have an output callback, call it to send the data
+  if ( stbir_info->out_pixels_cb )
+    stbir_info->out_pixels_cb( output_buffer_data, num_pixels, row, stbir_info->user_data );
+}
+
+
+// Get the ring buffer pointer for an index
+static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index )
+{
+  STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries );
+
+  #ifdef STBIR__SEPARATE_ALLOCATIONS
+    return split_info->ring_buffers[ index ];
+  #else
+    return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) );
+  #endif
+}
+
+// Get the specified scan line from the ring buffer
+static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline)
+{
+  int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+  return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index );
+}
+
+static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO )
+{
+  float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels );
+
+  STBIR_PROFILE_START( horizontal );
+  if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) )
+    STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels );
+  else
+    stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width );
+  STBIR_PROFILE_END( horizontal );
+}
+
+static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients )
+{
+  float* encode_buffer = split_info->vertical_buffer;
+  float* decode_buffer = split_info->decode_buffer;
+  int vertical_first = stbir_info->vertical_first;
+  int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size;
+  int width_times_channels = stbir_info->effective_channels * width;
+
+  STBIR_ASSERT( stbir_info->vertical.is_gather );
+
+  // loop over the contributing scanlines and scale into the buffer
+  STBIR_PROFILE_START( vertical );
+  {
+    int k = 0, total = contrib_n1 - contrib_n0 + 1;
+    STBIR_ASSERT( total > 0 );
+    do {
+      float const * inputs[8];
+      int i, cnt = total; if ( cnt > 8 ) cnt = 8;
+      for( i = 0 ; i < cnt ; i++ )
+        inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 );
+
+      // call the N scanlines at a time function (up to 8 scanlines of blending at once)
+      ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels );
+      k += cnt;
+      total -= cnt;
+    } while ( total );
+  }
+  STBIR_PROFILE_END( vertical );
+
+  if ( vertical_first )
+  {
+    // Now resample the gathered vertical data in the horizontal axis into the encode buffer
+    stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  }
+
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes), 
+                          encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+}
+
+static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n)
+{
+  int ring_buffer_index;
+  float* ring_buffer;
+
+  // Decode the nth scanline from the source image into the decode buffer.
+  stbir__decode_scanline( stbir_info, n, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // update new end scanline
+  split_info->ring_buffer_last_scanline = n;
+
+  // get ring buffer 
+  ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+  ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
+
+  // Now resample it into the ring buffer.
+  stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+  // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
+{
+  int y, start_output_y, end_output_y;
+  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
+  float const * vertical_coefficients = stbir_info->vertical.coefficients;
+
+  STBIR_ASSERT( stbir_info->vertical.is_gather );
+
+  start_output_y = split_info->start_output_y;
+  end_output_y = split_info[split_count-1].end_output_y;
+
+  vertical_contributors += start_output_y;
+  vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width;
+
+  // initialize the ring buffer for gathering
+  split_info->ring_buffer_begin_index = 0;
+  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;  
+  split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
+
+  for (y = start_output_y; y < end_output_y; y++)
+  {
+    int in_first_scanline, in_last_scanline;
+
+    in_first_scanline = vertical_contributors->n0;
+    in_last_scanline = vertical_contributors->n1;
+
+    // make sure the indexing hasn't broken
+    STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline );
+
+    // Load in new scanlines
+    while (in_last_scanline > split_info->ring_buffer_last_scanline)
+    {
+      STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries );
+
+      // make sure there was room in the ring buffer when we add new scanlines
+      if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries )
+      {
+        split_info->ring_buffer_first_scanline++;
+        split_info->ring_buffer_begin_index++;
+      }
+      
+      if ( stbir_info->vertical_first )
+      {
+        float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
+        // Decode the nth scanline from the source image into the decode buffer.
+        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+      }
+      else
+      {
+        stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1);
+      }
+    }
+
+    // Now all buffers should be ready to write a row of vertical sampling, so do it.
+    stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients );
+
+    ++vertical_contributors;
+    vertical_coefficients += stbir_info->vertical.coefficient_width;
+  }
+}
+
+#define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F
+#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER)
+
+static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
+{
+  // evict a scanline out into the output buffer
+  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
+  
+  // dump the scanline out
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  
+  // mark it as empty
+  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
+
+  // advance the first scanline
+  split_info->ring_buffer_first_scanline++;
+  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
+    split_info->ring_buffer_begin_index = 0;
+}
+
+static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info)
+{
+  // evict a scanline out into the output buffer
+
+  float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
+
+  // Now resample it into the buffer.
+  stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  
+  // dump the scanline out
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  
+  // mark it as empty
+  ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
+
+  // advance the first scanline
+  split_info->ring_buffer_first_scanline++;
+  if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries )
+    split_info->ring_buffer_begin_index = 0;
+}
+
+static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end )
+{
+  STBIR_ASSERT( !stbir_info->vertical.is_gather );
+
+  STBIR_PROFILE_START( vertical );
+  {
+    int k = 0, total = n1 - n0 + 1;
+    STBIR_ASSERT( total > 0 );
+    do {
+      float * outputs[8];
+      int i, n = total; if ( n > 8 ) n = 8;
+      for( i = 0 ; i < n ; i++ )
+      {
+        outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 );
+        if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type
+        {
+          n = i;
+          break;
+        }
+      }
+      // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once)
+      ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end );
+      k += n;
+      total -= n;
+    } while ( total );
+  }
+
+  STBIR_PROFILE_END( vertical );
+}
+
+typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info); 
+
+static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
+{
+  int y, start_output_y, end_output_y, start_input_y, end_input_y;
+  stbir__contributors* vertical_contributors = stbir_info->vertical.contributors;
+  float const * vertical_coefficients = stbir_info->vertical.coefficients;
+  stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter;
+  void * scanline_scatter_buffer;
+  void * scanline_scatter_buffer_end;
+  int on_first_input_y, last_input_y;
+
+  STBIR_ASSERT( !stbir_info->vertical.is_gather );
+
+  start_output_y = split_info->start_output_y;
+  end_output_y = split_info[split_count-1].end_output_y;  // may do multiple split counts
+
+  start_input_y = split_info->start_input_y;
+  end_input_y = split_info[split_count-1].end_input_y;
+
+  // adjust for starting offset start_input_y
+  y = start_input_y + stbir_info->vertical.filter_pixel_margin; 
+  vertical_contributors += y ;
+  vertical_coefficients += stbir_info->vertical.coefficient_width * y;
+
+  if ( stbir_info->vertical_first )
+  {
+    handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter;
+    scanline_scatter_buffer = split_info->decode_buffer;
+    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1);
+  }
+  else
+  {
+    handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter;
+    scanline_scatter_buffer = split_info->vertical_buffer;
+    scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size;
+  }
+
+  // initialize the ring buffer for scattering
+  split_info->ring_buffer_first_scanline = start_output_y;
+  split_info->ring_buffer_last_scanline = -1;
+  split_info->ring_buffer_begin_index = -1;
+
+  // mark all the buffers as empty to start
+  for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ )
+    stbir__get_ring_buffer_entry( stbir_info, split_info, y )[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter
+
+  // do the loop in input space
+  on_first_input_y = 1; last_input_y = start_input_y;
+  for (y = start_input_y ; y < end_input_y; y++)
+  {
+    int out_first_scanline, out_last_scanline;
+
+    out_first_scanline = vertical_contributors->n0;
+    out_last_scanline = vertical_contributors->n1;
+
+    STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+    if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) )
+    {
+      float const * vc = vertical_coefficients;
+
+      // keep track of the range actually seen for the next resize
+      last_input_y = y;
+      if ( ( on_first_input_y ) && ( y > start_input_y ) )
+        split_info->start_input_y = y;
+      on_first_input_y = 0;
+
+      // clip the region 
+      if ( out_first_scanline < start_output_y )
+      {
+        vc += start_output_y - out_first_scanline;
+        out_first_scanline = start_output_y;
+      }
+
+      if ( out_last_scanline >= end_output_y )
+        out_last_scanline = end_output_y - 1;
+
+      // if very first scanline, init the index
+      if (split_info->ring_buffer_begin_index < 0)
+        split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
+      
+      STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
+
+      // Decode the nth scanline from the source image into the decode buffer.
+      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+
+      // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
+      if ( !stbir_info->vertical_first )
+        stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+
+      // Now it's sitting in the buffer ready to be distributed into the ring buffers.
+
+      // evict from the ringbuffer, if we need are full
+      if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
+           ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
+        handle_scanline_for_scatter( stbir_info, split_info );
+    
+      // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
+      stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
+
+      // update the end of the buffer
+      if ( out_last_scanline > split_info->ring_buffer_last_scanline )
+        split_info->ring_buffer_last_scanline = out_last_scanline;
+    }
+    ++vertical_contributors;
+    vertical_coefficients += stbir_info->vertical.coefficient_width;
+  }
+
+  // now evict the scanlines that are left over in the ring buffer
+  while ( split_info->ring_buffer_first_scanline < end_output_y )
+    handle_scanline_for_scatter(stbir_info, split_info);
+
+  // update the end_input_y if we do multiple resizes with the same data
+  ++last_input_y;
+  for( y = 0 ; y < split_count; y++ )
+    if ( split_info[y].end_input_y > last_input_y )
+      split_info[y].end_input_y = last_input_y;
+}
+
+
+static stbir__kernel_callback * stbir__builtin_kernels[] =   { 0, stbir__filter_trapezoid,  stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point };
+static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one,     stbir__support_two,  stbir__support_two,       stbir__support_two,     stbir__support_zeropoint5 };
+
+static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data )
+{
+  // set filter
+  if (filter == 0)
+  {
+    filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample
+    if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
+    {
+      if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
+        filter = STBIR_FILTER_POINT_SAMPLE;  
+      else
+        filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
+    }
+  }
+  samp->filter_enum = filter;
+
+  STBIR_ASSERT(samp->filter_enum != 0);
+  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER); 
+  samp->filter_kernel = stbir__builtin_kernels[ filter ];
+  samp->filter_support = stbir__builtin_supports[ filter ];
+
+  if ( kernel && support )
+  {
+    samp->filter_kernel = kernel;
+    samp->filter_support = support;
+    samp->filter_enum = STBIR_FILTER_OTHER;
+  }
+
+  samp->edge = edge;
+  samp->filter_pixel_width  = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data );
+  // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory
+  //    For horizontal, we always have all the pixels, so we always use gather here (always_gather==1).
+  //    For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width
+  //    scanlines in memory at once).
+  samp->is_gather = 0;
+  if ( scale_info->scale >= ( 1.0f - stbir__small_float ) )
+    samp->is_gather = 1;
+  else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) )
+    samp->is_gather = 2;
+
+  // pre calculate stuff based on the above
+  samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
+
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 2 ) )  // this can only happen when shrinking to a single pixel
+      samp->filter_pixel_width = scale_info->input_full_size * 2;
+
+  // This is how much to expand buffers to account for filters seeking outside
+  // the image boundaries.
+  samp->filter_pixel_margin = samp->filter_pixel_width / 2;
+
+  samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
+  samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
+  samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
+
+  samp->gather_prescatter_contributors = 0;
+  samp->gather_prescatter_coefficients = 0;
+  if ( samp->is_gather == 0 )
+  {
+    samp->gather_prescatter_coefficient_width = samp->filter_pixel_width;
+    samp->gather_prescatter_num_contributors  = stbir__get_contributors(samp, 2);
+    samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors);
+    samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float);
+  }
+}
+
+static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data )
+{
+  float scale = samp->scale_info.scale;
+  float out_shift = samp->scale_info.pixel_shift;
+  stbir__support_callback * support = samp->filter_support;
+  int input_full_size = samp->scale_info.input_full_size;
+  stbir_edge edge = samp->edge;
+  float inv_scale = samp->scale_info.inv_scale;
+
+  STBIR_ASSERT( samp->is_gather != 0 );
+
+  if ( samp->is_gather == 1 )
+  {
+    int in_first_pixel, in_last_pixel;
+    float out_filter_radius = support(inv_scale, user_data) * scale;
+
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
+    range->n0 = in_first_pixel;
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge );
+    range->n1 = in_last_pixel;
+  }
+  else if ( samp->is_gather == 2 ) // downsample gather, refine
+  {
+    float in_pixels_radius = support(scale, user_data) * inv_scale;
+    int filter_pixel_margin = samp->filter_pixel_margin;
+    int output_sub_size = samp->scale_info.output_sub_size;
+    int input_end;
+    int n;
+    int in_first_pixel, in_last_pixel;
+
+    // get a conservative area of the input range
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge );
+    range->n0 = in_first_pixel;
+    stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
+    range->n1 = in_last_pixel;
+     
+    // now go through the margin to the start of area to find bottom 
+    n = range->n0 + 1;
+    input_end = -filter_pixel_margin;
+    while( n >= input_end )
+    {
+      int out_first_pixel, out_last_pixel;
+      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
+      if ( out_first_pixel > out_last_pixel )
+        break;
+
+      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
+        range->n0 = n;
+      --n;
+    }
+
+    // now go through the end of the area through the margin to find top 
+    n = range->n1 - 1;
+    input_end = n + 1 + filter_pixel_margin;
+    while( n <= input_end )
+    {
+      int out_first_pixel, out_last_pixel;
+      stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size );
+      if ( out_first_pixel > out_last_pixel )
+        break;
+      if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) )
+        range->n1 = n;
+      ++n;
+    }
+  }
+
+  if ( samp->edge == STBIR_EDGE_WRAP )
+  {
+    // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge
+    if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) )
+    {
+      int marg = range->n1 - input_full_size + 1;
+      if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 )
+        range->n0 = 0;
+    }
+    if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) )
+    {
+      int marg = -range->n0;
+      if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 )
+        range->n1 = input_full_size - 1;
+    }
+  }
+  else
+  {
+    // for non-edge-wrap modes, we never read over the edge, so clamp
+    if ( range->n0 < 0 )
+      range->n0 = 0;
+    if ( range->n1 >= input_full_size )
+      range->n1 = input_full_size - 1;
+  }
+}
+
+static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height )
+{
+  int i, cur;
+  int left = output_height;
+
+  cur = 0;
+  for( i = 0 ; i < splits ; i++ )
+  {
+    int each; 
+    split_info[i].start_output_y = cur;
+    each = left / ( splits - i );
+    split_info[i].end_output_y = cur + each;
+    cur += each;
+    left -= each;
+
+    // scatter range (updated to minimum as you run it)
+    split_info[i].start_input_y = -vertical_pixel_margin;
+    split_info[i].end_input_y = input_full_height + vertical_pixel_margin;
+  }
+}
+
+static void stbir__free_internal_mem( stbir__info *info )
+{
+  #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
+  
+  if ( info )
+  {
+  #ifndef STBIR__SEPARATE_ALLOCATIONS
+    STBIR__FREE_AND_CLEAR( info->alloced_mem );
+  #else
+    int i,j;
+
+    if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) )
+    {
+      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients );
+      STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors );
+    }
+    for( i = 0 ; i < info->splits ; i++ )
+    {
+      for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
+      {
+        #ifdef STBIR_SIMD8
+        if ( info->effective_channels == 3 ) 
+          --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
+        #endif  
+        STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
+      }
+
+      #ifdef STBIR_SIMD8
+      if ( info->effective_channels == 3 ) 
+        --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif  
+      STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
+      STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
+      STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
+    }
+    STBIR__FREE_AND_CLEAR( info->split_info );
+    if ( info->vertical.coefficients != info->horizontal.coefficients )
+    {
+      STBIR__FREE_AND_CLEAR( info->vertical.coefficients );
+      STBIR__FREE_AND_CLEAR( info->vertical.contributors );
+    }
+    STBIR__FREE_AND_CLEAR( info->horizontal.coefficients );
+    STBIR__FREE_AND_CLEAR( info->horizontal.contributors );
+    STBIR__FREE_AND_CLEAR( info->alloced_mem );
+    STBIR__FREE_AND_CLEAR( info );
+  #endif
+  }
+  
+  #undef STBIR__FREE_AND_CLEAR
+}
+
+static int stbir__get_max_split( int splits, int height )
+{
+  int i;
+  int max = 0;
+
+  for( i = 0 ; i < splits ; i++ )
+  {
+    int each = height / ( splits - i );
+    if ( each > max ) 
+      max = each;
+    height -= each;
+  }
+  return max;
+}
+
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] = 
+{ 
+  0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
+};
+
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] = 
+{ 
+  0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
+};
+
+// there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column
+#define STBIR_RESIZE_CLASSIFICATIONS 8
+
+static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]=  // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan
+{
+  {
+    { 1.00000f, 1.00000f, 0.31250f, 1.00000f },
+    { 0.56250f, 0.59375f, 0.00000f, 0.96875f },
+    { 1.00000f, 0.06250f, 0.00000f, 1.00000f },
+    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.03125f },
+  }, {
+    { 0.00000f, 0.84375f, 0.00000f, 0.03125f },
+    { 0.09375f, 0.93750f, 0.00000f, 0.78125f },
+    { 0.87500f, 0.21875f, 0.00000f, 0.96875f },
+    { 0.09375f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.53125f },
+  }, {
+    { 0.00000f, 0.53125f, 0.00000f, 0.03125f },
+    { 0.06250f, 0.96875f, 0.00000f, 0.53125f },
+    { 0.87500f, 0.18750f, 0.00000f, 0.93750f },
+    { 0.00000f, 0.09375f, 1.00000f, 1.00000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.03125f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.00000f, 0.56250f },
+  }, {
+    { 0.00000f, 0.50000f, 0.00000f, 0.71875f },
+    { 0.06250f, 0.84375f, 0.00000f, 0.87500f },
+    { 1.00000f, 0.50000f, 0.50000f, 0.96875f },
+    { 1.00000f, 0.09375f, 0.31250f, 0.50000f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 1.00000f, 0.03125f, 0.03125f, 0.53125f },
+    { 0.18750f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.03125f, 0.18750f },
+  }, {
+    { 0.00000f, 0.59375f, 0.00000f, 0.96875f },
+    { 0.06250f, 0.81250f, 0.06250f, 0.59375f },
+    { 0.75000f, 0.43750f, 0.12500f, 0.96875f },
+    { 0.87500f, 0.06250f, 0.18750f, 0.43750f },
+    { 1.00000f, 1.00000f, 1.00000f, 1.00000f },
+    { 0.15625f, 0.12500f, 1.00000f, 1.00000f },
+    { 0.06250f, 0.12500f, 0.00000f, 1.00000f },
+    { 0.00000f, 1.00000f, 0.03125f, 0.34375f },
+  }
+};
+
+// structure that allow us to query and override info for training the costs
+typedef struct STBIR__V_FIRST_INFO
+{
+  double v_cost, h_cost;
+  int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert
+  int v_first;
+  int v_resize_classification;
+  int is_gather;
+} STBIR__V_FIRST_INFO;
+
+#ifdef STBIR__V_FIRST_INFO_BUFFER
+static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
+#define STBIR__V_FIRST_INFO_POINTER &STBIR__V_FIRST_INFO_BUFFER
+#else
+#define STBIR__V_FIRST_INFO_POINTER 0
+#endif
+
+// Figure out whether to scale along the horizontal or vertical first.
+//   This only *super* important when you are scaling by a massively 
+//   different amount in the vertical vs the horizontal (for example, if 
+//   you are scaling by 2x in the width, and 0.5x in the height, then you 
+//   want to do the vertical scale first, because it's around 3x faster 
+//   in that order.
+//
+//   In more normal circumstances, this makes a 20-40% differences, so 
+//     it's good to get right, but not critical. The normal way that you
+//     decide which direction goes first is just figuring out which 
+//     direction does more multiplies. But with modern CPUs with their 
+//     fancy caches and SIMD and high IPC abilities, so there's just a lot
+//     more that goes into it. 
+//
+//   My handwavy sort of solution is to have an app that does a whole 
+//     bunch of timing for both vertical and horizontal first modes,
+//     and then another app that can read lots of these timing files
+//     and try to search for the best weights to use. Dotimings.c
+//     is the app that does a bunch of timings, and vf_train.c is the
+//     app that solves for the best weights (and shows how well it 
+//     does currently).
+
+static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )    
+{
+  double v_cost, h_cost;
+  float * weights;
+  int vertical_first;
+  int v_classification;
+
+  // categorize the resize into buckets
+  if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) )
+    v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
+  else if ( vertical_scale <= 1.0f )
+    v_classification = ( is_gather ) ? 1 : 0;
+  else if ( vertical_scale <= 2.0f) 
+    v_classification = 2;
+  else if ( vertical_scale <= 3.0f) 
+    v_classification = 3;
+  else if ( vertical_scale <= 4.0f) 
+    v_classification = 5;
+  else 
+    v_classification = 6;
+  
+  // use the right weights
+  weights = weights_table[ v_classification ];
+
+  // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate
+  h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1];
+  v_cost = (float)vertical_filter_pixel_width  * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3];
+
+  // use computation estimate to decide vertical first or not
+  vertical_first = ( v_cost <= h_cost ) ? 1 : 0;
+
+  // save these, if requested
+  if ( info )
+  {
+    info->h_cost = h_cost;
+    info->v_cost = v_cost;
+    info->v_resize_classification = v_classification;
+    info->v_first = vertical_first;
+    info->is_gather = is_gather;
+  }
+
+  // and this allows us to override everything for testing (see dotiming.c) 
+  if ( ( info ) && ( info->control_v_first ) ) 
+    vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
+  
+  return vertical_first;
+}
+
+// layout lookups - must match stbir_internal_pixel_layout
+static unsigned char stbir__pixel_channels[] = {
+  1,2,3,3,4,   // 1ch, 2ch, rgb, bgr, 4ch
+  4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR
+  4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM
+};
+
+// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
+static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
+  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA, 
+  STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
+  STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
+};
+
+static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO )
+{
+  static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 };
+
+  stbir__info * info = 0;
+  void * alloced = 0;
+  int alloced_total = 0;
+  int vertical_first;
+  int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
+
+  int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
+  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size ); 
+  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];  
+  stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
+  int channels = stbir__pixel_channels[ input_pixel_layout ];      
+  int effective_channels = channels;
+  
+  // first figure out what type of alpha weighting to use (if any)
+  if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
+  {
+    if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
+    {
+      if ( fast_alpha )
+      {
+        alpha_weighting_type = 4;
+      }
+      else
+      {
+        static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 };
+        alpha_weighting_type = 2;
+        effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ];
+      }
+    }
+    else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) )
+    {
+      // input premult, output non-premult
+      alpha_weighting_type = 3;
+    }
+    else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) )
+    {
+      // input non-premult, output premult
+      alpha_weighting_type = 1;
+    }
+  }
+
+  // channel in and out count must match currently
+  if ( channels != stbir__pixel_channels[ output_pixel_layout ] )
+    return 0;
+
+  // get vertical first
+  vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER );
+
+  // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
+  decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+  
+#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
+  if ( effective_channels == 3 )
+    decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
+#endif  
+
+  ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
+
+  // if we do vertical first, the ring buffer holds a whole decoded line
+  if ( vertical_first )
+    ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15;
+
+  if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias
+
+  // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+  alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1;
+
+  // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode
+  if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) )
+    alloc_ring_buffer_num_entries = conservative_split_output_size;
+
+  ring_buffer_size = alloc_ring_buffer_num_entries * ring_buffer_length_bytes;
+
+  // The vertical buffer is used differently, depending on whether we are scattering
+  //   the vertical scanlines, or gathering them.
+  //   If scattering, it's used at the temp buffer to accumulate each output.
+  //   If gathering, it's just the output buffer.
+  vertical_buffer_size = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float);  // extra float for padding
+
+  // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init
+  for(;;)
+  {
+    int i;
+    void * advance_mem = alloced;
+    int copy_horizontal = 0;
+    stbir__sampler * possibly_use_horizontal_for_pivot = 0;
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; }
+#else
+    #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = ((char*)advance_mem) + (size);
+#endif
+
+    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );      
+
+    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );      
+
+    if ( info )
+    {
+      static stbir__alpha_weight_func * fancy_alpha_weights[6]  =    { stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_4ch,   stbir__fancy_alpha_weight_2ch,   stbir__fancy_alpha_weight_2ch };
+      static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch };
+      static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch };
+      static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch };
+
+      // initialize info fields
+      info->alloced_mem = alloced;
+      info->alloced_total = alloced_total;
+
+      info->channels = channels;
+      info->effective_channels = effective_channels;
+  
+      info->offset_x = new_x;
+      info->offset_y = new_y;
+      info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
+      info->ring_buffer_num_entries = 0;  
+      info->ring_buffer_length_bytes = ring_buffer_length_bytes;
+      info->splits = splits;
+      info->vertical_first = vertical_first;
+
+      info->input_pixel_layout_internal = input_pixel_layout;  
+      info->output_pixel_layout_internal = output_pixel_layout;
+
+      // setup alpha weight functions
+      info->alpha_weight = 0;
+      info->alpha_unweight = 0;
+    
+      // handle alpha weighting functions and overrides
+      if ( alpha_weighting_type == 2 )
+      {
+        // high quality alpha multiplying on the way in, dividing on the way out
+        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 4 )
+      {
+        // fast alpha multiplying on the way in, dividing on the way out
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+      else if ( alpha_weighting_type == 1 )
+      {
+        // fast alpha on the way in, leave in premultiplied form on way out
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; 
+      }
+      else if ( alpha_weighting_type == 3 )
+      {
+        // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output
+        info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
+      }
+
+      // handle 3-chan color flipping, using the alpha weight path
+      if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) ||
+           ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) )
+      {
+        // do the flipping on the smaller of the two ends
+        if ( horizontal->scale_info.scale < 1.0f )
+          info->alpha_unweight = stbir__simple_flip_3ch;
+        else
+          info->alpha_weight = stbir__simple_flip_3ch;
+      }
+
+    }        
+
+    // get all the per-split buffers
+    for( i = 0 ; i < splits ; i++ )
+    {
+      STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float );
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+
+      #ifdef STBIR_SIMD8
+      if ( ( info ) && ( effective_channels == 3 ) )
+        ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
+      #endif  
+
+      STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
+      {
+        int j;
+        for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ )
+        {
+          STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float );
+          #ifdef STBIR_SIMD8
+          if ( ( info ) && ( effective_channels == 3 ) )
+            ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
+          #endif  
+        }
+      }
+#else
+      STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float );
+#endif
+      STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float );
+    }
+
+    // alloc memory for to-be-pivoted coeffs (if necessary)
+    if ( vertical->is_gather == 0 )
+    {
+      int both;
+      int temp_mem_amt;
+
+      // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after,
+      //   that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that
+      //   is too small, we just allocate extra memory to use as this temp.
+
+      both = vertical->gather_prescatter_contributors_size + vertical->gather_prescatter_coefficients_size;
+
+#ifdef STBIR__SEPARATE_ALLOCATIONS
+      temp_mem_amt = decode_buffer_size;
+#else
+      temp_mem_amt = ( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * splits;
+#endif
+      if ( temp_mem_amt >= both )
+      {
+        if ( info ) 
+        { 
+          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer; 
+          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size ); 
+        }
+      }
+      else
+      {
+        // ring+decode memory is too small, so allocate temp memory
+        STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors );
+        STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float );
+      }
+    }
+
+    STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors );
+    STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float );
+
+    // are the two filters identical?? (happens a lot with mipmap generation)
+    if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) )
+    {
+      float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale;
+      float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift;
+      if ( diff_scale < 0.0f ) diff_scale = -diff_scale;
+      if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
+      if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
+      {
+        if ( horizontal->is_gather == vertical->is_gather ) 
+        {
+          copy_horizontal = 1;
+          goto no_vert_alloc;
+        }
+        // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs
+        possibly_use_horizontal_for_pivot = horizontal;
+      }
+    }
+
+    STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors );
+    STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float );
+
+   no_vert_alloc:
+
+    if ( info )
+    {
+      STBIR_PROFILE_BUILD_START( horizontal );
+
+      stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+
+      // setup the horizontal gather functions
+      // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover)
+      info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ];
+      // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
+      if ( horizontal->extent_info.widest <= 12 )
+        info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
+      
+      info->scanline_extents.conservative.n0 = conservative->n0;
+      info->scanline_extents.conservative.n1 = conservative->n1;
+      
+      // get exact extents
+      stbir__get_extents( horizontal, &info->scanline_extents );
+
+      // pack the horizontal coeffs
+      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 );
+      
+      STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
+
+      STBIR_PROFILE_BUILD_END( horizontal );
+
+      if ( copy_horizontal )
+      {
+        STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) );
+      }
+      else
+      {
+        STBIR_PROFILE_BUILD_START( vertical );
+
+        stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+        STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) );
+
+        STBIR_PROFILE_BUILD_END( vertical );
+      }
+
+      // setup the vertical split ranges
+      stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size );
+
+      // now we know precisely how many entries we need
+      info->ring_buffer_num_entries = info->vertical.extent_info.widest;
+
+      // we never need more ring buffer entries than the scanlines we're outputting
+      if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) )
+        info->ring_buffer_num_entries = conservative_split_output_size;
+      STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries );
+
+      // a few of the horizontal gather functions read one dword past the end (but mask it out), so put in a normal value so no snans or denormals accidentally sneak in
+      for( i = 0 ; i < splits ; i++ )
+      {
+        int width, ofs;
+        
+        // find the right most span
+        if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 )
+          width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0;
+        else
+          width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0;
+        
+        // this calc finds the exact end of the decoded scanline for all filter modes.
+        //   usually this is just the width * effective channels.  But we have to account 
+        //   for the area to the left of the scanline for wrap filtering and alignment, this 
+        //   is stored as a negative value in info->scanline_extents.conservative.n0. Next,
+        //   we need to skip the exact size of the right hand size filter area (again for
+        //   wrap mode), this is in info->scanline_extents.edge_sizes[1]).
+        ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels;
+        
+        // place a known, but numerically valid value in the decode buffer
+        info->split_info[i].decode_buffer[ ofs ] = 9999.0f;
+
+        // if vertical filtering first, place a known, but numerically valid value in the all
+        //   of the ring buffer accumulators
+        if ( vertical_first )
+        {
+          int j;  
+          for( j = 0; j < info->ring_buffer_num_entries ; j++ )
+          {
+            stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f;
+          }
+        }
+      }
+    }
+
+    #undef STBIR__NEXT_PTR
+
+
+    // is this the first time through loop?
+    if ( info == 0 )
+    {
+      alloced_total = (int) ( 15 + (size_t)advance_mem );
+      alloced = STBIR_MALLOC( alloced_total, user_data );
+      if ( alloced == 0 )
+        return 0;
+    }
+    else
+      return info;  // success
+  }
+}
+
+static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count ) 
+{
+  stbir__per_split_info * split_info = info->split_info + split_start;
+
+  STBIR_PROFILE_CLEAR_EXTRAS();
+
+  STBIR_PROFILE_FIRST_START( looping );
+  if (info->vertical.is_gather)
+    stbir__vertical_gather_loop( info, split_info, split_count );
+  else
+    stbir__vertical_scatter_loop( info, split_info, split_count );
+  STBIR_PROFILE_END( looping );
+
+  return 1;
+}
+
+static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize )
+{
+  static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear, 
+  };
+
+  static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
+    { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA },
+    { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB },
+    { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR },
+    { /* RA   */ stbir__decode_uint8_srgb2_linearalpha,      stbir__decode_uint8_srgb,      0, stbir__decode_float_linear,      stbir__decode_half_float_linear },
+    { /* AR   */ stbir__decode_uint8_srgb2_linearalpha_AR,   stbir__decode_uint8_srgb_AR,   0, stbir__decode_float_linear_AR,   stbir__decode_half_float_linear_AR },
+  };
+
+  static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]=
+  {
+    { stbir__decode_uint8_linear_scaled,  stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear },
+  };
+
+  static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
+  {
+    { /* RGBA */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
+    { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA,  stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } },
+    { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB,  stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } },
+    { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR,  stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } },
+    { /* RA   */ { stbir__decode_uint8_linear_scaled,       stbir__decode_uint8_linear },      { stbir__decode_uint16_linear_scaled,      stbir__decode_uint16_linear } },
+    { /* AR   */ { stbir__decode_uint8_linear_scaled_AR,    stbir__decode_uint8_linear_AR },   { stbir__decode_uint16_linear_scaled_AR,   stbir__decode_uint16_linear_AR } }
+  };
+
+  static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear,
+  };
+
+  static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
+  {
+    { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
+    { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA },
+    { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB },
+    { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR },
+    { /* RA   */ stbir__encode_uint8_srgb2_linearalpha,      stbir__encode_uint8_srgb,      0, stbir__encode_float_linear,      stbir__encode_half_float_linear },
+    { /* AR   */ stbir__encode_uint8_srgb2_linearalpha_AR,   stbir__encode_uint8_srgb_AR,   0, stbir__encode_float_linear_AR,   stbir__encode_half_float_linear_AR }
+  };
+
+  static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]=
+  {
+    { stbir__encode_uint8_linear_scaled,  stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear },
+  };
+
+  static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]=
+  {
+    { /* RGBA */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
+    { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA,  stbir__encode_uint8_linear_BGRA },  { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } },
+    { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB,  stbir__encode_uint8_linear_ARGB },  { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } },
+    { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR,  stbir__encode_uint8_linear_ABGR },  { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } },
+    { /* RA   */ { stbir__encode_uint8_linear_scaled,       stbir__encode_uint8_linear },       { stbir__encode_uint16_linear_scaled,      stbir__encode_uint16_linear } },
+    { /* AR   */ { stbir__encode_uint8_linear_scaled_AR,    stbir__encode_uint8_linear_AR },    { stbir__encode_uint16_linear_scaled_AR,   stbir__encode_uint16_linear_AR } }
+  };
+
+  stbir__decode_pixels_func * decode_pixels = 0;
+  stbir__encode_pixels_func * encode_pixels = 0;
+  stbir_datatype input_type, output_type;
+
+  input_type = resize->input_data_type;
+  output_type = resize->output_data_type; 
+  info->input_data = resize->input_pixels;
+  info->input_stride_bytes = resize->input_stride_in_bytes;
+  info->output_stride_bytes = resize->output_stride_in_bytes;
+
+  // if we're completely point sampling, then we can turn off SRGB
+  if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
+  {
+    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) && 
+         ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
+    {
+      input_type = STBIR_TYPE_UINT8;
+      output_type = STBIR_TYPE_UINT8;
+    }
+  }
+
+  // recalc the output and input strides  
+  if ( info->input_stride_bytes == 0 )
+    info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
+
+  if ( info->output_stride_bytes == 0 )
+    info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
+
+  // calc offset
+  info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+
+  info->in_pixels_cb = resize->input_cb;
+  info->user_data = resize->user_data;
+  info->out_pixels_cb = resize->output_cb;
+
+  // setup the input format converters
+  if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) )
+  {
+    int non_scaled = 0;
+
+    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
+    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight )  ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
+      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
+        non_scaled = 1;
+
+    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
+      decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+    else
+      decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+  }
+  else
+  {
+    if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL )
+      decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ];
+    else
+      decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ];
+  }
+
+  // setup the output format converters
+  if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
+  {
+    int non_scaled = 0;
+    
+    // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
+    if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
+      if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
+        non_scaled = 1;
+
+    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
+      encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+    else
+      encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ];
+  }
+  else
+  {
+    if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL )
+      encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ];
+    else
+      encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ];
+  }
+
+  info->input_type = input_type;
+  info->output_type = output_type; 
+  info->decode_pixels = decode_pixels;
+  info->encode_pixels = encode_pixels; 
+}
+
+static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
+{
+  double per, adj;
+  int over;
+  
+  // do left/top edge
+  if ( *outx < 0 )
+  {
+    per = ( (double)*outx ) / ( (double)*outsubw ); // is negative
+    adj = per * ( *u1 - *u0 );
+    *u0 -= adj; // increases u0
+    *outx = 0;
+  }
+
+  // do right/bot edge
+  over = outw - ( *outx + *outsubw );
+  if ( over < 0 )
+  {
+    per = ( (double)over ) / ( (double)*outsubw ); // is negative
+    adj = per * ( *u1 - *u0 );
+    *u1 += adj; // decrease u1
+    *outsubw = outw - *outx;
+  }
+}    
+
+// converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
+static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
+{
+  double err;
+  stbir_uint64 top, bot;
+  stbir_uint64 numer_last = 0;
+  stbir_uint64 denom_last = 1;
+  stbir_uint64 numer_estimate = 1;
+  stbir_uint64 denom_estimate = 0;
+
+  // scale to past float error range
+  top = (stbir_uint64)( f * (double)(1 << 25) );
+  bot = 1 << 25;
+
+  // keep refining, but usually stops in a few loops - usually 5 for bad cases
+  for(;;)  
+  {
+    stbir_uint64 est, temp;
+
+    // hit limit, break out and do best full range estimate
+    if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit )
+      break;
+
+    // is the current error less than 1 bit of a float? if so, we're done
+    if ( denom_estimate )
+    {
+      err = ( (double)numer_estimate / (double)denom_estimate ) - f;
+      if ( err < 0.0 ) err = -err;
+      if ( err < ( 1.0 / (double)(1<<24) ) )
+      {
+        // yup, found it
+        *numer = (stbir_uint32) numer_estimate;
+        *denom = (stbir_uint32) denom_estimate;
+        return 1;
+      }
+    }
+
+    // no more refinement bits left? break out and do full range estimate
+    if ( bot == 0 )
+      break;
+
+    // gcd the estimate bits
+    est = top / bot;
+    temp = top % bot;
+    top = bot;
+    bot = temp;
+
+    // move remainders
+    temp = est * denom_estimate + denom_last; 
+    denom_last = denom_estimate; 
+    denom_estimate = temp;
+
+    // move remainders
+    temp = est * numer_estimate + numer_last; 
+    numer_last = numer_estimate; 
+    numer_estimate = temp;
+  }
+
+  // we didn't fine anything good enough for float, use a full range estimate
+  if ( limit_denom )
+  {
+    numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 );
+    denom_estimate = limit;
+  }
+  else
+  {
+    numer_estimate = limit;
+    denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 );
+  }
+
+  *numer = (stbir_uint32) numer_estimate;
+  *denom = (stbir_uint32) denom_estimate;
+
+  err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0;
+  if ( err < 0.0 ) err = -err;
+  return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0;
+}
+
+static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 )
+{
+  double output_range, input_range, output_s, input_s, ratio, scale;
+
+  input_s = input_s1 - input_s0;
+
+  // null area
+  if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) ||
+       ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) )
+    return 0;
+
+  // are either of the ranges completely out of bounds?
+  if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) )
+    return 0;
+
+  output_range = (double)output_full_range;
+  input_range = (double)input_full_range;
+
+  output_s = ( (double)output_sub_range) / output_range;
+
+  // figure out the scaling to use 
+  ratio = output_s / input_s; 
+
+  // save scale before clipping
+  scale = ( output_range / input_range ) * ratio; 
+  scale_info->scale = (float)scale;
+  scale_info->inv_scale = (float)( 1.0 / scale );
+
+  // clip output area to left/right output edges (and adjust input area)
+  stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 );
+
+  // recalc input area
+  input_s = input_s1 - input_s0;
+
+  // after clipping do we have zero input area?
+  if ( input_s <= stbir__small_float ) 
+    return 0;
+
+  // calculate and store the starting source offsets in output pixel space 
+  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range ); 
+
+  scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
+
+  scale_info->input_full_size = input_full_range;
+  scale_info->output_sub_size = output_sub_range;
+
+  return 1;
+}
+
+
+static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type )
+{
+  resize->input_cb = 0;
+  resize->output_cb = 0;
+  resize->user_data = resize;
+  resize->samplers = 0;
+  resize->needs_rebuild = 1;
+  resize->called_alloc = 0;
+  resize->horizontal_filter = STBIR_FILTER_DEFAULT;
+  resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
+  resize->vertical_filter = STBIR_FILTER_DEFAULT;
+  resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0;
+  resize->horizontal_edge = STBIR_EDGE_CLAMP;
+  resize->vertical_edge = STBIR_EDGE_CLAMP;
+  resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1;
+  resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h;
+  resize->input_data_type = data_type;
+  resize->output_data_type = data_type;
+  resize->input_pixel_layout_public = pixel_layout;
+  resize->output_pixel_layout_public = pixel_layout;
+}
+
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, 
+                                 const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
+                                       void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
+                                 stbir_pixel_layout pixel_layout, stbir_datatype data_type )
+{
+  resize->input_pixels = input_pixels;
+  resize->input_w = input_w;
+  resize->input_h = input_h;
+  resize->input_stride_in_bytes = input_stride_in_bytes;
+  resize->output_pixels = output_pixels;
+  resize->output_w = output_w;
+  resize->output_h = output_h;
+  resize->output_stride_in_bytes = output_stride_in_bytes;
+  resize->fast_alpha = 0;
+
+  stbir__init_and_set_layout( resize, pixel_layout, data_type );
+}
+
+// You can update parameters any time after resize_init
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type )  // by default, datatype from resize_init
+{
+  resize->input_data_type = input_type;
+  resize->output_data_type = output_type;
+}
+
+STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb )   // no callbacks by default
+{
+  resize->input_cb = input_cb;
+  resize->output_cb = output_cb;
+}
+
+STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
+{
+  resize->user_data = user_data;
+}
+
+STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
+{
+  resize->input_pixels = input_pixels;
+  resize->input_stride_in_bytes = input_stride_in_bytes;
+  resize->output_pixels = output_pixels;
+  resize->output_stride_in_bytes = output_stride_in_bytes;
+}
+
+
+STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge )       // CLAMP by default
+{
+  resize->horizontal_edge = horizontal_edge;
+  resize->vertical_edge = vertical_edge;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
+{
+  resize->horizontal_filter = horizontal_filter;
+  resize->vertical_filter = vertical_filter;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support )
+{
+  resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support;
+  resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout )   // sets new pixel layouts
+{
+  resize->input_pixel_layout_public = input_pixel_layout;
+  resize->output_pixel_layout_public = output_pixel_layout;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+
+STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality )   // sets alpha speed
+{
+  resize->fast_alpha = non_pma_alpha_speed_over_quality;
+  resize->needs_rebuild = 1;
+  return 1;
+}
+
+STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 )                 // sets input region (full region by default)
+{
+  resize->input_s0 = s0;
+  resize->input_t0 = t0;
+  resize->input_s1 = s1;
+  resize->input_t1 = t1;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) ||
+       ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) ||
+       ( s0 > (1.0f-stbir__small_float) ) ||
+       ( t0 > (1.0f-stbir__small_float) ) )
+    return 0;
+
+  return 1;
+}
+
+STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )          // sets input region (full region by default)
+{
+  resize->output_subx = subx;
+  resize->output_suby = suby;
+  resize->output_subw = subw;
+  resize->output_subh = subh;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
+    return 0;
+
+  return 1;
+}
+
+STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh )                 // sets both regions (full regions by default)
+{
+  double s0, t0, s1, t1;
+
+  s0 = ( (double)subx ) / ( (double)resize->output_w );
+  t0 = ( (double)suby ) / ( (double)resize->output_h );
+  s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w );
+  t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h );
+
+  resize->input_s0 = s0;
+  resize->input_t0 = t0;
+  resize->input_s1 = s1;
+  resize->input_t1 = t1;
+  resize->output_subx = subx;
+  resize->output_suby = suby;
+  resize->output_subw = subw;
+  resize->output_subh = subh;
+  resize->needs_rebuild = 1;
+
+  // are we inbounds?
+  if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) )
+    return 0;
+
+  return 1;
+}
+
+static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) 
+{
+  stbir__contributors conservative = { 0, 0 };
+  stbir__sampler horizontal, vertical;
+  int new_output_subx, new_output_suby;
+  stbir__info * out_info;
+  #ifdef STBIR_PROFILE
+  stbir__info profile_infod;  // used to contain building profile info before everything is allocated
+  stbir__info * profile_info = &profile_infod;
+  #endif
+
+  // have we already built the samplers?
+  if ( resize->samplers )
+    return 0;
+  
+  #define STBIR_RETURN_ERROR_AND_ASSERT( exp )  STBIR_ASSERT( !(exp) ); if (exp) return 0;
+  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
+  STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
+  #undef STBIR_RETURN_ERROR_AND_ASSERT
+
+  if ( splits <= 0 ) 
+    return 0;
+
+  STBIR_PROFILE_BUILD_FIRST_START( build );
+
+  new_output_subx = resize->output_subx;
+  new_output_suby = resize->output_suby;
+
+  // do horizontal clip and scale calcs
+  if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) )
+    return 0;
+
+  // do vertical clip and scale calcs
+  if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) )
+    return 0;
+
+  // if nothing to do, just return
+  if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) )
+    return 0;
+
+  stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data );
+  stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
+  stbir__set_sampler(&vertical, resize->vertical_filter, resize->horizontal_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
+
+  if ( ( vertical.scale_info.output_sub_size / splits ) < 4 ) // each split should be a minimum of 4 scanlines (handwavey choice)
+  {
+    splits = vertical.scale_info.output_sub_size / 4;
+    if ( splits == 0 ) splits = 1;
+  }
+
+  STBIR_PROFILE_BUILD_START( alloc );
+  out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
+  STBIR_PROFILE_BUILD_END( alloc );
+  STBIR_PROFILE_BUILD_END( build );
+ 
+  if ( out_info )
+  {
+    resize->splits = splits;
+    resize->samplers = out_info;
+    resize->needs_rebuild = 0;
+    #ifdef STBIR_PROFILE
+      STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
+    #endif
+    return splits;
+  }
+
+  return 0;
+}
+
+void stbir_free_samplers( STBIR_RESIZE * resize )
+{
+  if ( resize->samplers )
+  {
+    stbir__free_internal_mem( resize->samplers );
+    resize->samplers = 0;
+    resize->called_alloc = 0;
+  }
+}
+
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits )
+{
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+  {
+    if ( resize->samplers )
+      stbir_free_samplers( resize );
+
+    resize->called_alloc = 1;
+    return stbir__perform_build( resize, splits );
+  }
+
+  STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
+  
+  return 1;
+}
+
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
+{
+  return stbir_build_samplers_with_splits( resize, 1 );
+}
+
+STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
+{
+  int result;
+  
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+  {
+    int alloc_state = resize->called_alloc;  // remember allocated state
+
+    if ( resize->samplers )
+    {
+      stbir__free_internal_mem( resize->samplers );
+      resize->samplers = 0;
+    }
+
+    if ( !stbir_build_samplers( resize ) )
+      return 0;
+    
+    resize->called_alloc = alloc_state;
+
+    // if build_samplers succeeded (above), but there are no samplers set, then 
+    //   the area to stretch into was zero pixels, so don't do anything and return
+    //   success
+    if ( resize->samplers == 0 )
+      return 1;
+  }
+  else
+  {
+    // didn't build anything - clear it
+    STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
+  }
+
+
+  // update anything that can be changed without recalcing samplers
+  stbir__update_info_from_resize( resize->samplers, resize );
+
+  // do resize
+  result = stbir__perform_resize( resize->samplers, 0, resize->splits );
+
+  // if we alloced, then free
+  if ( !resize->called_alloc )
+  {
+    stbir_free_samplers( resize );
+    resize->samplers = 0;
+  } 
+
+  return result;
+}
+
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count )
+{
+  STBIR_ASSERT( resize->samplers );
+
+  // if we're just doing the whole thing, call full
+  if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) )
+    return stbir_resize_extended( resize );
+
+  // you **must** build samplers first when using split resize
+  if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
+    return 0; 
+    
+  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
+    return 0;
+  
+  // update anything that can be changed without recalcing samplers
+  stbir__update_info_from_resize( resize->samplers, resize );
+ 
+  // do resize
+  return stbir__perform_resize( resize->samplers, split_start, split_count );
+}
+
+static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * output_pixels, int type_size, int output_w, int output_h, int output_stride_in_bytes, stbir_internal_pixel_layout pixel_layout )
+{
+  size_t size;
+  int pitch;
+  void * ptr;
+
+  pitch = output_w * type_size * stbir__pixel_channels[ pixel_layout ];
+  if ( pitch == 0 )
+    return 0;
+
+  if ( output_stride_in_bytes == 0 )
+    output_stride_in_bytes = pitch;
+
+  if ( output_stride_in_bytes < pitch )
+    return 0;
+
+  size = output_stride_in_bytes * output_h;
+  if ( size == 0 )
+    return 0;
+
+  *ret_ptr = 0;
+  *ret_pitch = output_stride_in_bytes;
+
+  if ( output_pixels == 0 )
+  {
+    ptr = STBIR_MALLOC( size, 0 );
+    if ( ptr == 0 )
+      return 0;
+
+    *ret_ptr = ptr;
+    *ret_pitch = pitch;
+  }
+
+  return 1;  
+}
+
+
+STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                          unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                          stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  unsigned char * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+                     pixel_layout, STBIR_TYPE_UINT8 );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                        unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                        stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  unsigned char * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+                     pixel_layout, STBIR_TYPE_UINT8_SRGB );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+
+STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                                  float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                                  stbir_pixel_layout pixel_layout )
+{
+  STBIR_RESIZE resize;
+  float * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( float ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+                     pixel_layout, STBIR_TYPE_FLOAT );
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+
+STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
+                                    void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                              stbir_pixel_layout pixel_layout, stbir_datatype data_type, 
+                              stbir_edge edge, stbir_filter filter )
+{
+  STBIR_RESIZE resize;
+  float * optr;
+  int opitch;
+
+  if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, stbir__type_size[data_type], output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
+    return 0;
+
+  stbir_resize_init( &resize, 
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
+                     (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes, 
+                     pixel_layout, data_type );
+
+  resize.horizontal_edge = edge;
+  resize.vertical_edge = edge;
+  resize.horizontal_filter = filter;
+  resize.vertical_filter = filter;
+
+  if ( !stbir_resize_extended( &resize ) )
+  {
+    if ( optr )
+      STBIR_FREE( optr, 0 );
+    return 0;
+  }
+
+  return (optr) ? optr : output_pixels;
+}
+
+#ifdef STBIR_PROFILE
+
+STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
+{
+  static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ;
+  stbir__info* samp = resize->samplers;
+  int i;
+
+  typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1];
+  typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1];
+  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1];
+
+  for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++)
+    info->clocks[i] = samp->profile.array[i+1];
+
+  info->total_clocks = samp->profile.named.total;
+  info->descriptions = bdescriptions;
+  info->count = STBIR__ARRAY_SIZE( bdescriptions );
+}
+
+STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count )
+{
+  static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" };
+  stbir__per_split_info * split_info;
+  int s, i;
+
+  typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1];
+  typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1];
+  typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1];
+
+  if ( split_start == -1 )
+  {
+    split_start = 0;
+    split_count = resize->samplers->splits;
+  }
+
+  if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
+  {
+    info->total_clocks = 0;
+    info->descriptions = 0;
+    info->count = 0;
+    return;
+  }
+
+  split_info = resize->samplers->split_info + split_start;
+
+  // sum up the profile from all the splits
+  for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ )
+  {
+    stbir_uint64 sum = 0;
+    for( s = 0 ; s < split_count ; s++ )
+      sum += split_info[s].profile.array[i+1];
+    info->clocks[i] = sum;
+  }
+
+  info->total_clocks = split_info->profile.named.total;
+  info->descriptions = descriptions;
+  info->count = STBIR__ARRAY_SIZE( descriptions );
+}
+
+STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize )
+{
+  stbir_resize_split_profile_info( info, resize, -1, 0 );
+}
+
+#endif // STBIR_PROFILE
+
+#undef STBIR_BGR
+#undef STBIR_1CHANNEL
+#undef STBIR_2CHANNEL
+#undef STBIR_RGB
+#undef STBIR_RGBA
+#undef STBIR_4CHANNEL
+#undef STBIR_BGRA
+#undef STBIR_ARGB
+#undef STBIR_ABGR
+#undef STBIR_RA
+#undef STBIR_AR
+#undef STBIR_RGBA_PM
+#undef STBIR_BGRA_PM
+#undef STBIR_ARGB_PM
+#undef STBIR_ABGR_PM
+#undef STBIR_RA_PM
+#undef STBIR_AR_PM
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#else  // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
+
+// we reinclude the header file to define all the horizontal functions
+//   specializing each function for the number of coeffs is 20-40% faster *OVERALL* 
+
+// by including the header file again this way, we can still debug the functions
+
+#define STBIR_strs_join2( start, mid, end ) start##mid##end
+#define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end )
+
+#define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end
+#define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end )
+
+#ifdef STB_IMAGE_RESIZE_DO_CODERS
+
+#ifdef stbir__decode_suffix
+#define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix )
+#else
+#define STBIR__CODER_NAME( name ) name
+#endif
+
+#ifdef stbir__decode_swizzle
+#define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
+#define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg)
+#define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
+#define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg)
+#else
+#define stbir__decode_order0 0
+#define stbir__decode_order1 1
+#define stbir__decode_order2 2
+#define stbir__decode_order3 3
+#define stbir__encode_order0 0
+#define stbir__encode_order1 1
+#define stbir__encode_order2 2
+#define stbir__encode_order3 3
+#define stbir__decode_simdf8_flip(reg)
+#define stbir__decode_simdf4_flip(reg) 
+#define stbir__encode_simdf8_unflip(reg)
+#define stbir__encode_simdf4_unflip(reg) 
+#endif
+
+#ifdef STBIR_SIMD8
+#define stbir__encode_simdfX_unflip  stbir__encode_simdf8_unflip
+#else
+#define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
+#endif 
+
+static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const*)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned char const * end_input_m16 = input + width_times_channels - 16;
+  if ( width_times_channels >= 16 )
+  {
+    decode_end -= 16;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o0,o1;
+      stbir__simdf8 of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
+      stbir__simdi8_convert_i32_to_float( of0, o0 );
+      stbir__simdi8_convert_i32_to_float( of1, o1 );
+      stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8);
+      stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8);
+      stbir__decode_simdf8_flip( of0 );
+      stbir__decode_simdf8_flip( of1 );
+      stbir__simdf8_store( decode + 0, of0 );
+      stbir__simdf8_store( decode + 8, of1 );
+      #else
+      stbir__simdi i, o0, o1, o2, o3;
+      stbir__simdf of0, of1, of2, of3;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdi_convert_i32_to_float( of2, o2 );
+      stbir__simdi_convert_i32_to_float( of3, o3 );
+      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__decode_simdf4_flip( of2 );
+      stbir__decode_simdf4_flip( of3 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      stbir__simdf_store( decode + 8,  of2 );
+      stbir__simdf_store( decode + 12, of3 );
+      #endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
+    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
+    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
+    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted;
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted;
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted;
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted;
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
+  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= stbir__simdfX_float_count*2 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+    end_output -= stbir__simdfX_float_count*2;
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      stbir__simdi i;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode );
+      stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count );
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      #ifdef STBIR_SIMD8
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdi_store( output, i );
+      #else
+      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdi_store2( output, i );
+      #endif
+      encode += stbir__simdfX_float_count*2;
+      output += stbir__simdfX_float_count*2;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    stbir__simdi i0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+    stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 );
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
+    *(int*)(output-4) = stbir__simdi_to_int( i0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    stbir__simdf e0; 
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
+    #if stbir__coder_min_num >= 2
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 );
+    #endif
+    #if stbir__coder_min_num >= 3
+    stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
+    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
+    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
+    f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const*)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned char const * end_input_m16 = input + width_times_channels - 16;
+  if ( width_times_channels >= 16 )
+  {
+    decode_end -= 16;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o0,o1;
+      stbir__simdf8 of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u8_to_u32( o0, o1, i );
+      stbir__simdi8_convert_i32_to_float( of0, o0 );
+      stbir__simdi8_convert_i32_to_float( of1, o1 );
+      stbir__decode_simdf8_flip( of0 );
+      stbir__decode_simdf8_flip( of1 );
+      stbir__simdf8_store( decode + 0, of0 );
+      stbir__simdf8_store( decode + 8, of1 );
+      #else
+      stbir__simdi i, o0, o1, o2, o3;
+      stbir__simdf of0, of1, of2, of3;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i);
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdi_convert_i32_to_float( of2, o2 );
+      stbir__simdi_convert_i32_to_float( of3, o3 );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__decode_simdf4_flip( of2 );
+      stbir__decode_simdf4_flip( of3 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      stbir__simdf_store( decode + 8,  of2 );
+      stbir__simdf_store( decode + 12, of3 );
+#endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0]));
+    decode[1-4] = ((float)(input[stbir__decode_order1]));
+    decode[2-4] = ((float)(input[stbir__decode_order2]));
+    decode[3-4] = ((float)(input[stbir__decode_order3]));
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0]));
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1]));
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2]));
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp;
+  unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= stbir__simdfX_float_count*2 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+    end_output -= stbir__simdfX_float_count*2;
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      stbir__simdi i;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
+      stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      #ifdef STBIR_SIMD8
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdi_store( output, i );
+      #else
+      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdi_store2( output, i );
+      #endif
+      encode += stbir__simdfX_float_count*2;
+      output += stbir__simdfX_float_count*2;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    stbir__simdi i0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+    stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 );
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_pack_to_8bytes( i0, e0, e0 );  // only use first 4
+    *(int*)(output-4) = stbir__simdi_to_int( i0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f;
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f;
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f;
+    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
+    decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
+    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
+    decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ];
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ];
+    #if stbir__coder_min_num >= 2
+    decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ];
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ];
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+#define stbir__min_max_shift20( i, f ) \
+    stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
+    stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one  )) ); \
+    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 ); 
+
+#define stbir__scale_and_convert( i, f ) \
+    stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
+    stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \
+    stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \
+    stbir__simdf_convert_float_to_i32( i, f );
+
+#define stbir__linear_to_srgb_finish( i, f ) \
+{ \
+    stbir__simdi temp;  \
+    stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \
+    stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \
+    stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \
+    stbir__simdi_16madd( i, i, temp ); \
+    stbir__simdi_32shr( i, i, 16 ); \
+}
+
+#define stbir__simdi_table_lookup2( v0,v1, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+} 
+
+#define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1,temp2; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp2.m128i_i128 = v2; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+  v2 = temp2.m128i_i128; \
+}
+
+#define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \
+{ \
+  stbir__simdi_u32 temp0,temp1,temp2,temp3; \
+  temp0.m128i_i128 = v0; \
+  temp1.m128i_i128 = v1; \
+  temp2.m128i_i128 = v2; \
+  temp3.m128i_i128 = v3; \
+  temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \
+  temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
+  temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \
+  temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \
+  v0 = temp0.m128i_i128; \
+  v1 = temp1.m128i_i128; \
+  v2 = temp2.m128i_i128; \
+  v3 = temp3.m128i_i128; \
+} 
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3; 
+      STBIR_SIMD_NO_UNROLL(encode);
+
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__min_max_shift20( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__min_max_shift20( i3, f3 );
+      
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
+     
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i1, f1 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+      stbir__linear_to_srgb_finish( i3, f3 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      encode += 16;
+      output += 16;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while ( output <= end_output )
+  {
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
+    output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
+    output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
+    output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] );
+
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output ) 
+  {
+    STBIR_NO_UNROLL(encode);
+    output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
+    #if stbir__coder_min_num >= 2
+    output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] );
+    #endif
+    #if stbir__coder_min_num >= 3
+    output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+  do {
+    decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
+    decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ];
+    decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ];
+    decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted;
+    input += 4;
+    decode += 4;
+  } while( decode < decode_end );
+}
+
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3;
+
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__min_max_shift20( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__scale_and_convert( i3, f3 ); 
+      
+      stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
+     
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i1, f1 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      output += 16;
+      encode += 16;
+
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  do {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);                                        
+
+    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
+    output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
+    output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] );
+
+    f = encode[3] * stbir__max_uint8_as_float + 0.5f;
+    STBIR_CLAMP(f, 0, 255);
+    output[stbir__decode_order3] = (unsigned char) f;
+
+    output += 4;
+    encode += 4;
+  } while( output < end_output );
+}
+
+#endif
+
+#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) )
+
+static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float const * decode_end = (float*) decode + width_times_channels;
+  unsigned char const * input = (unsigned char const *)inputp;
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ];
+    decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
+    decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ];
+    decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted;
+    input += 4;
+    decode += 4;
+  }
+  decode -= 4;
+  if( decode < decode_end ) 
+  {
+    decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
+    decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
+  }
+}
+
+static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp;
+  unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
+
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_encode_m16 = encode + width_times_channels - 16;
+    end_output -= 16;
+    for(;;)
+    {
+      stbir__simdf f0, f1, f2, f3;
+      stbir__simdi i0, i1, i2, i3; 
+
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
+
+      stbir__min_max_shift20( i0, f0 );
+      stbir__scale_and_convert( i1, f1 );
+      stbir__min_max_shift20( i2, f2 );
+      stbir__scale_and_convert( i3, f3 );
+      
+      stbir__simdi_table_lookup2( i0, i2, to_srgb );
+     
+      stbir__linear_to_srgb_finish( i0, f0 );
+      stbir__linear_to_srgb_finish( i2, f2 );
+
+      stbir__interleave_pack_and_store_16_u8( output,  STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) );
+
+      output += 16;
+      encode += 16;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 16 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m16;
+    }
+    return;
+  }
+  #endif
+
+  do {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+
+    output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
+
+    f = encode[1] * stbir__max_uint8_as_float + 0.5f;
+    STBIR_CLAMP(f, 0, 255);
+    output[stbir__decode_order1] = (unsigned char) f;
+
+    output += 2;
+    encode += 2;
+  } while( output < end_output );
+}
+
+#endif
+
+static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned short const * input = (unsigned short const *)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned short const * end_input_m8 = input + width_times_channels - 8;
+  if ( width_times_channels >= 8 )
+  {
+    decode_end -= 8;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o;
+      stbir__simdf8 of;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u16_to_u32( o, i );
+      stbir__simdi8_convert_i32_to_float( of, o );
+      stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8);
+      stbir__decode_simdf8_flip( of );
+      stbir__simdf8_store( decode + 0, of );
+      #else
+      stbir__simdi i, o0, o1;
+      stbir__simdf of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u16_to_u32( o0,o1,i );
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) );
+      stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted));
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      #endif
+      decode += 8;  
+      input += 8;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
+    decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
+    decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
+    decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted;
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted;
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted;
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted;
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+
+static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
+  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    if ( width_times_channels >= stbir__simdfX_float_count*2 )
+    {
+      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+      end_output -= stbir__simdfX_float_count*2;
+      for(;;)
+      {
+        stbir__simdfX e0, e1;
+        stbir__simdiX i;
+        STBIR_SIMD_NO_UNROLL(encode);
+        stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode );
+        stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count );
+        stbir__encode_simdfX_unflip( e0 );
+        stbir__encode_simdfX_unflip( e1 );
+        stbir__simdfX_pack_to_words( i, e0, e1 );
+        stbir__simdiX_store( output, i );
+        encode += stbir__simdfX_float_count*2;
+        output += stbir__simdfX_float_count*2;
+        if ( output <= end_output ) 
+          continue;
+        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+          break;
+        output = end_output;     // backup and do last couple
+        encode = end_encode_m8;
+      }
+      return;
+    }
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e;
+    stbir__simdi i;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e, encode );
+    stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e );
+    stbir__encode_simdf4_unflip( e );
+    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
+    stbir__simdi_store2( output-4, i );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    stbir__simdf e;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e );
+    #if stbir__coder_min_num >= 2
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e );
+    #endif
+    #if stbir__coder_min_num >= 3
+    stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e );
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
+    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
+    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
+    f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  unsigned short const * input = (unsigned short const *)inputp;
+
+  #ifdef STBIR_SIMD
+  unsigned short const * end_input_m8 = input + width_times_channels - 8;
+  if ( width_times_channels >= 8 )
+  {
+    decode_end -= 8;
+    for(;;)
+    {
+      #ifdef STBIR_SIMD8
+      stbir__simdi i; stbir__simdi8 o;
+      stbir__simdf8 of;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi8_expand_u16_to_u32( o, i );
+      stbir__simdi8_convert_i32_to_float( of, o );
+      stbir__decode_simdf8_flip( of );
+      stbir__simdf8_store( decode + 0, of );
+      #else
+      stbir__simdi i, o0, o1;
+      stbir__simdf of0, of1;
+      STBIR_NO_UNROLL(decode);
+      stbir__simdi_load( i, input );
+      stbir__simdi_expand_u16_to_u32( o0, o1, i );
+      stbir__simdi_convert_i32_to_float( of0, o0 );
+      stbir__simdi_convert_i32_to_float( of1, o1 );
+      stbir__decode_simdf4_flip( of0 );
+      stbir__decode_simdf4_flip( of1 );
+      stbir__simdf_store( decode + 0,  of0 );
+      stbir__simdf_store( decode + 4,  of1 );
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = ((float)(input[stbir__decode_order0]));
+    decode[1-4] = ((float)(input[stbir__decode_order1]));
+    decode[2-4] = ((float)(input[stbir__decode_order2]));
+    decode[3-4] = ((float)(input[stbir__decode_order3]));
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = ((float)(input[stbir__decode_order0]));
+    #if stbir__coder_min_num >= 2
+    decode[1] = ((float)(input[stbir__decode_order1]));
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = ((float)(input[stbir__decode_order2]));
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode )
+{
+  unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp;
+  unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  {
+    if ( width_times_channels >= stbir__simdfX_float_count*2 )
+    {
+      float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2;
+      end_output -= stbir__simdfX_float_count*2;
+      for(;;)
+      {
+        stbir__simdfX e0, e1;
+        stbir__simdiX i;
+        STBIR_SIMD_NO_UNROLL(encode);
+        stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode );
+        stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count );
+        stbir__encode_simdfX_unflip( e0 );
+        stbir__encode_simdfX_unflip( e1 );
+        stbir__simdfX_pack_to_words( i, e0, e1 );
+        stbir__simdiX_store( output, i );
+        encode += stbir__simdfX_float_count*2;
+        output += stbir__simdfX_float_count*2;
+        if ( output <= end_output ) 
+          continue;
+        if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
+          break;
+        output = end_output; // backup and do last couple
+        encode = end_encode_m8;
+      }
+      return;
+    }
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e;
+    stbir__simdi i;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e, encode );
+    stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e );
+    stbir__encode_simdf4_unflip( e );
+    stbir__simdf_pack_to_8words( i, e, e );  // only use first 4
+    stbir__simdi_store2( output-4, i );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if  stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float f;
+    STBIR_SIMD_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f;
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f;
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f;
+    f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float f;
+    STBIR_NO_UNROLL(encode);
+    f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f;
+    #if stbir__coder_min_num >= 2
+    f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f;
+    #endif
+    #if stbir__coder_min_num >= 3
+    f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  stbir__FP16 const * input = (stbir__FP16 const *)inputp;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 8 )
+  {
+    stbir__FP16 const * end_input_m8 = input + width_times_channels - 8;
+    decode_end -= 8;
+    for(;;)
+    {
+      STBIR_NO_UNROLL(decode);
+
+      stbir__half_to_float_SIMD( decode, input );
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of;
+        stbir__simdf8_load( of, decode );
+        stbir__decode_simdf8_flip( of );
+        stbir__simdf8_store( decode, of );
+      }
+      #else
+      {
+        stbir__simdf of0,of1;
+        stbir__simdf_load( of0, decode );
+        stbir__simdf_load( of1, decode+4 );
+        stbir__decode_simdf4_flip( of0 );
+        stbir__decode_simdf4_flip( of1 );
+        stbir__simdf_store( decode, of0 );
+        stbir__simdf_store( decode+4, of1 );
+      }
+      #endif
+      #endif
+      decode += 8;
+      input += 8;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 8 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]);
+    decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]);
+    decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]);
+    decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]);
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = stbir__half_to_float(input[stbir__decode_order0]);
+    #if stbir__coder_min_num >= 2
+    decode[1] = stbir__half_to_float(input[stbir__decode_order1]);
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = stbir__half_to_float(input[stbir__decode_order2]);
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp;
+  stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 8 )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - 8;
+    end_output -= 8;
+    for(;;)
+    {
+      STBIR_SIMD_NO_UNROLL(encode);
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of;
+        stbir__simdf8_load( of, encode );
+        stbir__encode_simdf8_unflip( of );
+        stbir__float_to_half_SIMD( output, (float*)&of );
+      }
+      #else
+      {
+        stbir__simdf of[2];
+        stbir__simdf_load( of[0], encode );
+        stbir__simdf_load( of[1], encode+4 );
+        stbir__encode_simdf4_unflip( of[0] );
+        stbir__encode_simdf4_unflip( of[1] );
+        stbir__float_to_half_SIMD( output, (float*)of );
+      }
+      #endif
+      #else
+      stbir__float_to_half_SIMD( output, encode );
+      #endif
+      encode += 8;
+      output += 8;
+      if ( output <= end_output ) 
+        continue;
+      if ( output == ( end_output + 8 ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    STBIR_SIMD_NO_UNROLL(output);
+    output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]);
+    output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]);
+    output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]);
+    output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]);
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    STBIR_NO_UNROLL(output);
+    output[0] = stbir__float_to_half(encode[stbir__encode_order0]);
+    #if stbir__coder_min_num >= 2
+    output[1] = stbir__float_to_half(encode[stbir__encode_order1]);
+    #endif
+    #if stbir__coder_min_num >= 3
+    output[2] = stbir__float_to_half(encode[stbir__encode_order2]);
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+}
+
+static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp )
+{
+  #ifdef stbir__decode_swizzle
+  float STBIR_STREAMOUT_PTR( * ) decode = decodep;
+  float * decode_end = (float*) decode + width_times_channels;
+  float const * input = (float const *)inputp;
+
+  #ifdef STBIR_SIMD
+  if ( width_times_channels >= 16 )
+  {
+    float const * end_input_m16 = input + width_times_channels - 16;
+    decode_end -= 16;
+    for(;;)
+    {
+      STBIR_NO_UNROLL(decode);
+      #ifdef stbir__decode_swizzle
+      #ifdef STBIR_SIMD8
+      {
+        stbir__simdf8 of0,of1;
+        stbir__simdf8_load( of0, input );
+        stbir__simdf8_load( of1, input+8 );
+        stbir__decode_simdf8_flip( of0 );
+        stbir__decode_simdf8_flip( of1 );
+        stbir__simdf8_store( decode, of0 );
+        stbir__simdf8_store( decode+8, of1 );
+      }
+      #else
+      {
+        stbir__simdf of0,of1,of2,of3;
+        stbir__simdf_load( of0, input );
+        stbir__simdf_load( of1, input+4 );
+        stbir__simdf_load( of2, input+8 );
+        stbir__simdf_load( of3, input+12 );
+        stbir__decode_simdf4_flip( of0 );
+        stbir__decode_simdf4_flip( of1 );
+        stbir__decode_simdf4_flip( of2 );
+        stbir__decode_simdf4_flip( of3 );
+        stbir__simdf_store( decode, of0 );
+        stbir__simdf_store( decode+4, of1 );
+        stbir__simdf_store( decode+8, of2 );
+        stbir__simdf_store( decode+12, of3 );
+      }
+      #endif
+      #endif
+      decode += 16;
+      input += 16;
+      if ( decode <= decode_end ) 
+        continue;
+      if ( decode == ( decode_end + 16 ) )
+        break;
+      decode = decode_end; // backup and do last couple
+      input = end_input_m16;
+    }
+    return;
+  }
+  #endif
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  decode += 4;
+  while( decode <= decode_end )
+  {
+    STBIR_SIMD_NO_UNROLL(decode);
+    decode[0-4] = input[stbir__decode_order0];
+    decode[1-4] = input[stbir__decode_order1];
+    decode[2-4] = input[stbir__decode_order2];
+    decode[3-4] = input[stbir__decode_order3];
+    decode += 4;
+    input += 4;
+  }
+  decode -= 4;
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( decode < decode_end )
+  {
+    STBIR_NO_UNROLL(decode);
+    decode[0] = input[stbir__decode_order0];
+    #if stbir__coder_min_num >= 2
+    decode[1] = input[stbir__decode_order1];
+    #endif
+    #if stbir__coder_min_num >= 3
+    decode[2] = input[stbir__decode_order2];
+    #endif
+    decode += stbir__coder_min_num;
+    input += stbir__coder_min_num;
+  }
+  #endif
+
+  #else
+  
+  if ( (void*)decodep != inputp )
+    STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
+  
+  #endif
+}
+
+static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode )
+{
+  #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle)
+
+  if ( (void*)outputp != (void*) encode )
+    STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) );
+
+  #else
+
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp;
+  float * end_output = ( (float*) output ) + width_times_channels;
+
+  #ifdef STBIR_FLOAT_HIGH_CLAMP
+  #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP;
+  #else
+  #define stbir_scalar_hi_clamp( v )
+  #endif
+  #ifdef STBIR_FLOAT_LOW_CLAMP
+  #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP;
+  #else
+  #define stbir_scalar_lo_clamp( v )
+  #endif
+
+  #ifdef STBIR_SIMD
+
+  #ifdef STBIR_FLOAT_HIGH_CLAMP
+  const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP);
+  #endif
+  #ifdef STBIR_FLOAT_LOW_CLAMP
+  const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP);
+  #endif
+
+  if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) )
+  {
+    float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 );
+    end_output -= ( stbir__simdfX_float_count * 2 );
+    for(;;)
+    {
+      stbir__simdfX e0, e1;
+      STBIR_SIMD_NO_UNROLL(encode);
+      stbir__simdfX_load( e0, encode );
+      stbir__simdfX_load( e1, encode+stbir__simdfX_float_count );
+#ifdef STBIR_FLOAT_HIGH_CLAMP
+      stbir__simdfX_min( e0, e0, high_clamp );
+      stbir__simdfX_min( e1, e1, high_clamp );
+#endif      
+#ifdef STBIR_FLOAT_LOW_CLAMP
+      stbir__simdfX_max( e0, e0, low_clamp );
+      stbir__simdfX_max( e1, e1, low_clamp );
+#endif      
+      stbir__encode_simdfX_unflip( e0 );
+      stbir__encode_simdfX_unflip( e1 );
+      stbir__simdfX_store( output, e0 );
+      stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
+      encode += stbir__simdfX_float_count * 2;
+      output += stbir__simdfX_float_count * 2;
+      if ( output < end_output ) 
+        continue;
+      if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
+        break;
+      output = end_output; // backup and do last couple
+      encode = end_encode_m8;
+    }
+    return;
+  }
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    stbir__simdf e0;
+    STBIR_NO_UNROLL(encode);
+    stbir__simdf_load( e0, encode );
+#ifdef STBIR_FLOAT_HIGH_CLAMP
+    stbir__simdf_min( e0, e0, high_clamp );
+#endif      
+#ifdef STBIR_FLOAT_LOW_CLAMP
+    stbir__simdf_max( e0, e0, low_clamp );
+#endif      
+    stbir__encode_simdf4_unflip( e0 );
+    stbir__simdf_store( output-4, e0 );
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+  #endif
+
+  #else
+
+  // try to do blocks of 4 when you can
+  #if stbir__coder_min_num != 3 // doesn't divide cleanly by four
+  output += 4;
+  while( output <= end_output )
+  {
+    float e;
+    STBIR_SIMD_NO_UNROLL(encode);
+    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e;
+    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e;
+    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e;
+    e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e;
+    output += 4;
+    encode += 4;
+  }
+  output -= 4;
+
+  #endif
+
+  #endif
+
+  // do the remnants
+  #if stbir__coder_min_num < 4
+  while( output < end_output )
+  {
+    float e;
+    STBIR_NO_UNROLL(encode);
+    e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e;
+    #if stbir__coder_min_num >= 2
+    e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e;
+    #endif
+    #if stbir__coder_min_num >= 3
+    e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e;
+    #endif
+    output += stbir__coder_min_num;
+    encode += stbir__coder_min_num;
+  }
+  #endif
+  
+  #endif
+}
+
+#undef stbir__decode_suffix 
+#undef stbir__decode_simdf8_flip
+#undef stbir__decode_simdf4_flip
+#undef stbir__decode_order0 
+#undef stbir__decode_order1
+#undef stbir__decode_order2
+#undef stbir__decode_order3
+#undef stbir__encode_order0 
+#undef stbir__encode_order1
+#undef stbir__encode_order2
+#undef stbir__encode_order3
+#undef stbir__encode_simdf8_unflip
+#undef stbir__encode_simdf4_unflip
+#undef stbir__encode_simdfX_unflip
+#undef STBIR__CODER_NAME
+#undef stbir__coder_min_num
+#undef stbir__decode_swizzle
+#undef stbir_scalar_hi_clamp
+#undef stbir_scalar_lo_clamp
+#undef STB_IMAGE_RESIZE_DO_CODERS
+
+#elif defined( STB_IMAGE_RESIZE_DO_VERTICALS)
+
+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont)
+#else
+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end)
+#endif
+
+#if STBIR__vertical_channels >= 1
+#define stbIF0( code ) code
+#else
+#define stbIF0( code )
+#endif
+#if STBIR__vertical_channels >= 2
+#define stbIF1( code ) code
+#else
+#define stbIF1( code )
+#endif
+#if STBIR__vertical_channels >= 3
+#define stbIF2( code ) code
+#else
+#define stbIF2( code )
+#endif
+#if STBIR__vertical_channels >= 4
+#define stbIF3( code ) code
+#else
+#define stbIF3( code )
+#endif
+#if STBIR__vertical_channels >= 5
+#define stbIF4( code ) code
+#else
+#define stbIF4( code )
+#endif
+#if STBIR__vertical_channels >= 6
+#define stbIF5( code ) code
+#else
+#define stbIF5( code )
+#endif
+#if STBIR__vertical_channels >= 7
+#define stbIF6( code ) code
+#else
+#define stbIF6( code )
+#endif
+#if STBIR__vertical_channels >= 8
+#define stbIF7( code ) code
+#else
+#define stbIF7( code )
+#endif
+
+static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end )
+{
+  stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; )
+  stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; )
+  stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; )
+  stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; )
+  stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; )
+  stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; )
+  stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; )
+  stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; )
+
+  #ifdef STBIR_SIMD
+  {
+    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
+    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
+    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
+    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
+    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
+    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
+    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
+    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) ) 
+    {
+      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
+      STBIR_SIMD_NO_UNROLL(output0);
+
+      stbir__simdfX_load( r0, input );               stbir__simdfX_load( r1, input+stbir__simdfX_float_count );     stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) );      stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) );
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdfX_load( o0, output0 );     stbir__simdfX_load( o1, output0+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );           
+              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF1( stbir__simdfX_load( o0, output1 );     stbir__simdfX_load( o1, output1+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );             
+              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF2( stbir__simdfX_load( o0, output2 );     stbir__simdfX_load( o1, output2+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );             
+              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF3( stbir__simdfX_load( o0, output3 );     stbir__simdfX_load( o1, output3+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );             
+              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF4( stbir__simdfX_load( o0, output4 );     stbir__simdfX_load( o1, output4+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );             
+              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF5( stbir__simdfX_load( o0, output5 );     stbir__simdfX_load( o1, output5+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count));    stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );             
+              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF6( stbir__simdfX_load( o0, output6 );     stbir__simdfX_load( o1, output6+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );             
+              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF7( stbir__simdfX_load( o0, output7 );     stbir__simdfX_load( o1, output7+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );             
+              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
+      #else
+      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );  
+              stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );  
+              stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );  
+              stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );  
+              stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );  
+              stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );  
+              stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );  
+              stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
+      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );  
+              stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
+      #endif
+
+      input += (4*stbir__simdfX_float_count);
+      stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
+    }
+    while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+    {
+      stbir__simdf o0, r0;
+      STBIR_SIMD_NO_UNROLL(output0);
+
+      stbir__simdf_load( r0, input );
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdf_load( o0, output0 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );  stbir__simdf_store( output0, o0 ); )
+      stbIF1( stbir__simdf_load( o0, output1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );  stbir__simdf_store( output1, o0 ); )
+      stbIF2( stbir__simdf_load( o0, output2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );  stbir__simdf_store( output2, o0 ); )
+      stbIF3( stbir__simdf_load( o0, output3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );  stbir__simdf_store( output3, o0 ); )
+      stbIF4( stbir__simdf_load( o0, output4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );  stbir__simdf_store( output4, o0 ); )
+      stbIF5( stbir__simdf_load( o0, output5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );  stbir__simdf_store( output5, o0 ); )
+      stbIF6( stbir__simdf_load( o0, output6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );  stbir__simdf_store( output6, o0 ); )
+      stbIF7( stbir__simdf_load( o0, output7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );  stbir__simdf_store( output7, o0 ); )
+      #else
+      stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) );   stbir__simdf_store( output0, o0 ); )
+      stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) );   stbir__simdf_store( output1, o0 ); )
+      stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) );   stbir__simdf_store( output2, o0 ); )
+      stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) );   stbir__simdf_store( output3, o0 ); )
+      stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) );   stbir__simdf_store( output4, o0 ); )
+      stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) );   stbir__simdf_store( output5, o0 ); )
+      stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );   stbir__simdf_store( output6, o0 ); )
+      stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );   stbir__simdf_store( output7, o0 ); )
+      #endif
+      
+      input += 4;
+      stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
+    }
+  }
+  #else
+  while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+  {
+    float r0, r1, r2, r3;
+    STBIR_NO_UNROLL(input);
+
+    r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3];
+
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); )
+    stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); )
+    stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); )
+    stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); )
+    stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); )
+    stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); )
+    stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); )
+    stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); )
+    #else
+    stbIF0( output0[0]  = ( r0 * c0s ); output0[1]  = ( r1 * c0s ); output0[2]  = ( r2 * c0s ); output0[3]  = ( r3 * c0s ); )
+    stbIF1( output1[0]  = ( r0 * c1s ); output1[1]  = ( r1 * c1s ); output1[2]  = ( r2 * c1s ); output1[3]  = ( r3 * c1s ); )
+    stbIF2( output2[0]  = ( r0 * c2s ); output2[1]  = ( r1 * c2s ); output2[2]  = ( r2 * c2s ); output2[3]  = ( r3 * c2s ); )
+    stbIF3( output3[0]  = ( r0 * c3s ); output3[1]  = ( r1 * c3s ); output3[2]  = ( r2 * c3s ); output3[3]  = ( r3 * c3s ); )
+    stbIF4( output4[0]  = ( r0 * c4s ); output4[1]  = ( r1 * c4s ); output4[2]  = ( r2 * c4s ); output4[3]  = ( r3 * c4s ); )
+    stbIF5( output5[0]  = ( r0 * c5s ); output5[1]  = ( r1 * c5s ); output5[2]  = ( r2 * c5s ); output5[3]  = ( r3 * c5s ); )
+    stbIF6( output6[0]  = ( r0 * c6s ); output6[1]  = ( r1 * c6s ); output6[2]  = ( r2 * c6s ); output6[3]  = ( r3 * c6s ); )
+    stbIF7( output7[0]  = ( r0 * c7s ); output7[1]  = ( r1 * c7s ); output7[2]  = ( r2 * c7s ); output7[3]  = ( r3 * c7s ); )
+    #endif
+
+    input += 4;
+    stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
+  }
+  #endif
+  while ( input < input_end ) 
+  {
+    float r = input[0];
+    STBIR_NO_UNROLL(output0);
+
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( output0[0] += ( r * c0s ); )
+    stbIF1( output1[0] += ( r * c1s ); )
+    stbIF2( output2[0] += ( r * c2s ); )
+    stbIF3( output3[0] += ( r * c3s ); )
+    stbIF4( output4[0] += ( r * c4s ); )
+    stbIF5( output5[0] += ( r * c5s ); )
+    stbIF6( output6[0] += ( r * c6s ); )
+    stbIF7( output7[0] += ( r * c7s ); )
+    #else
+    stbIF0( output0[0]  = ( r * c0s ); )
+    stbIF1( output1[0]  = ( r * c1s ); )
+    stbIF2( output2[0]  = ( r * c2s ); )
+    stbIF3( output3[0]  = ( r * c3s ); )
+    stbIF4( output4[0]  = ( r * c4s ); )
+    stbIF5( output5[0]  = ( r * c5s ); )
+    stbIF6( output6[0]  = ( r * c6s ); )
+    stbIF7( output7[0]  = ( r * c7s ); )
+    #endif
+
+    ++input;
+    stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; )
+  }
+}
+
+static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end )
+{
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp;
+
+  stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; )
+  stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; )
+  stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; )
+  stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; )
+  stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; )
+  stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; )
+  stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; )
+  stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; )
+
+#if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE)
+  // check single channel one weight
+  if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) )
+  {
+    STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
+    return;
+  }
+#endif  
+
+  #ifdef STBIR_SIMD
+  {
+    stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); )
+    stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); )
+    stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); )
+    stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); )
+    stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); )
+    stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
+    stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
+    stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
+    
+    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) ) 
+    {
+      stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
+      STBIR_SIMD_NO_UNROLL(output);
+
+      // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
+      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); ) 
+      stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
+      stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
+      stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
+      stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); )
+      stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); )
+      stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); )
+      stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); )
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdfX_load( o0, output );      stbir__simdfX_load( o1, output+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );                         stbir__simdfX_madd( o2, o2, r2, c0 );                             stbir__simdfX_madd( o3, o3, r3, c0 ); )
+      #else
+      stbIF0( stbir__simdfX_load( r0, input0 );      stbir__simdfX_load( r1, input0+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );                             stbir__simdfX_mult( o2, r2, c0 );                                 stbir__simdfX_mult( o3, r3, c0 );  )
+      #endif
+
+      stbIF1( stbir__simdfX_load( r0, input1 );      stbir__simdfX_load( r1, input1+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );                         stbir__simdfX_madd( o2, o2, r2, c1 );                             stbir__simdfX_madd( o3, o3, r3, c1 ); )
+      stbIF2( stbir__simdfX_load( r0, input2 );      stbir__simdfX_load( r1, input2+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );                         stbir__simdfX_madd( o2, o2, r2, c2 );                             stbir__simdfX_madd( o3, o3, r3, c2 ); )
+      stbIF3( stbir__simdfX_load( r0, input3 );      stbir__simdfX_load( r1, input3+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );                         stbir__simdfX_madd( o2, o2, r2, c3 );                             stbir__simdfX_madd( o3, o3, r3, c3 ); )
+      stbIF4( stbir__simdfX_load( r0, input4 );      stbir__simdfX_load( r1, input4+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );                         stbir__simdfX_madd( o2, o2, r2, c4 );                             stbir__simdfX_madd( o3, o3, r3, c4 ); )
+      stbIF5( stbir__simdfX_load( r0, input5 );      stbir__simdfX_load( r1, input5+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );                         stbir__simdfX_madd( o2, o2, r2, c5 );                             stbir__simdfX_madd( o3, o3, r3, c5 ); )
+      stbIF6( stbir__simdfX_load( r0, input6 );      stbir__simdfX_load( r1, input6+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );                         stbir__simdfX_madd( o2, o2, r2, c6 );                             stbir__simdfX_madd( o3, o3, r3, c6 ); )
+      stbIF7( stbir__simdfX_load( r0, input7 );      stbir__simdfX_load( r1, input7+stbir__simdfX_float_count );   stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) );   stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) );
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );                         stbir__simdfX_madd( o2, o2, r2, c7 );                             stbir__simdfX_madd( o3, o3, r3, c7 ); )
+
+      stbir__simdfX_store( output, o0 );             stbir__simdfX_store( output+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 );  stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 );
+      output += (4*stbir__simdfX_float_count);
+      stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
+    }
+
+    while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+    {
+      stbir__simdf o0, r0;
+      STBIR_SIMD_NO_UNROLL(output);
+
+      #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+      stbIF0( stbir__simdf_load( o0, output );   stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
+      #else
+      stbIF0( stbir__simdf_load( r0, input0 );  stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); )
+      #endif
+      stbIF1( stbir__simdf_load( r0, input1 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); )
+      stbIF2( stbir__simdf_load( r0, input2 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); )
+      stbIF3( stbir__simdf_load( r0, input3 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); )
+      stbIF4( stbir__simdf_load( r0, input4 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); )
+      stbIF5( stbir__simdf_load( r0, input5 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); )
+      stbIF6( stbir__simdf_load( r0, input6 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); )
+      stbIF7( stbir__simdf_load( r0, input7 );  stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); )
+
+      stbir__simdf_store( output, o0 );
+      output += 4;
+      stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
+    }
+  }
+  #else
+  while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+  {
+    float o0, o1, o2, o3;
+    STBIR_NO_UNROLL(output);
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; )
+    #else
+    stbIF0( o0  = input0[0] * c0s; o1  = input0[1] * c0s; o2  = input0[2] * c0s; o3  = input0[3] * c0s; )
+    #endif
+    stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; )
+    stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; )
+    stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; )
+    stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; )
+    stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; )
+    stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; )
+    stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; )
+    output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3;
+    output += 4;
+    stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
+  }
+  #endif
+  while ( input0 < input0_end ) 
+  {
+    float o0;
+    STBIR_NO_UNROLL(output);
+    #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+    stbIF0( o0 = output[0] + input0[0] * c0s; )
+    #else
+    stbIF0( o0  = input0[0] * c0s; )
+    #endif
+    stbIF1( o0 += input1[0] * c1s; )
+    stbIF2( o0 += input2[0] * c2s; )
+    stbIF3( o0 += input3[0] * c3s; )
+    stbIF4( o0 += input4[0] * c4s; )
+    stbIF5( o0 += input5[0] * c5s; )
+    stbIF6( o0 += input6[0] * c6s; )
+    stbIF7( o0 += input7[0] * c7s; )
+    output[0] = o0; 
+    ++output;
+    stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
+  }
+}
+
+#undef stbIF0
+#undef stbIF1
+#undef stbIF2
+#undef stbIF3
+#undef stbIF4
+#undef stbIF5
+#undef stbIF6
+#undef stbIF7
+#undef STB_IMAGE_RESIZE_DO_VERTICALS
+#undef STBIR__vertical_channels
+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
+#undef STBIR_strs_join24
+#undef STBIR_strs_join14
+#undef STBIR_chans
+#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#undef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
+#endif
+
+#else // !STB_IMAGE_RESIZE_DO_VERTICALS
+
+#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end)
+
+#ifndef stbir__2_coeff_only
+#define stbir__2_coeff_only()             \
+    stbir__1_coeff_only();                \
+    stbir__1_coeff_remnant(1);            
+#endif
+
+#ifndef stbir__2_coeff_remnant
+#define stbir__2_coeff_remnant( ofs )     \
+    stbir__1_coeff_remnant(ofs);          \
+    stbir__1_coeff_remnant((ofs)+1);      
+#endif
+    
+#ifndef stbir__3_coeff_only
+#define stbir__3_coeff_only()             \
+    stbir__2_coeff_only();                \
+    stbir__1_coeff_remnant(2);            
+#endif
+    
+#ifndef stbir__3_coeff_remnant
+#define stbir__3_coeff_remnant( ofs )     \
+    stbir__2_coeff_remnant(ofs);          \
+    stbir__1_coeff_remnant((ofs)+2);      
+#endif
+
+#ifndef stbir__3_coeff_setup
+#define stbir__3_coeff_setup()
+#endif
+
+#ifndef stbir__4_coeff_start
+#define stbir__4_coeff_start()            \
+    stbir__2_coeff_only();                \
+    stbir__2_coeff_remnant(2);            
+#endif
+    
+#ifndef stbir__4_coeff_continue_from_4
+#define stbir__4_coeff_continue_from_4( ofs )     \
+    stbir__2_coeff_remnant(ofs);                  \
+    stbir__2_coeff_remnant((ofs)+2);      
+#endif
+
+#ifndef stbir__store_output_tiny
+#define stbir__store_output_tiny stbir__store_output
+#endif
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__1_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__2_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__3_coeff_only();
+    stbir__store_output_tiny();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__1_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__2_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+  
+    stbir__4_coeff_start();
+    stbir__3_coeff_remnant(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__1_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__2_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__3_coeff_remnant(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * hc = horizontal_coefficients;
+    stbir__4_coeff_start();
+    stbir__4_coeff_continue_from_4(4);
+    stbir__4_coeff_continue_from_4(8);
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__1_coeff_remnant( 4 ); 
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__2_coeff_remnant( 4 ); 
+
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width )
+{
+  float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
+  float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
+  stbir__3_coeff_setup();
+  do {
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2; 
+    float const * hc = horizontal_coefficients;
+
+    stbir__4_coeff_start();
+    do {
+      hc += 4;
+      decode += STBIR__horizontal_channels * 4;
+      stbir__4_coeff_continue_from_4( 0 );
+      --n;
+    } while ( n > 0 );
+    stbir__3_coeff_remnant( 4 ); 
+
+    stbir__store_output();
+  } while ( output < output_end );
+}
+
+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
+{
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),  
+};
+
+static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
+{
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),  
+};
+
+#undef STBIR__horizontal_channels
+#undef STB_IMAGE_RESIZE_DO_HORIZONTALS
+#undef stbir__1_coeff_only
+#undef stbir__1_coeff_remnant
+#undef stbir__2_coeff_only
+#undef stbir__2_coeff_remnant
+#undef stbir__3_coeff_only
+#undef stbir__3_coeff_remnant
+#undef stbir__3_coeff_setup
+#undef stbir__4_coeff_start
+#undef stbir__4_coeff_continue_from_4
+#undef stbir__store_output
+#undef stbir__store_output_tiny
+#undef STBIR_chans
+
+#endif  // HORIZONALS
+
+#undef STBIR_strs_join2
+#undef STBIR_strs_join1
+
+#endif // STB_IMAGE_RESIZE_DO_HORIZONTALS/VERTICALS/CODERS
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/stb_image_resize_test/dotimings.c b/stb_image_resize_test/dotimings.c
new file mode 100644
index 0000000..515c5d5
--- /dev/null
+++ b/stb_image_resize_test/dotimings.c
@@ -0,0 +1,224 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _MSC_VER
+
+#define stop() __debugbreak()
+#include <windows.h>
+#define int64 __int64
+#pragma warning(disable:4127)
+
+#define get_milliseconds GetTickCount
+
+#else
+
+#define stop() __builtin_trap()
+#define int64 long long
+
+typedef unsigned int U32;
+typedef unsigned long long U64;
+
+#include <time.h>
+static int get_milliseconds()
+{
+  struct timespec ts;
+  clock_gettime( CLOCK_MONOTONIC, &ts );
+  return (U32) ( ( ((U64)(U32)ts.tv_sec) * 1000LL ) + (U64)(((U32)ts.tv_nsec+500000)/1000000) );
+}
+
+#endif
+
+#if defined(TIME_SIMD)
+  // default for most platforms
+#elif defined(TIME_SCALAR)
+  #define STBIR_NO_SIMD
+#else
+  #error You must define TIME_SIMD or TIME_SCALAR when compiling this file.
+#endif
+
+#define STBIR_PROFILE
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STBIR__V_FIRST_INFO_BUFFER v_info
+#include "stb_image_resize2.h"  // new one!
+
+#if defined(TIME_SIMD)  && !defined(STBIR_SIMD)
+#error Timing SIMD, but scalar was ON!
+#endif
+
+#if defined(TIME_SCALAR)  && defined(STBIR_SIMD)
+#error Timing scalar, but SIMD was ON!
+#endif
+
+#define HEADER 32
+
+
+static int file_write( const char *filename, void * buffer, size_t size ) 
+{
+  FILE * f = fopen( filename, "wb" );
+  if ( f == 0 ) return 0;
+  if ( fwrite( buffer, 1, size, f) != size ) return 0;
+  fclose(f);
+  return 1;
+}
+
+int64 nresize( void * o, int ox, int oy, int op, void * i, int ix, int iy, int ip, int buf, int type, int edg, int flt )
+{
+  STBIR_RESIZE resize;
+  int t;
+  int64 b;
+
+  stbir_resize_init( &resize, i, ix, iy, ip, o, ox, oy, op, buf, type );
+  stbir_set_edgemodes( &resize, edg, edg );
+  stbir_set_filters( &resize, flt, flt );
+  
+  stbir_build_samplers_with_splits( &resize, 1 );
+
+  b = 0x7fffffffffffffffULL;
+  for( t = 0 ; t < 16 ; t++ )
+  {
+    STBIR_PROFILE_INFO profile;
+    int64 v;
+    if(!stbir_resize_extended( &resize ) )
+      stop();
+    stbir_resize_extended_profile_info( &profile, &resize );
+    v = profile.clocks[1]+profile.clocks[2];
+    if ( v < b )
+    {
+      b = v;
+      t = 0;
+    }
+  }
+
+  stbir_free_samplers( &resize );
+
+  return b;
+}
+
+
+#define INSIZES 5
+#define TYPESCOUNT 5
+#define NUM 64
+
+static const int sizes[INSIZES]={63,126,252,520,772};
+static const int types[TYPESCOUNT]={STBIR_1CHANNEL,STBIR_2CHANNEL,STBIR_RGB,STBIR_4CHANNEL,STBIR_RGBA};
+static const int effective[TYPESCOUNT]={1,2,3,4,7};
+
+int main( int argc, char ** argv )
+{
+  unsigned char * input;
+  unsigned char * output;
+  int dimensionx, dimensiony;
+  int scalex, scaley;
+  int totalms;
+  int timing_count;
+  int ir;
+  int * file;
+  int * ts;
+  int64 totalcycles;
+
+  if ( argc != 6 )
+  {
+    printf("command: dotimings x_samps y_samps x_scale y_scale outfilename\n");
+    exit(1);
+  }
+
+  input = malloc( 4*1200*1200 );
+  memset( input, 0x80, 4*1200*1200 );
+  output = malloc( 4*10000*10000ULL );
+  
+  dimensionx = atoi( argv[1] );
+  dimensiony = atoi( argv[2] );
+  scalex = atoi( argv[3] );
+  scaley = atoi( argv[4] );
+
+  timing_count = dimensionx * dimensiony * INSIZES * TYPESCOUNT;
+
+  file = malloc( sizeof(int) * ( 2 * timing_count + HEADER ) );
+  ts = file + HEADER;
+
+  totalms = get_milliseconds();  
+  totalcycles = STBIR_PROFILE_FUNC();
+  for( ir = 0 ; ir < INSIZES ; ir++ )
+  {
+    int ix, iy, ty;
+    ix = iy = sizes[ir];
+
+    for( ty = 0 ; ty < TYPESCOUNT ; ty++ )
+    {
+      int h, hh;
+
+      h = 1;
+      for( hh = 0 ; hh < dimensiony; hh++ )
+      {
+        int ww, w = 1;
+        for( ww = 0 ; ww < dimensionx; ww++ )
+        {
+          int64 VF, HF;
+          int good;
+        
+          v_info.control_v_first = 2; // vertical first
+          VF = nresize( output, w, h, (w*4*1)&~3, input, ix, iy, ix*4*1, types[ty], STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_MITCHELL );
+          v_info.control_v_first = 1; // horizonal first
+          HF = nresize( output, w, h, (w*4*1)&~3, input, ix, iy, ix*4*1, types[ty], STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_MITCHELL );
+
+          good = ( ((HF<=VF) && (!v_info.v_first)) || ((VF<=HF) && (v_info.v_first)));
+
+//          printf("\r%d,%d, %d,%d, %d, %I64d,%I64d, // Good: %c(%c-%d)  CompEst: %.1f %.1f\n", ix, iy, w, h, ty, VF, HF, good?'y':'n', v_info.v_first?'v':'h', v_info.v_resize_classification, v_info.v_cost,v_info.h_cost );
+          ts[0] = (int)VF;
+          ts[1] = (int)HF;
+
+          ts += 2;
+
+          w += scalex;
+        }
+        printf(".");
+        h += scaley;  
+      }
+    }
+  }
+  totalms = get_milliseconds() - totalms;  
+  totalcycles = STBIR_PROFILE_FUNC() - totalcycles;
+
+  printf("\n");
+
+  file[0] = 'VFT1';
+
+  #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+  file[1] = 1;  // x64
+  #elif defined( _M_AMD64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) || defined(__ARM_NEON) || defined(__arm__) || defined( _M_ARM )
+  file[1] = 2;  // arm
+  #else
+  file[1] = 99;  // who knows???
+  #endif
+  
+  #ifdef STBIR_SIMD8
+    file[2] = 2;  // simd-8
+  #elif defined( STBIR_SIMD )
+    file[2] = 1;  // simd-4
+  #else
+    file[2] = 0;  // nosimd
+  #endif
+  
+  file[3] = dimensionx; // dimx
+  file[4] = dimensiony; // dimy
+  file[5] = TYPESCOUNT;  // channel types
+  file[ 6] = types[0]; file[7] = types[1]; file[8] = types[2]; file[9] = types[3]; file[10] = types[4];  // buffer_type 
+  file[11] = effective[0]; file[12] = effective[1]; file[13] = effective[2]; file[14] = effective[3]; file[15] = effective[4];  // effective channels 
+  file[16] = INSIZES;  // resizes
+  file[17] = sizes[0]; file[18] = sizes[0]; // input sizes (w x h)
+  file[19] = sizes[1]; file[20] = sizes[1];
+  file[21] = sizes[2]; file[22] = sizes[2];
+  file[23] = sizes[3]; file[24] = sizes[3];
+  file[25] = sizes[4]; file[26] = sizes[4];
+  file[27] = scalex;   file[28] = scaley;  // scale the dimx and dimy amount ( for(i=0;i<dimx) outputx = 1 + i*scalex; )
+  file[29] = totalms;
+  ((int64*)(file+30))[0] = totalcycles;
+
+  if ( !file_write( argv[5], file, sizeof(int) * ( 2 * timing_count + HEADER ) ) )
+    printf( "Error writing file: %s\n", argv[5] );
+  else
+    printf( "Successfully wrote timing file: %s\n", argv[5] );
+
+  return 0;
+}
diff --git a/stb_image_resize_test/old_image_resize.h b/stb_image_resize_test/old_image_resize.h
new file mode 100644
index 0000000..caf0f50
--- /dev/null
+++ b/stb_image_resize_test/old_image_resize.h
@@ -0,0 +1,2738 @@
+/* stb_image_resize - v0.96 - public domain image resizing
+   by Jorge L Rodriguez (@VinoBS) - 2014
+   http://github.com/nothings/stb
+
+   Written with emphasis on usability, portability, and efficiency. (No
+   SIMD or threads, so it be easily outperformed by libs that use those.)
+   Only scaling and translation is supported, no rotations or shears.
+   Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   QUICKSTART
+      stbir_resize_uint8(      input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0, num_channels)
+      stbir_resize_float(...)
+      stbir_resize_uint8_srgb( input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0,
+                               num_channels , alpha_chan  , 0)
+      stbir_resize_uint8_srgb_edgemode(
+                               input_pixels , in_w , in_h , 0, 
+                               output_pixels, out_w, out_h, 0, 
+                               num_channels , alpha_chan  , 0, STBIR_EDGE_CLAMP)
+                                                            // WRAP/REFLECT/ZERO
+
+   FULL API
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      SRGB & FLOATING POINT REPRESENTATION
+         The sRGB functions presume IEEE floating point. If you do not have
+         IEEE floating point, define STBIR_NON_IEEE_FLOAT. This will use
+         a slower implementation.
+
+      MEMORY ALLOCATION
+         The resize functions here perform a single memory allocation using
+         malloc. To control the memory allocation, before the #include that
+         triggers the implementation, do:
+
+            #define STBIR_MALLOC(size,context) ...
+            #define STBIR_FREE(ptr,context)   ...
+
+         Each resize function makes exactly one call to malloc/free, so to use
+         temp memory, store the temp memory in the context and return that.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+      OPTIMIZATION
+         Define STBIR_SATURATE_INT to compute clamp values in-range using
+         integer operations instead of float operations. This may be faster
+         on some platforms.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters
+         to use, you can change the compile-time defaults with
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are used. For a list of
+         supported filters see the stbir_filter enum. To add a new filter,
+         write a filter function and add it to stbir__filter_info_table.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can install
+         a progress-report callback:
+
+            #define STBIR_PROGRESS_REPORT(val)   some_func(val)
+
+         The parameter val is a float which goes from 0 to 1 as progress is made.
+
+         For example:
+
+            static void my_progress_report(float progress);
+            #define STBIR_PROGRESS_REPORT(val) my_progress_report(val)
+
+            #define STB_IMAGE_RESIZE_IMPLEMENTATION
+            #include "stb_image_resize.h"
+
+            static void my_progress_report(float progress)
+            {
+               printf("Progress: %f%%\n", progress*100);
+            }
+
+      MAX CHANNELS
+         If your image has more than 64 channels, define STBIR_MAX_CHANNELS
+         to the max you'll have.
+
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how
+         the alpha channel of an image is processed. The important things
+         to know about this:
+
+         1. The best mathematically-behaved version of alpha to use is
+         called "premultiplied alpha", in which the other color channels
+         have had the alpha value multiplied in. If you use premultiplied
+         alpha, linear filtering (such as image resampling done by this
+         library, or performed in texture units on GPUs) does the "right
+         thing". While premultiplied alpha is standard in the movie CGI
+         industry, it is still uncommon in the videogame/real-time world.
+
+         If you linearly filter non-premultiplied alpha, strange effects
+         occur. (For example, the 50/50 average of 99% transparent bright green
+         and 1% transparent black produces 50% transparent dark green when
+         non-premultiplied, whereas premultiplied it produces 50%
+         transparent near-black. The former introduces green energy
+         that doesn't exist in the source image.)
+
+         2. Artists should not edit premultiplied-alpha images; artists
+         want non-premultiplied alpha images. Thus, art tools generally output
+         non-premultiplied alpha images.
+
+         3. You will get best results in most cases by converting images
+         to premultiplied alpha before processing them mathematically.
+
+         4. If you pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED, the
+         resizer does not do anything special for the alpha channel;
+         it is resampled identically to other channels. This produces
+         the correct results for premultiplied-alpha images, but produces
+         less-than-ideal results for non-premultiplied-alpha images.
+
+         5. If you do not pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED,
+         then the resizer weights the contribution of input pixels
+         based on their alpha values, or, equivalently, it multiplies
+         the alpha value into the color channels, resamples, then divides
+         by the resultant alpha value. Input pixels which have alpha=0 do
+         not contribute at all to output pixels unless _all_ of the input
+         pixels affecting that output pixel have alpha=0, in which case
+         the result for that pixel is the same as it would be without
+         STBIR_FLAG_ALPHA_PREMULTIPLIED. However, this is only true for
+         input images in integer formats. For input images in float format,
+         input pixels with alpha=0 have no effect, and output pixels
+         which have alpha=0 will be 0 in all channels. (For float images,
+         you can manually achieve the same result by adding a tiny epsilon
+         value to the alpha channel of every image, and then subtracting
+         or clamping it at the end.)
+
+         6. You can suppress the behavior described in #5 and make
+         all-0-alpha pixels have 0 in all channels by #defining
+         STBIR_NO_ALPHA_EPSILON.
+
+         7. You can separately control whether the alpha channel is
+         interpreted as linear or affected by the colorspace. By default
+         it is linear; you almost never want to apply the colorspace.
+         (For example, graphics hardware does not apply sRGB conversion
+         to the alpha channel.)
+
+   CONTRIBUTORS
+      Jorge L Rodriguez: Implementation
+      Sean Barrett: API design, optimizations
+      Aras Pranckevicius: bugfix
+      Nathan Reed: warning fixes
+
+   REVISIONS
+      0.96 (2019-03-04) fixed warnings
+      0.95 (2017-07-23) fixed warnings
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+
+   TODO
+      Don't decode all of the image data when only processing a partial tile
+      Don't use full-width decode buffers when only processing a partial tile
+      When processing wide images, break processing into tiles so data fits in L1 cache
+      Installable filters?
+      Resize that respects alpha test coverage
+         (Reference code: FloatImage::alphaTestCoverage and FloatImage::scaleAlphaToCoverage:
+         https://code.google.com/p/nvidia-texture-tools/source/browse/trunk/src/nvimage/FloatImage.cpp )
+*/
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+#ifdef _MSC_VER
+typedef unsigned char  stbir_uint8;
+typedef unsigned short stbir_uint16;
+typedef unsigned int   stbir_uint32;
+typedef unsigned __int64   stbir_uint64;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+typedef uint64_t stbir_uint64;
+#endif
+
+#ifndef STBIRDEF
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * "input pixels" points to an array of image data with 'num_channels' channels (e.g. RGB=3, RGBA=4)
+//     * input_w is input image width (x-axis), input_h is input image height (y-axis)
+//     * stride is the offset between successive rows of image data in memory, in bytes. you can
+//       specify 0 to mean packed continuously in memory
+//     * alpha channel is treated identically to other channels.
+//     * colorspace is linear or sRGB as specified by function name
+//     * returned result is 1 for success or 0 in case of an error.
+//       #define STBIR_ASSERT() to trigger an assert on parameter validation errors.
+//     * Memory required grows approximately linearly with input and output size, but with
+//       discontinuities at input_w == output_w and input_h == output_h.
+//     * These functions use a "default" resampling filter defined at compile time. To change the filter,
+//       you can change the compile-time defaults by #defining STBIR_DEFAULT_FILTER_UPSAMPLE
+//       and STBIR_DEFAULT_FILTER_DOWNSAMPLE, or you can use the medium-complexity API.
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+
+// The following functions interpret image data as gamma-corrected sRGB. 
+// Specify STBIR_ALPHA_CHANNEL_NONE if you have no alpha channel,
+// or otherwise provide the index of the alpha channel. Flags value
+// of 0 will probably do the right thing if you're not sure what
+// the flags mean.
+
+#define STBIR_ALPHA_CHANNEL_NONE       -1
+
+// Set this flag if your texture has premultiplied alpha. Otherwise, stbir will
+// use alpha-weighted resampling (effectively premultiplying, resampling,
+// then unpremultiplying).
+#define STBIR_FLAG_ALPHA_PREMULTIPLIED    (1 << 0)
+// The specified alpha channel should be handled as gamma-corrected value even
+// when doing sRGB operations.
+#define STBIR_FLAG_ALPHA_USES_COLORSPACE  (1 << 1)
+
+#define STBIR_FLAG_ALPHA_OUT_PREMULTIPLIED (1 << 2)
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags);
+
+
+typedef enum
+{
+    STBIR_EDGE_CLAMP   = 1,
+    STBIR_EDGE_REFLECT = 2,
+    STBIR_EDGE_WRAP    = 3,
+    STBIR_EDGE_ZERO    = 4,
+} stbir_edge;
+
+// This function adds the ability to specify how requests to sample off the edge of the image are handled.
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode);
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Alpha-channel can be processed separately
+//       * If alpha_channel is not STBIR_ALPHA_CHANNEL_NONE
+//         * Alpha channel will not be gamma corrected (unless flags&STBIR_FLAG_GAMMA_CORRECT)
+//         * Filters will be weighted by alpha channel (unless flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)
+//     * Filter can be selected explicitly
+//     * uint16 image type
+//     * sRGB colorspace available for all types
+//     * context parameter for passing to STBIR_MALLOC
+
+typedef enum
+{
+    STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+    STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+    STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+    STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+    STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+    STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+} stbir_filter;
+
+typedef enum
+{
+    STBIR_COLORSPACE_LINEAR,
+    STBIR_COLORSPACE_SRGB,
+
+    STBIR_MAX_COLORSPACES,
+} stbir_colorspace;
+
+// The following functions are all identical except for the type of the image data
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Full-complexity API
+//
+// This extends the medium API as follows:
+//
+//       * uint32 image type
+//     * not typesafe
+//     * separate filter types for each axis
+//     * separate edge modes for each axis
+//     * can specify scale explicitly for subpixel correctness
+//     * can specify image source tile using texture coordinates
+
+typedef enum
+{
+    STBIR_TYPE_UINT8 ,
+    STBIR_TYPE_UINT16,
+    STBIR_TYPE_FLOAT ,
+    STBIR_TYPE_UINT32,
+
+    STBIR_MAX_TYPES
+} stbir_datatype;
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context);
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset);
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1);
+// (s0, t0) & (s1, t1) are the top-left and bottom right corner (uv addressing style: [0, 1]x[0, 1]) of a region of the input image to use.
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+
+
+
+
+#ifdef STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+// For memset
+#include <string.h>
+
+#include <math.h>
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+// use comma operator to evaluate c, to avoid "unused parameter" warnings
+#define STBIR_MALLOC(size,c) ((void)(c), malloc(size))
+#define STBIR_FREE(ptr,c)    ((void)(c), free(ptr))
+#endif
+
+#ifndef _MSC_VER
+#ifdef __cplusplus
+#define stbir__inline inline
+#else
+#define stbir__inline
+#endif
+#else
+#define stbir__inline __forceinline
+#endif
+
+#ifdef STBIR_PROFILE
+
+union
+{
+  struct { stbir_uint64 total, setup, filters, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named;
+  stbir_uint64 array[10];
+} oldprofile;
+stbir_uint64 * current_zone_excluded_ptr;
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+
+#ifdef _MSC_VER
+
+  STBIRDEF stbir_uint64 __rdtsc();
+  #define STBIR_PROFILE_FUNC() __rdtsc()
+
+#else // non msvc
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() 
+  {
+    stbir_uint32 lo, hi;
+    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
+    return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo );
+  }
+
+#endif  // msvc
+
+#elif defined( _M_AMD64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#ifdef _MSC_VER
+
+  #error Not sure what the intrinsic for cntvct_el0 is on MSVC
+
+#else // no msvc
+
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
+  {
+    stbir_uint64 tsc;
+    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+    return tsc;
+  }
+
+#endif
+
+#elif // x64, arm
+
+#error Unknown platform for profiling.
+
+#endif  //x64 and   
+
+#define STBIR_PROFILE_START() { stbir_uint64 thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * save_parent_excluded_ptr = current_zone_excluded_ptr; stbir_uint64 current_zone_excluded = 0; current_zone_excluded_ptr = &current_zone_excluded; 
+#define STBIR_PROFILE_END( wh ) thiszonetime = STBIR_PROFILE_FUNC() - thiszonetime; oldprofile.named.wh += thiszonetime - current_zone_excluded; *save_parent_excluded_ptr += thiszonetime; current_zone_excluded_ptr = save_parent_excluded_ptr; }
+#define STBIR_PROFILE_FIRST_START() { int i; current_zone_excluded_ptr = &oldprofile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(oldprofile.array);i++) oldprofile.array[i]=0; } STBIR_PROFILE_START();
+
+#else
+
+#define STBIR_PROFILE_START()
+#define STBIR_PROFILE_END( wh )
+#define STBIR_PROFILE_FIRST_START()
+
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char stbir__validate_uint32[sizeof(stbir_uint32) == 4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBIR__NOTUSED(v)  (void)(v)
+#else
+#define STBIR__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+#ifndef STBIR_PROGRESS_REPORT
+#define STBIR_PROGRESS_REPORT(float_0_to_1)
+#endif
+
+#ifndef STBIR_MAX_CHANNELS
+#define STBIR_MAX_CHANNELS 64
+#endif
+
+#if STBIR_MAX_CHANNELS > 65536
+#error "Too many channels; STBIR_MAX_CHANNELS must be no more than 65536."
+// because we store the indices in 16-bit variables
+#endif
+
+// This value is added to alpha just before premultiplication to avoid
+// zeroing out color values. It is equivalent to 2^-80. If you don't want
+// that behavior (it may interfere if you have floating point images with
+// very small alpha values) then you can define STBIR_NO_ALPHA_EPSILON to
+// disable it.
+#ifndef STBIR_ALPHA_EPSILON
+#define STBIR_ALPHA_EPSILON ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+#endif
+
+
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED_PARAM(v)  (void)(v)
+#else
+#define STBIR__UNUSED_PARAM(v)  (void)sizeof(v)
+#endif
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+    1, // STBIR_TYPE_UINT8
+    2, // STBIR_TYPE_UINT16
+    4, // STBIR_TYPE_UINT32
+    4, // STBIR_TYPE_FLOAT
+};
+
+// Kernel function centered at 0
+typedef float (stbir__kernel_fn)(float x, float scale);
+typedef float (stbir__support_fn)(float scale);
+
+typedef struct
+{
+    stbir__kernel_fn* kernel;
+    stbir__support_fn* support;
+} stbir__filter_info;
+
+// When upsampling, the contributors are which source pixels contribute.
+// When downsampling, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+    int n0; // First contributing pixel
+    int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+    const void* input_data;
+    int input_w;
+    int input_h;
+    int input_stride_bytes;
+
+    void* output_data;
+    int output_w;
+    int output_h;
+    int output_stride_bytes;
+
+    float s0, t0, s1, t1;
+
+    float horizontal_shift; // Units: output pixels
+    float vertical_shift;   // Units: output pixels
+    float horizontal_scale;
+    float vertical_scale;
+
+    int channels;
+    int alpha_channel;
+    stbir_uint32 flags;
+    stbir_datatype type;
+    stbir_filter horizontal_filter;
+    stbir_filter vertical_filter;
+    stbir_edge edge_horizontal;
+    stbir_edge edge_vertical;
+    stbir_colorspace colorspace;
+
+    stbir__contributors* horizontal_contributors;
+    float* horizontal_coefficients;
+
+    stbir__contributors* vertical_contributors;
+    float* vertical_coefficients;
+
+    int decode_buffer_pixels;
+    float* decode_buffer;
+
+    float* horizontal_buffer;
+
+    // cache these because ceil/floor are inexplicably showing up in profile
+    int horizontal_coefficient_width;
+    int vertical_coefficient_width;
+    int horizontal_filter_pixel_width;
+    int vertical_filter_pixel_width;
+    int horizontal_filter_pixel_margin;
+    int vertical_filter_pixel_margin;
+    int horizontal_num_contributors;
+    int vertical_num_contributors;
+
+    int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+    int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+    int ring_buffer_first_scanline;
+    int ring_buffer_last_scanline;
+    int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+    float* ring_buffer;
+
+    float* encode_buffer; // A temporary buffer to store floats so we don't lose precision while we do multiply-adds.
+
+    int horizontal_contributors_size;
+    int horizontal_coefficients_size;
+    int vertical_contributors_size;
+    int vertical_coefficients_size;
+    int decode_buffer_size;
+    int horizontal_buffer_size;
+    int ring_buffer_size;
+    int encode_buffer_size;
+} ostbir__info;
+
+
+static const float stbir__max_uint8_as_float  = 255.0f;
+static const float stbir__max_uint16_as_float = 65535.0f;
+static const double stbir__max_uint32_as_float = 4294967295.0;
+
+
+static stbir__inline int stbir__min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+static stbir__inline float stbir__saturate(float x)
+{
+    if (x < 0)
+        return 0;
+
+    if (x > 1)
+        return 1;
+
+    return x;
+}
+
+#ifdef STBIR_SATURATE_INT
+static stbir__inline stbir_uint8 stbir__saturate8(int x)
+{
+    if ((unsigned int) x <= 255)
+        return (stbir_uint8) x;
+
+    if (x < 0)
+        return 0;
+
+    return 255;
+}
+
+static stbir__inline stbir_uint16 stbir__saturate16(int x)
+{
+    if ((unsigned int) x <= 65535)
+        return (stbir_uint16) x;
+
+    if (x < 0)
+        return 0;
+
+    return 65535;
+}
+#endif
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+    0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+    0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+    0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+    0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+    0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+    0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+    0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+    0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+    0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+    0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+    0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+    0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+    0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+    0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+    0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+    0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+    0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+    0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+    0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+    0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+    0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+    0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+    0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+    0.982251f, 0.991102f, 1.0f
+};
+
+static float stbir__srgb_to_linear(float f)
+{
+    if (f <= 0.04045f)
+        return f / 12.92f;
+    else
+        return (float)pow((f + 0.055f) / 1.055f, 2.4f);
+}
+
+static float stbir__linear_to_srgb(float f)
+{
+    if (f <= 0.0031308f)
+        return f * 12.92f;
+    else
+        return 1.055f * (float)pow(f, 1 / 2.4f) - 0.055f;
+}
+
+#ifndef STBIR_NON_IEEE_FLOAT
+// From https://gist.github.com/rygorous/2203834
+
+typedef union
+{
+    stbir_uint32 u;
+    float f;
+} stbir__FP32;
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+    0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+    0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+    0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+    0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+    0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+    0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+    0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+    0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+ 
+static stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+    static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+    static const stbir__FP32 minval = { (127-13) << 23 };
+    stbir_uint32 tab,bias,scale,t;
+    stbir__FP32 f;
+ 
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    // The tests are carefully written so that NaNs map to 0, same as in the reference
+    // implementation.
+    if (!(in > minval.f)) // written this way to catch NaNs
+        in = minval.f;
+    if (in > almostone.f)
+        in = almostone.f;
+ 
+    // Do the table lookup and unpack bias, scale
+    f.f = in;
+    tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+    bias = (tab >> 16) << 9;
+    scale = tab & 0xffff;
+ 
+    // Grab next-highest mantissa bits and perform linear interpolation
+    t = (f.u >> 12) & 0xff;
+    return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#else
+// sRGB transition values, scaled by 1<<28
+static int stbir__srgb_offset_to_linear_scaled[256] =
+{
+            0,     40738,    122216,    203693,    285170,    366648,    448125,    529603,
+       611080,    692557,    774035,    855852,    942009,   1033024,   1128971,   1229926,
+      1335959,   1447142,   1563542,   1685229,   1812268,   1944725,   2082664,   2226148,
+      2375238,   2529996,   2690481,   2856753,   3028870,   3206888,   3390865,   3580856,
+      3776916,   3979100,   4187460,   4402049,   4622919,   4850123,   5083710,   5323731,
+      5570236,   5823273,   6082892,   6349140,   6622065,   6901714,   7188133,   7481369,
+      7781466,   8088471,   8402427,   8723380,   9051372,   9386448,   9728650,  10078021,
+     10434603,  10798439,  11169569,  11548036,  11933879,  12327139,  12727857,  13136073,
+     13551826,  13975156,  14406100,  14844697,  15290987,  15745007,  16206795,  16676389,
+     17153826,  17639142,  18132374,  18633560,  19142734,  19659934,  20185196,  20718552,
+     21260042,  21809696,  22367554,  22933648,  23508010,  24090680,  24681686,  25281066,
+     25888850,  26505076,  27129772,  27762974,  28404716,  29055026,  29713942,  30381490,
+     31057708,  31742624,  32436272,  33138682,  33849884,  34569912,  35298800,  36036568,
+     36783260,  37538896,  38303512,  39077136,  39859796,  40651528,  41452360,  42262316,
+     43081432,  43909732,  44747252,  45594016,  46450052,  47315392,  48190064,  49074096,
+     49967516,  50870356,  51782636,  52704392,  53635648,  54576432,  55526772,  56486700,
+     57456236,  58435408,  59424248,  60422780,  61431036,  62449032,  63476804,  64514376,
+     65561776,  66619028,  67686160,  68763192,  69850160,  70947088,  72053992,  73170912,
+     74297864,  75434880,  76581976,  77739184,  78906536,  80084040,  81271736,  82469648,
+     83677792,  84896192,  86124888,  87363888,  88613232,  89872928,  91143016,  92423512,
+     93714432,  95015816,  96327688,  97650056,  98982952, 100326408, 101680440, 103045072,
+    104420320, 105806224, 107202800, 108610064, 110028048, 111456776, 112896264, 114346544,
+    115807632, 117279552, 118762328, 120255976, 121760536, 123276016, 124802440, 126339832,
+    127888216, 129447616, 131018048, 132599544, 134192112, 135795792, 137410592, 139036528,
+    140673648, 142321952, 143981456, 145652208, 147334208, 149027488, 150732064, 152447968,
+    154175200, 155913792, 157663776, 159425168, 161197984, 162982240, 164777968, 166585184,
+    168403904, 170234160, 172075968, 173929344, 175794320, 177670896, 179559120, 181458992,
+    183370528, 185293776, 187228736, 189175424, 191133888, 193104112, 195086128, 197079968,
+    199085648, 201103184, 203132592, 205173888, 207227120, 209292272, 211369392, 213458480,
+    215559568, 217672656, 219797792, 221934976, 224084240, 226245600, 228419056, 230604656,
+    232802400, 235012320, 237234432, 239468736, 241715280, 243974080, 246245120, 248528464,
+    250824112, 253132064, 255452368, 257785040, 260130080, 262487520, 264857376, 267239664,
+};
+
+static stbir_uint8 stbir__linear_to_srgb_uchar(float f)
+{
+    int x = (int) (f * (1 << 28)); // has headroom so you don't need to clamp
+    int v = 0;
+    int i;
+
+    // Refine the guess with a short binary search.
+    i = v + 128; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  64; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  32; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  16; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   8; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   4; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   2; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   1; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+
+    return (stbir_uint8) v;
+}
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale)
+{
+    float halfscale = scale / 2;
+    float t = 0.5f + halfscale;
+    STBIR_ASSERT(scale <= 1);
+
+    x = (float)fabs(x);
+
+    if (x >= t)
+        return 0;
+    else
+    {
+        float r = 0.5f - halfscale;
+        if (x <= r)
+            return 1;
+        else
+            return (t - x) / scale;
+    }
+}
+
+static float stbir__support_trapezoid(float scale)
+{
+    STBIR_ASSERT(scale <= 1);
+    return 0.5f + scale / 2;
+}
+
+static float stbir__filter_triangle(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x <= 1.0f)
+        return 1 - x;
+    else
+        return 0;
+}
+
+static float stbir__filter_cubic(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (4 + x*x*(3*x - 6))/6;
+    else if (x < 2.0f)
+        return (8 + x*(-12 + x*(6 - x)))/6;
+
+    return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return 1 - x*x*(2.5f - 1.5f*x);
+    else if (x < 2.0f)
+        return 2 - x*(4 + x*(0.5f*x - 2.5f));
+
+    return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (16 + x*x*(21 * x - 36))/18;
+    else if (x < 2.0f)
+        return (32 + x*(-60 + x*(36 - 7*x)))/18;
+
+    return (0.0f);
+}
+
+static float stbir__support_zero(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 0;
+}
+
+static float stbir__support_one(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 1;
+}
+
+static float stbir__support_two(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 2;
+}
+
+static stbir__filter_info stbir__filter_info_table[] = {
+        { NULL,                     stbir__support_zero },
+        { stbir__filter_trapezoid,  stbir__support_trapezoid },
+        { stbir__filter_triangle,   stbir__support_one },
+        { stbir__filter_cubic,      stbir__support_two },
+        { stbir__filter_catmullrom, stbir__support_two },
+        { stbir__filter_mitchell,   stbir__support_two },
+};
+
+stbir__inline static int stbir__use_upsampling(float ratio)
+{
+    return ratio > 1;
+}
+
+stbir__inline static int stbir__use_width_upsampling(ostbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->horizontal_scale);
+}
+
+stbir__inline static int stbir__use_height_upsampling(ostbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->vertical_scale);
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter
+static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
+{
+    STBIR_ASSERT(filter != 0);
+    STBIR_ASSERT(filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2 / scale);
+}
+
+// This is how much to expand buffers to account for filters seeking outside
+// the image boundaries.
+static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
+{
+    return stbir__get_filter_pixel_width(filter, scale) / 2;
+}
+
+static int stbir__get_coefficient_width(stbir_filter filter, float scale)
+{
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1 / scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2);
+}
+
+static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
+{
+    if (stbir__use_upsampling(scale))
+        return output_size;
+    else
+        return (input_size + stbir__get_filter_pixel_margin(filter, scale) * 2);
+}
+
+static int stbir__get_total_horizontal_coefficients(ostbir__info* info)
+{
+    return info->horizontal_num_contributors
+         * stbir__get_coefficient_width      (info->horizontal_filter, info->horizontal_scale);
+}
+
+static int stbir__get_total_vertical_coefficients(ostbir__info* info)
+{
+    return info->vertical_num_contributors
+         * stbir__get_coefficient_width      (info->vertical_filter, info->vertical_scale);
+}
+
+static stbir__contributors* stbir__get_contributor(stbir__contributors* contributors, int n)
+{
+    return &contributors[n];
+}
+
+// For perf reasons this code is duplicated in stbir__resample_horizontal_upsample/downsample,
+// if you change it here change it there too.
+static float* stbir__get_coefficient(float* coefficients, stbir_filter filter, float scale, int n, int c)
+{
+    int width = stbir__get_coefficient_width(filter, scale);
+    return &coefficients[width*n + c];
+}
+
+static int stbir__edge_wrap_slow(stbir_edge edge, int n, int max)
+{
+    switch (edge)
+    {
+    case STBIR_EDGE_ZERO:
+        return 0; // we'll decode the wrong pixel here, and then overwrite with 0s later
+
+    case STBIR_EDGE_CLAMP:
+        if (n < 0)
+            return 0;
+
+        if (n >= max)
+            return max - 1;
+
+        return n; // NOTREACHED
+
+    case STBIR_EDGE_REFLECT:
+    {
+        if (n < 0)
+        {
+            if (n > -max)
+                return -n;
+            else
+                return max - 1;
+        }
+
+        if (n >= max)
+        {
+            int max2 = max * 2;
+            if (n >= max2)
+                return 0;
+            else
+                return max2 - n - 1;
+        }
+
+        return n; // NOTREACHED
+    }
+
+    case STBIR_EDGE_WRAP:
+        if (n >= 0)
+            return (n % max);
+        else
+        {
+            int m = (-n) % max;
+
+            if (m != 0)
+                m = max - m;
+
+            return (m);
+        }
+        // NOTREACHED
+
+    default:
+        STBIR_ASSERT(!"Unimplemented edge type");
+        return 0;
+    }
+}
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+    // avoid per-pixel switch
+    if (n >= 0 && n < max)
+        return n;
+    return stbir__edge_wrap_slow(edge, n, max);
+}
+
+// What input pixels contribute to this output pixel?
+static void stbir__calculate_sample_range_upsample(int n, float out_filter_radius, float scale_ratio, float out_shift, int* in_first_pixel, int* in_last_pixel, float* in_center_of_out)
+{
+    float out_pixel_center = (float)n + 0.5f;
+    float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+    float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+    float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) / scale_ratio;
+    float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) / scale_ratio;
+
+    *in_center_of_out = (out_pixel_center + out_shift) / scale_ratio;
+    *in_first_pixel = (int)(floor(in_pixel_influence_lowerbound + 0.5));
+    *in_last_pixel = (int)(floor(in_pixel_influence_upperbound - 0.5));
+}
+
+// What output pixels does this input pixel contribute to?
+static void stbir__calculate_sample_range_downsample(int n, float in_pixels_radius, float scale_ratio, float out_shift, int* out_first_pixel, int* out_last_pixel, float* out_center_of_in)
+{
+    float in_pixel_center = (float)n + 0.5f;
+    float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+    float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+
+    float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale_ratio - out_shift;
+    float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale_ratio - out_shift;
+
+    *out_center_of_in = in_pixel_center * scale_ratio - out_shift;
+    *out_first_pixel = (int)(floor(out_pixel_influence_lowerbound + 0.5));
+    *out_last_pixel = (int)(floor(out_pixel_influence_upperbound - 0.5));
+}
+
+static void stbir__calculate_coefficients_upsample(stbir_filter filter, float scale, int in_first_pixel, int in_last_pixel, float in_center_of_out, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+    float total_filter = 0;
+    float filter_scale;
+
+    STBIR_ASSERT(in_last_pixel - in_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = in_first_pixel;
+    contributor->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+        float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(in_center_of_out - in_pixel_center, 1 / scale);
+
+        // If the coefficient is zero, skip it. (Don't do the <0 check here, we want the influence of those outside pixels.)
+        if (i == 0 && !coefficient_group[i])
+        {
+            contributor->n0 = ++in_first_pixel;
+            i--;
+            continue;
+        }
+
+        total_filter += coefficient_group[i];
+    }
+
+    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
+
+    STBIR_ASSERT(total_filter > 0.9);
+    STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
+
+    // Make sure the sum of all coefficients is 1.
+    filter_scale = 1 / total_filter;
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+        coefficient_group[i] *= filter_scale;
+
+    for (i = in_last_pixel - in_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__calculate_coefficients_downsample(stbir_filter filter, float scale_ratio, int out_first_pixel, int out_last_pixel, float out_center_of_in, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+
+     STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = out_first_pixel;
+    contributor->n1 = out_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+        float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+        float x = out_pixel_center - out_center_of_in;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
+    }
+
+    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
+
+    for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__normalize_downsample_coefficients(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, int input_size, int output_size)
+{
+    int num_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+    int num_coefficients = stbir__get_coefficient_width(filter, scale_ratio);
+    int i, j;
+    int skip;
+
+    for (i = 0; i < output_size; i++)
+    {
+        float scale;
+        float total = 0;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+            {
+                float coefficient = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0);
+                total += coefficient;
+            }
+            else if (i < contributors[j].n0)
+                break;
+        }
+
+        //STBIR_ASSERT(total > 0.9f);
+        //STBIR_ASSERT(total < 1.5f);
+
+        scale = 1 / total;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+                *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0) *= scale;
+            else if (i < contributors[j].n0)
+                break;
+        }
+    }
+
+    // Optimize: Skip zero coefficients and contributions outside of image bounds.
+    // Do this after normalizing because normalization depends on the n0/n1 values.
+    for (j = 0; j < num_contributors; j++)
+    {
+        int range, max, width;
+
+        skip = 0;
+        while (*stbir__get_coefficient(coefficients, filter, scale_ratio, j, skip) == 0)
+            skip++;
+
+        contributors[j].n0 += skip;
+
+        while (contributors[j].n0 < 0)
+        {
+            contributors[j].n0++;
+            skip++;
+        }
+
+        range = contributors[j].n1 - contributors[j].n0 + 1;
+        max = stbir__min(num_coefficients, range);
+
+        width = stbir__get_coefficient_width(filter, scale_ratio);
+        for (i = 0; i < max; i++)
+        {
+            if (i + skip >= width)
+                break;
+
+            *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i) = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i + skip);
+        }
+
+        continue;
+    }
+
+    // Using min to avoid writing into invalid pixels.
+    for (i = 0; i < num_contributors; i++)
+        contributors[i].n1 = stbir__min(contributors[i].n1, output_size - 1);
+}
+
+// Each scan line uses the same kernel values so we should calculate the kernel
+// values once and then we can use them for every scan line.
+static void stbir__calculate_filters(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, float shift, int input_size, int output_size)
+{
+    int n;
+    int total_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+
+    if (stbir__use_upsampling(scale_ratio))
+    {
+        float out_pixels_radius = stbir__filter_info_table[filter].support(1 / scale_ratio) * scale_ratio;
+
+        // Looping through out pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float in_center_of_out; // Center of the current out pixel in the in pixel space
+            int in_first_pixel, in_last_pixel;
+
+            stbir__calculate_sample_range_upsample(n, out_pixels_radius, scale_ratio, shift, &in_first_pixel, &in_last_pixel, &in_center_of_out);
+
+            stbir__calculate_coefficients_upsample(filter, scale_ratio, in_first_pixel, in_last_pixel, in_center_of_out, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+    }
+    else
+    {
+        float in_pixels_radius = stbir__filter_info_table[filter].support(scale_ratio) / scale_ratio;
+
+        // Looping through in pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float out_center_of_in; // Center of the current out pixel in the in pixel space
+            int out_first_pixel, out_last_pixel;
+            int n_adjusted = n - stbir__get_filter_pixel_margin(filter, scale_ratio);
+
+            stbir__calculate_sample_range_downsample(n_adjusted, in_pixels_radius, scale_ratio, shift, &out_first_pixel, &out_last_pixel, &out_center_of_in);
+
+            stbir__calculate_coefficients_downsample(filter, scale_ratio, out_first_pixel, out_last_pixel, out_center_of_in, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+
+        stbir__normalize_downsample_coefficients(contributors, coefficients, filter, scale_ratio, input_size, output_size);
+    }
+}
+
+static float* stbir__get_decode_buffer(ostbir__info* stbir_info)
+{
+    // The 0 index of the decode buffer starts after the margin. This makes
+    // it okay to use negative indexes on the decode buffer.
+    return &stbir_info->decode_buffer[stbir_info->horizontal_filter_pixel_margin * stbir_info->channels];
+}
+
+#define STBIR__DECODE(type, colorspace) ((type) * (STBIR_MAX_COLORSPACES) + (colorspace))
+
+static void stbir__decode_scanline(ostbir__info* stbir_info, int n)
+{
+    int c;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int input_w = stbir_info->input_w;
+    size_t input_stride_bytes = stbir_info->input_stride_bytes;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir_edge edge_horizontal = stbir_info->edge_horizontal;
+    stbir_edge edge_vertical = stbir_info->edge_vertical;
+    size_t in_buffer_row_offset = stbir__edge_wrap(edge_vertical, n, stbir_info->input_h) * input_stride_bytes;
+    const void* input_data = (char *) stbir_info->input_data + in_buffer_row_offset;
+    int max_x = input_w + stbir_info->horizontal_filter_pixel_margin;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    int x = -stbir_info->horizontal_filter_pixel_margin;
+
+    // special handling for STBIR_EDGE_ZERO because it needs to return an item that doesn't appear in the input,
+    // and we want to avoid paying overhead on every pixel if not STBIR_EDGE_ZERO
+    if (edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->input_h))
+    {
+        for (; x < max_x; x++)
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        return;
+    }
+
+    STBIR_PROFILE_START( );
+    switch (decode)
+    {
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned char*)input_data)[input_pixel_index + c]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_uchar_to_linear_float[((const unsigned char*)input_data)[input_pixel_index + c]];
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned char*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned short*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear((float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float));
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((const float*)input_data)[input_pixel_index + c];
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((const float*)input_data)[input_pixel_index + c]);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((const float*)input_data)[input_pixel_index + alpha_channel];
+        }
+
+        break;
+
+    default:
+        STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+        break;
+    }
+    STBIR_PROFILE_END( decode );
+
+    if (!(stbir_info->flags & STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        STBIR_PROFILE_START();
+
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+
+            // If the alpha value is 0 it will clobber the color values. Make sure it's not.
+            float alpha = decode_buffer[decode_pixel_index + alpha_channel];
+#ifndef STBIR_NO_ALPHA_EPSILON
+            if (stbir_info->type != STBIR_TYPE_FLOAT) {
+                alpha += STBIR_ALPHA_EPSILON;
+                decode_buffer[decode_pixel_index + alpha_channel] = alpha;
+            }
+#endif
+            for (c = 0; c < channels; c++)
+            {
+                if (c == alpha_channel)
+                    continue;
+
+                decode_buffer[decode_pixel_index + c] *= alpha;
+            }
+        }
+        STBIR_PROFILE_END( alpha );
+    }
+
+    if (edge_horizontal == STBIR_EDGE_ZERO)
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < 0; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+        for (x = input_w; x < max_x; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+    }
+}
+
+static float* stbir__get_ring_buffer_entry(float* ring_buffer, int index, int ring_buffer_length)
+{
+    return &ring_buffer[index * ring_buffer_length];
+}
+
+static float* stbir__add_empty_ring_buffer_entry(ostbir__info* stbir_info, int n)
+{
+    int ring_buffer_index;
+    float* ring_buffer;
+
+    stbir_info->ring_buffer_last_scanline = n;
+
+    if (stbir_info->ring_buffer_begin_index < 0)
+    {
+        ring_buffer_index = stbir_info->ring_buffer_begin_index = 0;
+        stbir_info->ring_buffer_first_scanline = n;
+    }
+    else
+    {
+        ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+        STBIR_ASSERT(ring_buffer_index != stbir_info->ring_buffer_begin_index);
+    }
+
+    ring_buffer = stbir__get_ring_buffer_entry(stbir_info->ring_buffer, ring_buffer_index, stbir_info->ring_buffer_length_bytes / sizeof(float));
+    
+    memset(ring_buffer, 0, stbir_info->ring_buffer_length_bytes);
+
+    return ring_buffer;
+}
+
+
+static void stbir__resample_horizontal_upsample(ostbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+
+    STBIR_PROFILE_START( );
+    for (x = 0; x < output_w; x++)
+    {
+        int n0 = horizontal_contributors[x].n0;
+        int n1 = horizontal_contributors[x].n1;
+
+        int out_pixel_index = x * channels;
+        int coefficient_group = coefficient_width * x;
+        int coefficient_counter = 0;
+
+        STBIR_ASSERT(n1 >= n0);
+        STBIR_ASSERT(n0 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n0 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+
+        switch (channels) {
+            case 1:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    int c;
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+    STBIR_PROFILE_END( horizontal );
+}
+
+static void stbir__resample_horizontal_downsample(ostbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int input_w = stbir_info->input_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+    int filter_pixel_margin = stbir_info->horizontal_filter_pixel_margin;
+    int max_x = input_w + filter_pixel_margin * 2;
+
+    STBIR_ASSERT(!stbir__use_width_upsampling(stbir_info));
+
+    STBIR_PROFILE_START( );
+    switch (channels) {
+        case 1:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 1;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+
+        case 2:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 2;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+
+        case 3:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 3;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    //STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+
+        case 4:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 4;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+
+        default:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * channels;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int c;
+                    int out_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+    STBIR_PROFILE_END( horizontal );
+}
+
+static void stbir__decode_and_resample_upsample(ostbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    // Now resample it into the ring buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+
+    // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__decode_and_resample_downsample(ostbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    memset(stbir_info->horizontal_buffer, 0, stbir_info->output_w * stbir_info->channels * sizeof(float));
+
+    // Now resample it into the horizontal buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir_info->horizontal_buffer);
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir_info->horizontal_buffer);
+
+    // Now it's sitting in the horizontal buffer ready to be distributed into the ring buffers.
+}
+
+// Get the specified scan line from the ring buffer.
+static float* stbir__get_ring_buffer_scanline(int get_scanline, float* ring_buffer, int begin_index, int first_scanline, int ring_buffer_num_entries, int ring_buffer_length)
+{
+    int ring_buffer_index = (begin_index + (get_scanline - first_scanline)) % ring_buffer_num_entries;
+    return stbir__get_ring_buffer_entry(ring_buffer, ring_buffer_index, ring_buffer_length);
+}
+
+
+static void stbir__encode_scanline(ostbir__info* stbir_info, int num_pixels, void *output_buffer, float *encode_buffer, int channels, int alpha_channel, int decode)
+{
+    int x;
+    int n;
+    int num_nonalpha;
+    stbir_uint16 nonalpha[STBIR_MAX_CHANNELS];
+
+   if ((!(stbir_info->flags&STBIR_FLAG_ALPHA_OUT_PREMULTIPLIED))&&(alpha_channel!=-1))
+   {
+        STBIR_PROFILE_START( );
+
+        for (x=0; x < num_pixels; ++x)
+        {
+            int pixel_index = x*channels;
+
+            float alpha = encode_buffer[pixel_index + alpha_channel];
+            float reciprocal_alpha = alpha ? 1.0f / alpha : 0;
+
+            // unrolling this produced a 1% slowdown upscaling a large RGBA linear-space image on my machine - stb
+            for (n = 0; n < channels; n++)
+                if (n != alpha_channel)
+                    encode_buffer[pixel_index + n] *= reciprocal_alpha;
+
+            // We added in a small epsilon to prevent the color channel from being deleted with zero alpha.
+            // Because we only add it for integer types, it will automatically be discarded on integer
+            // conversion, so we don't need to subtract it back out (which would be problematic for
+            // numeric precision reasons).
+        }
+      STBIR_PROFILE_END( unalpha );
+    }
+
+    // build a table of all channels that need colorspace correction, so
+    // we don't perform colorspace correction on channels that don't need it.
+    for (x = 0, num_nonalpha = 0; x < channels; ++x)
+    {
+        if (x != alpha_channel || (stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+        {
+            nonalpha[num_nonalpha++] = (stbir_uint16)x;
+        }
+    }
+
+    #define STBIR__ROUND_INT(f)    ((int)          ((f)+0.5))
+    #define STBIR__ROUND_UINT(f)   ((stbir_uint32) ((f)+0.5))
+
+    #ifdef STBIR__SATURATE_INT
+    #define STBIR__ENCODE_LINEAR8(f)   stbir__saturate8 (STBIR__ROUND_INT((f) * stbir__max_uint8_as_float ))
+    #define STBIR__ENCODE_LINEAR16(f)  stbir__saturate16(STBIR__ROUND_INT((f) * stbir__max_uint16_as_float))
+    #else
+    #define STBIR__ENCODE_LINEAR8(f)   (unsigned char ) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint8_as_float )
+    #define STBIR__ENCODE_LINEAR16(f)  (unsigned short) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint16_as_float)
+    #endif
+
+    STBIR_PROFILE_START( );
+
+    switch (decode)
+    {
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned char*)output_buffer)[index] = STBIR__ENCODE_LINEAR8(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned char*)output_buffer)[index] = stbir__linear_to_srgb_uchar(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned char *)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR8(encode_buffer[pixel_index+alpha_channel]);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned short*)output_buffer)[index] = STBIR__ENCODE_LINEAR16(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned short*)output_buffer)[index] = (unsigned short)STBIR__ROUND_INT(stbir__linear_to_srgb(stbir__saturate(encode_buffer[index])) * stbir__max_uint16_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned short*)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR16(encode_buffer[pixel_index + alpha_channel]);
+            }
+
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__saturate(encode_buffer[index])) * stbir__max_uint32_as_float);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__linear_to_srgb(stbir__saturate(encode_buffer[index]))) * stbir__max_uint32_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned int*)output_buffer)[pixel_index + alpha_channel] = (unsigned int)STBIR__ROUND_INT(((double)stbir__saturate(encode_buffer[pixel_index + alpha_channel])) * stbir__max_uint32_as_float);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((float*)output_buffer)[index] = encode_buffer[index];
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    float p = encode_buffer[index];
+                    if ( p <= 0 ) p = 0; if ( p >= 1.0 ) p = 1.0;
+                    ((float*)output_buffer)[index] = stbir__linear_to_srgb(p);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                {
+                    float p = encode_buffer[pixel_index + alpha_channel];
+                    if ( p <= 0 ) p = 0; if ( p >= 1.0 ) p = 1.0;
+                    ((float*)output_buffer)[pixel_index + alpha_channel] = p;
+                }
+            }
+            break;
+
+        default:
+            STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+            break;
+    }
+    STBIR_PROFILE_END( encode );
+}
+
+static void stbir__resample_vertical_upsample(ostbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    void* output_data = stbir_info->output_data;
+    float* encode_buffer = stbir_info->encode_buffer;
+    int decode = STBIR__DECODE(type, colorspace);
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int coefficient_counter;
+    int contributor = n;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    int n0,n1, output_row_start;
+    int coefficient_group = coefficient_width * contributor;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    output_row_start = n * stbir_info->output_stride_bytes;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    STBIR_PROFILE_START( );
+
+    memset(encode_buffer, 0, output_w * sizeof(float) * channels);
+
+    // I tried reblocking this for better cache usage of encode_buffer
+    // (using x_outer, k, x_inner), but it lost speed. -- stb
+
+    coefficient_counter = 0;
+    switch (channels) {
+        case 1:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 1;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+        case 2:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 2;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+        case 3:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 3;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+        case 4:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 4;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                    encode_buffer[in_pixel_index + 3] += ring_buffer_entry[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+        default:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * channels;
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        encode_buffer[in_pixel_index + c] += ring_buffer_entry[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+    STBIR_PROFILE_END( vertical );
+    stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, encode_buffer, channels, alpha_channel, decode);
+}
+
+static void stbir__resample_vertical_downsample(ostbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    float* horizontal_buffer = stbir_info->horizontal_buffer;
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int contributor = n + stbir_info->vertical_filter_pixel_margin;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+    int n0,n1;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    STBIR_PROFILE_START( );
+    for (k = n0; k <= n1; k++)
+    {
+        int coefficient_index = k - n0;
+        int coefficient_group = coefficient_width * contributor;
+        float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+
+        float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+
+        switch (channels) {
+            case 1:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 1;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 2;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 3;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 4;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 3] += horizontal_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * channels;
+
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        ring_buffer_entry[in_pixel_index + c] += horizontal_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+    STBIR_PROFILE_END( vertical );
+}
+
+static void stbir__buffer_loop_upsample(ostbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    float out_scanlines_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(1/scale_ratio) * scale_ratio;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    for (y = 0; y < stbir_info->output_h; y++)
+    {
+        float in_center_of_out = 0; // Center of the current out scanline in the in scanline space
+        int in_first_scanline = 0, in_last_scanline = 0;
+
+        stbir__calculate_sample_range_upsample(y, out_scanlines_radius, scale_ratio, stbir_info->vertical_shift, &in_first_scanline, &in_last_scanline, &in_center_of_out);
+
+        STBIR_ASSERT(in_last_scanline - in_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (stbir_info->ring_buffer_begin_index >= 0)
+        {
+            // Get rid of whatever we don't need anymore.
+            while (in_first_scanline > stbir_info->ring_buffer_first_scanline)
+            {
+                if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+                {
+                    // We just popped the last scanline off the ring buffer.
+                    // Reset it to the empty state.
+                    stbir_info->ring_buffer_begin_index = -1;
+                    stbir_info->ring_buffer_first_scanline = 0;
+                    stbir_info->ring_buffer_last_scanline = 0;
+                    break;
+                }
+                else
+                {
+                    stbir_info->ring_buffer_first_scanline++;
+                    stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+                }
+            }
+        }
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__decode_and_resample_upsample(stbir_info, in_first_scanline);
+
+        while (in_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__decode_and_resample_upsample(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now all buffers should be ready to write a row of vertical sampling.
+        stbir__resample_vertical_upsample(stbir_info, y);
+
+        STBIR_PROGRESS_REPORT((float)y / stbir_info->output_h);
+    }
+}
+
+static void stbir__empty_ring_buffer(ostbir__info* stbir_info, int first_necessary_scanline)
+{
+    int output_stride_bytes = stbir_info->output_stride_bytes;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int output_w = stbir_info->output_w;
+    void* output_data = stbir_info->output_data;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    if (stbir_info->ring_buffer_begin_index >= 0)
+    {
+        // Get rid of whatever we don't need anymore.
+        while (first_necessary_scanline > stbir_info->ring_buffer_first_scanline)
+        {
+            if (stbir_info->ring_buffer_first_scanline >= 0 && stbir_info->ring_buffer_first_scanline < stbir_info->output_h)
+            {
+                int output_row_start = stbir_info->ring_buffer_first_scanline * output_stride_bytes;
+                float* ring_buffer_entry = stbir__get_ring_buffer_entry(ring_buffer, stbir_info->ring_buffer_begin_index, ring_buffer_length);
+                stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, ring_buffer_entry, channels, alpha_channel, decode);
+                STBIR_PROGRESS_REPORT((float)stbir_info->ring_buffer_first_scanline / stbir_info->output_h);
+            }
+
+            if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+            {
+                // We just popped the last scanline off the ring buffer.
+                // Reset it to the empty state.
+                stbir_info->ring_buffer_begin_index = -1;
+                stbir_info->ring_buffer_first_scanline = 0;
+                stbir_info->ring_buffer_last_scanline = 0;
+                break;
+            }
+            else
+            {
+                stbir_info->ring_buffer_first_scanline++;
+                stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+            }
+        }
+    }
+}
+
+static void stbir__buffer_loop_downsample(ostbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    int output_h = stbir_info->output_h;
+    float in_pixels_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(scale_ratio) / scale_ratio;
+    int pixel_margin = stbir_info->vertical_filter_pixel_margin;
+    int max_y = stbir_info->input_h + pixel_margin;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (y = -pixel_margin; y < max_y; y++)
+    {
+        float out_center_of_in; // Center of the current out scanline in the in scanline space
+        int out_first_scanline, out_last_scanline;
+
+        stbir__calculate_sample_range_downsample(y, in_pixels_radius, scale_ratio, stbir_info->vertical_shift, &out_first_scanline, &out_last_scanline, &out_center_of_in);
+
+        STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (out_last_scanline < 0 || out_first_scanline >= output_h)
+            continue;
+
+        stbir__empty_ring_buffer(stbir_info, out_first_scanline);
+
+        stbir__decode_and_resample_downsample(stbir_info, y);
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__add_empty_ring_buffer_entry(stbir_info, out_first_scanline);
+
+        while (out_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__add_empty_ring_buffer_entry(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now the horizontal buffer is ready to write to all ring buffer rows.
+        stbir__resample_vertical_downsample(stbir_info, y);
+    }
+
+    stbir__empty_ring_buffer(stbir_info, stbir_info->output_h);
+}
+
+static void stbir__setup(ostbir__info *info, int input_w, int input_h, int output_w, int output_h, int channels)
+{
+    info->input_w = input_w;
+    info->input_h = input_h;
+    info->output_w = output_w;
+    info->output_h = output_h;
+    info->channels = channels;
+}
+
+static void stbir__calculate_transform(ostbir__info *info, float s0, float t0, float s1, float t1, float *transform)
+{
+    info->s0 = s0;
+    info->t0 = t0;
+    info->s1 = s1;
+    info->t1 = t1;
+
+    if (transform)
+    {
+        info->horizontal_scale = transform[0];
+        info->vertical_scale   = transform[1];
+        info->horizontal_shift = transform[2];
+        info->vertical_shift   = transform[3];
+    }
+    else
+    {
+        info->horizontal_scale = ((float)info->output_w / info->input_w) / (s1 - s0);
+        info->vertical_scale = ((float)info->output_h / info->input_h) / (t1 - t0);
+
+        info->horizontal_shift = s0 * info->output_w / (s1 - s0);
+        info->vertical_shift = t0 * info->output_h / (t1 - t0);
+    }
+}
+
+static void stbir__choose_filter(ostbir__info *info, stbir_filter h_filter, stbir_filter v_filter)
+{
+    if (h_filter == 0)
+        h_filter = stbir__use_upsampling(info->horizontal_scale) ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    if (v_filter == 0)
+        v_filter = stbir__use_upsampling(info->vertical_scale)   ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    info->horizontal_filter = h_filter;
+    info->vertical_filter = v_filter;
+}
+
+static stbir_uint32 stbir__calculate_memory(ostbir__info *info)
+{
+    int pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    int filter_height = stbir__get_filter_pixel_width(info->vertical_filter, info->vertical_scale);
+
+    info->horizontal_num_contributors = stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
+    info->vertical_num_contributors   = stbir__get_contributors(info->vertical_scale  , info->vertical_filter  , info->input_h, info->output_h);
+
+    // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+    info->ring_buffer_num_entries = filter_height + 1;
+
+    info->horizontal_contributors_size = info->horizontal_num_contributors * sizeof(stbir__contributors);
+    info->horizontal_coefficients_size = stbir__get_total_horizontal_coefficients(info) * sizeof(float);
+    info->vertical_contributors_size = info->vertical_num_contributors * sizeof(stbir__contributors);
+    info->vertical_coefficients_size = stbir__get_total_vertical_coefficients(info) * sizeof(float);
+    info->decode_buffer_size = (info->input_w + pixel_margin * 2) * info->channels * sizeof(float);
+    info->horizontal_buffer_size = info->output_w * info->channels * sizeof(float);
+    info->ring_buffer_size = info->output_w * info->channels * info->ring_buffer_num_entries * sizeof(float);
+    info->encode_buffer_size = info->output_w * info->channels * sizeof(float);
+
+    STBIR_ASSERT(info->horizontal_filter != 0);
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+    STBIR_ASSERT(info->vertical_filter != 0);
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+
+    if (stbir__use_height_upsampling(info))
+        // The horizontal buffer is for when we're downsampling the height and we
+        // can't output the result of sampling the decode buffer directly into the
+        // ring buffers.
+        info->horizontal_buffer_size = 0;
+    else
+        // The encode buffer is to retain precision in the height upsampling method
+        // and isn't used when height downsampling.
+        info->encode_buffer_size = 0;
+
+    return info->horizontal_contributors_size + info->horizontal_coefficients_size
+        + info->vertical_contributors_size + info->vertical_coefficients_size
+        + info->decode_buffer_size + info->horizontal_buffer_size
+        + info->ring_buffer_size + info->encode_buffer_size;
+}
+
+static int stbir__resize_allocated(ostbir__info *info,
+    const void* input_data, int input_stride_in_bytes,
+    void* output_data, int output_stride_in_bytes,
+    int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace,
+    void* tempmem, size_t tempmem_size_in_bytes)
+{
+    size_t memory_required = stbir__calculate_memory(info);
+
+    int width_stride_input = input_stride_in_bytes ? input_stride_in_bytes : info->channels * info->input_w * stbir__type_size[type];
+    int width_stride_output = output_stride_in_bytes ? output_stride_in_bytes : info->channels * info->output_w * stbir__type_size[type];
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+#define OVERWRITE_ARRAY_SIZE 8
+    unsigned char overwrite_output_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_output_after_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_after_pre[OVERWRITE_ARRAY_SIZE];
+
+    size_t begin_forbidden = width_stride_output * (info->output_h - 1) + info->output_w * info->channels * stbir__type_size[type];
+    memcpy(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE);
+#endif
+
+    STBIR_ASSERT(info->channels >= 0);
+    STBIR_ASSERT(info->channels <= STBIR_MAX_CHANNELS);
+
+    if (info->channels < 0 || info->channels > STBIR_MAX_CHANNELS)
+        return 0;
+
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (info->horizontal_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+    if (info->vertical_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+
+    if (alpha_channel < 0)
+        flags |= STBIR_FLAG_ALPHA_USES_COLORSPACE | STBIR_FLAG_ALPHA_PREMULTIPLIED;
+
+    if (!(flags&STBIR_FLAG_ALPHA_USES_COLORSPACE) || !(flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)) {
+        STBIR_ASSERT(alpha_channel >= 0 && alpha_channel < info->channels);
+    }
+
+    if (alpha_channel >= info->channels)
+        return 0;
+
+    STBIR_ASSERT(tempmem);
+
+    if (!tempmem)
+        return 0;
+
+    STBIR_ASSERT(tempmem_size_in_bytes >= memory_required);
+
+    if (tempmem_size_in_bytes < memory_required)
+        return 0;
+
+    memset(tempmem, 0, tempmem_size_in_bytes);
+
+    info->input_data = input_data;
+    info->input_stride_bytes = width_stride_input;
+
+    info->output_data = output_data;
+    info->output_stride_bytes = width_stride_output;
+
+    info->alpha_channel = alpha_channel;
+    info->flags = flags;
+    info->type = type;
+    info->edge_horizontal = edge_horizontal;
+    info->edge_vertical = edge_vertical;
+    info->colorspace = colorspace;
+
+    STBIR_PROFILE_START();
+
+    info->horizontal_coefficient_width   = stbir__get_coefficient_width  (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_coefficient_width     = stbir__get_coefficient_width  (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_width  = stbir__get_filter_pixel_width (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_width    = stbir__get_filter_pixel_width (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_margin   = stbir__get_filter_pixel_margin(info->vertical_filter  , info->vertical_scale  );
+
+    info->ring_buffer_length_bytes = info->output_w * info->channels * sizeof(float);
+    info->decode_buffer_pixels = info->input_w + info->horizontal_filter_pixel_margin * 2;
+
+#define STBIR__NEXT_MEMPTR(current, newtype) (newtype*)(((unsigned char*)current) + current##_size)
+
+    info->horizontal_contributors = (stbir__contributors *) tempmem;
+    info->horizontal_coefficients = STBIR__NEXT_MEMPTR(info->horizontal_contributors, float);
+    info->vertical_contributors = STBIR__NEXT_MEMPTR(info->horizontal_coefficients, stbir__contributors);
+    info->vertical_coefficients = STBIR__NEXT_MEMPTR(info->vertical_contributors, float);
+    info->decode_buffer = STBIR__NEXT_MEMPTR(info->vertical_coefficients, float);
+
+    if (stbir__use_height_upsampling(info))
+    {
+        info->horizontal_buffer = NULL;
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->encode_buffer = STBIR__NEXT_MEMPTR(info->ring_buffer, float);
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->encode_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+    else
+    {
+        info->horizontal_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->horizontal_buffer, float);
+        info->encode_buffer = NULL;
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->ring_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+
+#undef STBIR__NEXT_MEMPTR
+
+    // This signals that the ring buffer is empty
+    info->ring_buffer_begin_index = -1;
+
+    stbir__calculate_filters(info->horizontal_contributors, info->horizontal_coefficients, info->horizontal_filter, info->horizontal_scale, info->horizontal_shift, info->input_w, info->output_w);
+    stbir__calculate_filters(info->vertical_contributors, info->vertical_coefficients, info->vertical_filter, info->vertical_scale, info->vertical_shift, info->input_h, info->output_h);
+    STBIR_PROFILE_END( filters );
+
+    STBIR_PROGRESS_REPORT(0);
+
+    STBIR_PROFILE_START();
+    if (stbir__use_height_upsampling(info))
+    {
+        stbir__buffer_loop_upsample(info);
+    }
+    else
+    {
+        stbir__buffer_loop_downsample(info);
+    }
+    STBIR_PROFILE_END( looping );
+
+
+    STBIR_PROGRESS_REPORT(1);
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+    STBIR_ASSERT(memcmp(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE) == 0);
+#endif
+
+    return 1;
+}
+
+
+static int stbir__resize_arbitrary(
+    void *alloc_context,
+    const void* input_data, int input_w, int input_h, int input_stride_in_bytes,
+    void* output_data, int output_w, int output_h, int output_stride_in_bytes,
+    float s0, float t0, float s1, float t1, float *transform,
+    int channels, int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_filter h_filter, stbir_filter v_filter,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace)
+{
+    ostbir__info info;
+    int result;
+    size_t memory_required;
+    void* extra_memory;
+    
+    STBIR_PROFILE_FIRST_START();
+
+    stbir__setup(&info, input_w, input_h, output_w, output_h, channels);
+    stbir__calculate_transform(&info, s0,t0,s1,t1,transform);
+    stbir__choose_filter(&info, h_filter, v_filter);
+    memory_required = stbir__calculate_memory(&info);
+    extra_memory = STBIR_MALLOC(memory_required, alloc_context);
+
+    if (!extra_memory)
+    {
+        return 0;
+    }
+
+    result = stbir__resize_allocated(&info, input_data, input_stride_in_bytes,
+                                            output_data, output_stride_in_bytes, 
+                                            alpha_channel, flags, type,
+                                            edge_horizontal, edge_vertical,
+                                            colorspace, extra_memory, memory_required);
+
+    STBIR_PROFILE_END( setup);
+
+    STBIR_FREE(extra_memory, alloc_context);
+
+    return result;
+}
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_FLOAT, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        edge_wrap_mode, edge_wrap_mode, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT16, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_FLOAT, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset)
+{
+    float transform[4];
+    transform[0] = x_scale;
+    transform[1] = y_scale;
+    transform[2] = x_offset;
+    transform[3] = y_offset;
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,transform,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        s0,t0,s1,t1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/stb_image_resize_test/oldir.c b/stb_image_resize_test/oldir.c
new file mode 100644
index 0000000..e1f3505
--- /dev/null
+++ b/stb_image_resize_test/oldir.c
@@ -0,0 +1,56 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef _MSC_VER
+#define stop() __debugbreak()
+#else
+#define stop() __builtin_trap()
+#endif
+
+//#define HEAVYTM
+#include "tm.h"
+
+#define STBIR_SATURATE_INT
+#define STB_IMAGE_RESIZE_STATIC
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "old_image_resize.h"  
+
+
+static int types[4] =    { STBIR_TYPE_UINT8, STBIR_TYPE_UINT8, STBIR_TYPE_UINT16, STBIR_TYPE_FLOAT };
+static int edges[4] =    { STBIR_EDGE_CLAMP, STBIR_EDGE_REFLECT, STBIR_EDGE_ZERO, STBIR_EDGE_WRAP };
+static int flts[5] =     { STBIR_FILTER_BOX, STBIR_FILTER_TRIANGLE, STBIR_FILTER_CUBICBSPLINE, STBIR_FILTER_CATMULLROM, STBIR_FILTER_MITCHELL };
+static int channels[20] = { 1, 2, 3, 4,      4,4,  2,2,  4,4, 2,2,  4,4, 2,2,  4,4, 2,2 }; 
+static int alphapos[20] = { -1, -1, -1, -1,  3,0,  1,0,   3,0,  1,0,   3,0,  1,0,   3,0,  1,0 }; 
+
+
+void oresize( void * o, int ox, int oy, int op, void * i, int ix, int iy, int ip, int buf, int type, int edg, int flt )
+{
+  int t = types[type];
+  int ic = channels[buf];
+  int alpha = alphapos[buf];
+  int e = edges[edg];
+  int f = flts[flt];
+  int space = ( type == 1 ) ? STBIR_COLORSPACE_SRGB : 0;
+  int flags = ( buf >= 16 ) ? STBIR_FLAG_ALPHA_PREMULTIPLIED : ( ( buf >= 12 ) ? STBIR_FLAG_ALPHA_OUT_PREMULTIPLIED : ( ( buf >= 8 ) ? (STBIR_FLAG_ALPHA_PREMULTIPLIED|STBIR_FLAG_ALPHA_OUT_PREMULTIPLIED) : 0 ) );
+  stbir_uint64 start;
+
+  ENTER( "Resize (old)" );
+  start = tmGetAccumulationStart( tm_mask );
+
+  if(!stbir_resize( i, ix, iy, ip, o, ox, oy, op, t, ic, alpha, flags, e, e, f, f, space, 0 ) )
+    stop();
+
+  #ifdef STBIR_PROFILE
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.setup, "Setup (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.filters, "Filters (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.looping, "Looping (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.vertical, "Vertical (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.horizontal, "Horizontal (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.decode, "Scanline input (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.encode, "Scanline output (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.alpha, "Alpha weighting (old)" );
+  tmEmitAccumulationZone( 0, 0, (tm_uint64 *)&start, 0, oldprofile.named.unalpha, "Alpha unweighting (old)" );
+  #endif
+
+  LEAVE();
+}
diff --git a/stb_image_resize_test/stbirtest.c b/stb_image_resize_test/stbirtest.c
new file mode 100644
index 0000000..22e1b82
--- /dev/null
+++ b/stb_image_resize_test/stbirtest.c
@@ -0,0 +1,992 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+//#define HEAVYTM
+#include "tm.h"
+
+#ifdef RADUSETM3
+tm_api * g_tm_api;
+//#define PROFILE_MODE
+#endif
+
+#include <math.h>
+
+#ifdef _MSC_VER
+#define stop() __debugbreak()
+#include <windows.h>
+#define int64 __int64
+#define uint64 unsigned __int64
+#else
+#define stop() __builtin_trap()
+#define int64 long long
+#define uint64 unsigned long long
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(disable:4127)
+#endif
+
+//#define NOCOMP
+
+
+//#define PROFILE_NEW_ONLY
+//#define PROFILE_MODE
+
+
+#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ )
+
+#ifdef _MSC_VER
+
+  uint64 __rdtsc();
+  #define __cycles() __rdtsc()
+
+#else // non msvc
+
+  static inline uint64 __cycles() 
+  {
+    unsigned int lo, hi;
+    asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
+    return ( ( (uint64) hi ) << 32 ) | ( (uint64) lo );
+  }
+
+#endif  // msvc
+
+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) 
+
+#ifdef _MSC_VER
+
+  #define __cycles() _ReadStatusReg(ARM64_CNTVCT)
+
+#else
+
+  static inline uint64 __cycles()
+  {
+    uint64 tsc;
+    asm volatile("mrs %0, cntvct_el0" : "=r" (tsc));
+    return tsc;
+  }
+
+#endif
+
+#else // x64, arm
+
+#error Unknown platform for timing.
+
+#endif  //x64 and   
+
+
+#ifdef PROFILE_MODE
+
+#define STBIR_ASSERT(cond)
+
+#endif
+
+#ifdef _DEBUG
+#undef STBIR_ASSERT
+#define STBIR_ASSERT(cond) { if (!(cond)) stop(); }
+#endif
+
+
+#define SHRINKBYW 2
+#define ZOOMBYW 2
+#define SHRINKBYH 2
+#define ZOOMBYH 2
+
+
+int mem_count = 0;
+
+#ifdef TEST_WITH_VALLOC
+
+#define STBIR__SEPARATE_ALLOCATIONS
+
+#if TEST_WITH_LIMIT_AT_FRONT
+
+  void * wmalloc(SIZE_T size)
+  {
+     static unsigned int pagesize=0;
+     void* p;
+     SIZE_T s;
+
+     // get the page size, if we haven't yet
+     if (pagesize==0)
+     {
+       SYSTEM_INFO si;
+       GetSystemInfo(&si);
+       pagesize=si.dwPageSize;
+     }
+
+     // we need room for the size, 8 bytes to hide the original pointer and a
+     //   validation dword, and enough data to completely fill one page
+     s=(size+(pagesize-1))&~(pagesize-1);
+
+     // allocate the size plus a page (for the guard)
+     p=VirtualAlloc(0,(SIZE_T)s,MEM_RESERVE|MEM_COMMIT,PAGE_READWRITE);
+     
+     return p;
+   }
+
+  void wfree(void * ptr)
+  {
+    if (ptr)
+    {
+      if ( ((ptrdiff_t)ptr) & 4095 ) stop();
+      if ( VirtualFree(ptr,0,MEM_RELEASE) == 0 ) stop();
+    }
+  }
+
+#else
+
+  void * wmalloc(SIZE_T size)
+  {
+     static unsigned int pagesize=0;
+     void* p;
+     SIZE_T s;
+
+     // get the page size, if we haven't yet
+     if (pagesize==0)
+     {
+       SYSTEM_INFO si;
+       GetSystemInfo(&si);
+       pagesize=si.dwPageSize;
+     }
+
+     // we need room for the size, 8 bytes to hide the original pointer and a
+     //   validation dword, and enough data to completely fill one page
+     s=(size+16+(pagesize-1))&~(pagesize-1);
+
+     // allocate the size plus a page (for the guard)
+     p=VirtualAlloc(0,(SIZE_T)(s+pagesize+pagesize),MEM_RESERVE|MEM_COMMIT,PAGE_READWRITE);
+
+     if (p)
+     {
+       DWORD oldprot;
+       void* orig=p;
+
+       // protect the first page
+       VirtualProtect(((char*)p),pagesize,PAGE_NOACCESS,&oldprot);
+
+       // protect the final page
+       VirtualProtect(((char*)p)+s+pagesize,pagesize,PAGE_NOACCESS,&oldprot);
+
+       // now move the returned pointer so that it bumps right up against the
+       //   the next (protected) page (this may result in unaligned return
+       //   addresses - pre-align the sizes if you always want aligned ptrs)
+//#define ERROR_ON_FRONT
+#ifdef ERROR_ON_FRONT
+       p=((char*)p)+pagesize+16;
+#else
+       p=((char*)p)+(s-size)+pagesize;
+#endif
+
+       // hide the validation value and the original pointer (which we'll
+       //   need used for freeing) right behind the returned pointer
+       ((unsigned int*)p)[-1]=0x98765432;
+       ((void**)p)[-2]=orig;
+       ++mem_count;
+//printf("aloc: %p bytes: %d\n",p,(int)size);
+       return(p);
+     }
+
+     return 0;
+  }
+
+  void wfree(void * ptr)
+  {
+    if (ptr)
+    {
+      int err=0;
+
+      // is this one of our allocations?
+      if (((((unsigned int*)ptr)[-1])!=0x98765432) || ((((void**)ptr)[-2])==0))
+      {
+        err=1;
+      }
+
+      if (err)
+      {
+        __debugbreak();
+      }
+      else
+      {
+
+        // back up to find the original pointer
+        void* p=((void**)ptr)[-2];
+
+        // clear the validation value and the original pointer
+        ((unsigned int*)ptr)[-1]=0;
+        ((void**)ptr)[-2]=0;
+
+//printf("free: %p\n",ptr);
+
+        --mem_count;
+
+        // now free the pages
+        if (p)
+          VirtualFree(p,0,MEM_RELEASE);
+
+      }
+    }
+  }
+
+#endif
+
+#define STBIR_MALLOC(size,user_data) ((void)(user_data), wmalloc(size))
+#define STBIR_FREE(ptr,user_data)    ((void)(user_data), wfree(ptr))
+
+#endif
+
+#define STBIR_PROFILE
+//#define STBIR_NO_SIMD
+//#define STBIR_AVX
+//#define STBIR_AVX2
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize2.h"  // new one!
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+int tsizes[5] =   { 1, 1, 2, 4, 2 };
+int ttypes[5] =   { STBIR_TYPE_UINT8, STBIR_TYPE_UINT8_SRGB, STBIR_TYPE_UINT16, STBIR_TYPE_FLOAT, STBIR_TYPE_HALF_FLOAT };
+
+int cedges[4] =   { STBIR_EDGE_CLAMP, STBIR_EDGE_REFLECT, STBIR_EDGE_ZERO, STBIR_EDGE_WRAP };
+int flts[5] =     { STBIR_FILTER_BOX, STBIR_FILTER_TRIANGLE, STBIR_FILTER_CUBICBSPLINE, STBIR_FILTER_CATMULLROM, STBIR_FILTER_MITCHELL };
+int buffers[20] = { STBIR_1CHANNEL, STBIR_2CHANNEL, STBIR_RGB, STBIR_4CHANNEL, 
+                    STBIR_BGRA, STBIR_ARGB, STBIR_RA, STBIR_AR,
+                    STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
+                    STBIR_RGBA, STBIR_ARGB, STBIR_RA, STBIR_AR,
+                    STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
+                  };
+int obuffers[20] = { STBIR_1CHANNEL, STBIR_2CHANNEL, STBIR_RGB, STBIR_4CHANNEL, 
+                     STBIR_BGRA, STBIR_ARGB, STBIR_RA, STBIR_AR,
+                     STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
+                     STBIR_RGBA_PM, STBIR_ARGB_PM, STBIR_RA_PM, STBIR_AR_PM,
+                     STBIR_RGBA, STBIR_ARGB, STBIR_RA, STBIR_AR,
+                   };
+
+int bchannels[20] = { 1, 2, 3, 4,  4,4, 2,2,  4,4, 2,2,  4,4, 2,2,  4,4, 2,2  }; 
+int alphapos[20] = { -1, -1, -1, -1,  3,0,  1,0,   3,0,  1,0,   3,0,  1,0,3,0,  1,0 }; 
+
+
+char const * buffstrs[20] = { "1ch", "2ch", "3ch", "4ch",  "RGBA", "ARGB", "RA", "AR",  "RGBA_both_pre", "ARGB_both_pre", "RA_both_pre", "AR_both_pre",  "RGBA_out_pre", "ARGB_out_pre", "RA_out_pre", "AR_out_pre",  "RGBA_in_pre", "ARGB_in_pre", "RA_in_pre", "AR_in_pre" };
+char const * typestrs[5] =  { "Bytes", "BytesSRGB", "Shorts", "Floats", "Half Floats"};
+char const * edgestrs[4] =  { "Clamp", "Reflect", "Zero", "Wrap" };
+char const * fltstrs[5] =   { "Box", "Triangle", "Cubic", "Catmullrom", "Mitchell" };
+
+#ifdef STBIR_PROFILE
+  static void do_acc_zones( STBIR_PROFILE_INFO * profile )
+  {
+    stbir_uint32 j;
+    stbir_uint64 start = tmGetAccumulationStart( tm_mask ); start=start;
+
+    for( j = 0 ; j < profile->count ; j++ )
+    {
+      if ( profile->clocks[j] )
+        tmEmitAccumulationZone( 0, 0, (tm_uint64*)&start, 0, profile->clocks[j], profile->descriptions[j] );
+    }
+  }
+#else
+  #define do_acc_zones(...)
+#endif
+
+int64 vert;
+
+//#define WINTHREADTEST
+#ifdef WINTHREADTEST
+
+static STBIR_RESIZE * thread_resize;
+static LONG which;
+static int threads_started = 0;
+static HANDLE threads[32];
+static HANDLE starts,stops;
+  
+static DWORD resize_shim( LPVOID p )
+{
+  for(;;)
+  {
+    LONG wh;
+
+    WaitForSingleObject( starts, INFINITE );
+
+    wh = InterlockedAdd( &which, 1 ) - 1;
+
+    ENTER( "Split %d", wh );
+    stbir_resize_split( thread_resize, wh, 1 );
+  #ifdef STBIR_PROFILE
+    { STBIR_PROFILE_INFO profile; stbir_resize_split_profile_info( &profile, thread_resize, wh, 1 ); do_acc_zones( &profile ); vert = profile.clocks[1]; }
+  #endif
+    LEAVE();
+
+    ReleaseSemaphore( stops, 1, 0 );
+  }
+}
+
+#endif
+
+void nresize( void * o, int ox, int oy, int op, void * i, int ix, int iy, int ip, int buf, int type, int edg, int flt )
+{
+  STBIR_RESIZE resize;
+ 
+  stbir_resize_init( &resize, i, ix, iy, ip, o, ox, oy, op, buffers[buf], ttypes[type] );
+  stbir_set_pixel_layouts( &resize, buffers[buf], obuffers[buf] );
+  stbir_set_edgemodes( &resize, cedges[edg], cedges[edg] );
+  stbir_set_filters( &resize, flts[flt], /*STBIR_FILTER_POINT_SAMPLE */ flts[flt] );
+  //stbir_set_input_subrect( &resize, 0.55f,0.333f,0.75f,0.50f);
+  //stbir_set_output_pixel_subrect( &resize, 00, 00, ox/2,oy/2);
+  //stbir_set_pixel_subrect(&resize, 1430,1361,30,30);
+  
+  ENTER( "Resize" );
+ 
+  #ifndef WINTHREADTEST
+
+  ENTER( "Filters" );
+  stbir_build_samplers_with_splits( &resize, 1 );
+  #ifdef STBIR_PROFILE
+    { STBIR_PROFILE_INFO profile; stbir_resize_build_profile_info( &profile, &resize ); do_acc_zones( &profile ); }
+  #endif
+  LEAVE();
+ 
+  ENTER( "Resize" );
+  if(!stbir_resize_extended( &resize ) )
+    stop();
+  #ifdef STBIR_PROFILE
+    { STBIR_PROFILE_INFO profile; stbir_resize_extended_profile_info( &profile, &resize ); do_acc_zones( &profile ); vert = profile.clocks[1]; }
+  #endif
+  LEAVE();
+
+  #else
+  {
+    int c, cnt;
+ 
+    ENTER( "Filters" );
+    cnt =  stbir_build_samplers_with_splits( &resize, 4 );
+    #ifdef STBIR_PROFILE
+      { STBIR_PROFILE_INFO profile; stbir_resize_build_profile_info( &profile, &resize ); do_acc_zones( &profile ); }
+    #endif
+    LEAVE();
+    
+    ENTER( "Thread start" );
+    if ( threads_started == 0 )
+    {
+      starts = CreateSemaphore( 0, 0, 32, 0 );
+      stops = CreateSemaphore( 0, 0, 32, 0 );
+    }
+    for( c = threads_started ; c < cnt ; c++ )
+      threads[ c ] = CreateThread( 0, 2048*1024, resize_shim, 0, 0, 0 );
+    
+    threads_started = cnt;
+    thread_resize = &resize;
+    which = 0;
+    LEAVE();
+
+    // starts the threads
+    ReleaseSemaphore( starts, cnt, 0 );
+
+    ENTER( "Wait" );
+    for( c = 0 ; c < cnt; c++ )
+      WaitForSingleObject( stops, INFINITE );
+    LEAVE();
+  }
+  #endif
+
+  ENTER( "Free" );
+  stbir_free_samplers( &resize );
+  LEAVE();
+  LEAVE();
+}
+
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+extern void oresize( void * o, int ox, int oy, int op, void * i, int ix, int iy, int ip, int buf, int type, int edg, int flt );
+
+
+
+#define TYPESTART 0
+#define TYPEEND   4
+
+#define LAYOUTSTART  0
+#define LAYOUTEND   19
+
+#define SIZEWSTART 0
+#define SIZEWEND   2
+
+#define SIZEHSTART 0
+#define SIZEHEND   2
+
+#define EDGESTART 0
+#define EDGEEND   3
+
+#define FILTERSTART 0
+#define FILTEREND   4
+
+#define HEIGHTSTART 0
+#define HEIGHTEND   2
+
+#define WIDTHSTART 0
+#define WIDTHEND   2
+
+
+
+
+static void * convert8to16( unsigned char * i, int w, int h, int c )
+{
+  unsigned short * ret;
+  int p;
+
+  ret = malloc( w*h*c*sizeof(short) );
+  for(p = 0 ; p < (w*h*c) ; p++ )
+  {
+    ret[p]=(short)((((int)i[p])<<8)+i[p]);
+  }
+
+  return ret;
+}
+
+static void * convert8tof( unsigned char * i, int w, int h, int c )
+{
+  float * ret;
+  int p;
+
+  ret = malloc( w*h*c*sizeof(float) );
+  for(p = 0 ; p < (w*h*c) ; p++ )
+  {
+    ret[p]=((float)i[p])*(1.0f/255.0f);
+  }
+
+  return ret;
+}
+
+static void * convert8tohf( unsigned char * i, int w, int h, int c )
+{
+  stbir__FP16 * ret;
+  int p;
+
+  ret = malloc( w*h*c*sizeof(stbir__FP16) );
+  for(p = 0 ; p < (w*h*c) ; p++ )
+  {
+    ret[p]=stbir__float_to_half(((float)i[p])*(1.0f/255.0f));
+  }
+
+  return ret;
+}
+
+static void * convert8tohff( unsigned char * i, int w, int h, int c )
+{
+  float * ret;
+  int p;
+
+  ret = malloc( w*h*c*sizeof(float) );
+  for(p = 0 ; p < (w*h*c) ; p++ )
+  {
+    ret[p]=stbir__half_to_float(stbir__float_to_half(((float)i[p])*(1.0f/255.0f)));
+  }
+
+  return ret;
+}
+
+static int isprime( int v )
+{
+  int i;
+
+  if ( v <= 3 )
+    return ( v > 1 );
+  if ( ( v & 1 ) == 0 )
+    return 0;
+  if ( ( v % 3 ) == 0 )
+    return 0;
+  i = 5;
+  while ( (i*i) <= v )
+  {
+    if ( ( v % i ) == 0 )
+      return 0;
+    if ( ( v % ( i + 2 ) ) == 0 )
+      return 0;
+    i += 6;
+  }
+
+  return 1;
+}
+
+static int getprime( int v )
+{
+  int i;
+  i = 0;
+  for(;;)
+  {
+    if ( i >= v )
+      return v;  // can't find any, just return orig
+    if (isprime(v - i))
+      return v - i;
+    if (isprime(v + i))
+      return v + i;
+    ++i;
+  }
+}
+
+
+int main( int argc, char ** argv )
+{
+  int ix, iy, ic;
+  unsigned char * input[6];
+  char * ir1;
+  char * ir2;
+  int szhs[3];
+  int szws[3];
+  int aw, ah, ac;
+  unsigned char * correctalpha;
+  int layouts, types, heights, widths, edges, filters;
+
+  if ( argc != 2 )
+  {
+    printf("command: stbirtest [imagefile]\n");
+    exit(1);
+  }
+
+  SetupTM( "127.0.0.1" );
+
+  correctalpha = stbi_load( "correctalpha.png", &aw, &ah, &ac, 0 );
+
+  input[0] = stbi_load( argv[1], &ix, &iy, &ic, 0 );
+  input[1] = input[0];
+  input[2] = convert8to16( input[0], ix, iy, ic );
+  input[3] = convert8tof( input[0], ix, iy, ic );
+  input[4] = convert8tohf( input[0], ix, iy, ic );
+  input[5] = convert8tohff( input[0], ix, iy, ic );
+
+  printf("Input %dx%d (%d channels)\n",ix,iy,ic);
+
+  ir1 = malloc( 4 * 4 * 3000 * 3000ULL );
+  ir2 = malloc( 4 * 4 * 3000 * 3000ULL );
+
+  szhs[0] = getprime( iy/SHRINKBYH );
+  szhs[1] = iy;
+  szhs[2] = getprime( iy*ZOOMBYH );
+
+  szws[0] = getprime( ix/SHRINKBYW );
+  szws[1] = ix;
+  szws[2] = getprime( ix*ZOOMBYW );
+
+  #if 1
+  for( types = TYPESTART ; types <= TYPEEND ; types++ )
+  #else
+  for( types = 1 ; types <= 1 ; types++ )
+  #endif
+  {
+    ENTER( "Test type: %s",typestrs[types]);
+    #if 1
+    for( layouts = LAYOUTSTART ; layouts <= LAYOUTEND ; layouts++ )
+    #else
+    for( layouts = 16; layouts <= 16 ; layouts++ )
+    #endif
+    {
+      ENTER( "Test layout: %s",buffstrs[layouts]);
+      
+      #if 0
+      for( heights = HEIGHTSTART ; heights <= HEIGHTEND ; heights++ )
+      {
+        int w, h = szhs[heights];
+      #else
+      for( heights = 0 ; heights <= 11 ; heights++ )
+      {
+        static int szhsz[12]={32, 200, 350, 400, 450, 509, 532, 624, 700, 824, 1023, 2053 };
+        int w, h = szhsz[heights];
+      #endif
+
+        ENTER( "Test height: %d %s %d",iy,(h<iy)?"Down":((h>iy)?"Up":"Same"),h);
+  
+        #if 0
+        for( widths = WIDTHSTART ; widths <= WIDTHEND ; widths++ )
+        {
+          w = szws[widths];
+        #else
+        for( widths = 0 ; widths <= 12 ; widths++ )
+        {
+          static int szwsz[13]={2, 32, 200, 350, 400, 450, 509, 532, 624, 700, 824, 1023, 2053 };
+          w = szwsz[widths];
+        #endif
+
+          ENTER( "Test width: %d %s %d",ix, (w<ix)?"Down":((w>ix)?"Up":"Same"), w);
+
+          #if 0
+          for( edges = EDGESTART ; edges <= EDGEEND ; edges++ )
+          #else
+          for( edges = 0 ; edges <= 0 ; edges++ )
+          #endif
+          {
+            ENTER( "Test edge: %s",edgestrs[edges]);
+            #if 0
+            for( filters = FILTERSTART ; filters <= FILTEREND ; filters++ )
+            #else
+            for( filters = 3 ; filters <= 3 ; filters++ )
+            #endif
+            {
+              int op, opw, np,npw, c, a;
+              #ifdef COMPARE_SAME                        
+              int oldtypes = types;
+              #else
+              int oldtypes = (types==4)?3:types;
+              #endif
+            
+              ENTER( "Test filter: %s",fltstrs[filters]);
+              {
+                c = bchannels[layouts];
+                a = alphapos[layouts];
+
+                op = w*tsizes[oldtypes]*c + 60;
+                opw = w*tsizes[oldtypes]*c;
+
+                np = w*tsizes[types]*c + 60;
+                npw = w*tsizes[types]*c;
+
+                printf( "%s:layout: %s  w: %d h: %d edge: %s filt: %s\n", typestrs[types],buffstrs[layouts], w, h, edgestrs[edges], fltstrs[filters] );
+
+
+                // clear pixel area to different, right edge to zero
+                #ifndef NOCLEAR
+                ENTER( "Test clear padding" );
+                {
+                  int d;
+                  for( d = 0 ; d < h ; d++ )
+                  {
+                    int oofs = d * op;
+                    int nofs = d * np;
+                    memset( ir1 + oofs, 192, opw );
+                    memset( ir1 + oofs+opw, 79, op-opw );
+                    memset( ir2 + nofs, 255, npw );
+                    memset( ir2 + nofs+npw, 79, np-npw );
+                  }
+                }
+                LEAVE();
+
+                #endif
+
+                #ifdef COMPARE_SAME                        
+                #define TIMINGS 1
+                #else 
+                #define TIMINGS 1
+                #endif
+                ENTER( "Test both" );
+                {
+                  #ifndef PROFILE_NEW_ONLY
+                    {
+                      int ttt, max = 0x7fffffff;
+                      ENTER( "Test old" );
+                      for( ttt = 0 ; ttt < TIMINGS ; ttt++ )
+                      {
+                        int64 m = __cycles();
+
+                        oresize( ir1, w, h, op, 
+                        #ifdef COMPARE_SAME                        
+                        input[types], 
+                        #else
+                        input[(types==4)?5:types], 
+                        #endif
+                        ix, iy, ix*ic*tsizes[oldtypes], layouts, oldtypes, edges, filters );
+
+                        m = __cycles() - m;
+                        if ( ( (int)m ) < max )
+                        max = (int) m;
+                      }
+                      LEAVE();
+                      printf("old: %d\n", max );
+                    }
+                  #endif
+     
+                  {
+                    int ttt, max = 0x7fffffff, maxv = 0x7fffffff;
+                    ENTER( "Test new" );
+                    for( ttt = 0 ; ttt < TIMINGS ; ttt++ )
+                    {
+                      int64 m = __cycles();
+
+                      nresize( ir2, w, h, np, input[types], ix, iy, ix*ic*tsizes[types], layouts, types, edges, filters );
+        
+                      m = __cycles() - m;
+                      if ( ( (int)m ) < max )
+                        max = (int) m;
+                      if ( ( (int)vert ) < maxv )
+                        maxv = (int) vert;
+                    }
+                    LEAVE(); // test new
+                    printf("new: %d (v: %d)\n", max, maxv );
+                  }
+                }
+                LEAVE();  // test both
+
+                if ( mem_count!= 0 )
+                  stop();
+
+              #ifndef NOCOMP
+                ENTER( "Test compare" );
+                {
+                  int x,y,ch;
+                  int nums = 0;
+                  for( y = 0 ; y < h ; y++ )
+                  {
+                    for( x = 0 ; x < w ; x++ )
+                    {
+                      switch(types)
+                      {
+                        case 0:
+                        case 1: //SRGB 
+                        {
+                          unsigned char * p1 = (unsigned char *)&ir1[y*op+x*c];
+                          unsigned char * p2 = (unsigned char *)&ir2[y*np+x*c];
+                          for( ch = 0 ; ch < c ; ch++ )
+                          {
+                            float pp1,pp2,d;
+                            float av = (a==-1)?1.0f:((float)p1[a]/255.0f);
+
+                            pp1 = p1[ch];
+                            pp2 = p2[ch];
+
+                            // compare in premult space
+                            #ifndef COMPARE_SAME                        
+                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >=16 ) && ( layouts <= 19 ) ) )
+                            {
+                              pp1 *= av;
+                              pp2 *= av;
+                            }
+                            #endif
+
+                            d = pp1 - pp2;
+                            if ( d < 0 ) d = -d;
+
+                            #ifdef COMPARE_SAME                        
+                            if ( d > 0 ) 
+                            #else 
+                            if ( d > 1 )
+                            #endif
+                            {
+                              printf("Error at %d x %d (chan %d) (d: %g a: %g) [%d %d %d %d] [%d %d %d %d]\n",x,y,ch, d,av, p1[0],p1[1],p1[2],p1[3], p2[0],p2[1],p2[2],p2[3]);
+                              ++nums;
+                              if ( nums > 16 ) goto ex;
+                              //if (d) exit(1);
+                              //goto ex;
+                            }
+                          }  
+                        }
+                        break;
+
+                        case 2:
+                        {
+                          unsigned short * p1 = (unsigned short *)&ir1[y*op+x*c*sizeof(short)];
+                          unsigned short * p2 = (unsigned short *)&ir2[y*np+x*c*sizeof(short)];
+                          for( ch = 0 ; ch < c ; ch++ )
+                          {
+                            float thres,pp1,pp2,d;
+                            float av = (a==-1)?1.0f:((float)p1[a]/65535.0f);
+
+                            pp1 = p1[ch];
+                            pp2 = p2[ch];
+
+                            // compare in premult space
+                            #ifndef COMPARE_SAME                        
+                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >= 16 ) && ( layouts <= 19 ) ) )
+                            {
+                              pp1 *= av;
+                              pp2 *= av;
+                            }
+                            #endif
+
+                            d = pp1 - pp2;
+                            if ( d < 0 ) d = -d;
+
+                            thres=((float)p1[ch]*0.007f)+2.0f;
+                            if (thres<4) thres = 4;
+  
+                            #ifdef COMPARE_SAME                        
+                            if ( d > 0 ) 
+                            #else 
+                            if ( d > thres)
+                            #endif
+                            {
+                              printf("Error at %d x %d (chan %d) %d %d [df: %g th: %g al: %g] (%d %d %d %d) (%d %d %d %d)\n",x,y,ch, p1[ch],p2[ch],d,thres,av,p1[0],p1[1],p1[2],p1[3],p2[0],p2[1],p2[2],p2[3]);
+                              ++nums;
+                              if ( nums > 16 ) goto ex;
+                              //if (d) exit(1);
+                              //goto ex;
+                            }
+                          }
+                        }
+                        break;
+
+                        case 3:
+                        {
+                          float * p1 = (float *)&ir1[y*op+x*c*sizeof(float)];
+                          float * p2 = (float *)&ir2[y*np+x*c*sizeof(float)];
+                          for( ch = 0 ; ch < c ; ch++ )
+                          {
+                            float pp1 = p1[ch], pp2 = p2[ch];
+                            float av = (a==-1)?1.0f:p1[a];
+                            float thres, d;
+
+                            // clamp
+                            if (pp1<=0.0f) pp1 = 0;
+                            if (pp2<=0.0f) pp2 = 0;
+                            if (av<=0.0f) av = 0;
+                            if (pp1>1.0f) pp1 = 1.0f;
+                            if (pp2>1.0f) pp2 = 1.0f;
+                            if (av>1.0f) av = 1.0f;
+
+                            // compare in premult space
+                            #ifndef COMPARE_SAME                        
+                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >= 16 ) && ( layouts <= 19 ) ) )
+                            {
+                              pp1 *= av;
+                              pp2 *= av;
+                            }
+                            #endif
+
+                            d = pp1 - pp2;
+                            if ( d < 0 ) d = -d;
+
+                            thres=(p1[ch]*0.002f)+0.0002f;
+                            if ( thres < 0 ) thres = -thres;
+
+                            #ifdef COMPARE_SAME                        
+                            if ( d != 0.0f ) 
+                            #else 
+                            if ( d > thres )
+                            #endif
+                            {
+                              printf("Error at %d x %d (chan %d) %g %g [df: %g th: %g al: %g] (%g %g %g %g) (%g %g %g %g)\n",x,y,ch, p1[ch],p2[ch],d,thres,av,p1[0],p1[1],p1[2],p1[3],p2[0],p2[1],p2[2],p2[3]);
+                              ++nums;
+                              if ( nums > 16 ) goto ex;
+                              //if (d) exit(1);
+                              //goto ex;
+                            }
+                          }
+                        }
+                        break;
+
+                        case 4:
+                        {
+                          #ifdef COMPARE_SAME                        
+                          stbir__FP16 * p1 = (stbir__FP16 *)&ir1[y*op+x*c*sizeof(stbir__FP16)];
+                          #else
+                          float * p1 = (float *)&ir1[y*op+x*c*sizeof(float)];
+                          #endif
+                          stbir__FP16 * p2 = (stbir__FP16 *)&ir2[y*np+x*c*sizeof(stbir__FP16)];
+                          for( ch = 0 ; ch < c ; ch++ )
+                          {
+                            #ifdef COMPARE_SAME                        
+                            float pp1  = stbir__half_to_float(p1[ch]);
+                            float av = (a==-1)?1.0f:stbir__half_to_float(p1[a]);
+                            #else
+                            float pp1 = stbir__half_to_float(stbir__float_to_half(p1[ch]));
+                            float av = (a==-1)?1.0f:stbir__half_to_float(stbir__float_to_half(p1[a]));
+                            #endif
+                            float pp2 = stbir__half_to_float(p2[ch]);
+                            float d, thres;
+
+                            // clamp
+                            if (pp1<=0.0f) pp1 = 0;
+                            if (pp2<=0.0f) pp2 = 0;
+                            if (av<=0.0f) av = 0;
+                            if (pp1>1.0f) pp1 = 1.0f;
+                            if (pp2>1.0f) pp2 = 1.0f;
+                            if (av>1.0f) av = 1.0f;
+
+                            thres=(pp1*0.002f)+0.0002f;
+
+                            // compare in premult space
+                            #ifndef COMPARE_SAME                        
+                            if ( ( ( layouts >=4 ) && ( layouts <= 7 ) ) || ( ( layouts >= 16 ) && ( layouts <= 19 ) ) )
+                            {
+                              pp1 *= av;
+                              pp2 *= av;
+                            }
+                            #endif
+
+                            d = pp1 - pp2;
+                            if ( d < 0 ) d = -d;
+
+
+                            #ifdef COMPARE_SAME                        
+                            if ( d != 0.0f ) 
+                            #else 
+                            if ( d > thres )
+                            #endif
+                            {
+                              printf("Error at %d x %d (chan %d) %g %g [df: %g th: %g al: %g] (%g %g %g %g) (%g %g %g %g)\n",x,y,ch, 
+                              #ifdef COMPARE_SAME                        
+                              stbir__half_to_float(p1[ch]), 
+                              #else
+                              p1[ch],
+                              #endif
+                              stbir__half_to_float(p2[ch]), 
+                              d,thres,av,
+                              #ifdef COMPARE_SAME                        
+                              stbir__half_to_float(p1[0]),stbir__half_to_float(p1[1]),stbir__half_to_float(p1[2]),stbir__half_to_float(p1[3]),
+                              #else
+                              p1[0],p1[1],p1[2],p1[3],
+                              #endif
+                              stbir__half_to_float(p2[0]),stbir__half_to_float(p2[1]),stbir__half_to_float(p2[2]),stbir__half_to_float(p2[3]) );
+                              ++nums;
+                              if ( nums > 16 ) goto ex;
+                              //if (d) exit(1);
+                              //goto ex;
+                            }
+                          }
+                        }
+                        break;
+                      }
+                    }
+
+                    for( x = (w*c)*tsizes[oldtypes]; x < op; x++ )
+                    {
+                      if ( ir1[y*op+x] != 79 )
+                      {
+                        printf("Margin error at %d x %d %d (should be 79) OLD!\n",x,y,(unsigned char)ir1[y*op+x]);
+                        goto ex;
+                      }
+                    }
+
+                    for( x = (w*c)*tsizes[types]; x < np; x++ )
+                    {
+                      if ( ir2[y*np+x] != 79 )
+                      {
+                        printf("Margin error at %d x %d %d (should be 79) NEW\n",x,y,(unsigned char)ir2[y*np+x]);
+                        goto ex;
+                      }
+                    }
+                  }
+                
+                  ex:
+                  ENTER( "OUTPUT IMAGES" );
+                  printf("  tot pix: %d, errs: %d\n", w*h*c,nums );
+
+                  if (nums)
+                  {
+                    stbi_write_png("old.png", w, h, c, ir1, op);
+                    stbi_write_png("new.png", w, h, c, ir2, np);
+                    exit(1);
+                  }
+
+                  LEAVE(); // output images
+                }
+                LEAVE(); //test compare
+              #endif
+
+
+
+              }
+              LEAVE(); // test filter
+            }
+            LEAVE(); // test edge
+          }
+          LEAVE(); // test width
+        }
+        LEAVE(); // test height
+      }
+      LEAVE(); // test type
+    }
+    LEAVE();  // test layout
+  }
+
+  CloseTM();
+  return 0;
+}
diff --git a/stb_image_resize_test/vf_train.c b/stb_image_resize_test/vf_train.c
new file mode 100644
index 0000000..0fdbe27
--- /dev/null
+++ b/stb_image_resize_test/vf_train.c
@@ -0,0 +1,999 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define stop() __debugbreak()
+#include <windows.h>
+#define int64 __int64
+
+#pragma warning(disable:4127)
+
+#define STBIR__WEIGHT_TABLES  
+#define STBIR_PROFILE
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize2.h"  
+
+static int * file_read( char const * filename )
+{
+  size_t s;
+  int * m;
+  FILE * f = fopen( filename, "rb" );
+  if ( f == 0 ) return 0;
+  
+  fseek( f, 0, SEEK_END);
+  s = ftell( f );
+  fseek( f, 0, SEEK_SET);
+  m = malloc( s + 4 );
+  m[0] = (int)s;
+  fread( m+1, 1, s, f);
+  fclose(f);
+
+  return( m );
+}
+
+typedef struct fileinfo
+{
+  int * timings;
+  int timing_count;
+  int dimensionx, dimensiony;
+  int numtypes;
+  int * types;
+  int * effective;
+  int cpu;
+  int simd;
+  int numinputrects;
+  int * inputrects;
+  int outputscalex, outputscaley;
+  int milliseconds;
+  int64 cycles;
+  double scale_time;
+  int bitmapx, bitmapy;
+  char const * filename;
+} fileinfo;
+
+int numfileinfo;
+fileinfo fi[256];
+unsigned char * bitmap;
+int bitmapw, bitmaph, bitmapp;
+
+static int use_timing_file( char const * filename, int index )
+{
+  int * base = file_read( filename );
+  int * file = base;
+
+  if ( base == 0 ) return 0;
+
+  ++file; // skip file image size;
+  if ( *file++ != 'VFT1' ) return 0;
+  fi[index].cpu = *file++;
+  fi[index].simd = *file++;
+  fi[index].dimensionx = *file++;
+  fi[index].dimensiony = *file++;
+  fi[index].numtypes = *file++;
+  fi[index].types = file; file += fi[index].numtypes;
+  fi[index].effective = file; file += fi[index].numtypes;
+  fi[index].numinputrects = *file++;
+  fi[index].inputrects = file; file += fi[index].numinputrects * 2;
+  fi[index].outputscalex = *file++;
+  fi[index].outputscaley = *file++;
+  fi[index].milliseconds = *file++;
+  fi[index].cycles = ((int64*)file)[0]; file += 2;
+  fi[index].filename = filename;
+
+  fi[index].timings = file;
+  fi[index].timing_count = (int) ( ( base[0] - ( ((char*)file - (char*)base - sizeof(int) ) ) ) / (sizeof(int)*2) );
+
+  fi[index].scale_time = (double)fi[index].milliseconds / (double)fi[index].cycles;
+
+  return 1;
+}
+
+static int vert_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int ox, int oy, int ix, int iy, int filter, STBIR__V_FIRST_INFO * v_info )
+{
+  float h_scale=(float)ox/(float)(ix);
+  float v_scale=(float)oy/(float)(iy);
+  stbir__support_callback * support = stbir__builtin_supports[filter];
+  int vertical_filter_width = stbir__get_filter_pixel_width(support,v_scale,0);
+  int vertical_gather = ( v_scale >= ( 1.0f - stbir__small_float ) ) || ( vertical_filter_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT );
+
+  return stbir__should_do_vertical_first( weights_table, stbir__get_filter_pixel_width(support,h_scale,0), h_scale, ox, vertical_filter_width, v_scale, oy, vertical_gather, v_info );
+} 
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#include "stb_image_write.h"
+
+static void alloc_bitmap()
+{
+  int findex;
+  int x = 0, y = 0;
+  int w = 0, h = 0;
+
+  for( findex = 0 ; findex < numfileinfo ; findex++ )
+  {
+    int nx, ny;
+    int thisw, thish;
+
+    thisw = ( fi[findex].dimensionx * fi[findex].numtypes ) + ( fi[findex].numtypes - 1 );
+    thish = ( fi[findex].dimensiony * fi[findex].numinputrects ) + ( fi[findex].numinputrects - 1 );
+
+    for(;;)
+    {
+      nx = x + ((x)?4:0) + thisw;
+      ny = y + ((y)?4:0) + thish;
+      if ( ( nx <= 3600 ) || ( x == 0 ) )
+      { 
+        fi[findex].bitmapx = x + ((x)?4:0);
+        fi[findex].bitmapy = y + ((y)?4:0);
+        x = nx;
+        if ( x > w ) w = x;
+        if ( ny > h ) h = ny;
+        break;
+      }
+      else
+      {
+        x = 0;
+        y = h;
+      }
+    }
+  }
+
+  w = (w+3) & ~3;
+  bitmapw = w;
+  bitmaph = h;
+  bitmapp = w * 3; // RGB
+  bitmap = malloc( bitmapp * bitmaph );
+
+  memset( bitmap, 0, bitmapp * bitmaph );
+}
+
+static void build_bitmap( float weights[STBIR_RESIZE_CLASSIFICATIONS][4], int do_channel_count_index, int findex )
+{
+  static int colors[STBIR_RESIZE_CLASSIFICATIONS];
+  STBIR__V_FIRST_INFO v_info = {0};
+
+  int * ts;
+  int ir;
+  unsigned char * bitm = bitmap + ( fi[findex].bitmapx*3 ) + ( fi[findex].bitmapy*bitmapp) ;
+
+  for( ir = 0; ir < STBIR_RESIZE_CLASSIFICATIONS ; ir++ ) colors[ ir ] = 127*ir/STBIR_RESIZE_CLASSIFICATIONS+128;
+
+  ts = fi[findex].timings;
+
+  for( ir = 0 ; ir < fi[findex].numinputrects ; ir++ )
+  {
+    int ix, iy, chanind;
+    ix = fi[findex].inputrects[ir*2];
+    iy = fi[findex].inputrects[ir*2+1];
+
+    for( chanind = 0 ; chanind < fi[findex].numtypes ; chanind++ )
+    {
+      int ofs, h, hh;
+
+      // just do the type that we're on
+      if ( chanind != do_channel_count_index )
+      {
+        ts += 2 * fi[findex].dimensionx * fi[findex].dimensiony;
+        continue;
+      }
+
+      // bitmap offset
+      ofs=chanind*(fi[findex].dimensionx+1)*3+ir*(fi[findex].dimensiony+1)*bitmapp;
+
+      h = 1;
+      for( hh = 0 ; hh < fi[findex].dimensiony; hh++ )
+      {
+        int ww, w = 1;
+        for( ww = 0 ; ww < fi[findex].dimensionx; ww++ )
+        {
+          int good, v_first, VF, HF;
+
+          VF = ts[0];
+          HF = ts[1];
+
+          v_first = vert_first( weights, w, h, ix, iy, STBIR_FILTER_MITCHELL, &v_info );
+
+          good = ( ((HF<=VF) && (!v_first)) || ((VF<=HF) && (v_first)));
+
+          if ( good )
+          {
+            bitm[ofs+2] = 0;
+            bitm[ofs+1] = (unsigned char)colors[v_info.v_resize_classification];
+          }
+          else
+          {
+            double r;
+
+            if ( HF < VF )
+              r = (double)(VF-HF)/(double)HF;
+            else
+              r = (double)(HF-VF)/(double)VF;
+            
+            if ( r > 0.4f) r = 0.4;
+            r *= 1.0f/0.4f;   
+
+            bitm[ofs+2] = (char)(255.0f*r);
+            bitm[ofs+1] = (char)(((float)colors[v_info.v_resize_classification])*(1.0f-r));
+          }
+          bitm[ofs] = 0;
+
+          ofs += 3;
+          ts += 2;
+          w += fi[findex].outputscalex;
+        }
+        ofs += bitmapp - fi[findex].dimensionx*3;
+        h += fi[findex].outputscaley;
+      }
+    }
+  }
+}
+
+static void build_comp_bitmap( float weights[STBIR_RESIZE_CLASSIFICATIONS][4], int do_channel_count_index )
+{
+  int * ts0;
+  int * ts1;
+  int ir;
+  unsigned char * bitm = bitmap + ( fi[0].bitmapx*3 ) + ( fi[0].bitmapy*bitmapp) ;
+
+  ts0 = fi[0].timings;
+  ts1 = fi[1].timings;
+
+  for( ir = 0 ; ir < fi[0].numinputrects ; ir++ )
+  {
+    int ix, iy, chanind;
+    ix = fi[0].inputrects[ir*2];
+    iy = fi[0].inputrects[ir*2+1];
+
+    for( chanind = 0 ; chanind < fi[0].numtypes ; chanind++ )
+    {
+      int ofs, h, hh;
+
+      // just do the type that we're on
+      if ( chanind != do_channel_count_index )
+      {
+        ts0 += 2 * fi[0].dimensionx * fi[0].dimensiony;
+        ts1 += 2 * fi[0].dimensionx * fi[0].dimensiony;
+        continue;
+      }
+
+      // bitmap offset
+      ofs=chanind*(fi[0].dimensionx+1)*3+ir*(fi[0].dimensiony+1)*bitmapp;
+
+      h = 1;
+      for( hh = 0 ; hh < fi[0].dimensiony; hh++ )
+      {
+        int ww, w = 1;
+        for( ww = 0 ; ww < fi[0].dimensionx; ww++ )
+        {
+          int v_first, time0, time1;
+
+          v_first = vert_first( weights, w, h, ix, iy, STBIR_FILTER_MITCHELL, 0 );
+
+          time0 = ( v_first ) ? ts0[0] : ts0[1];
+          time1 = ( v_first ) ? ts1[0] : ts1[1];
+
+          if ( time0 < time1 )
+          {
+            double r = (double)(time1-time0)/(double)time0;
+            if ( r > 0.4f) r = 0.4;
+            r *= 1.0f/0.4f;   
+            bitm[ofs+2] = 0;
+            bitm[ofs+1] = (char)(255.0f*r);
+            bitm[ofs] = (char)(64.0f*(1.0f-r));
+          }
+          else
+          {
+            double r = (double)(time0-time1)/(double)time1;
+            if ( r > 0.4f) r = 0.4;
+            r *= 1.0f/0.4f;   
+            bitm[ofs+2] = (char)(255.0f*r);
+            bitm[ofs+1] = 0;
+            bitm[ofs] = (char)(64.0f*(1.0f-r));
+          }
+          ofs += 3;
+          ts0 += 2;
+          ts1 += 2;
+          w += fi[0].outputscalex;
+        }
+        ofs += bitmapp - fi[0].dimensionx*3;
+        h += fi[0].outputscaley;
+      }
+    }
+  }
+}
+
+static void write_bitmap()
+{
+  stbi_write_png( "results.png", bitmapp / 3, bitmaph, 3|STB_IMAGE_BGR, bitmap, bitmapp );
+}
+
+
+static void calc_errors( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int * curtot, double * curerr, int do_channel_count_index )
+{
+  int th, findex;
+  STBIR__V_FIRST_INFO v_info = {0};
+
+  for(th=0;th<STBIR_RESIZE_CLASSIFICATIONS;th++)
+  {
+    curerr[th]=0;
+    curtot[th]=0;
+  }
+
+  for( findex = 0 ; findex < numfileinfo ; findex++ )
+  {
+    int * ts;
+    int ir;
+    ts = fi[findex].timings;
+
+    for( ir = 0 ; ir < fi[findex].numinputrects ; ir++ )
+    {
+      int ix, iy, chanind;
+      ix = fi[findex].inputrects[ir*2];
+      iy = fi[findex].inputrects[ir*2+1];
+
+      for( chanind = 0 ; chanind < fi[findex].numtypes ; chanind++ )
+      {
+        int h, hh;
+
+        // just do the type that we're on
+        if ( chanind != do_channel_count_index )
+        {
+          ts += 2 * fi[findex].dimensionx * fi[findex].dimensiony;
+          continue;
+        }
+
+        h = 1;
+        for( hh = 0 ; hh < fi[findex].dimensiony; hh++ )
+        {
+          int ww, w = 1;
+          for( ww = 0 ; ww < fi[findex].dimensionx; ww++ )
+          {
+            int good, v_first, VF, HF;
+
+            VF = ts[0];
+            HF = ts[1];
+
+            v_first = vert_first( weights_table, w, h, ix, iy, STBIR_FILTER_MITCHELL, &v_info );
+
+            good = ( ((HF<=VF) && (!v_first)) || ((VF<=HF) && (v_first)));
+
+            if ( !good )
+            {
+              double diff;
+              if ( VF < HF )
+                diff = ((double)HF-(double)VF) * fi[findex].scale_time;
+              else
+                diff = ((double)VF-(double)HF) * fi[findex].scale_time;
+
+              curtot[v_info.v_resize_classification] += 1;
+              curerr[v_info.v_resize_classification] += diff;
+            }
+
+            ts += 2;
+            w += fi[findex].outputscalex;
+          }
+          h += fi[findex].outputscaley;
+        }
+      }
+    }
+  }
+}
+
+#define TRIESPERWEIGHT 32
+#define MAXRANGE ((TRIESPERWEIGHT+1) * (TRIESPERWEIGHT+1) * (TRIESPERWEIGHT+1) * (TRIESPERWEIGHT+1) - 1)
+
+static void expand_to_floats( float * weights, int range )
+{
+  weights[0] = (float)( range % (TRIESPERWEIGHT+1) ) / (float)TRIESPERWEIGHT;
+  weights[1] = (float)( range/(TRIESPERWEIGHT+1) % (TRIESPERWEIGHT+1) ) / (float)TRIESPERWEIGHT;
+  weights[2] = (float)( range/(TRIESPERWEIGHT+1)/(TRIESPERWEIGHT+1) % (TRIESPERWEIGHT+1) ) / (float)TRIESPERWEIGHT;
+  weights[3] = (float)( range/(TRIESPERWEIGHT+1)/(TRIESPERWEIGHT+1)/(TRIESPERWEIGHT+1) % (TRIESPERWEIGHT+1) ) / (float)TRIESPERWEIGHT;
+}
+
+static char const * expand_to_string( int range )
+{
+  static char str[128];
+  int w0,w1,w2,w3;
+  w0 = range % (TRIESPERWEIGHT+1);
+  w1 = range/(TRIESPERWEIGHT+1) % (TRIESPERWEIGHT+1);
+  w2 = range/(TRIESPERWEIGHT+1)/(TRIESPERWEIGHT+1) % (TRIESPERWEIGHT+1);
+  w3 = range/(TRIESPERWEIGHT+1)/(TRIESPERWEIGHT+1)/(TRIESPERWEIGHT+1) % (TRIESPERWEIGHT+1);
+  sprintf( str, "[ %2d/%d %2d/%d %2d/%d %2d/%d ]",w0,TRIESPERWEIGHT,w1,TRIESPERWEIGHT,w2,TRIESPERWEIGHT,w3,TRIESPERWEIGHT );
+  return str;
+}
+
+static void print_weights( float weights[STBIR_RESIZE_CLASSIFICATIONS][4], int channel_count_index, int * tots, double * errs )
+{
+  int th;
+  printf("ChInd: %d  Weights:\n",channel_count_index);
+  for(th=0;th<STBIR_RESIZE_CLASSIFICATIONS;th++)
+  {
+    float * w = weights[th];
+    printf("  %d: [%1.5f %1.5f %1.5f %1.5f] (%d %.4f)\n",th, w[0], w[1], w[2], w[3], tots[th], errs[th] );
+  }
+  printf("\n");
+}
+
+static int windowranges[ 16 ];
+static int windowstatus = 0;
+static DWORD trainstart = 0;
+
+static void opt_channel( float best_output_weights[STBIR_RESIZE_CLASSIFICATIONS][4], int channel_count_index )
+{
+  int newbest = 0;
+  float weights[STBIR_RESIZE_CLASSIFICATIONS][4] = {0};
+  double besterr[STBIR_RESIZE_CLASSIFICATIONS];
+  int besttot[STBIR_RESIZE_CLASSIFICATIONS];
+  int best[STBIR_RESIZE_CLASSIFICATIONS]={0};
+
+  double curerr[STBIR_RESIZE_CLASSIFICATIONS];
+  int curtot[STBIR_RESIZE_CLASSIFICATIONS];
+  int th, range;
+  DWORD lasttick = 0;
+
+  for(th=0;th<STBIR_RESIZE_CLASSIFICATIONS;th++) 
+  {
+    besterr[th]=1000000000000.0;
+    besttot[th]=0x7fffffff;
+  }
+
+  newbest = 0;
+
+  // try the whole range  
+  range = MAXRANGE;
+  do
+  {
+    for(th=0;th<STBIR_RESIZE_CLASSIFICATIONS;th++)
+      expand_to_floats( weights[th], range );
+
+    calc_errors( weights, curtot, curerr, channel_count_index );
+
+    for(th=0;th<STBIR_RESIZE_CLASSIFICATIONS;th++)
+    {
+      if ( curerr[th] < besterr[th] )
+      {
+        besterr[th] = curerr[th];
+        besttot[th] = curtot[th];
+        best[th] = range;
+        expand_to_floats( best_output_weights[th], best[th] );
+        newbest = 1;
+      }
+    }
+
+    {
+      DWORD t = GetTickCount();
+      if ( range == 0 )
+        goto do_bitmap;
+
+      if ( newbest )
+      {
+        if ( ( GetTickCount() - lasttick ) > 200 )
+        {
+          int findex;
+        
+         do_bitmap:
+          lasttick = t;
+          newbest = 0;
+
+          for( findex = 0 ; findex < numfileinfo ; findex++ )
+            build_bitmap( best_output_weights, channel_count_index, findex );
+
+          lasttick = GetTickCount();
+        }
+      }
+    }
+   
+    windowranges[ channel_count_index ] = range;
+
+    // advance all the weights and loop
+    --range;
+  } while( ( range >= 0 ) && ( !windowstatus ) );
+
+  // if we hit here, then we tried all weights for this opt, so save them 
+}
+
+static void print_struct( float weight[5][STBIR_RESIZE_CLASSIFICATIONS][4], char const * name )
+{
+  printf("\n\nstatic float %s[5][STBIR_RESIZE_CLASSIFICATIONS][4]=\n{", name );
+  {
+    int i;
+    for(i=0;i<5;i++) 
+    { 
+      int th;
+      for(th=0;th<STBIR_RESIZE_CLASSIFICATIONS;th++)
+      {
+        int j;
+        printf("\n  "); 
+        for(j=0;j<4;j++) 
+          printf("%1.5ff, ", weight[i][th][j] ); 
+      }
+      printf("\n");
+    }
+    printf("\n};\n");
+  }
+}
+
+static float retrain_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4];
+
+static DWORD __stdcall retrain_shim( LPVOID p )
+{
+  int chanind = (int) (size_t)p;
+  opt_channel( retrain_weights[chanind], chanind );
+  return 0;
+}
+
+static char const * gettime( int ms )
+{
+  static char time[32];
+  if (ms > 60000)
+    sprintf( time, "%dm %ds",ms/60000, (ms/1000)%60 );
+  else  
+    sprintf( time, "%ds",ms/1000 );
+  return time;
+}
+
+static BITMAPINFOHEADER bmiHeader;
+static DWORD extrawindoww, extrawindowh;
+static HINSTANCE instance;
+static int curzoom = 1;
+
+static LRESULT WINAPI WindowProc( HWND   window,
+                                  UINT   message,
+                                  WPARAM wparam,
+                                  LPARAM lparam )
+{
+  switch( message )
+  {
+    case WM_CHAR:
+      if ( wparam != 27 )
+        break;
+      // falls through
+
+    case WM_CLOSE:
+    {
+      int i;
+      int max = 0;
+      
+      for( i = 0 ; i < fi[0].numtypes ; i++ )
+        if( windowranges[i] > max ) max = windowranges[i];
+   
+      if ( ( max == 0 ) || ( MessageBox( window, "Cancel before training is finished?", "Vertical First Training", MB_OKCANCEL|MB_ICONSTOP ) == IDOK ) )
+      {
+        for( i = 0 ; i < fi[0].numtypes ; i++ )
+          if( windowranges[i] > max ) max = windowranges[i];
+        if ( max )
+          windowstatus = 1;
+        DestroyWindow( window );
+      }
+    }
+    return 0;
+       
+    case WM_PAINT:
+    {
+      PAINTSTRUCT ps;
+      HDC dc;
+
+      dc = BeginPaint( window, &ps );
+      StretchDIBits( dc, 
+        0, 0, bitmapw*curzoom, bitmaph*curzoom,
+        0, 0, bitmapw, bitmaph,
+        bitmap, (BITMAPINFO*)&bmiHeader, DIB_RGB_COLORS, SRCCOPY );
+
+      PatBlt( dc, bitmapw*curzoom, 0, 4096, 4096, WHITENESS );
+      PatBlt( dc, 0, bitmaph*curzoom, 4096, 4096, WHITENESS );
+
+      SetTextColor( dc, RGB(0,0,0)  );
+      SetBkColor( dc, RGB(255,255,255) );
+      SetBkMode( dc, OPAQUE );
+
+      {
+        int i, l = 0, max = 0;
+        char buf[1024];
+        RECT rc;
+        POINT p;
+
+        for( i = 0 ; i < fi[0].numtypes ; i++ )
+        {
+          l += sprintf( buf + l, "channels: %d %s\n", fi[0].effective[i], windowranges[i] ? expand_to_string( windowranges[i] ) : "Done." );
+          if ( windowranges[i] > max ) max = windowranges[i];
+        }
+
+        rc.left = 32; rc.top = bitmaph*curzoom+10;
+        rc.right = 512; rc.bottom = rc.top + 512;
+        DrawText( dc, buf, -1, &rc, DT_TOP );
+
+        l = 0;
+        if ( max == 0 )
+        {
+          static DWORD traindone = 0;
+          if ( traindone == 0 ) traindone = GetTickCount();
+          l = sprintf( buf, "Finished in %s.", gettime( traindone - trainstart ) );
+        }
+        else if ( max != MAXRANGE )
+          l = sprintf( buf, "Done in %s...", gettime( (int) ( ( ( (int64)max * ( (int64)GetTickCount() - (int64)trainstart ) ) ) / (int64) ( MAXRANGE - max ) ) ) );
+
+        GetCursorPos( &p );
+        ScreenToClient( window, &p );
+
+        if ( ( p.x >= 0 ) && ( p.y >= 0 ) && ( p.x < (bitmapw*curzoom) ) && ( p.y < (bitmaph*curzoom) ) )
+        {
+          int findex;
+          int x, y, w, h, sx, sy, ix, iy, ox, oy;
+          int ir, chanind;
+          int * ts;
+          char badstr[64];
+          STBIR__V_FIRST_INFO v_info={0};
+
+          p.x /= curzoom;
+          p.y /= curzoom;
+
+          for( findex = 0 ; findex < numfileinfo ; findex++ )
+          {
+            x = fi[findex].bitmapx;
+            y = fi[findex].bitmapy;
+            w = x + ( fi[findex].dimensionx + 1 ) * fi[findex].numtypes;
+            h = y + ( fi[findex].dimensiony + 1 ) * fi[findex].numinputrects;
+
+            if ( ( p.x >= x ) && ( p.y >= y ) && ( p.x < w ) && ( p.y < h ) )
+              goto found;
+          }
+          goto nope;
+         
+         found:
+            
+          ir = ( p.y - y ) / ( fi[findex].dimensiony + 1 );
+          sy = ( p.y - y ) % ( fi[findex].dimensiony + 1 );
+          if ( sy >= fi[findex].dimensiony ) goto nope;
+
+          chanind = ( p.x - x ) / ( fi[findex].dimensionx + 1 );
+          sx = ( p.x - x ) % ( fi[findex].dimensionx + 1 );
+          if ( sx >= fi[findex].dimensionx ) goto nope;
+
+          ix = fi[findex].inputrects[ir*2];
+          iy = fi[findex].inputrects[ir*2+1];
+
+          ts = fi[findex].timings + ( ( fi[findex].dimensionx * fi[findex].dimensiony * fi[findex].numtypes * ir ) + ( fi[findex].dimensionx * fi[findex].dimensiony * chanind ) + ( fi[findex].dimensionx * sy ) + sx ) * 2;
+
+          ox = 1+fi[findex].outputscalex*sx;
+          oy = 1+fi[findex].outputscaley*sy;
+
+          if ( windowstatus != 2 )
+          {
+            int VF, HF, v_first, good;
+            VF = ts[0];
+            HF = ts[1];
+
+            v_first = vert_first( retrain_weights[chanind], ox, oy, ix, iy, STBIR_FILTER_MITCHELL, &v_info );
+
+            good = ( ((HF<=VF) && (!v_first)) || ((VF<=HF) && (v_first)));
+
+            if ( good )
+              badstr[0] = 0;
+            else
+            {
+              double r;
+
+              if ( HF < VF )
+                r = (double)(VF-HF)/(double)HF;
+              else
+                r = (double)(HF-VF)/(double)VF;
+              sprintf( badstr, " %.1f%% off", r*100 );
+            }
+            sprintf( buf + l, "\n\n%s\nCh: %d Resize: %dx%d to %dx%d\nV: %d H: %d  Order: %c (%s%s)\nClass: %d Scale: %.2f %s", fi[findex].filename,fi[findex].effective[chanind], ix,iy,ox,oy, VF, HF, v_first?'V':'H', good?"Good":"Wrong", badstr, v_info.v_resize_classification, (double)oy/(double)iy, v_info.is_gather ? "Gather" : "Scatter" );
+          }
+          else
+          {
+            int v_first, time0, time1;
+            float (* weights)[4] = stbir__compute_weights[chanind];
+            int * ts1;
+            char b0[32], b1[32];
+
+            ts1 = fi[1].timings + ( ts - fi[0].timings );
+
+            v_first = vert_first( weights, ox, oy, ix, iy, STBIR_FILTER_MITCHELL, &v_info );
+
+            time0 = ( v_first ) ? ts[0] : ts[1];
+            time1 = ( v_first ) ? ts1[0] : ts1[1];
+            
+            b0[0] = b1[0] = 0;
+            if ( time0 < time1 )
+              sprintf( b0," (%.f%% better)", ((double)time1-(double)time0)*100.0f/(double)time0);
+            else
+              sprintf( b1," (%.f%% better)", ((double)time0-(double)time1)*100.0f/(double)time1);
+
+            sprintf( buf + l, "\n\n0: %s\n1: %s\nCh: %d Resize: %dx%d to %dx%d\nClass: %d Scale: %.2f %s\nTime0: %d%s\nTime1: %d%s", fi[0].filename, fi[1].filename, fi[0].effective[chanind], ix,iy,ox,oy, v_info.v_resize_classification, (double)oy/(double)iy, v_info.is_gather ? "Gather" : "Scatter", time0, b0, time1, b1 );
+          }
+        }
+       nope:
+
+        rc.left = 32+320; rc.right = 512+320; 
+        SetTextColor( dc, RGB(0,0,128) );
+        DrawText( dc, buf, -1, &rc, DT_TOP );
+
+      }
+      EndPaint( window, &ps );
+      return 0;
+    }
+
+    case WM_TIMER:
+      InvalidateRect( window, 0, 0 );
+      return 0;
+
+    case WM_DESTROY:
+      PostQuitMessage( 0 );
+      return 0;
+  }
+   
+
+  return DefWindowProc( window, message, wparam, lparam );
+}
+
+static void SetHighDPI(void)
+{
+  typedef HRESULT WINAPI setdpitype(int v);
+  HMODULE h=LoadLibrary("Shcore.dll");
+  if (h)
+  {
+    setdpitype * sd = (setdpitype*)GetProcAddress(h,"SetProcessDpiAwareness");
+    if (sd )
+      sd(1);
+  }
+} 
+
+static void draw_window()
+{
+  WNDCLASS wc;
+  HWND w;
+  MSG msg;
+  
+  instance = GetModuleHandle(NULL);
+
+  wc.style = 0;
+  wc.lpfnWndProc = WindowProc;
+  wc.cbClsExtra = 0;
+  wc.cbWndExtra = 0;
+  wc.hInstance = instance;
+  wc.hIcon = 0;
+  wc.hCursor = LoadCursor(NULL, IDC_ARROW);
+  wc.hbrBackground = 0;
+  wc.lpszMenuName = 0;
+  wc.lpszClassName = "WHTrain";
+
+  if ( !RegisterClass( &wc ) )
+    exit(1);
+
+  SetHighDPI();
+
+  bmiHeader.biSize          =  sizeof(BITMAPINFOHEADER);
+  bmiHeader.biWidth         =  bitmapp/3;
+  bmiHeader.biHeight        =  -bitmaph;
+  bmiHeader.biPlanes        =  1;
+  bmiHeader.biBitCount      =  24;
+  bmiHeader.biCompression   =  BI_RGB;
+
+  w = CreateWindow( "WHTrain",
+                    "Vertical First Training",
+                    WS_CAPTION | WS_POPUP| WS_CLIPCHILDREN |
+                    WS_SYSMENU | WS_MINIMIZEBOX | WS_SIZEBOX,
+                    CW_USEDEFAULT,CW_USEDEFAULT,
+                    CW_USEDEFAULT,CW_USEDEFAULT,
+                    0, 0, instance, 0 );
+
+  {
+    RECT r, c;
+    GetWindowRect( w, &r );
+    GetClientRect( w, &c );
+    extrawindoww = ( r.right - r.left ) - ( c.right - c.left );
+    extrawindowh = ( r.bottom - r.top ) - ( c.bottom - c.top );
+    SetWindowPos( w, 0, 0, 0, bitmapw * curzoom + extrawindoww, bitmaph * curzoom + extrawindowh + 164, SWP_NOMOVE );
+  }
+
+  ShowWindow( w, SW_SHOWNORMAL );
+  SetTimer( w, 1, 250, 0 );
+
+  {
+    BOOL ret;
+    while( ( ret = GetMessage( &msg, w, 0, 0 ) ) != 0 )
+    { 
+      if ( ret == -1 )
+        break;
+      TranslateMessage( &msg ); 
+      DispatchMessage( &msg ); 
+    }
+  }
+}
+
+static void retrain()
+{
+  HANDLE threads[ 16 ];
+  int chanind;
+
+  trainstart = GetTickCount();
+  for( chanind = 0 ; chanind < fi[0].numtypes ; chanind++ )
+    threads[ chanind ] = CreateThread( 0, 2048*1024, retrain_shim, (LPVOID)(size_t)chanind, 0, 0 );
+
+  draw_window();
+
+  for( chanind = 0 ; chanind < fi[0].numtypes ; chanind++ )
+  {
+    WaitForSingleObject( threads[ chanind ], INFINITE );
+    CloseHandle( threads[ chanind ] );
+  }
+
+  write_bitmap();
+
+  print_struct( retrain_weights, "retained_weights" );
+  if ( windowstatus ) printf( "CANCELLED!\n" );
+}
+
+static void info()
+{
+  int findex;
+
+  // display info about each input file
+  for( findex = 0 ; findex < numfileinfo ; findex++ )
+  {
+    int i, h,m,s;
+    if ( findex ) printf( "\n" );
+    printf( "Timing file: %s\n", fi[findex].filename );
+    printf( "CPU type: %d  %s\n", fi[findex].cpu, fi[findex].simd?(fi[findex].simd==2?"SIMD8":"SIMD4"):"Scalar" );
+    h = fi[findex].milliseconds/3600000;
+    m = (fi[findex].milliseconds-h*3600000)/60000;
+    s = (fi[findex].milliseconds-h*3600000-m*60000)/1000;
+    printf( "Total time in test: %dh %dm %ds  Cycles/sec: %.f\n", h,m,s, 1000.0/fi[findex].scale_time );
+    printf( "Each tile of samples is %dx%d, and is scaled by %dx%d.\n", fi[findex].dimensionx,fi[findex].dimensiony, fi[findex].outputscalex,fi[findex].outputscaley );
+    printf( "So the x coords are: " );
+    for( i=0; i < fi[findex].dimensionx ; i++ ) printf( "%d ",1+i*fi[findex].outputscalex );
+    printf( "\n" );
+    printf( "And the y coords are: " );
+    for( i=0; i < fi[findex].dimensiony ; i++ ) printf( "%d ",1+i*fi[findex].outputscaley );
+    printf( "\n" );
+    printf( "There are %d channel counts and they are: ", fi[findex].numtypes );
+    for( i=0; i < fi[findex].numtypes ; i++ ) printf( "%d ",fi[findex].effective[i] );
+    printf( "\n" );
+    printf( "There are %d input rect sizes and they are: ", fi[findex].numinputrects );
+    for( i=0; i < fi[findex].numtypes ; i++ ) printf( "%dx%d ",fi[findex].inputrects[i*2],fi[findex].inputrects[i*2+1] );
+    printf( "\n" );
+  }
+}
+
+static void current( int do_win, int do_bitmap )
+{
+  int i, findex;
+
+  trainstart = GetTickCount();
+
+  // clear progress
+  memset( windowranges, 0, sizeof( windowranges ) );
+  // copy in appropriate weights
+  memcpy( retrain_weights, stbir__compute_weights, sizeof( retrain_weights ) );
+
+  // build and print current errors and build current bitmap
+  for( i = 0 ; i < fi[0].numtypes ; i++ )
+  {
+    double curerr[STBIR_RESIZE_CLASSIFICATIONS];
+    int curtot[STBIR_RESIZE_CLASSIFICATIONS];
+    float (* weights)[4] = retrain_weights[i];
+ 
+    calc_errors( weights, curtot, curerr, i );
+    if ( !do_bitmap )
+      print_weights( weights, i, curtot, curerr );
+
+    for( findex = 0 ; findex < numfileinfo ; findex++ )
+      build_bitmap( weights, i, findex );
+  }
+
+  if ( do_win )
+    draw_window();
+
+  if ( do_bitmap )
+    write_bitmap();
+}
+
+static void compare()
+{
+  int i;
+
+  trainstart = GetTickCount();
+  windowstatus = 2; // comp mode
+
+  // clear progress
+  memset( windowranges, 0, sizeof( windowranges ) );
+
+  if ( ( fi[0].numtypes != fi[1].numtypes ) || ( fi[0].numinputrects != fi[1].numinputrects ) ||
+       ( fi[0].dimensionx != fi[1].dimensionx ) || ( fi[0].dimensiony != fi[1].dimensiony ) || 
+       ( fi[0].outputscalex != fi[1].outputscalex ) || ( fi[0].outputscaley != fi[1].outputscaley ) )
+  {
+   err:
+    printf( "Timing files don't match.\n" );
+    exit(5);
+  }
+
+  for( i=0; i < fi[0].numtypes ; i++ )
+  {
+    if ( fi[0].effective[i]      != fi[1].effective[i] ) goto err;
+    if ( fi[0].inputrects[i*2]   != fi[1].inputrects[i*2] ) goto err;
+    if ( fi[0].inputrects[i*2+1] != fi[1].inputrects[i*2+1] ) goto err;
+  }
+    
+  alloc_bitmap( 1 );
+  
+  for( i = 0 ; i < fi[0].numtypes ; i++ )
+  {
+    float (* weights)[4] = stbir__compute_weights[i];
+    build_comp_bitmap( weights, i );
+  }
+
+  draw_window();
+}
+
+static void load_files( char ** args, int count )
+{
+  int i;
+
+  if ( count == 0 )
+  {
+    printf( "No timing files listed!" );
+    exit(3);
+  }
+
+  for ( i = 0 ; i < count ; i++ )
+  {
+    if ( !use_timing_file( args[i], i ) )
+    {
+      printf( "Bad timing file %s\n", args[i] );
+      exit(2);
+    }
+  }
+  numfileinfo = count;
+}  
+
+int main( int argc, char ** argv )
+{
+  int check;
+  if ( argc < 3 )
+  {
+   err:
+    printf( "vf_train retrain [timing_filenames....] - recalcs weights for all the files on the command line.\n");
+    printf( "vf_train info [timing_filenames....] - shows info about each timing file.\n");
+    printf( "vf_train check [timing_filenames...] - show results for the current weights for all files listed.\n");
+    printf( "vf_train compare <timing file1> <timing file2> - compare two timing files (must only be two files and same resolution).\n");
+    printf( "vf_train bitmap [timing_filenames...] - write out results.png, comparing against the current weights for all files listed.\n");
+    exit(1);
+  }
+  
+  check = ( strcmp( argv[1], "check" ) == 0 );
+  if ( ( check ) || ( strcmp( argv[1], "bitmap" ) == 0 ) )
+  {
+    load_files( argv + 2, argc - 2 );
+    alloc_bitmap( numfileinfo );
+    current( check, !check );
+  }
+  else if ( strcmp( argv[1], "info" ) == 0 ) 
+  {
+    load_files( argv + 2, argc - 2 );
+    info();
+  }
+  else if ( strcmp( argv[1], "compare" ) == 0 ) 
+  {
+    if ( argc != 4 )
+    {
+      printf( "You must specify two files to compare.\n" );
+      exit(4);
+    }
+
+    load_files( argv + 2, argc - 2 );
+    compare();
+  }
+  else if ( strcmp( argv[1], "retrain" ) == 0 ) 
+  {
+    load_files( argv + 2, argc - 2 );
+    alloc_bitmap( numfileinfo );
+    retrain();  
+  }
+  else
+  {
+    goto err;
+  }
+
+  return 0;
+}
diff --git a/tests/resample_test.cpp b/tests/resample_test.cpp
index 21f874f..6595e37 100644
--- a/tests/resample_test.cpp
+++ b/tests/resample_test.cpp
@@ -64,7 +64,7 @@ void stbir_progress(float p)
 #define STBIR_PROGRESS_REPORT stbir_progress
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_IMAGE_RESIZE_STATIC
-#include "stb_image_resize.h"
+#include "stb_image_resize2.h"
 
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #include "stb_image_write.h"
@@ -143,7 +143,7 @@ void resizer(int argc, char **argv)
 	out_h = h*3;
 	output_pixels = (unsigned char*) malloc(out_w*out_h*n);
 	//stbir_resize_uint8_srgb(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, n, -1,0);
-	stbir_resize_uint8(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, n);
+	stbir_resize_uint8_linear(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, (stbir_pixel_layout) n);
 	stbi_write_png("output.png", out_w, out_h, n, output_pixels, 0);
 	exit(0);
 }
@@ -171,9 +171,9 @@ void performance(int argc, char **argv)
 	output_pixels = (unsigned char*) malloc(out_w*out_h*n);
     for (i=0; i < count; ++i)
         if (srgb)
-	        stbir_resize_uint8_srgb(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, n,-1,0);
+	        stbir_resize_uint8_srgb(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, (stbir_pixel_layout) n);
         else
-	        stbir_resize(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, STBIR_TYPE_UINT8, n,-1, 0, STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT, STBIR_COLORSPACE_LINEAR, NULL);
+	        stbir_resize_uint8_linear(input_pixels, w, h, 0, output_pixels, out_w, out_h, 0, (stbir_pixel_layout) n);
 	exit(0);
 }
 
@@ -188,6 +188,7 @@ int main(int argc, char** argv)
 	return 0;
 }
 
+#if 0
 void resize_image(const char* filename, float width_percent, float height_percent, stbir_filter filter, stbir_edge edge, stbir_colorspace colorspace, const char* output_filename)
 {
 	int w, h, n;
@@ -1120,3 +1121,7 @@ void test_suite(int argc, char **argv)
 	resize_image("gamma_2.2.jpg", .5f, .5f, STBIR_FILTER_CATMULLROM, STBIR_EDGE_REFLECT, STBIR_COLORSPACE_SRGB, "test-output/gamma_2.2.jpg");
 	resize_image("gamma_dalai_lama_gray.jpg", .5f, .5f, STBIR_FILTER_CATMULLROM, STBIR_EDGE_REFLECT, STBIR_COLORSPACE_SRGB, "test-output/gamma_dalai_lama_gray.jpg");
 }
+#endif
+void test_suite(int argc, char **argv)
+{
+}
diff --git a/tests/resize.dsp b/tests/resize.dsp
index 0aa1bba..cfb9608 100644
--- a/tests/resize.dsp
+++ b/tests/resize.dsp
@@ -88,7 +88,7 @@ SOURCE=.\resample_test.cpp
 # End Source File
 # Begin Source File
 
-SOURCE=..\stb_image_resize.h
+SOURCE=..\stb_image_resize2.h
 # End Source File
 # End Target
 # End Project
diff --git a/tests/stb.dsp b/tests/stb.dsp
index ba13ba1..849b95b 100644
--- a/tests/stb.dsp
+++ b/tests/stb.dsp
@@ -130,10 +130,6 @@ SOURCE=..\stb_image.h
 # End Source File
 # Begin Source File
 
-SOURCE=..\stb_image_resize.h
-# End Source File
-# Begin Source File
-
 SOURCE=..\stb_image_write.h
 # End Source File
 # Begin Source File
diff --git a/tests/test_c_compilation.c b/tests/test_c_compilation.c
index 11f6023..8141d2b 100644
--- a/tests/test_c_compilation.c
+++ b/tests/test_c_compilation.c
@@ -1,3 +1,6 @@
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include "stb_image_resize2.h"
+
 #define STB_SPRINTF_IMPLEMENTATION
 #include "stb_sprintf.h"
 
@@ -7,7 +10,6 @@
 #define STB_DIVIDE_IMPLEMENTATION
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_HERRINGBONE_WANG_TILE_IMEPLEMENTATIOn
-#define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_RECT_PACK_IMPLEMENTATION
 #define STB_VOXEL_RENDER_IMPLEMENTATION
 #define STB_EASY_FONT_IMPLEMENTATION
@@ -20,7 +22,6 @@
 #include "stb_perlin.h"
 #include "stb_c_lexer.h"
 #include "stb_divide.h"
-#include "stb_image_resize.h"
 #include "stb_rect_pack.h"
 #include "stb_dxt.h"
 #include "stb_include.h"
diff --git a/tests/test_cpp_compilation.cpp b/tests/test_cpp_compilation.cpp
index fd8c5b6..d1d10b8 100644
--- a/tests/test_cpp_compilation.cpp
+++ b/tests/test_cpp_compilation.cpp
@@ -70,7 +70,7 @@ void my_free(void *) { }
 #include "stb_leakcheck.h"
 
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
-#include "stb_image_resize.h"
+#include "stb_image_resize2.h"
 
 //#include "stretchy_buffer.h"  // deprecating
 
diff --git a/tools/README.list b/tools/README.list
index 294bf87..e858783 100644
--- a/tools/README.list
+++ b/tools/README.list
@@ -3,7 +3,7 @@ stb_hexwave.h               | audio            | audio waveform synthesizer
 stb_image.h                 | graphics         | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 stb_truetype.h              | graphics         | parse, decode, and rasterize characters from truetype fonts
 stb_image_write.h           | graphics         | image writing to disk: PNG, TGA, BMP
-stb_image_resize.h          | graphics         | resize images larger/smaller with good quality
+stb_image_resize2.h         | graphics         | resize images larger/smaller with good quality
 stb_rect_pack.h             | graphics         | simple 2D rectangle packer with decent quality
 stb_perlin.h                | graphics         | perlin's revised simplex noise w/ different seeds
 stb_ds.h                    | utility          | typesafe dynamic array and hash tables for C, will compile in C++

From e81f294b163f0e6110a80c1fc37b1cae779d1dd6 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 11 Oct 2023 17:28:56 -0700
Subject: [PATCH 147/170] stb_image_resize: switch GNU/clang from "asm" to
 "__asm__" for -c99 compatibility

---
 stb_image_resize2.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 7aaeab0..e0c4282 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.00 - public domain image resizing
+/* stb_image_resize2 - v2.01 - public domain image resizing
    
    by Jeff Roberts (v2) and Jorge L Rodriguez 
    http://github.com/nothings/stb
@@ -1178,10 +1178,10 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop
 #elif defined(  __clang__ )
   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
-  #define STBIR_NO_UNROLL( ptr ) asm (""::"r"(ptr))
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
 #elif defined(  __GNUC__ )
   #define STBIR_STREAMOUT_PTR( star ) star __restrict__
-  #define STBIR_NO_UNROLL( ptr ) asm (""::"r"(ptr))
+  #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr))
 #else
   #define STBIR_STREAMOUT_PTR( star ) star
   #define STBIR_NO_UNROLL( ptr )

From beebb24b945efdea3b9bba23affb8eb3ba8982e7 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 11 Oct 2023 17:32:00 -0700
Subject: [PATCH 148/170] README.md: fix reference to stb_image_resize2.h

---
 README.md              | 4 ++--
 tools/README.header.md | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index cf7f162..85b3a99 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Noteworthy:
 
 * image loader: [stb_image.h](stb_image.h)
 * image writer: [stb_image_write.h](stb_image_write.h)
-* image resizer: [stb_image_resize.h](stb_image_resize.h)
+* image resizer: [stb_image_resize2.h](stb_image_resize2.h)
 * font text rasterizer: [stb_truetype.h](stb_truetype.h)
 * typesafe containers: [stb_ds.h](stb_ds.h)
 
@@ -25,7 +25,7 @@ library    | lastest version | category | LoC | description
 **[stb_image.h](stb_image.h)** | 2.28 | graphics | 7987 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
-**[stb_image_resize2.h](stb_image_resize2.h)** | 2.00 | graphics | 10303 | resize images larger/smaller with good quality
+**[stb_image_resize2.h](stb_image_resize2.h)** | 2.01 | graphics | 10303 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
 **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
diff --git a/tools/README.header.md b/tools/README.header.md
index 03fa1c5..e49d67b 100644
--- a/tools/README.header.md
+++ b/tools/README.header.md
@@ -7,7 +7,7 @@ Noteworthy:
 
 * image loader: [stb_image.h](stb_image.h)
 * image writer: [stb_image_write.h](stb_image_write.h)
-* image resizer: [stb_image_resize.h](stb_image_resize.h)
+* image resizer: [stb_image_resize2.h](stb_image_resize2.h)
 * font text rasterizer: [stb_truetype.h](stb_truetype.h)
 * typesafe containers: [stb_ds.h](stb_ds.h)
 

From 1d878bd2a3f06d048afb359873a9108ac9500355 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 15 Nov 2023 09:35:21 -0800
Subject: [PATCH 149/170] security notice

---
 README.md              | 2 ++
 tools/README.header.md | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 85b3a99..2c3707f 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,8 @@ stb
 
 single-file public domain (or MIT licensed) libraries for C/C++
 
+# This project discusses security-relevant bugs in public in Github Issues and Pull Requests, and they make some time to be fixed. If this poses an unreasonable risk to your project, do not use stb libraries.
+
 Noteworthy:
 
 * image loader: [stb_image.h](stb_image.h)
diff --git a/tools/README.header.md b/tools/README.header.md
index e49d67b..4f81dcb 100644
--- a/tools/README.header.md
+++ b/tools/README.header.md
@@ -3,6 +3,8 @@ stb
 
 single-file public domain (or MIT licensed) libraries for C/C++
 
+# This project discusses security-relevant bugs in public in Github Issues and Pull Requests, and they make some time to be fixed. If this poses an unreasonable risk to your project, do not use stb libraries.
+
 Noteworthy:
 
 * image loader: [stb_image.h](stb_image.h)

From 03f50e343d796e492e6579a11143a085429d7f5d Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Wed, 15 Nov 2023 09:36:28 -0800
Subject: [PATCH 150/170] security notice

---
 README.md              | 2 +-
 tools/README.header.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2c3707f..48fb917 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ stb
 
 single-file public domain (or MIT licensed) libraries for C/C++
 
-# This project discusses security-relevant bugs in public in Github Issues and Pull Requests, and they make some time to be fixed. If this poses an unreasonable risk to your project, do not use stb libraries.
+# This project discusses security-relevant bugs in public in Github Issues and Pull Requests, and it may take significant time for security fixes to be implemented or merged. If this poses an unreasonable risk to your project, do not use stb libraries.
 
 Noteworthy:
 
diff --git a/tools/README.header.md b/tools/README.header.md
index 4f81dcb..0050967 100644
--- a/tools/README.header.md
+++ b/tools/README.header.md
@@ -3,7 +3,7 @@ stb
 
 single-file public domain (or MIT licensed) libraries for C/C++
 
-# This project discusses security-relevant bugs in public in Github Issues and Pull Requests, and they make some time to be fixed. If this poses an unreasonable risk to your project, do not use stb libraries.
+# This project discusses security-relevant bugs in public in Github Issues and Pull Requests, and it may take significant time for security fixes to be implemented or merged. If this poses an unreasonable risk to your project, do not use stb libraries.
 
 Noteworthy:
 

From d3736741151fd76cf8deccb9e94d4d83da3d3e4e Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Mon, 6 Mar 2023 01:03:05 -0800
Subject: [PATCH 151/170] stb_image: Fix zlib decoder end-of-stream handling

We speculatively try to fill our bit buffer to always contain
at least 16 bits for stbi__zhuffman_decode. It's not a sign of
a malformed stream for us to be reading past the end there,
because the contents of that bit buffer are speculative; it's
only a malformed stream if we actually _consume_ the extra bits.

This fix adds some extra logic where we the first time we hit
zeof, we add an explicit 16 extra zero bits at the top of the
bit buffer just so that for the purposes of the decoder, we have
16 bits in the buffer.

However, if at the end of stream, we have the "hit zeof once"
flag set and less than 16 bits remaining in the bit buffer, we
know some of those implicit zero bits got read, which indicates
we actually had a past-end-of-stream read. In that case, flag
it as an error.

While I'm in here, also rephrase the length-too-large check to
not do any potentially-overflowing pointer arithmetic.

Fixes issue #1456.
---
 stb_image.h | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 5e807a0..d4fd2fc 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4176,6 +4176,7 @@ typedef struct
 {
    stbi_uc *zbuffer, *zbuffer_end;
    int num_bits;
+   int hit_zeof_once;
    stbi__uint32 code_buffer;
 
    char *zout;
@@ -4242,9 +4243,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
    int b,s;
    if (a->num_bits < 16) {
       if (stbi__zeof(a)) {
-         return -1;   /* report error for unexpected end of data. */
+         if (!a->hit_zeof_once) {
+            // This is the first time we hit eof, insert 16 extra padding btis
+            // to allow us to keep going; if we actually consume any of them
+            // though, that is invalid data. This is caught later.
+            a->hit_zeof_once = 1;
+            a->num_bits += 16; // add 16 implicit zero bits
+         } else {
+            // We already inserted our extra 16 padding bits and are again
+            // out, this stream is actually prematurely terminated.
+            return -1;
+         }
+      } else {
+         stbi__fill_bits(a);
       }
-      stbi__fill_bits(a);
    }
    b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
    if (b) {
@@ -4309,6 +4321,13 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
          int len,dist;
          if (z == 256) {
             a->zout = zout;
+            if (a->hit_zeof_once && a->num_bits < 16) {
+               // The first time we hit zeof, we inserted 16 extra zero bits into our bit
+               // buffer so the decoder can just do its speculative decoding. But if we
+               // actually consumed any of those bits (which is the case when num_bits < 16),
+               // the stream actually read past the end so it is malformed.
+               return stbi__err("unexpected end","Corrupt PNG");
+            }
             return 1;
          }
          if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
@@ -4320,7 +4339,7 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
          dist = stbi__zdist_base[z];
          if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
          if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
-         if (zout + len > a->zout_end) {
+         if (len > a->zout_end - zout) {
             if (!stbi__zexpand(a, zout, len)) return 0;
             zout = a->zout;
          }
@@ -4464,6 +4483,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       if (!stbi__parse_zlib_header(a)) return 0;
    a->num_bits = 0;
    a->code_buffer = 0;
+   a->hit_zeof_once = 0;
    do {
       final = stbi__zreceive(a,1);
       type = stbi__zreceive(a,2);

From e5f0e18d0f16f4e8e4975ee19848dbd9feda9268 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Mon, 1 May 2023 17:10:45 -0700
Subject: [PATCH 152/170] stb_image: Small PNG filter simplification

Paeth on the first scanline is not a separate filter, it
happens to be equivalent to the "sub" filter which is already
an option.
---
 stb_image.h | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index d4fd2fc..50aea70 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4639,9 +4639,8 @@ enum {
    STBI__F_up=2,
    STBI__F_avg=3,
    STBI__F_paeth=4,
-   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
-   STBI__F_avg_first,
-   STBI__F_paeth_first
+   // synthetic filter used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first
 };
 
 static stbi_uc first_row_filter[5] =
@@ -4650,7 +4649,7 @@ static stbi_uc first_row_filter[5] =
    STBI__F_sub,
    STBI__F_none,
    STBI__F_avg_first,
-   STBI__F_paeth_first
+   STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub
 };
 
 static int stbi__paeth(int a, int b, int c)
@@ -4719,9 +4718,8 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
             case STBI__F_sub        : cur[k] = raw[k]; break;
             case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
             case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
-            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break; // prior[k] == stbi__paeth(0,prior[k],0)
             case STBI__F_avg_first  : cur[k] = raw[k]; break;
-            case STBI__F_paeth_first: cur[k] = raw[k]; break;
          }
       }
 
@@ -4753,13 +4751,12 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
                 for (k=0; k < nk; ++k)
          switch (filter) {
             // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            case STBI__F_none:               memcpy(cur, raw, nk); break;
             STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
             STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
             STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
             STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
             STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
          }
          #undef STBI__CASE
          raw += nk;
@@ -4776,7 +4773,6 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
             STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
             STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
             STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
-            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
          }
          #undef STBI__CASE
 

From 45eb4ac1588796d053a485c5712a65e4c8763546 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Mon, 1 May 2023 22:46:32 -0700
Subject: [PATCH 153/170] global: very basic .gitignore for object files

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8cc774f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.o
+*.obj
+*.exe

From ed643334103597cb7234f088e377ffc5bd8b3b47 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Mon, 1 May 2023 22:47:45 -0700
Subject: [PATCH 154/170] tests: test_png_regress

I want to make some changes to the PNG loader, this is to get some
test coverage at least to make it easier not to break anything.

The two ref_results files that are "corrupt" files that stb_image
nevertheless loads without error are checksum failures; this is
by design, since stb_image does not verify checksums.
---
 tests/pngsuite/ref_results.csv | 259 +++++++++++++++++++++++++++++++++
 tests/test_png_regress.c       |  75 ++++++++++
 2 files changed, 334 insertions(+)
 create mode 100644 tests/pngsuite/ref_results.csv
 create mode 100644 tests/test_png_regress.c

diff --git a/tests/pngsuite/ref_results.csv b/tests/pngsuite/ref_results.csv
new file mode 100644
index 0000000..74dc2e6
--- /dev/null
+++ b/tests/pngsuite/ref_results.csv
@@ -0,0 +1,259 @@
+filename,width,height,ncomp,error,hash
+pngsuite/16bit/basi0g16.png,32,32,1,,0xfc8f2f99
+pngsuite/16bit/basi2c16.png,32,32,3,,0x65567ed5
+pngsuite/16bit/basi4a16.png,32,32,2,,0x198cf245
+pngsuite/16bit/basi6a16.png,32,32,4,,0x3016e9b5
+pngsuite/16bit/basn0g16.png,32,32,1,,0xfc8f2f99
+pngsuite/16bit/basn2c16.png,32,32,3,,0x65567ed5
+pngsuite/16bit/basn4a16.png,32,32,2,,0x198cf245
+pngsuite/16bit/basn6a16.png,32,32,4,,0x3016e9b5
+pngsuite/16bit/bgai4a16.png,32,32,2,,0x198cf245
+pngsuite/16bit/bgan6a16.png,32,32,4,,0x3016e9b5
+pngsuite/16bit/bggn4a16.png,32,32,2,,0x198cf245
+pngsuite/16bit/bgyn6a16.png,32,32,4,,0x3016e9b5
+pngsuite/16bit/oi1n0g16.png,32,32,1,,0xfc8f2f99
+pngsuite/16bit/oi1n2c16.png,32,32,3,,0x65567ed5
+pngsuite/16bit/oi2n0g16.png,32,32,1,,0xfc8f2f99
+pngsuite/16bit/oi2n2c16.png,32,32,3,,0x65567ed5
+pngsuite/16bit/oi4n0g16.png,32,32,1,,0xfc8f2f99
+pngsuite/16bit/oi4n2c16.png,32,32,3,,0x65567ed5
+pngsuite/16bit/oi9n0g16.png,32,32,1,,0xfc8f2f99
+pngsuite/16bit/oi9n2c16.png,32,32,3,,0x65567ed5
+pngsuite/16bit/tbbn2c16.png,32,32,4,,0xaa9bfe44
+pngsuite/16bit/tbgn2c16.png,32,32,4,,0xaa9bfe44
+pngsuite/16bit/tbwn0g16.png,32,32,2,,0x075e519a
+pngsuite/corrupt/xc1n0g08.png,32,32,2,bad ctype,0x00000000
+pngsuite/corrupt/xc9n2c08.png,32,32,2,bad ctype,0x00000000
+pngsuite/corrupt/xcrn0g04.png,32,32,2,unknown image type,0x00000000
+pngsuite/corrupt/xcsn0g01.png,32,32,1,,0x43b9891f
+pngsuite/corrupt/xd0n2c08.png,32,32,1,1/2/4/8/16-bit only,0x00000000
+pngsuite/corrupt/xd3n2c08.png,32,32,1,1/2/4/8/16-bit only,0x00000000
+pngsuite/corrupt/xd9n2c08.png,32,32,1,1/2/4/8/16-bit only,0x00000000
+pngsuite/corrupt/xdtn0g01.png,32,32,1,no IDAT,0x00000000
+pngsuite/corrupt/xhdn0g08.png,32,32,1,,0x414f1ca9
+pngsuite/corrupt/xlfn0g04.png,32,32,1,unknown image type,0x00000000
+pngsuite/corrupt/xs1n0g01.png,32,32,1,unknown image type,0x00000000
+pngsuite/corrupt/xs2n0g01.png,32,32,1,unknown image type,0x00000000
+pngsuite/corrupt/xs4n0g01.png,32,32,1,unknown image type,0x00000000
+pngsuite/corrupt/xs7n0g01.png,32,32,1,unknown image type,0x00000000
+pngsuite/iphone/iphone_basi0g01.png,32,32,4,,0x5fb33cfd
+pngsuite/iphone/iphone_basi0g02.png,32,32,4,,0x5bbe95c5
+pngsuite/iphone/iphone_basi3p02.png,32,32,4,,0x50ba29c5
+pngsuite/iphone/iphone_bgwn6a08.png,32,32,4,,0x45d8548a
+pngsuite/iphone/iphone_bgyn6a16.png,32,32,4,,0x4b2b7545
+pngsuite/iphone/iphone_tbyn3p08.png,32,32,4,,0x8ea9aaaf
+pngsuite/iphone/iphone_z06n2c08.png,32,32,4,,0xb5dd034b
+pngsuite/primary/basi0g01.png,32,32,1,,0x43b9891f
+pngsuite/primary/basi0g02.png,32,32,1,,0xaf0bb3c5
+pngsuite/primary/basi0g04.png,32,32,1,,0x6fbaeb45
+pngsuite/primary/basi0g08.png,32,32,1,,0x414f1ca9
+pngsuite/primary/basi2c08.png,32,32,3,,0x522345c5
+pngsuite/primary/basi3p01.png,32,32,3,,0x9c5b75c5
+pngsuite/primary/basi3p02.png,32,32,3,,0x46f26ec5
+pngsuite/primary/basi3p04.png,32,32,3,,0x35b2e4a5
+pngsuite/primary/basi3p08.png,32,32,3,,0xfe066865
+pngsuite/primary/basi4a08.png,32,32,2,,0x77cbbfa5
+pngsuite/primary/basi6a08.png,32,32,4,,0xb472197d
+pngsuite/primary/basn0g01.png,32,32,1,,0x43b9891f
+pngsuite/primary/basn0g02.png,32,32,1,,0xaf0bb3c5
+pngsuite/primary/basn0g04.png,32,32,1,,0x6fbaeb45
+pngsuite/primary/basn0g08.png,32,32,1,,0x414f1ca9
+pngsuite/primary/basn2c08.png,32,32,3,,0x522345c5
+pngsuite/primary/basn3p01.png,32,32,3,,0x9c5b75c5
+pngsuite/primary/basn3p02.png,32,32,3,,0x46f26ec5
+pngsuite/primary/basn3p04.png,32,32,3,,0x35b2e4a5
+pngsuite/primary/basn3p08.png,32,32,3,,0xfe066865
+pngsuite/primary/basn4a08.png,32,32,2,,0x77cbbfa5
+pngsuite/primary/basn6a08.png,32,32,4,,0xb472197d
+pngsuite/primary/bgai4a08.png,32,32,2,,0x77cbbfa5
+pngsuite/primary/bgan6a08.png,32,32,4,,0xb472197d
+pngsuite/primary/bgbn4a08.png,32,32,2,,0x77cbbfa5
+pngsuite/primary/bgwn6a08.png,32,32,4,,0xb472197d
+pngsuite/primary/s01i3p01.png,1,1,3,,0xafb003b6
+pngsuite/primary/s01n3p01.png,1,1,3,,0xafb003b6
+pngsuite/primary/s02i3p01.png,2,2,3,,0x96f3dd85
+pngsuite/primary/s02n3p01.png,2,2,3,,0x96f3dd85
+pngsuite/primary/s03i3p01.png,3,3,3,,0xb0cf1241
+pngsuite/primary/s03n3p01.png,3,3,3,,0xb0cf1241
+pngsuite/primary/s04i3p01.png,4,4,3,,0xbfcedd75
+pngsuite/primary/s04n3p01.png,4,4,3,,0xbfcedd75
+pngsuite/primary/s05i3p02.png,5,5,3,,0xc322cedd
+pngsuite/primary/s05n3p02.png,5,5,3,,0xc322cedd
+pngsuite/primary/s06i3p02.png,6,6,3,,0x46916799
+pngsuite/primary/s06n3p02.png,6,6,3,,0x46916799
+pngsuite/primary/s07i3p02.png,7,7,3,,0xfdabc297
+pngsuite/primary/s07n3p02.png,7,7,3,,0xfdabc297
+pngsuite/primary/s08i3p02.png,8,8,3,,0x8f036d09
+pngsuite/primary/s08n3p02.png,8,8,3,,0x8f036d09
+pngsuite/primary/s09i3p02.png,9,9,3,,0x16a46830
+pngsuite/primary/s09n3p02.png,9,9,3,,0x16a46830
+pngsuite/primary/s32i3p04.png,32,32,3,,0x4bd4fbd3
+pngsuite/primary/s32n3p04.png,32,32,3,,0x4bd4fbd3
+pngsuite/primary/s33i3p04.png,33,33,3,,0x51aa005e
+pngsuite/primary/s33n3p04.png,33,33,3,,0x51aa005e
+pngsuite/primary/s34i3p04.png,34,34,3,,0x84818775
+pngsuite/primary/s34n3p04.png,34,34,3,,0x84818775
+pngsuite/primary/s35i3p04.png,35,35,3,,0x6359ec75
+pngsuite/primary/s35n3p04.png,35,35,3,,0x6359ec75
+pngsuite/primary/s36i3p04.png,36,36,3,,0xe4878065
+pngsuite/primary/s36n3p04.png,36,36,3,,0xe4878065
+pngsuite/primary/s37i3p04.png,37,37,3,,0x3cefc423
+pngsuite/primary/s37n3p04.png,37,37,3,,0x3cefc423
+pngsuite/primary/s38i3p04.png,38,38,3,,0xffc55a2b
+pngsuite/primary/s38n3p04.png,38,38,3,,0xffc55a2b
+pngsuite/primary/s39i3p04.png,39,39,3,,0x0c790240
+pngsuite/primary/s39n3p04.png,39,39,3,,0x0c790240
+pngsuite/primary/s40i3p04.png,40,40,3,,0x951a316d
+pngsuite/primary/s40n3p04.png,40,40,3,,0x951a316d
+pngsuite/primary/tbbn0g04.png,32,32,2,,0x9c8410ea
+pngsuite/primary/tbbn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary/tbgn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary/tbrn2c08.png,32,32,4,,0xaa9bfe44
+pngsuite/primary/tbwn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary/tbyn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary/tm3n3p02.png,32,32,4,,0xf59745c5
+pngsuite/primary/tp0n0g08.png,32,32,1,,0xbac0864c
+pngsuite/primary/tp0n2c08.png,32,32,3,,0x82687c37
+pngsuite/primary/tp0n3p08.png,32,32,3,,0x61f54e37
+pngsuite/primary/tp1n3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary/z00n2c08.png,32,32,3,,0x65b4a72f
+pngsuite/primary/z03n2c08.png,32,32,3,,0x65b4a72f
+pngsuite/primary/z06n2c08.png,32,32,3,,0x65b4a72f
+pngsuite/primary/z09n2c08.png,32,32,3,,0x65b4a72f
+pngsuite/primary_check/basi0g01.png,32,32,4,,0x5fb33cfd
+pngsuite/primary_check/basi0g02.png,32,32,4,,0x5bbe95c5
+pngsuite/primary_check/basi0g04.png,32,32,4,,0x3468b9c5
+pngsuite/primary_check/basi0g08.png,32,32,4,,0x262ef46d
+pngsuite/primary_check/basi2c08.png,32,32,4,,0x1fc92bc5
+pngsuite/primary_check/basi3p01.png,32,32,4,,0x28a3e1c5
+pngsuite/primary_check/basi3p02.png,32,32,4,,0x803be5c5
+pngsuite/primary_check/basi3p04.png,32,32,4,,0xf3fc60e5
+pngsuite/primary_check/basi3p08.png,32,32,4,,0x30ef4f45
+pngsuite/primary_check/basi4a08.png,32,32,4,,0x23c8536d
+pngsuite/primary_check/basi6a08.png,32,32,4,,0xb472197d
+pngsuite/primary_check/basn0g01.png,32,32,4,,0x5fb33cfd
+pngsuite/primary_check/basn0g02.png,32,32,4,,0x5bbe95c5
+pngsuite/primary_check/basn0g04.png,32,32,4,,0x3468b9c5
+pngsuite/primary_check/basn0g08.png,32,32,4,,0x262ef46d
+pngsuite/primary_check/basn2c08.png,32,32,4,,0x1fc92bc5
+pngsuite/primary_check/basn3p01.png,32,32,4,,0x28a3e1c5
+pngsuite/primary_check/basn3p02.png,32,32,4,,0x803be5c5
+pngsuite/primary_check/basn3p04.png,32,32,4,,0xf3fc60e5
+pngsuite/primary_check/basn3p08.png,32,32,4,,0x30ef4f45
+pngsuite/primary_check/basn4a08.png,32,32,4,,0x23c8536d
+pngsuite/primary_check/basn6a08.png,32,32,4,,0xb472197d
+pngsuite/primary_check/bgai4a08.png,32,32,4,,0x23c8536d
+pngsuite/primary_check/bgan6a08.png,32,32,4,,0xb472197d
+pngsuite/primary_check/bgbn4a08.png,32,32,4,,0x23c8536d
+pngsuite/primary_check/bgwn6a08.png,32,32,4,,0xb472197d
+pngsuite/primary_check/s01i3p01.png,1,1,4,,0xdb152beb
+pngsuite/primary_check/s01n3p01.png,1,1,4,,0xdb152beb
+pngsuite/primary_check/s02i3p01.png,2,2,4,,0xa344a3a5
+pngsuite/primary_check/s02n3p01.png,2,2,4,,0xa344a3a5
+pngsuite/primary_check/s03i3p01.png,3,3,4,,0x594d3bfa
+pngsuite/primary_check/s03n3p01.png,3,3,4,,0x594d3bfa
+pngsuite/primary_check/s04i3p01.png,4,4,4,,0xd59d4605
+pngsuite/primary_check/s04n3p01.png,4,4,4,,0xd59d4605
+pngsuite/primary_check/s05i3p02.png,5,5,4,,0x41e58366
+pngsuite/primary_check/s05n3p02.png,5,5,4,,0x41e58366
+pngsuite/primary_check/s06i3p02.png,6,6,4,,0xcad1a885
+pngsuite/primary_check/s06n3p02.png,6,6,4,,0xcad1a885
+pngsuite/primary_check/s07i3p02.png,7,7,4,,0x09184108
+pngsuite/primary_check/s07n3p02.png,7,7,4,,0x09184108
+pngsuite/primary_check/s08i3p02.png,8,8,4,,0x4fd11cad
+pngsuite/primary_check/s08n3p02.png,8,8,4,,0x4fd11cad
+pngsuite/primary_check/s09i3p02.png,9,9,4,,0xc50dbecd
+pngsuite/primary_check/s09n3p02.png,9,9,4,,0xc50dbecd
+pngsuite/primary_check/s32i3p04.png,32,32,4,,0x95cbb1d3
+pngsuite/primary_check/s32n3p04.png,32,32,4,,0x95cbb1d3
+pngsuite/primary_check/s33i3p04.png,33,33,4,,0x6649fc5b
+pngsuite/primary_check/s33n3p04.png,33,33,4,,0x6649fc5b
+pngsuite/primary_check/s34i3p04.png,34,34,4,,0x35b98e15
+pngsuite/primary_check/s34n3p04.png,34,34,4,,0x35b98e15
+pngsuite/primary_check/s35i3p04.png,35,35,4,,0xc9ddf938
+pngsuite/primary_check/s35n3p04.png,35,35,4,,0xc9ddf938
+pngsuite/primary_check/s36i3p04.png,36,36,4,,0x7bb4e1cd
+pngsuite/primary_check/s36n3p04.png,36,36,4,,0x7bb4e1cd
+pngsuite/primary_check/s37i3p04.png,37,37,4,,0xee50001c
+pngsuite/primary_check/s37n3p04.png,37,37,4,,0xee50001c
+pngsuite/primary_check/s38i3p04.png,38,38,4,,0x51b76813
+pngsuite/primary_check/s38n3p04.png,38,38,4,,0x51b76813
+pngsuite/primary_check/s39i3p04.png,39,39,4,,0x42f23327
+pngsuite/primary_check/s39n3p04.png,39,39,4,,0x42f23327
+pngsuite/primary_check/s40i3p04.png,40,40,4,,0xf91b6a7d
+pngsuite/primary_check/s40n3p04.png,40,40,4,,0xf91b6a7d
+pngsuite/primary_check/tbbn0g04.png,32,32,4,,0x8a0117a4
+pngsuite/primary_check/tbbn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary_check/tbgn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary_check/tbrn2c08.png,32,32,4,,0xaa9bfe44
+pngsuite/primary_check/tbwn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary_check/tbyn3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary_check/tm3n3p02.png,32,32,4,,0xf59745c5
+pngsuite/primary_check/tp0n0g08.png,32,32,4,,0xd405ad2e
+pngsuite/primary_check/tp0n2c08.png,32,32,4,,0x5a66ca09
+pngsuite/primary_check/tp0n3p08.png,32,32,4,,0x06e81adf
+pngsuite/primary_check/tp1n3p08.png,32,32,4,,0x82bf9a57
+pngsuite/primary_check/z00n2c08.png,32,32,4,,0xaa698493
+pngsuite/primary_check/z03n2c08.png,32,32,4,,0xaa698493
+pngsuite/primary_check/z06n2c08.png,32,32,4,,0xaa698493
+pngsuite/primary_check/z09n2c08.png,32,32,4,,0xaa698493
+pngsuite/unused/ccwn2c08.png,32,32,3,,0xbb576418
+pngsuite/unused/ccwn3p08.png,32,32,3,,0x5c4df060
+pngsuite/unused/cdfn2c08.png,8,32,3,,0xe30ed48f
+pngsuite/unused/cdhn2c08.png,32,8,3,,0x999321f5
+pngsuite/unused/cdsn2c08.png,8,8,3,,0x7f63fa01
+pngsuite/unused/cdun2c08.png,32,32,3,,0xbd325d71
+pngsuite/unused/ch1n3p04.png,32,32,3,,0x35b2e4a5
+pngsuite/unused/ch2n3p08.png,32,32,3,,0xfe066865
+pngsuite/unused/cm0n0g04.png,32,32,1,,0xe9f53e6c
+pngsuite/unused/cm7n0g04.png,32,32,1,,0xe9f53e6c
+pngsuite/unused/cm9n0g04.png,32,32,1,,0xe9f53e6c
+pngsuite/unused/cs3n2c16.png,32,32,3,,0x7f0fa2c5
+pngsuite/unused/cs3n3p08.png,32,32,3,,0x5533bac5
+pngsuite/unused/cs5n2c08.png,32,32,3,,0x8a80f8c5
+pngsuite/unused/cs5n3p08.png,32,32,3,,0x8a80f8c5
+pngsuite/unused/cs8n2c08.png,32,32,3,,0x7f0fa2c5
+pngsuite/unused/cs8n3p08.png,32,32,3,,0x7f0fa2c5
+pngsuite/unused/ct0n0g04.png,32,32,1,,0xe9f53e6c
+pngsuite/unused/ct1n0g04.png,32,32,1,,0xe9f53e6c
+pngsuite/unused/cten0g04.png,32,32,1,,0x1c073b45
+pngsuite/unused/ctfn0g04.png,32,32,1,,0xfa9fd205
+pngsuite/unused/ctgn0g04.png,32,32,1,,0xf28c8085
+pngsuite/unused/cthn0g04.png,32,32,1,,0x7c039595
+pngsuite/unused/ctjn0g04.png,32,32,1,,0xc520f455
+pngsuite/unused/ctzn0g04.png,32,32,1,,0xe9f53e6c
+pngsuite/unused/f00n0g08.png,32,32,1,,0x21db411b
+pngsuite/unused/f00n2c08.png,32,32,3,,0x1f25ded0
+pngsuite/unused/f01n0g08.png,32,32,1,,0x7437b32a
+pngsuite/unused/f01n2c08.png,32,32,3,,0x0d4507ae
+pngsuite/unused/f02n0g08.png,32,32,1,,0x6b633c7c
+pngsuite/unused/f02n2c08.png,32,32,3,,0x4b278986
+pngsuite/unused/f03n0g08.png,32,32,1,,0x2f31c08e
+pngsuite/unused/f03n2c08.png,32,32,3,,0x843ecc7e
+pngsuite/unused/f04n0g08.png,32,32,1,,0xfd3a0b73
+pngsuite/unused/f04n2c08.png,32,32,3,,0x557174bc
+pngsuite/unused/f99n0g04.png,32,32,1,,0xb79aa6e1
+pngsuite/unused/g03n0g16.png,32,32,1,,0xecd13817
+pngsuite/unused/g03n2c08.png,32,32,3,,0x242407a8
+pngsuite/unused/g03n3p04.png,32,32,3,,0xe801ecc8
+pngsuite/unused/g04n0g16.png,32,32,1,,0xc11bc972
+pngsuite/unused/g04n2c08.png,32,32,3,,0xdf843cc4
+pngsuite/unused/g04n3p04.png,32,32,3,,0x60e41f3b
+pngsuite/unused/g05n0g16.png,32,32,1,,0xbe6615a5
+pngsuite/unused/g05n2c08.png,32,32,3,,0x5c312116
+pngsuite/unused/g05n3p04.png,32,32,3,,0x2e0fbf86
+pngsuite/unused/g07n0g16.png,32,32,1,,0x2b54a398
+pngsuite/unused/g07n2c08.png,32,32,3,,0xf765fb10
+pngsuite/unused/g07n3p04.png,32,32,3,,0x9a8c3338
+pngsuite/unused/g10n0g16.png,32,32,1,,0xb08a92e1
+pngsuite/unused/g10n2c08.png,32,32,3,,0xa43f2291
+pngsuite/unused/g10n3p04.png,32,32,3,,0xb733194c
+pngsuite/unused/g25n0g16.png,32,32,1,,0xa6b1f5dd
+pngsuite/unused/g25n2c08.png,32,32,3,,0x767aee0c
+pngsuite/unused/g25n3p04.png,32,32,3,,0x4cf349a8
+pngsuite/unused/pp0n2c16.png,32,32,3,,0x65567ed5
+pngsuite/unused/pp0n6a08.png,32,32,4,,0x3188c645
+pngsuite/unused/ps1n0g08.png,32,32,1,,0x414f1ca9
+pngsuite/unused/ps1n2c16.png,32,32,3,,0x65567ed5
+pngsuite/unused/ps2n0g08.png,32,32,1,,0x414f1ca9
+pngsuite/unused/ps2n2c16.png,32,32,3,,0x65567ed5
diff --git a/tests/test_png_regress.c b/tests/test_png_regress.c
new file mode 100644
index 0000000..5ba6796
--- /dev/null
+++ b/tests/test_png_regress.c
@@ -0,0 +1,75 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#define STBI_WINDOWS_UTF8
+
+#ifdef _WIN32
+#define WIN32 // what stb.h checks
+#pragma comment(lib, "advapi32.lib")
+#endif
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb_image.h"
+
+#define STB_DEFINE
+#include "deprecated/stb.h"
+
+static unsigned int fnv1a_hash32(const stbi_uc *bytes, size_t len)
+{
+   unsigned int hash = 0x811c9dc5;
+   unsigned int mul = 0x01000193;
+   size_t i;
+
+   for (i = 0; i < len; ++i)
+      hash = (hash ^ bytes[i]) * mul;
+
+   return hash;
+}
+
+// The idea for this test is to leave pngsuite/ref_results.csv checked in,
+// and then you can run this test after making PNG loader changes. If the
+// ref results change (as per git diff), confirm that the change was
+// intentional. If so, commit them as well; if not, undo.
+int main()
+{
+   char **files;
+   FILE *csv_file;
+   int i;
+
+   files = stb_readdir_recursive("pngsuite", "*.png");
+   if (!files) {
+      fprintf(stderr, "pngsuite files not found!\n");
+      return 1;
+   }
+
+   // sort files by name
+   qsort(files, stb_arr_len(files), sizeof(char*), stb_qsort_strcmp(0));
+
+   csv_file = fopen("pngsuite/ref_results.csv", "w");
+   if (!csv_file) {
+      fprintf(stderr, "error opening ref results for writing!\n");
+      stb_readdir_free(files);
+      return 1;
+   }
+
+   fprintf(csv_file, "filename,width,height,ncomp,error,hash\n");
+   for (i = 0; i < stb_arr_len(files); ++i) {
+      char *filename = files[i];
+      int width, height, ncomp;
+      stbi_uc *pixels = stbi_load(filename, &width, &height, &ncomp, 0);
+      const char *error = "";
+      unsigned int hash = 0;
+
+      if (!pixels)
+         error = stbi_failure_reason();
+      else {
+         hash = fnv1a_hash32(pixels, width * height * ncomp);
+         stbi_image_free(pixels);
+      }
+
+      fprintf(csv_file, "%s,%d,%d,%d,%s,0x%08x\n", filename, width, height, ncomp, error, hash);
+   }
+
+   fclose(csv_file);
+   stb_readdir_free(files);
+}

From 07268cbf36f749c058e9dde1728fd10d4bec02d0 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 2 May 2023 01:17:05 -0700
Subject: [PATCH 155/170] stb_image: New Paeth filter

This formulation is equivalent to the original (reference)
implementation but runs _significantly_ faster - this speeds up
the filtering portion of a Paeth-heavy 8192x8192 16-bit/channel
image by a factor of more than 2 on a Zen2 CPU.

I'm investigating doing a more thorough restructuring of this pass,
but this seems like a good first step.
---
 stb_image.h            | 16 +++++++-------
 tests/test_png_paeth.c | 47 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 7 deletions(-)
 create mode 100644 tests/test_png_paeth.c

diff --git a/stb_image.h b/stb_image.h
index 50aea70..c59df5b 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4654,13 +4654,15 @@ static stbi_uc first_row_filter[5] =
 
 static int stbi__paeth(int a, int b, int c)
 {
-   int p = a + b - c;
-   int pa = abs(p-a);
-   int pb = abs(p-b);
-   int pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return a;
-   if (pb <= pc) return b;
-   return c;
+   // This formulation looks very different from the reference in the PNG spec, but is
+   // actually equivalent and has favorable data dependencies and admits straightforward
+   // generation of branch-free code, which helps performance significantly.
+   int thresh = c*3 - (a + b);
+   int lo = a < b ? a : b;
+   int hi = a < b ? b : a;
+   int t0 = (hi <= thresh) ? lo : c;
+   int t1 = (thresh <= lo) ? hi : t0;
+   return t1;
 }
 
 static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
diff --git a/tests/test_png_paeth.c b/tests/test_png_paeth.c
new file mode 100644
index 0000000..69ba37f
--- /dev/null
+++ b/tests/test_png_paeth.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+// Reference Paeth filter as per PNG spec
+static int ref_paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+// Optimized Paeth filter
+static int opt_paeth(int a, int b, int c)
+{
+   int thresh = c*3 - (a + b);
+   int lo = a < b ? a : b;
+   int hi = a < b ? b : a;
+   int t0 = (hi <= thresh) ? lo : c;
+   int t1 = (thresh <= lo) ? hi : t0;
+   return t1;
+}
+
+int main()
+{
+   // Exhaustively test the functions match for all byte inputs a, b,c in [0,255]
+   for (int i = 0; i < (1 << 24); ++i) {
+      int a = i & 0xff;
+      int b = (i >> 8) & 0xff;
+      int c = (i >> 16) & 0xff;
+
+      int ref = ref_paeth(a, b, c);
+      int opt = opt_paeth(a, b, c);
+      if (ref != opt) {
+         fprintf(stderr, "mismatch at a=%3d b=%3d c=%3d: ref=%3d opt=%3d\n", a, b, c, ref, opt);
+         return 1;
+      }
+   }
+
+   printf("all ok!\n");
+   return 0;
+}
+
+// vim:sw=3:sts=3:et

From c6d7c32e5d2ab9f26c9ab6b02ab45870756257f0 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 2 May 2023 17:19:02 -0700
Subject: [PATCH 156/170] stb_image: Two warning fixes

---
 stb_image.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index c59df5b..6a6bfdd 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1072,8 +1072,8 @@ static int stbi__addints_valid(int a, int b)
    return a <= INT_MAX - b;
 }
 
-// returns 1 if the product of two signed shorts is valid, 0 on overflow.
-static int stbi__mul2shorts_valid(short a, short b)
+// returns 1 if the product of two ints fits in a signed short, 0 on overflow.
+static int stbi__mul2shorts_valid(int a, int b)
 {
    if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
    if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
@@ -3384,13 +3384,13 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
    return 1;
 }
 
-static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
 {
    // some JPEGs have junk at end, skip over it but if we find what looks
    // like a valid marker, resume there
    while (!stbi__at_eof(j->s)) {
-      int x = stbi__get8(j->s);
-      while (x == 255) { // might be a marker
+      stbi_uc x = stbi__get8(j->s);
+      while (x == 0xff) { // might be a marker
          if (stbi__at_eof(j->s)) return STBI__MARKER_none;
          x = stbi__get8(j->s);
          if (x != 0x00 && x != 0xff) {

From 4da08a1dbd3cdf7b3165da120d1d64dad03ed157 Mon Sep 17 00:00:00 2001
From: Fabian Giesen <rygorous@gmail.com>
Date: Tue, 2 May 2023 17:14:45 -0700
Subject: [PATCH 157/170] stb_image: create_png_image_raw restructuring

Allocate filter_buf for two scan lines that we do all the filter
processing in, then do all other conversions (bit depth,
endianness, inserting alpha=255 values) on the way out.

Separating the two concerns makes everything much clearer.
---
 stb_image.h | 283 ++++++++++++++++++++++++----------------------------
 1 file changed, 131 insertions(+), 152 deletions(-)

diff --git a/stb_image.h b/stb_image.h
index 6a6bfdd..a6d33fb 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -4667,13 +4667,38 @@ static int stbi__paeth(int a, int b, int c)
 
 static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
+// adds an extra all-255 alpha channel
+// dest == src is legal
+// img_n must be 1 or 3
+static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n)
+{
+   int i;
+   // must process data backwards since we allow dest==src
+   if (img_n == 1) {
+      for (i=x-1; i >= 0; --i) {
+         dest[i*2+1] = 255;
+         dest[i*2+0] = src[i];
+      }
+   } else {
+      STBI_ASSERT(img_n == 3);
+      for (i=x-1; i >= 0; --i) {
+         dest[i*4+3] = 255;
+         dest[i*4+2] = src[i*3+2];
+         dest[i*4+1] = src[i*3+1];
+         dest[i*4+0] = src[i*3+0];
+      }
+   }
+}
+
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 {
-   int bytes = (depth == 16? 2 : 1);
+   int bytes = (depth == 16 ? 2 : 1);
    stbi__context *s = a->s;
    stbi__uint32 i,j,stride = x*out_n*bytes;
    stbi__uint32 img_len, img_width_bytes;
+   stbi_uc *filter_buf;
+   int all_ok = 1;
    int k;
    int img_n = s->img_n; // copy it into a local for later
 
@@ -4685,8 +4710,11 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
    a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
 
+   // note: error exits here don't need to clean up a->out individually,
+   // stbi__do_png always does on error.
    if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG");
    img_len = (img_width_bytes + 1) * y;
 
    // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
@@ -4694,186 +4722,137 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
    // so just check for raw_len < img_len always.
    if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
+   // Allocate two scan lines worth of filter workspace buffer.
+   filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0);
+   if (!filter_buf) return stbi__err("outofmem", "Out of memory");
+
+   // Filtering for low-bit-depth images
+   if (depth < 8) {
+      filter_bytes = 1;
+      width = img_width_bytes;
+   }
+
    for (j=0; j < y; ++j) {
-      stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior;
+      // cur/prior filter buffers alternate
+      stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes;
+      stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes;
+      stbi_uc *dest = a->out + stride*j;
+      int nk = width * filter_bytes;
       int filter = *raw++;
 
-      if (filter > 4)
-         return stbi__err("invalid filter","Corrupt PNG");
-
-      if (depth < 8) {
-         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
-         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
-         filter_bytes = 1;
-         width = img_width_bytes;
+      // check filter type
+      if (filter > 4) {
+         all_ok = stbi__err("invalid filter","Corrupt PNG");
+         break;
       }
-      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
 
-      // handle first byte explicitly
-      for (k=0; k < filter_bytes; ++k) {
-         switch (filter) {
-            case STBI__F_none       : cur[k] = raw[k]; break;
-            case STBI__F_sub        : cur[k] = raw[k]; break;
-            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
-            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break; // prior[k] == stbi__paeth(0,prior[k],0)
-            case STBI__F_avg_first  : cur[k] = raw[k]; break;
-         }
+      // perform actual filtering
+      switch (filter) {
+      case STBI__F_none:
+         memcpy(cur, raw, nk);
+         break;
+      case STBI__F_sub:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]);
+         break;
+      case STBI__F_up:
+         for (k = 0; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+         break;
+      case STBI__F_avg:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1));
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1));
+         break;
+      case STBI__F_paeth:
+         for (k = 0; k < filter_bytes; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0)
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes]));
+         break;
+      case STBI__F_avg_first:
+         memcpy(cur, raw, filter_bytes);
+         for (k = filter_bytes; k < nk; ++k)
+            cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1));
+         break;
       }
 
-      if (depth == 8) {
-         if (img_n != out_n)
-            cur[img_n] = 255; // first pixel
-         raw += img_n;
-         cur += out_n;
-         prior += out_n;
-      } else if (depth == 16) {
-         if (img_n != out_n) {
-            cur[filter_bytes]   = 255; // first pixel top byte
-            cur[filter_bytes+1] = 255; // first pixel bottom byte
-         }
-         raw += filter_bytes;
-         cur += output_bytes;
-         prior += output_bytes;
-      } else {
-         raw += 1;
-         cur += 1;
-         prior += 1;
-      }
+      raw += nk;
 
-      // this is a little gross, so that we don't switch per-pixel or per-component
-      if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*filter_bytes;
-         #define STBI__CASE(f) \
-             case f:     \
-                for (k=0; k < nk; ++k)
-         switch (filter) {
-            // "none" filter turns into a memcpy here; make that explicit.
-            case STBI__F_none:               memcpy(cur, raw, nk); break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
-         }
-         #undef STBI__CASE
-         raw += nk;
-      } else {
-         STBI_ASSERT(img_n+1 == out_n);
-         #define STBI__CASE(f) \
-             case f:     \
-                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
-                   for (k=0; k < filter_bytes; ++k)
-         switch (filter) {
-            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
-            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
-            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
-            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
-            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
-            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
-         }
-         #undef STBI__CASE
-
-         // the loop above sets the high byte of the pixels' alpha, but for
-         // 16 bit png files we also need the low byte set. we'll do that here.
-         if (depth == 16) {
-            cur = a->out + stride*j; // start at the beginning of the row again
-            for (i=0; i < x; ++i,cur+=output_bytes) {
-               cur[filter_bytes+1] = 255;
-            }
-         }
-      }
-   }
-
-   // we make a separate pass to expand bits to pixels; for performance,
-   // this could run two scanlines behind the above code, so it won't
-   // intefere with filtering but will still be in the cache.
-   if (depth < 8) {
-      for (j=0; j < y; ++j) {
-         stbi_uc *cur = a->out + stride*j;
-         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
-         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
-         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+      // expand decoded bits in cur to dest, also adding an extra alpha channel if desired
+      if (depth < 8) {
          stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+         stbi_uc *in = cur;
+         stbi_uc *out = dest;
+         stbi_uc inb = 0;
+         stbi__uint32 nsmp = x*img_n;
 
-         // note that the final byte might overshoot and write more data than desired.
-         // we can allocate enough data that this never writes out of memory, but it
-         // could also overwrite the next scanline. can it overwrite non-empty data
-         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
-         // so we need to explicitly clamp the final ones
-
+         // expand bits to bytes first
          if (depth == 4) {
-            for (k=x*img_n; k >= 2; k-=2, ++in) {
-               *cur++ = scale * ((*in >> 4)       );
-               *cur++ = scale * ((*in     ) & 0x0f);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 1) == 0) inb = *in++;
+               *out++ = scale * (inb >> 4);
+               inb <<= 4;
             }
-            if (k > 0) *cur++ = scale * ((*in >> 4)       );
          } else if (depth == 2) {
-            for (k=x*img_n; k >= 4; k-=4, ++in) {
-               *cur++ = scale * ((*in >> 6)       );
-               *cur++ = scale * ((*in >> 4) & 0x03);
-               *cur++ = scale * ((*in >> 2) & 0x03);
-               *cur++ = scale * ((*in     ) & 0x03);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 3) == 0) inb = *in++;
+               *out++ = scale * (inb >> 6);
+               inb <<= 2;
             }
-            if (k > 0) *cur++ = scale * ((*in >> 6)       );
-            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
-            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
-         } else if (depth == 1) {
-            for (k=x*img_n; k >= 8; k-=8, ++in) {
-               *cur++ = scale * ((*in >> 7)       );
-               *cur++ = scale * ((*in >> 6) & 0x01);
-               *cur++ = scale * ((*in >> 5) & 0x01);
-               *cur++ = scale * ((*in >> 4) & 0x01);
-               *cur++ = scale * ((*in >> 3) & 0x01);
-               *cur++ = scale * ((*in >> 2) & 0x01);
-               *cur++ = scale * ((*in >> 1) & 0x01);
-               *cur++ = scale * ((*in     ) & 0x01);
+         } else {
+            STBI_ASSERT(depth == 1);
+            for (i=0; i < nsmp; ++i) {
+               if ((i & 7) == 0) inb = *in++;
+               *out++ = scale * (inb >> 7);
+               inb <<= 1;
             }
-            if (k > 0) *cur++ = scale * ((*in >> 7)       );
-            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
-            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
-            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
-            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
-            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
-            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
          }
-         if (img_n != out_n) {
-            int q;
-            // insert alpha = 255
-            cur = a->out + stride*j;
+
+         // insert alpha=255 values if desired
+         if (img_n != out_n)
+            stbi__create_png_alpha_expand8(dest, dest, x, img_n);
+      } else if (depth == 8) {
+         if (img_n == out_n)
+            memcpy(dest, cur, x*img_n);
+         else
+            stbi__create_png_alpha_expand8(dest, cur, x, img_n);
+      } else if (depth == 16) {
+         // convert the image data from big-endian to platform-native
+         stbi__uint16 *dest16 = (stbi__uint16*)dest;
+         stbi__uint32 nsmp = x*img_n;
+
+         if (img_n == out_n) {
+            for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
+               *dest16 = (cur[0] << 8) | cur[1];
+         } else {
+            STBI_ASSERT(img_n+1 == out_n);
             if (img_n == 1) {
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*2+1] = 255;
-                  cur[q*2+0] = cur[q];
+               for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = 0xffff;
                }
             } else {
                STBI_ASSERT(img_n == 3);
-               for (q=x-1; q >= 0; --q) {
-                  cur[q*4+3] = 255;
-                  cur[q*4+2] = cur[q*3+2];
-                  cur[q*4+1] = cur[q*3+1];
-                  cur[q*4+0] = cur[q*3+0];
+               for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
+                  dest16[0] = (cur[0] << 8) | cur[1];
+                  dest16[1] = (cur[2] << 8) | cur[3];
+                  dest16[2] = (cur[4] << 8) | cur[5];
+                  dest16[3] = 0xffff;
                }
             }
          }
       }
-   } else if (depth == 16) {
-      // force the image data from big-endian to platform-native.
-      // this is done in a separate pass due to the decoding relying
-      // on the data being untouched, but could probably be done
-      // per-line during decode if care is taken.
-      stbi_uc *cur = a->out;
-      stbi__uint16 *cur16 = (stbi__uint16*)cur;
-
-      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
-         *cur16 = (cur[0] << 8) | cur[1];
-      }
    }
 
+   STBI_FREE(filter_buf);
+   if (!all_ok) return 0;
+
    return 1;
 }
 

From 9d924f8a47649ce650fb6943d2394e563eae39c8 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 14 Dec 2023 03:09:00 -0800
Subject: [PATCH 158/170] stb_image_resize: 2.04

---
 stb_image_resize2.h | 72 +++++++++++++++++++++++++++++----------------
 1 file changed, 47 insertions(+), 25 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index e0c4282..4b246f3 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.01 - public domain image resizing
+/* stb_image_resize2 - v2.04 - public domain image resizing
    
    by Jeff Roberts (v2) and Jorge L Rodriguez 
    http://github.com/nothings/stb
@@ -328,9 +328,11 @@
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.00 (2022-02-20) mostly new source: new api, optimizations, simd, vertical-first, etc 
-                       (2x-5x faster without simd, 4x-12x faster with simd)
-                       (in some cases, 20x to 40x faster - resizing to very small for example)
+      2.04 (2023-11-17) Fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
+      2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
+      2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
+                          (2x-5x faster without simd, 4x-12x faster with simd)
+                          (in some cases, 20x to 40x faster - resizing to very small for example)
       0.96 (2019-03-04) fixed warnings
       0.95 (2017-07-23) fixed warnings
       0.94 (2017-03-18) fixed warnings
@@ -450,25 +452,33 @@ typedef uint64_t stbir_uint64;
 // for back compatibility, you can cast the old channel count to an stbir_pixel_layout
 typedef enum 
 {
-  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
   STBIR_1CHANNEL = 1,              
   STBIR_2CHANNEL = 2,
   STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping) 
-  STBIR_RGBA     = 4,               // alpha formats, alpha is NOT premultiplied into color channels
-
+  STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
   STBIR_4CHANNEL = 5,
+
+  STBIR_RGBA = 4,                   // alpha formats, where alpha is NOT premultiplied into color channels
   STBIR_BGRA = 6,
   STBIR_ARGB = 7,
   STBIR_ABGR = 8,
   STBIR_RA   = 9,
   STBIR_AR   = 10,
 
-  STBIR_RGBA_PM = 11,               // alpha formats, alpha is premultiplied into color channels
+  STBIR_RGBA_PM = 11,               // alpha formats, where alpha is premultiplied into color channels
   STBIR_BGRA_PM = 12,
   STBIR_ARGB_PM = 13,
   STBIR_ABGR_PM = 14,
   STBIR_RA_PM   = 15,
   STBIR_AR_PM   = 16,
+
+  STBIR_RGBA_NO_AW = 11,            // alpha formats, where NO alpha weighting is applied at all!
+  STBIR_BGRA_NO_AW = 12,            //   these are just synonyms for the _PM flags (which also do
+  STBIR_ARGB_NO_AW = 13,            //   no alpha weighting). These names just make it more clear
+  STBIR_ABGR_NO_AW = 14,            //   for some folks).
+  STBIR_RA_NO_AW   = 15,
+  STBIR_AR_NO_AW   = 16,
+
 } stbir_pixel_layout;
 
 //===============================================================
@@ -1172,6 +1182,10 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
 #endif
 
+#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 
+#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
+#endif
+
 // restrict pointers for the output pointers
 #if defined( _MSC_VER ) && !defined(__clang__)
   #define STBIR_STREAMOUT_PTR( star ) star __restrict
@@ -1549,7 +1563,6 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
     #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 )
 
-    #define stbir__simdf8_load2( out, ptr ) (out) = _mm256_castsi256_ps(_mm256_castsi128_si256( _mm_loadl_epi64( (__m128i*)(ptr)) )) // top values can be random (not denormal or nan for perf)
     #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) )
 
     static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) };
@@ -1582,11 +1595,11 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     #ifdef STBIR_USE_FMA           // not on by default to maintain bit identical simd to non-simd
     #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add )
     #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add )
-    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( _mm256_castps128_ps256( mul ), _mm256_castps128_ps256( _mm_loadu_ps( (float const*)(ptr) ) ), add )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add )
     #else
     #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) )
     #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) )
-    #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_castps128_ps256( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) ) )
+    #define stbir__simdf8_madd_mem4( out, add, mul, ptr )  (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) )
     #endif
     #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val )
 
@@ -3697,7 +3710,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     float * coeffs = coefficents + widest * ( num_contributors - 1 );
 
     // go until no chance of clipping (this is usually less than 8 lops)
-    while ( ( ( contribs->n0 + widest*2 ) >= row_width ) && ( contribs >= contributors ) )
+    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_width ) )
     {
       // might we clip??
       if ( ( contribs->n0 + widest ) > row_width )
@@ -4652,10 +4665,10 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_madd( tot0, tot0, c, d ); }               
 
 #define stbir__store_output()                     \
-    { stbir__simdf t,c;                           \
+    { stbir__simdf t,d;                           \
     stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );    \
-    stbir__simdf_0123to2301( c, t );              \
-    stbir__simdf_add( t, t, c );                  \
+    stbir__simdf_0123to2301( d, t );              \
+    stbir__simdf_add( t, t, d );                  \
     stbir__simdf_store2( output, t );             \
     horizontal_coefficients += coefficient_width; \
     ++horizontal_contributors;                    \
@@ -7389,7 +7402,6 @@ static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layou
   resize->output_cb = 0;
   resize->user_data = resize;
   resize->samplers = 0;
-  resize->needs_rebuild = 1;
   resize->called_alloc = 0;
   resize->horizontal_filter = STBIR_FILTER_DEFAULT;
   resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0;
@@ -7403,6 +7415,7 @@ static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layou
   resize->output_data_type = data_type;
   resize->input_pixel_layout_public = pixel_layout;
   resize->output_pixel_layout_public = pixel_layout;
+  resize->needs_rebuild = 1;
 }
 
 STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, 
@@ -7428,17 +7441,27 @@ STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_t
 {
   resize->input_data_type = input_type;
   resize->output_data_type = output_type;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+    stbir__update_info_from_resize( resize->samplers, resize );
 }
 
 STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb )   // no callbacks by default
 {
   resize->input_cb = input_cb;
   resize->output_cb = output_cb;
+
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+  {
+    resize->samplers->in_pixels_cb = input_cb;
+    resize->samplers->out_pixels_cb = output_cb;
+  }
 }
 
 STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
 {
   resize->user_data = user_data;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+    resize->samplers->user_data = user_data;
 }
 
 STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes )
@@ -7447,6 +7470,8 @@ STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_p
   resize->input_stride_in_bytes = input_stride_in_bytes;
   resize->output_pixels = output_pixels;
   resize->output_stride_in_bytes = output_stride_in_bytes;
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+    stbir__update_info_from_resize( resize->samplers, resize );
 }
 
 
@@ -7593,9 +7618,9 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
   stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data );
   stbir__set_sampler(&vertical, resize->vertical_filter, resize->horizontal_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data );
 
-  if ( ( vertical.scale_info.output_sub_size / splits ) < 4 ) // each split should be a minimum of 4 scanlines (handwavey choice)
+  if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice)
   {
-    splits = vertical.scale_info.output_sub_size / 4;
+    splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS;
     if ( splits == 0 ) splits = 1;
   }
 
@@ -7612,6 +7637,10 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
     #ifdef STBIR_PROFILE
       STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) );
     #endif
+
+    // update anything that can be changed without recalcing samplers
+    stbir__update_info_from_resize( out_info, resize );
+ 
     return splits;
   }
 
@@ -7680,10 +7709,6 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
     STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
   }
 
-
-  // update anything that can be changed without recalcing samplers
-  stbir__update_info_from_resize( resize->samplers, resize );
-
   // do resize
   result = stbir__perform_resize( resize->samplers, 0, resize->splits );
 
@@ -7712,9 +7737,6 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
   if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
     return 0;
   
-  // update anything that can be changed without recalcing samplers
-  stbir__update_info_from_resize( resize->samplers, resize );
- 
   // do resize
   return stbir__perform_resize( resize->samplers, split_start, split_count );
 }

From 0ca75da4ec740f74d98bddbc79e0b264c564ad42 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 14 Dec 2023 03:10:47 -0800
Subject: [PATCH 159/170] stb_image_resize2: remove whitespace

---
 stb_image_resize2.h | 1138 +++++++++++++++++++++----------------------
 1 file changed, 569 insertions(+), 569 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 4b246f3..faf1b08 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -1,9 +1,9 @@
 /* stb_image_resize2 - v2.04 - public domain image resizing
-   
-   by Jeff Roberts (v2) and Jorge L Rodriguez 
+
+   by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
 
-   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only 
+   Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only
    scaling and translation is supported, no rotations or shears.
 
    COMPILING & LINKING
@@ -67,60 +67,60 @@
    ADDITIONAL DOCUMENTATION
 
       MEMORY ALLOCATION
-         By default, we use malloc and free for memory allocation.  To override the 
+         By default, we use malloc and free for memory allocation.  To override the
          memory allocation, before the implementation #include, add a:
 
             #define STBIR_MALLOC(size,user_data) ...
             #define STBIR_FREE(ptr,user_data)   ...
 
-         Each resize makes exactly one call to malloc/free (unless you use the 
+         Each resize makes exactly one call to malloc/free (unless you use the
          extended API where you can do one allocation for many resizes). Under
          address sanitizer, we do separate allocations to find overread/writes.
 
       PERFORMANCE
          This library was written with an emphasis on performance. When testing
-         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with 
+         stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with
          STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize
-         libs do by default). Also, make sure SIMD is turned on of course (default 
+         libs do by default). Also, make sure SIMD is turned on of course (default
          for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed.
 
          This library also comes with profiling built-in. If you define STBIR_PROFILE,
-         you can use the advanced API and get low-level profiling information by 
+         you can use the advanced API and get low-level profiling information by
          calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info()
          after a resize.
 
       SIMD
-         Most of the routines have optimized SSE2, AVX, NEON and WASM versions. 
+         Most of the routines have optimized SSE2, AVX, NEON and WASM versions.
 
-         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and 
-         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or 
+         On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and
+         ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or
          STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX
-         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2 
+         or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2
          support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2.
 
          On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit,
          we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled
          on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both
-         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to 
+         clang and GCC, but GCC also requires an additional -mfp16-format=ieee to
          automatically enable NEON.
 
          On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions
          for converting back and forth to half-floats. This is autoselected when we
-         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses 
-         the built-in half float hardware NEON instructions. 
+         are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses
+         the built-in half float hardware NEON instructions.
 
-         You can also tell us to use multiply-add instructions with STBIR_USE_FMA. 
+         You can also tell us to use multiply-add instructions with STBIR_USE_FMA.
          Because x86 doesn't always have fma, we turn it off by default to maintain
          determinism across all platforms. If you don't care about non-FMA determinism
-         and are willing to restrict yourself to more recent x86 CPUs (around the AVX 
+         and are willing to restrict yourself to more recent x86 CPUs (around the AVX
          timeframe), then fma will give you around a 15% speedup.
 
          You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn
          off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10%
          to 40% faster, and AVX2 is generally another 12%.
-        
+
       ALPHA CHANNEL
-         Most of the resizing functions provide the ability to control how the alpha 
+         Most of the resizing functions provide the ability to control how the alpha
          channel of an image is processed.
 
          When alpha represents transparency, it is important that when combining
@@ -167,33 +167,33 @@
 
          stb_image_resize expects case #1 by default, applying alpha weighting to
          images, expecting the input images to be unpremultiplied. This is what the
-         COLOR+ALPHA buffer types tell the resizer to do. 
+         COLOR+ALPHA buffer types tell the resizer to do.
 
-         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB, 
-         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are 
-         non-premultiplied. In these cases, the resizer will alpha weight the colors 
-         (effectively creating the premultiplied image), do the filtering, and then 
+         When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB,
+         STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are
+         non-premultiplied. In these cases, the resizer will alpha weight the colors
+         (effectively creating the premultiplied image), do the filtering, and then
          convert back to non-premult on exit.
 
          When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM,
-         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels 
-         ARE premultiplied. In this case, the resizer doesn't have to do the 
-         premultipling - it can filter directly on the input. This about twice as 
-         fast as the non-premultiplied case, so it's the right option if your data is 
+         STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels
+         ARE premultiplied. In this case, the resizer doesn't have to do the
+         premultipling - it can filter directly on the input. This about twice as
+         fast as the non-premultiplied case, so it's the right option if your data is
          already setup correctly.
 
-         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are 
-         telling us that there is no channel that represents transparency; it may be 
-         RGB and some unrelated fourth channel that has been stored in the alpha 
-         channel, but it is actually not alpha. No special processing will be 
-         performed. 
+         When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are
+         telling us that there is no channel that represents transparency; it may be
+         RGB and some unrelated fourth channel that has been stored in the alpha
+         channel, but it is actually not alpha. No special processing will be
+         performed.
 
-         The difference between the generic 4 or 2 channel layouts, and the 
+         The difference between the generic 4 or 2 channel layouts, and the
          specialized _PM versions is with the _PM versions you are telling us that
          the data *is* alpha, just don't premultiply it. That's important when
          using SRGB pixel formats, we need to know where the alpha is, because
          it is converted linearly (rather than with the SRGB converters).
-   
+
          Because alpha weighting produces the same effect as premultiplying, you
          even have the option with non-premultiplied inputs to let the resizer
          produce a premultiplied output. Because the intially computed alpha-weighted
@@ -201,10 +201,10 @@
          than the normal path which un-premultiplies the output image as a final step.
 
          Finally, when converting both in and out of non-premulitplied space (for
-         example, when using STBIR_RGBA), we go to somewhat heroic measures to 
-         ensure that areas with zero alpha value pixels get something reasonable 
-         in the RGB values. If you don't care about the RGB values of zero alpha 
-         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality() 
+         example, when using STBIR_RGBA), we go to somewhat heroic measures to
+         ensure that areas with zero alpha value pixels get something reasonable
+         in the RGB values. If you don't care about the RGB values of zero alpha
+         pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality()
          function - this runs a premultiplied resize about 25% faster. That said,
          when you really care about speed, using premultiplied pixels for both in
          and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied
@@ -218,38 +218,38 @@
          layouts with the same number of channels.
 
       DETERMINISM
-         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc). 
-         This requires compiling with fast-math off (using at least /fp:precise). 
+         We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc).
+         This requires compiling with fast-math off (using at least /fp:precise).
          Also, you must turn off fp-contracting (which turns mult+adds into fmas)!
-         We attempt to do this with pragmas, but with Clang, you usually want to add 
+         We attempt to do this with pragmas, but with Clang, you usually want to add
          -ffp-contract=off to the command line as well.
 
-         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is, 
-         if the scalar x87 unit gets used at all, we immediately lose determinism. 
+         For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is,
+         if the scalar x87 unit gets used at all, we immediately lose determinism.
          On Microsoft Visual Studio 2008 and earlier, from what we can tell there is
-         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even 
-         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and 
+         no way to be deterministic in 32-bit x86 (some x87 always leaks in, even
+         with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and
          -fpmath=sse.
 
          Note that we will not be deterministic with float data containing NaNs -
-         the NaNs will propagate differently on different SIMD and platforms. 
+         the NaNs will propagate differently on different SIMD and platforms.
 
-         If you turn on STBIR_USE_FMA, then we will be deterministic with other 
-         fma targets, but we will differ from non-fma targets (this is unavoidable, 
-         because a fma isn't simply an add with a mult - it also introduces a 
-         rounding difference compared to non-fma instruction sequences. 
+         If you turn on STBIR_USE_FMA, then we will be deterministic with other
+         fma targets, but we will differ from non-fma targets (this is unavoidable,
+         because a fma isn't simply an add with a mult - it also introduces a
+         rounding difference compared to non-fma instruction sequences.
 
       FLOAT PIXEL FORMAT RANGE
-         Any range of values can be used for the non-alpha float data that you pass 
-         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values 
-         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we 
-         scale back properly. The alpha channel must also be 0 to 1 for any format 
-         that does premultiplication prior to resizing. 
+         Any range of values can be used for the non-alpha float data that you pass
+         in (0 to 1, -1 to 1, whatever). However, if you are inputting float values
+         but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we
+         scale back properly. The alpha channel must also be 0 to 1 for any format
+         that does premultiplication prior to resizing.
 
-         Note also that with float output, using filters with negative lobes, the 
-         output filtered values might go slightly out of range. You can define 
-         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range 
-         to clamp to on output, if that's important. 
+         Note also that with float output, using filters with negative lobes, the
+         output filtered values might go slightly out of range. You can define
+         STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range
+         to clamp to on output, if that's important.
 
       MAX/MIN SCALE FACTORS
          The input pixel resolutions are in integers, and we do the internal pointer
@@ -263,13 +263,13 @@
          buffers).
 
       FLIPPED IMAGES
-         Stride is just the delta from one scanline to the next. This means you can 
-         use a negative stride to handle inverted images (point to the final 
+         Stride is just the delta from one scanline to the next. This means you can
+         use a negative stride to handle inverted images (point to the final
          scanline and use a negative stride). You can invert the input or output,
          using negative strides.
 
       DEFAULT FILTERS
-         For functions which don't provide explicit control over what filters to 
+         For functions which don't provide explicit control over what filters to
          use, you can change the compile-time defaults with:
 
             #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
@@ -278,18 +278,18 @@
          See stbir_filter in the header-file section for the list of filters.
 
       NEW FILTERS
-         A number of 1D filter kernels are supplied. For a list of supported 
-         filters, see the stbir_filter enum. You can install your own filters by 
+         A number of 1D filter kernels are supplied. For a list of supported
+         filters, see the stbir_filter enum. You can install your own filters by
          using the stbir_set_filter_callbacks function.
 
       PROGRESS
-         For interactive use with slow resize operations, you can use the the 
-         scanline callbacks in the extended API. It would have to be a *very* large 
+         For interactive use with slow resize operations, you can use the the
+         scanline callbacks in the extended API. It would have to be a *very* large
          image resample to need progress though - we're very fast.
 
       CEIL and FLOOR
-         In scalar mode, the only functions we use from math.h are ceilf and floorf, 
-         but if you have your own versions, you can define the STBIR_CEILF(v) and 
+         In scalar mode, the only functions we use from math.h are ceilf and floorf,
+         but if you have your own versions, you can define the STBIR_CEILF(v) and
          STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use
          our own versions.
 
@@ -304,7 +304,7 @@
          * For SIMD encode and decode scanline routines, do any pre-aligning
            for bad input/output buffer alignments and pitch?
          * For very wide scanlines, we should we do vertical strips to stay within
-           L2 cache. Maybe do chunks of 1K pixels at a time. There would be 
+           L2 cache. Maybe do chunks of 1K pixels at a time. There would be
            some pixel reconversion, but probably dwarfed by things falling out
            of cache. Probably also something possible with alternating between
            scattering and gathering at high resize scales?
@@ -316,7 +316,7 @@
            the pivot cost and the extra memory touches). Need to buffer the whole
            image so have to balance memory use.
          * Most of our code is internally function pointers, should we compile
-           all the SIMD stuff always and dynamically dispatch? 
+           all the SIMD stuff always and dynamically dispatch?
 
    CONTRIBUTORS
       Jeff Roberts: 2.0 implementation, optimizations, SIMD
@@ -370,7 +370,7 @@ typedef uint64_t stbir_uint64;
 #define STBIR_SSE
 #endif
 #endif
-#endif 
+#endif
 
 #if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2)
   #ifndef STBIR_SSE2
@@ -385,7 +385,7 @@ typedef uint64_t stbir_uint64;
   #endif
   #if defined(__AVX2__) || defined(STBIR_AVX2)
     #ifndef STBIR_NO_AVX2
-      #ifndef STBIR_AVX2  
+      #ifndef STBIR_AVX2
         #define STBIR_AVX2
       #endif
       #if defined( _MSC_VER ) && !defined(__clang__)
@@ -410,7 +410,7 @@ typedef uint64_t stbir_uint64;
 
 #if defined(_M_ARM)
 #ifdef STBIR_USE_FMA
-#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC 
+#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC
 #endif
 #endif
 
@@ -437,7 +437,7 @@ typedef uint64_t stbir_uint64;
 //
 // Easy-to-use API:
 //
-//     * stride is the offset between successive rows of image data 
+//     * stride is the offset between successive rows of image data
 //        in memory, in bytes. specify 0 for packed continuously in memory
 //     * colorspace is linear or sRGB as specified by function name
 //     * Uses the default filters
@@ -450,11 +450,11 @@ typedef uint64_t stbir_uint64;
 //   order of channels
 //   whether color is premultiplied by alpha
 // for back compatibility, you can cast the old channel count to an stbir_pixel_layout
-typedef enum 
+typedef enum
 {
-  STBIR_1CHANNEL = 1,              
+  STBIR_1CHANNEL = 1,
   STBIR_2CHANNEL = 2,
-  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping) 
+  STBIR_RGB      = 3,               // 3-chan, with order specified (for channel flipping)
   STBIR_BGR      = 0,               // 3-chan, with order specified (for channel flipping)
   STBIR_4CHANNEL = 5,
 
@@ -559,8 +559,8 @@ STBIRDEF void *  stbir_resize( const void *input_pixels , int input_w , int inpu
 //     * Separate input and output data types
 //     * Can specify regions with subpixel correctness
 //     * Can specify alpha flags
-//     * Can specify a memory callback 
-//     * Can specify a callback data type for pixel input and output 
+//     * Can specify a memory callback
+//     * Can specify a callback data type for pixel input and output
 //     * Can be threaded for a single resize
 //     * Can be used to resize many frames without recalculating the sampler info
 //
@@ -587,7 +587,7 @@ typedef float stbir__kernel_callback( float x, float scale, void * user_data );
 typedef float stbir__support_callback( float scale, void * user_data );
 
 // internal structure with precomputed scaling
-typedef struct stbir__info stbir__info; 
+typedef struct stbir__info stbir__info;
 
 typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override functions to set these values for future compatibility
 {
@@ -614,7 +614,7 @@ typedef struct STBIR_RESIZE  // use the stbir_resize_init and stbir_override fun
   stbir_edge horizontal_edge, vertical_edge;
   stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support;
   stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support;
-  stbir__info * samplers;      
+  stbir__info * samplers;
 } STBIR_RESIZE;
 
 // extended complexity api
@@ -630,7 +630,7 @@ STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
 // You can update these parameters any time after resize_init and there is no cost
 //--------------------------------
 
-STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );         
+STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type );
 STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb );   // no callbacks by default
 STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data );                                               // pass back STBIR_RESIZE* by default
 STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes );
@@ -646,7 +646,7 @@ STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout
 STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge );       // CLAMP by default
 
 STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default
-STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support ); 
+STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support );
 
 STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh );        // sets both sub-regions (full regions by default)
 STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 );    // sets input sub-region (full region by default)
@@ -668,7 +668,7 @@ STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, i
 //--------------------------------
 
 // This builds the samplers and does one allocation
-STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize ); 
+STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize );
 
 // You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits
 STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize );
@@ -691,7 +691,7 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize );
 //   It returns the number of splits (threads) that you can call it with.
 ///  It might be less if the image resize can't be split up that many ways.
 
-STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );             
+STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits );
 
 // This function does a split of the resizing (you call this fuction for each
 // split, on multiple threads). A split is a piece of the output resize pixel space.
@@ -701,10 +701,10 @@ STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_sp
 // Usually, you will always call stbir_resize_split with split_start as the thread_index
 //   and "1" for the split_count.
 // But, if you have a weird situation where you MIGHT want 8 threads, but sometimes
-//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the 
+//   only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the
 //   split_count each time to turn in into a 4 thread resize. (This is unusual).
 
-STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );         
+STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count );
 //===============================================================
 
 
@@ -715,10 +715,10 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 //   The input callback is super flexible - it calls you with the input address
 //   (based on the stride and base pointer), it gives you an optional_output
 //   pointer that you can fill, or you can just return your own pointer into
-//   your own data. 
+//   your own data.
 //
-//   You can also do conversion from non-supported data types if necessary - in 
-//   this case, you ignore the input_ptr and just use the x and y parameters to 
+//   You can also do conversion from non-supported data types if necessary - in
+//   this case, you ignore the input_ptr and just use the x and y parameters to
 //   calculate your own input_ptr based on the size of each non-supported pixel.
 //   (Something like the third example below.)
 //
@@ -732,14 +732,14 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 //           return input_ptr;  // use buffer from call
 //        }
 //
-//     Next example, copying: (copy from some other buffer or stream):  
+//     Next example, copying: (copy from some other buffer or stream):
 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 //        {
 //           CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes );
 //           return optional_output;  // return the optional buffer that we filled
 //        }
 //
-//     Third example, input another buffer without copying: (zero-copy from other buffer):  
+//     Third example, input another buffer without copying: (zero-copy from other buffer):
 //        void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context )
 //        {
 //           void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes );
@@ -768,7 +768,7 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 
 #ifdef STBIR_PROFILE
 
-typedef struct STBIR_PROFILE_INFO 
+typedef struct STBIR_PROFILE_INFO
 {
   stbir_uint64 total_clocks;
 
@@ -776,7 +776,7 @@ typedef struct STBIR_PROFILE_INFO
   //    there are "resize_count" number of zones
   stbir_uint64 clocks[ 8 ];
   char const ** descriptions;
-  
+
   // count of clocks and descriptions
   stbir_uint32 count;
 } STBIR_PROFILE_INFO;
@@ -875,15 +875,15 @@ STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, ST
 #endif
 
 // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
-//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
-typedef enum 
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
+typedef enum
 {
   STBIRI_1CHANNEL = 0,
   STBIRI_2CHANNEL = 1,
   STBIRI_RGB      = 2,
   STBIRI_BGR      = 3,
   STBIRI_4CHANNEL = 4,
-  
+
   STBIRI_RGBA = 5,
   STBIRI_BGRA = 6,
   STBIRI_ARGB = 7,
@@ -989,7 +989,7 @@ typedef struct
   stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP
 } stbir__extents;
 
-typedef struct 
+typedef struct
 {
 #ifdef STBIR_PROFILE
   union
@@ -1020,7 +1020,7 @@ typedef struct
 
 typedef void stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input );
 typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels );
-typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, 
+typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer,
   stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width );
 typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels );
 typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode );
@@ -1063,10 +1063,10 @@ struct stbir__info
   stbir__horizontal_gather_channels_func * horizontal_gather_channels;
   stbir__alpha_unweight_func * alpha_unweight;
   stbir__encode_pixels_func * encode_pixels;
-  
+
   int alloced_total;
   int splits; // count of splits
-  
+
   stbir_internal_pixel_layout input_pixel_layout_internal;
   stbir_internal_pixel_layout output_pixel_layout_internal;
 
@@ -1151,7 +1151,7 @@ static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
   0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
   0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
 };
- 
+
 static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 {
   static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
@@ -1182,7 +1182,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win.
 #endif
 
-#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 
+#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS
 #define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split?
 #endif
 
@@ -1237,7 +1237,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
 #ifdef STBIR_SSE2
   #include <emmintrin.h>
-  
+
   #define stbir__simdf __m128
   #define stbir__simdi __m128i
 
@@ -1268,7 +1268,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) )
 
   #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 )
- 
+
   #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \
   { \
     stbir__simdi zero = _mm_setzero_si128(); \
@@ -1299,7 +1299,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps()))))
   #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))))
 
-  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i) 
+  #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i)
   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg )
   #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 )
   #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 )
@@ -1454,10 +1454,10 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
     #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg )
     #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f)
-  
+
     #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) )
     #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) )
-    
+
     #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1)
 
     #ifdef STBIR_AVX2
@@ -1485,8 +1485,8 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
       out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \
     }
 
-    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() ); 
-  
+    #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() );
+
     #define stbir__simdf8_pack_to_16words(out,aa,bb) \
       { \
         stbir__simdf8 af,bf; \
@@ -1510,7 +1510,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
       a = _mm_unpackhi_epi8( ireg, zero ); \
       out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \
     }
-  
+
     #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \
     { \
       stbir__simdi t; \
@@ -1528,7 +1528,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
       t = _mm_packus_epi16( t, t ); \
       out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \
     }
-  
+
     #define stbir__simdi8_expand_u16_to_u32(out,ireg) \
     { \
       stbir__simdi a,b,zero = _mm_setzero_si128(); \
@@ -1640,7 +1640,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
   }
 
 #elif defined(STBIR_NEON)
-  
+
   #include <arm_neon.h>
 
   #define stbir__simdf float32x4_t
@@ -1699,7 +1699,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
   #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) )
   #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0)
-  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0) 
+  #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0)
   #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0))
   #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0))
   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) )
@@ -1755,7 +1755,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     #endif
 
     #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) )
-  
+
     #define stbir__simdi_16madd( out, reg0, reg1 ) \
     { \
       int16x8_t r0 = vreinterpretq_s16_u32(reg0); \
@@ -1955,7 +1955,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 
   #define stbir__simdf_convert_float_to_i32( i, f )    (i) = wasm_i32x4_trunc_sat_f32x4(f)
   #define stbir__simdf_convert_float_to_int( f )       wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0)
-  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0) 
+  #define stbir__simdi_to_int( i )                     wasm_i32x4_extract_lane(i, 0)
   #define stbir__simdf_convert_float_to_uint8( f )     ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0))
   #define stbir__simdf_convert_float_to_short( f )     ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0))
   #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg)
@@ -2181,7 +2181,7 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
     unsigned int sign_mask = 0x80000000u;
     stbir__FP16 o = { 0 };
     stbir__FP32 f;
-    unsigned int sign; 
+    unsigned int sign;
 
     f.f = val;
     sign = f.u & sign_mask;
@@ -2475,10 +2475,10 @@ static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in)
 #define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 )
 #define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 )
 #define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 )
-#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 ) 
-#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 ) 
-#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 ) 
-#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 ) 
+#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 )
+#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 )
+#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 )
+#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 )
 
 typedef union stbir__simdi_u32
 {
@@ -2513,7 +2513,7 @@ static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
 #endif
 
 // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
-static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) 
+static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
 {
   char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest;
   char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes;
@@ -2526,7 +2526,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
   {
     if ( bytes < 16 )
     {
-      if ( bytes ) 
+      if ( bytes )
       {
         do
         {
@@ -2579,7 +2579,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     for(;;)
     {
       STBIR_SIMD_NO_UNROLL(d);
-  
+
       if ( d > ( d_end - (16*stbir__simdfX_float_count) ) )
       {
         if ( d == d_end )
@@ -2603,7 +2603,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
 // memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 //   the diff between dest and src)
-static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
 {
   char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
   char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
@@ -2628,7 +2628,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   do
   {
     STBIR_SIMD_NO_UNROLL(sd);
-    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
     sd += 4;
   } while ( sd < s_end );
 }
@@ -2637,7 +2637,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 // when in scalar mode, we let unrolling happen, so this macro just does the __restrict
 #define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star )
-#define STBIR_SIMD_NO_UNROLL(ptr) 
+#define STBIR_SIMD_NO_UNROLL(ptr)
 
 #endif // SSE2
 
@@ -2653,7 +2653,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #else // non msvc
 
-  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() 
+  static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC()
   {
     stbir_uint32 lo, hi;
     asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) );
@@ -2662,7 +2662,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #endif  // msvc
 
-#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) 
+#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__)
 
 #if defined( _MSC_VER ) && !defined(__clang__)
 
@@ -2683,7 +2683,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 
 #error Unknown platform for profiling.
 
-#endif  //x64 and   
+#endif  //x64 and
 
 
 #define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info
@@ -2693,7 +2693,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 #define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info
 
 // super light-weight micro profiler
-#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded; 
+#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded;
 #define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; }
 #define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;i<STBIR__ARRAY_SIZE(info->profile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh );
 #define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;i<STBIR__ARRAY_SIZE((info)->profile.array);i++) (info)[extra].profile.array[i]=0; } }
@@ -2723,8 +2723,8 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 #define STBIR_PROFILE_FIRST_START( wh )
 #define STBIR_PROFILE_CLEAR_EXTRAS( )
 
-#define STBIR_PROFILE_BUILD_START( wh ) 
-#define STBIR_PROFILE_BUILD_END( wh ) 
+#define STBIR_PROFILE_BUILD_START( wh )
+#define STBIR_PROFILE_BUILD_END( wh )
 #define STBIR_PROFILE_BUILD_FIRST_START( wh )
 #define STBIR_PROFILE_BUILD_CLEAR( info )
 
@@ -2752,7 +2752,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
 // memcpy that is specically intentionally overlapping (src is smaller then dest, so can be
 //   a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to
 //   the diff between dest and src)
-static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) 
+static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes )
 {
   char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src;
   char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes;
@@ -2764,7 +2764,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
     do
     {
       STBIR_NO_UNROLL(sd);
-      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; 
+      *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd;
       sd += 8;
     } while ( sd < s_end8 );
 
@@ -2775,7 +2775,7 @@ static void stbir_overlapping_memcpy( void * dest, void const * src, size_t byte
   do
   {
     STBIR_NO_UNROLL(sd);
-    *(int*)( sd + ofs_to_dest ) = *(int*) sd; 
+    *(int*)( sd + ofs_to_dest ) = *(int*) sd;
     sd += 4;
   } while ( sd < s_end );
 }
@@ -2897,7 +2897,7 @@ static float stbir__support_one(float s, void * user_data)
   return 1;
 }
 
-static float stbir__support_two(float s, void * user_data) 
+static float stbir__support_two(float s, void * user_data)
 {
   STBIR__UNUSED(s);
   STBIR__UNUSED(user_data);
@@ -2916,7 +2916,7 @@ static int stbir__get_filter_pixel_width(stbir__support_callback * support, floa
     return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale);
 }
 
-// this is how many coefficents per run of the filter (which is different 
+// this is how many coefficents per run of the filter (which is different
 //   from the filter_pixel_width depending on if we are scattering or gathering)
 static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data)
 {
@@ -2937,7 +2937,7 @@ static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, vo
   }
 }
 
-static int stbir__get_contributors(stbir__sampler * samp, int is_gather)  
+static int stbir__get_contributors(stbir__sampler * samp, int is_gather)
 {
   if (is_gather)
       return samp->scale_info.output_sub_size;
@@ -2967,7 +2967,7 @@ static int stbir__edge_reflect_full( int n, int max )
 {
   if (n < 0)
   {
-    if (n > -max)    
+    if (n > -max)
       return -n;
     else
       return max - 1;
@@ -3069,7 +3069,7 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
     left_margin = -min_n;
     min_n = 0;
   }
-  
+
   right_margin = 0;
   if ( max_n >= input_full_size )
   {
@@ -3094,7 +3094,7 @@ static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline
   // don't have to do edge calc for zero clamp
   if ( edge == STBIR_EDGE_ZERO )
     return;
-  
+
   // convert margin pixels to the pixels within the input (min and max)
   for( j = -left_margin ; j < 0 ; j++ )
   {
@@ -3192,8 +3192,8 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
   float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
   float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
 
-  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale; 
-  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale; 
+  float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale;
+  float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale;
 
   first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f));
   last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f));
@@ -3205,7 +3205,7 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
     if ( last >= (input_size*2))
       last = (input_size*2) - 1;
   }
-  
+
   *first_pixel = first;
   *last_pixel = last;
 }
@@ -3226,10 +3226,10 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
     int i;
     int last_non_zero;
     float out_pixel_center = (float)n + 0.5f;
-    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;  
+    float in_center_of_out = (out_pixel_center + out_shift) * inv_scale;
 
     int in_first_pixel, in_last_pixel;
-    
+
     stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge );
 
     last_non_zero = -1;
@@ -3242,7 +3242,7 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
       if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) )
       {
         if ( i == 0 )  // if we're at the front, just eat zero contributors
-        { 
+        {
           STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib
           ++in_first_pixel;
           i--;
@@ -3252,10 +3252,10 @@ static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_
       }
       else
         last_non_zero = i;
-      
+
       coefficient_group[i] = coeff;
     }
-    
+
     in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros
     contributors->n0 = in_first_pixel;
     contributors->n1 = in_last_pixel;
@@ -3367,7 +3367,7 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
         stbir__contributors * contribs = contributors + out;
 
         // is this the first time this output pixel has been seen?  Init it.
-        if ( out > first_out_inited ) 
+        if ( out > first_out_inited )
         {
           STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time
           first_out_inited = out;
@@ -3375,7 +3375,7 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
           contribs->n1 = in_pixel;
           coeffs[0]  = coeff;
         }
-        else 
+        else
         {
           // insert on end (always in order)
           if ( coeffs[0] == 0.0f )  // if the first coefficent is zero, then zap it for this coeffs
@@ -3395,7 +3395,7 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
 static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
 {
   int input_size = scale_info->input_full_size;
-  int input_last_n1 = input_size - 1; 
+  int input_last_n1 = input_size - 1;
   int n, end;
   int lowest = 0x7fffffff;
   int highest = -0x7fffffff;
@@ -3496,13 +3496,13 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
     else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) )
     {
       // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight
-      
+
       // right hand side first
       if ( contribs->n1 > input_last_n1 )
       {
         int start = contribs->n0;
         int endi = contribs->n1;
-        contribs->n1 = input_last_n1;  
+        contribs->n1 = input_last_n1;
         for( i = input_size; i <= endi; i++ )
           stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start] );
       }
@@ -3513,18 +3513,18 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
         int save_n0;
         float save_n0_coeff;
         float * c = coeffs - ( contribs->n0 + 1 );
-        
+
         // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist)
-        for( i = -1 ; i > contribs->n0 ; i-- ) 
+        for( i = -1 ; i > contribs->n0 ; i-- )
           stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c-- );
         save_n0 = contribs->n0;
         save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)!
 
         // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib
-        contribs->n0 = 0;  
+        contribs->n0 = 0;
         for(i = 0 ; i <= contribs->n1 ; i++ )
           coeffs[i] = coeffs[i-save_n0];
-        
+
         // now that we have shrunk down the contribs, we insert the first one safely
         stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff );
       }
@@ -3701,7 +3701,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   coefficents[ widest * num_contributors ] = 8888.0f;
 
   // the minimum we might read for unrolled filters widths is 12. So, we need to
-  //   make sure we never read outside the decode buffer, by possibly moving 
+  //   make sure we never read outside the decode buffer, by possibly moving
   //   the sample area back into the scanline, and putting zeros weights first.
   // we start on the right edge and check until we're well past the possible
   //   clip area (2*widest).
@@ -3716,7 +3716,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
       if ( ( contribs->n0 + widest ) > row_width )
       {
         int stop_range = widest;
-      
+
         // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length
         //   of this contrib n1, instead of a fixed widest amount - so calculate this
         if ( widest > 12 )
@@ -3725,7 +3725,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
 
           // how far will be read in the n_coeff loop (which depends on the widest count mod4);
           mod = widest & 3;
-          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+          stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
 
           // the n_coeff loops do a minimum amount of coeffs, so factor that in!
           if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
@@ -3759,7 +3759,7 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
 
             // how far will be read in the n_coeff loop (which depends on the widest count mod4);
             mod = widest & 3;
-            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; 
+            stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod;
 
             // the n_coeff loops do a minimum amount of coeffs, so factor that in!
             if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod;
@@ -3787,7 +3787,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
   int input_full_size = samp->scale_info.input_full_size;
   int gather_num_contributors = samp->num_contributors;
   stbir__contributors* gather_contributors = samp->contributors;
-  float * gather_coeffs = samp->coefficients; 
+  float * gather_coeffs = samp->coefficients;
   int gather_coefficient_width = samp->coefficient_width;
 
   switch ( samp->is_gather )
@@ -3805,16 +3805,16 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
     break;
 
     case 0: // scatter downsample (only on vertical)
-    case 2: // gather downsample  
+    case 2: // gather downsample
     {
       float in_pixels_radius = support(scale,user_data) * inv_scale;
       int filter_pixel_margin = samp->filter_pixel_margin;
       int input_end = input_full_size + filter_pixel_margin;
-      
+
       // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after
       if ( !samp->is_gather )
       {
-        // check if we are using the same gather downsample on the horizontal as this vertical, 
+        // check if we are using the same gather downsample on the horizontal as this vertical,
         //   if so, then we don't have to generate them, we can just pivot from the horizontal.
         if ( other_axis_for_pivot )
         {
@@ -3859,7 +3859,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
           float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width;
           float * g_coeffs = gather_coeffs;
           scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin );
-          
+
           for (k = gn0 ; k <= gn1 ; k++ )
           {
             float gc = *g_coeffs++;
@@ -3870,7 +3870,7 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
                 stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
                 while ( clear_contributors < scatter_contributors )
                 {
-                  clear_contributors->n0 = 0; 
+                  clear_contributors->n0 = 0;
                   clear_contributors->n1 = -1;
                   ++clear_contributors;
                 }
@@ -3921,11 +3921,11 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix BGRA
 #define stbir__decode_swizzle
-#define stbir__decode_order0  2 
+#define stbir__decode_order0  2
 #define stbir__decode_order1  1
 #define stbir__decode_order2  0
 #define stbir__decode_order3  3
-#define stbir__encode_order0  2 
+#define stbir__encode_order0  2
 #define stbir__encode_order1  1
 #define stbir__encode_order2  0
 #define stbir__encode_order3  3
@@ -3935,11 +3935,11 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix ARGB
 #define stbir__decode_swizzle
-#define stbir__decode_order0  1 
+#define stbir__decode_order0  1
 #define stbir__decode_order1  2
 #define stbir__decode_order2  3
 #define stbir__decode_order3  0
-#define stbir__encode_order0  3 
+#define stbir__encode_order0  3
 #define stbir__encode_order1  0
 #define stbir__encode_order2  1
 #define stbir__encode_order3  2
@@ -3949,11 +3949,11 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix ABGR
 #define stbir__decode_swizzle
-#define stbir__decode_order0  3 
+#define stbir__decode_order0  3
 #define stbir__decode_order1  2
 #define stbir__decode_order2  1
 #define stbir__decode_order3  0
-#define stbir__encode_order0  3 
+#define stbir__encode_order0  3
 #define stbir__encode_order1  2
 #define stbir__encode_order2  1
 #define stbir__encode_order3  0
@@ -3963,12 +3963,12 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
 
 #define stbir__decode_suffix AR
 #define stbir__decode_swizzle
-#define stbir__decode_order0  1 
-#define stbir__decode_order1  0 
+#define stbir__decode_order0  1
+#define stbir__decode_order1  0
 #define stbir__decode_order2  3
 #define stbir__decode_order3  2
-#define stbir__encode_order0  1 
-#define stbir__encode_order1  0 
+#define stbir__encode_order0  1
+#define stbir__encode_order1  0
 #define stbir__encode_order2  3
 #define stbir__encode_order3  2
 #define stbir__coder_min_num 2
@@ -3986,7 +3986,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
   // fancy alpha is stored internally as R G B A Rpm Gpm Bpm
 
   #ifdef STBIR_SIMD
-  
+
   #ifdef STBIR_SIMD8
   decode += 16;
   while ( decode <= end_decode )
@@ -4011,7 +4011,7 @@ static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_c
     out += 28;
   }
   decode -= 16;
-  #else  
+  #else
   decode += 8;
   while ( decode <= end_decode )
   {
@@ -4090,11 +4090,11 @@ static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_c
       stbir__simdf8_0123to11331133( p0, d0 );
       stbir__simdf8_0123to00220022( a0, d0 );
       stbir__simdf8_mult( p0, p0, a0 );
- 
+
       stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) );
       stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) );
       stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) );
-      
+
       stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) );
       stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) );
       stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) );
@@ -4382,7 +4382,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
   // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed
   STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) );
 
-  do 
+  do
   {
     float * decode_buffer;
     void const * input_data;
@@ -4390,7 +4390,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     int width_times_channels;
     int width;
 
-    if ( spans->n1 < spans->n0 )    
+    if ( spans->n1 < spans->n0 )
       break;
 
     width = spans->n1 + 1 - spans->n0;
@@ -4407,7 +4407,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
       // call the callback with a temp buffer (that they can choose to use or not).  the temp is just right aligned memory in the decode_buffer itself
       input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data );
     }
-    
+
     STBIR_PROFILE_START( decode );
     // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channels<effective_channels, we are right justified in the buffer)
     stbir_info->decode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data );
@@ -4431,7 +4431,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     // this code only runs if we're in edge_wrap, and we're doing the entire scanline
     int e, start_x[2];
     int input_full_size = stbir_info->horizontal.scale_info.input_full_size;
-    
+
     start_x[0] = -stbir_info->scanline_extents.edge_sizes[0];  // left edge start x
     start_x[1] =  input_full_size;                             // right edge
 
@@ -4460,7 +4460,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf tot,c;                \
     STBIR_SIMD_NO_UNROLL(decode);      \
     stbir__simdf_load1( c, hc );       \
-    stbir__simdf_mult1_mem( tot, c, decode ); 
+    stbir__simdf_mult1_mem( tot, c, decode );
 
 #define stbir__2_coeff_only()          \
     stbir__simdf tot,c,d;              \
@@ -4469,7 +4469,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load2( d, decode );   \
     stbir__simdf_mult( tot, c, d );    \
     stbir__simdf_0123to1230( c, tot ); \
-    stbir__simdf_add1( tot, tot, c );          
+    stbir__simdf_add1( tot, tot, c );
 
 #define stbir__3_coeff_only()                  \
     stbir__simdf tot,c,t;                      \
@@ -4479,7 +4479,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1230( c, tot );         \
     stbir__simdf_0123to2301( t, tot );         \
     stbir__simdf_add1( tot, tot, c );          \
-    stbir__simdf_add1( tot, tot, t );    
+    stbir__simdf_add1( tot, tot, t );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store1( output, tot );           \
@@ -4496,7 +4496,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 #define stbir__4_coeff_continue_from_4( ofs )  \
     STBIR_SIMD_NO_UNROLL(decode);              \
     stbir__simdf_load( c, hc + (ofs) );        \
-    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) ); 
+    stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) );
 
 #define stbir__1_coeff_remnant( ofs )          \
     { stbir__simdf d;                          \
@@ -4508,7 +4508,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     { stbir__simdf d;                          \
     stbir__simdf_load2z( c, hc+(ofs) );        \
     stbir__simdf_load2( d, decode+(ofs) );     \
-    stbir__simdf_madd( tot, tot, d, c ); }   
+    stbir__simdf_madd( tot, tot, d, c ); }
 
 #define stbir__3_coeff_setup()                 \
     stbir__simdf mask;                         \
@@ -4533,18 +4533,18 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 
 #define stbir__1_coeff_only()  \
     float tot;                 \
-    tot = decode[0]*hc[0];     
+    tot = decode[0]*hc[0];
 
 #define stbir__2_coeff_only()  \
     float tot;                 \
     tot = decode[0] * hc[0];   \
-    tot += decode[1] * hc[1];    
+    tot += decode[1] * hc[1];
 
 #define stbir__3_coeff_only()  \
     float tot;                 \
     tot = decode[0] * hc[0];   \
     tot += decode[1] * hc[1];  \
-    tot += decode[2] * hc[2];    
+    tot += decode[2] * hc[2];
 
 #define stbir__store_output_tiny()                \
     output[0] = tot;                              \
@@ -4557,16 +4557,16 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     tot0 = decode[0] * hc[0];   \
     tot1 = decode[1] * hc[1];   \
     tot2 = decode[2] * hc[2];   \
-    tot3 = decode[3] * hc[3];     
+    tot3 = decode[3] * hc[3];
 
 #define stbir__4_coeff_continue_from_4( ofs )  \
     tot0 += decode[0+(ofs)] * hc[0+(ofs)];     \
     tot1 += decode[1+(ofs)] * hc[1+(ofs)];     \
     tot2 += decode[2+(ofs)] * hc[2+(ofs)];     \
-    tot3 += decode[3+(ofs)] * hc[3+(ofs)];     
+    tot3 += decode[3+(ofs)] * hc[3+(ofs)];
 
 #define stbir__1_coeff_remnant( ofs )        \
-    tot0 += decode[0+(ofs)] * hc[0+(ofs)];   
+    tot0 += decode[0+(ofs)] * hc[0+(ofs)];
 
 #define stbir__2_coeff_remnant( ofs )        \
     tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
@@ -4575,7 +4575,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 #define stbir__3_coeff_remnant( ofs )        \
     tot0 += decode[0+(ofs)] * hc[0+(ofs)];   \
     tot1 += decode[1+(ofs)] * hc[1+(ofs)];   \
-    tot2 += decode[2+(ofs)] * hc[2+(ofs)];   
+    tot2 += decode[2+(ofs)] * hc[2+(ofs)];
 
 #define stbir__store_output()                     \
     output[0] = (tot0+tot2)+(tot1+tot3);          \
@@ -4583,7 +4583,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 1;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 1
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -4601,14 +4601,14 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load1z( c, hc );     \
     stbir__simdf_0123to0011( c, c );  \
     stbir__simdf_load2( d, decode );  \
-    stbir__simdf_mult( tot, d, c ); 
+    stbir__simdf_mult( tot, d, c );
 
 #define stbir__2_coeff_only()         \
     stbir__simdf tot,c;               \
     STBIR_SIMD_NO_UNROLL(decode);     \
     stbir__simdf_load2( c, hc );      \
     stbir__simdf_0123to0011( c, c );  \
-    stbir__simdf_mult_mem( tot, c, decode ); 
+    stbir__simdf_mult_mem( tot, c, decode );
 
 #define stbir__3_coeff_only()                \
     stbir__simdf tot,c,cs,d;                 \
@@ -4618,7 +4618,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_mult_mem( tot, c, decode ); \
     stbir__simdf_0123to2222( c, cs );        \
     stbir__simdf_load2z( d, decode+4 );      \
-    stbir__simdf_madd( tot, tot, d, c );   
+    stbir__simdf_madd( tot, tot, d, c );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_0123to2301( c, tot );            \
@@ -4641,7 +4641,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     STBIR_SIMD_NO_UNROLL(decode);                    \
     stbir__simdf8_load4b( cs, hc + (ofs) );          \
     stbir__simdf8_0123to00112233( c, cs );           \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 
 #define stbir__1_coeff_remnant( ofs )                \
     { stbir__simdf t;                                \
@@ -4662,7 +4662,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_load4b( cs, hc + (ofs) );          \
     stbir__simdf8_0123to00112233( c, cs );           \
     stbir__simdf8_load6z( d, decode+(ofs)*2 );       \
-    stbir__simdf8_madd( tot0, tot0, c, d ); }               
+    stbir__simdf8_madd( tot0, tot0, c, d ); }
 
 #define stbir__store_output()                     \
     { stbir__simdf t,d;                           \
@@ -4683,7 +4683,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0011( c, cs );            \
     stbir__simdf_mult_mem( tot0, c, decode );    \
     stbir__simdf_0123to2233( c, cs );            \
-    stbir__simdf_mult_mem( tot1, c, decode+4 );   
+    stbir__simdf_mult_mem( tot1, c, decode+4 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                \
     STBIR_SIMD_NO_UNROLL(decode);                            \
@@ -4691,7 +4691,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0011( c, cs );                        \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );  \
     stbir__simdf_0123to2233( c, cs );                        \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );   
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 );
 
 #define stbir__1_coeff_remnant( ofs )            \
     { stbir__simdf d;                            \
@@ -4703,7 +4703,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
 #define stbir__2_coeff_remnant( ofs )                      \
     stbir__simdf_load2( cs, hc + (ofs) );                  \
     stbir__simdf_0123to0011( c, cs );                      \
-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );       
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 );
 
 #define stbir__3_coeff_remnant( ofs )                       \
     { stbir__simdf d;                                       \
@@ -4712,7 +4712,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \
     stbir__simdf_0123to2222( c, cs );                       \
     stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 );       \
-    stbir__simdf_madd( tot1, tot1, d, c ); }  
+    stbir__simdf_madd( tot1, tot1, d, c ); }
 
 #define stbir__store_output()                     \
     stbir__simdf_add( tot0, tot0, tot1 );         \
@@ -4731,7 +4731,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     float tota,totb,c;         \
     c = hc[0];                 \
     tota = decode[0]*c;        \
-    totb = decode[1]*c;     
+    totb = decode[1]*c;
 
 #define stbir__2_coeff_only()  \
     float tota,totb,c;         \
@@ -4740,7 +4740,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb = decode[1]*c;        \
     c = hc[1];                 \
     tota += decode[2]*c;       \
-    totb += decode[3]*c;     
+    totb += decode[3]*c;
 
 // this weird order of add matches the simd
 #define stbir__3_coeff_only()  \
@@ -4753,7 +4753,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb += decode[5]*c;       \
     c = hc[1];                 \
     tota += decode[2]*c;       \
-    totb += decode[3]*c;     
+    totb += decode[3]*c;
 
 #define stbir__store_output_tiny()                \
     output[0] = tota;                             \
@@ -4775,7 +4775,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb2 = decode[5]*c;            \
     c = hc[3];                      \
     tota3 = decode[6]*c;            \
-    totb3 = decode[7]*c;     
+    totb3 = decode[7]*c;
 
 #define stbir__4_coeff_continue_from_4( ofs )  \
     c = hc[0+(ofs)];                           \
@@ -4789,12 +4789,12 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb2 += decode[5+(ofs)*2]*c;              \
     c = hc[3+(ofs)];                           \
     tota3 += decode[6+(ofs)*2]*c;              \
-    totb3 += decode[7+(ofs)*2]*c;     
+    totb3 += decode[7+(ofs)*2]*c;
 
 #define stbir__1_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
     tota0 += decode[0+(ofs)*2] * c;    \
-    totb0 += decode[1+(ofs)*2] * c;   
+    totb0 += decode[1+(ofs)*2] * c;
 
 #define stbir__2_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
@@ -4802,7 +4802,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb0 += decode[1+(ofs)*2] * c;    \
     c = hc[1+(ofs)];                   \
     tota1 += decode[2+(ofs)*2] * c;    \
-    totb1 += decode[3+(ofs)*2] * c;   
+    totb1 += decode[3+(ofs)*2] * c;
 
 #define stbir__3_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
@@ -4813,7 +4813,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     totb1 += decode[3+(ofs)*2] * c;    \
     c = hc[2+(ofs)];                   \
     tota2 += decode[4+(ofs)*2] * c;    \
-    totb2 += decode[5+(ofs)*2] * c;    
+    totb2 += decode[5+(ofs)*2] * c;
 
 #define stbir__store_output()                     \
     output[0] = (tota0+tota2)+(tota1+tota3);      \
@@ -4822,7 +4822,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 2;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 2
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -4840,7 +4840,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load1z( c, hc );     \
     stbir__simdf_0123to0001( c, c );  \
     stbir__simdf_load( d, decode );   \
-    stbir__simdf_mult( tot, d, c ); 
+    stbir__simdf_mult( tot, d, c );
 
 #define stbir__2_coeff_only()         \
     stbir__simdf tot,c,cs,d;          \
@@ -4851,7 +4851,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_mult( tot, d, c );   \
     stbir__simdf_0123to1111( c, cs ); \
     stbir__simdf_load( d, decode+3 ); \
-    stbir__simdf_madd( tot, tot, d, c ); 
+    stbir__simdf_madd( tot, tot, d, c );
 
 #define stbir__3_coeff_only()            \
     stbir__simdf tot,c,d,cs;             \
@@ -4865,7 +4865,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd( tot, tot, d, c ); \
     stbir__simdf_0123to2222( c, cs );    \
     stbir__simdf_load( d, decode+6 );    \
-    stbir__simdf_madd( tot, tot, d, c ); 
+    stbir__simdf_madd( tot, tot, d, c );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store2( output, tot );           \
@@ -4885,7 +4885,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );         \
     stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \
     stbir__simdf8_0123to22223333( c, cs );         \
-    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );    
+    stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 );
 
 #define stbir__4_coeff_continue_from_4( ofs )      \
     STBIR_SIMD_NO_UNROLL(decode);                  \
@@ -4893,26 +4893,26 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );         \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
     stbir__simdf8_0123to22223333( c, cs );         \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );    
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 );
 
 #define stbir__1_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf_load1rep4( t, hc + (ofs) );                   \
-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 ); 
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 );
 
 #define stbir__2_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
     stbir__simdf8_0123to22223333( c, cs );                     \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );   
- 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 );
+
  #define stbir__3_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                                \
     stbir__simdf8_load4b( cs, hc + (ofs) );                      \
     stbir__simdf8_0123to00001111( c, cs );                       \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \
     stbir__simdf8_0123to2222( t, cs );                           \
-    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 ); 
+    stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 );
 
 #define stbir__store_output()                       \
     stbir__simdf8_add( tot0, tot0, tot1 );          \
@@ -4943,7 +4943,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1122( c, cs );           \
     stbir__simdf_mult_mem( tot1, c, decode+4 ); \
     stbir__simdf_0123to2333( c, cs );           \
-    stbir__simdf_mult_mem( tot2, c, decode+8 ); 
+    stbir__simdf_mult_mem( tot2, c, decode+8 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                 \
     STBIR_SIMD_NO_UNROLL(decode);                             \
@@ -4953,13 +4953,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1122( c, cs );                         \
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
     stbir__simdf_0123to2333( c, cs );                         \
-    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );   
+    stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 );
 
 #define stbir__1_coeff_remnant( ofs )         \
     STBIR_SIMD_NO_UNROLL(decode);             \
     stbir__simdf_load1z( c, hc + (ofs) );     \
     stbir__simdf_0123to0001( c, c );          \
-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );   
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 );
 
 #define stbir__2_coeff_remnant( ofs )                       \
     { stbir__simdf d;                                       \
@@ -4969,7 +4969,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \
     stbir__simdf_0123to1122( c, cs );                       \
     stbir__simdf_load2z( d, decode+(ofs)*3+4 );             \
-    stbir__simdf_madd( tot1, tot1, c, d ); }                 
+    stbir__simdf_madd( tot1, tot1, c, d ); }
 
 #define stbir__3_coeff_remnant( ofs )                         \
     { stbir__simdf d;                                         \
@@ -4981,7 +4981,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \
     stbir__simdf_0123to2222( c, cs );                         \
     stbir__simdf_load1z( d, decode+(ofs)*3+8 );               \
-    stbir__simdf_madd( tot2, tot2, c, d );  }                
+    stbir__simdf_madd( tot2, tot2, c, d );  }
 
 #define stbir__store_output()                       \
     stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 );   \
@@ -5012,7 +5012,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[0];                 \
     tot0 = decode[0]*c;        \
     tot1 = decode[1]*c;        \
-    tot2 = decode[2]*c;              
+    tot2 = decode[2]*c;
 
 #define stbir__2_coeff_only()  \
     float tot0, tot1, tot2, c; \
@@ -5023,7 +5023,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[1];                 \
     tot0 += decode[3]*c;       \
     tot1 += decode[4]*c;       \
-    tot2 += decode[5]*c;              
+    tot2 += decode[5]*c;
 
 #define stbir__3_coeff_only()  \
     float tot0, tot1, tot2, c; \
@@ -5038,7 +5038,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[2];                 \
     tot0 += decode[6]*c;       \
     tot1 += decode[7]*c;       \
-    tot2 += decode[8]*c;              
+    tot2 += decode[8]*c;
 
 #define stbir__store_output_tiny()                \
     output[0] = tot0;                             \
@@ -5065,7 +5065,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[3];                      \
     totd0 = decode[9]*c;            \
     totd1 = decode[10]*c;           \
-    totd2 = decode[11]*c;            
+    totd2 = decode[11]*c;
 
 #define stbir__4_coeff_continue_from_4( ofs )  \
     c = hc[0+(ofs)];                           \
@@ -5083,7 +5083,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[3+(ofs)];                           \
     totd0 += decode[9+(ofs)*3]*c;              \
     totd1 += decode[10+(ofs)*3]*c;             \
-    totd2 += decode[11+(ofs)*3]*c;              
+    totd2 += decode[11+(ofs)*3]*c;
 
 #define stbir__1_coeff_remnant( ofs )  \
     c = hc[0+(ofs)];                   \
@@ -5113,7 +5113,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     c = hc[2+(ofs)];                   \
     totc0 += decode[6+(ofs)*3]*c;      \
     totc1 += decode[7+(ofs)*3]*c;      \
-    totc2 += decode[8+(ofs)*3]*c;              
+    totc2 += decode[8+(ofs)*3]*c;
 
 #define stbir__store_output()                     \
     output[0] = (tota0+totc0)+(totb0+totd0);      \
@@ -5123,7 +5123,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 3;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 3
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -5139,7 +5139,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     STBIR_SIMD_NO_UNROLL(decode);         \
     stbir__simdf_load1( c, hc );          \
     stbir__simdf_0123to0000( c, c );      \
-    stbir__simdf_mult_mem( tot, c, decode ); 
+    stbir__simdf_mult_mem( tot, c, decode );
 
 #define stbir__2_coeff_only()                       \
     stbir__simdf tot,c,cs;                          \
@@ -5148,7 +5148,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0000( c, cs );               \
     stbir__simdf_mult_mem( tot, c, decode );        \
     stbir__simdf_0123to1111( c, cs );               \
-    stbir__simdf_madd_mem( tot, tot, c, decode+4 ); 
+    stbir__simdf_madd_mem( tot, tot, c, decode+4 );
 
 #define stbir__3_coeff_only()                       \
     stbir__simdf tot,c,cs;                          \
@@ -5159,7 +5159,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to1111( c, cs );               \
     stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \
     stbir__simdf_0123to2222( c, cs );               \
-    stbir__simdf_madd_mem( tot, tot, c, decode+8 ); 
+    stbir__simdf_madd_mem( tot, tot, c, decode+8 );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store( output, tot );            \
@@ -5176,7 +5176,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );         \
     stbir__simdf8_mult_mem( tot0, c, decode );     \
     stbir__simdf8_0123to22223333( c, cs );         \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );    
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                  \
     STBIR_SIMD_NO_UNROLL(decode);                              \
@@ -5184,26 +5184,26 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to00001111( c, cs );                     \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
     stbir__simdf8_0123to22223333( c, cs );                     \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );    
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );
 
 #define stbir__1_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf_load1rep4( t, hc + (ofs) );                   \
-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 ); 
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 );
 
 #define stbir__2_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf8_load4b( cs, hc + (ofs) - 2 );                \
     stbir__simdf8_0123to22223333( c, cs );                     \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   
- 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
+
  #define stbir__3_coeff_remnant( ofs )                         \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf8_load4b( cs, hc + (ofs) );                    \
     stbir__simdf8_0123to00001111( c, cs );                     \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
     stbir__simdf8_0123to2222( t, cs );                         \
-    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 ); 
+    stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 );
 
 #define stbir__store_output()                      \
     stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 );     \
@@ -5212,7 +5212,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                     \
     output += 4;
 
-#else    
+#else
 
 #define stbir__4_coeff_start()                        \
     stbir__simdf tot0,tot1,c,cs;                      \
@@ -5225,7 +5225,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to2222( c, cs );                 \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \
     stbir__simdf_0123to3333( c, cs );                 \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 ); 
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+12 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                  \
     STBIR_SIMD_NO_UNROLL(decode);                              \
@@ -5237,13 +5237,13 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to2222( c, cs );                          \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 );  \
     stbir__simdf_0123to3333( c, cs );                          \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 ); 
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 );
 
 #define stbir__1_coeff_remnant( ofs )                       \
     STBIR_SIMD_NO_UNROLL(decode);                           \
     stbir__simdf_load1( c, hc + (ofs) );                    \
     stbir__simdf_0123to0000( c, c );                        \
-    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); 
+    stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );
 
 #define stbir__2_coeff_remnant( ofs )                         \
     STBIR_SIMD_NO_UNROLL(decode);                             \
@@ -5251,8 +5251,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_0123to0000( c, cs );                         \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 );   \
     stbir__simdf_0123to1111( c, cs );                         \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); 
-  
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 );
+
 #define stbir__3_coeff_remnant( ofs )                          \
     STBIR_SIMD_NO_UNROLL(decode);                              \
     stbir__simdf_load( cs, hc + (ofs) );                       \
@@ -5378,7 +5378,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     x0 += decode[0+(ofs)*4] * c;      \
     x1 += decode[1+(ofs)*4] * c;      \
     x2 += decode[2+(ofs)*4] * c;      \
-    x3 += decode[3+(ofs)*4] * c;      
+    x3 += decode[3+(ofs)*4] * c;
 
 #define stbir__2_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);     \
@@ -5391,8 +5391,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y0 += decode[4+(ofs)*4] * c;      \
     y1 += decode[5+(ofs)*4] * c;      \
     y2 += decode[6+(ofs)*4] * c;      \
-    y3 += decode[7+(ofs)*4] * c;    
-  
+    y3 += decode[7+(ofs)*4] * c;
+
 #define stbir__3_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);     \
     c = hc[0+(ofs)];                  \
@@ -5409,7 +5409,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     x0 += decode[8+(ofs)*4] * c;      \
     x1 += decode[9+(ofs)*4] * c;      \
     x2 += decode[10+(ofs)*4] * c;     \
-    x3 += decode[11+(ofs)*4] * c;     
+    x3 += decode[11+(ofs)*4] * c;
 
 #define stbir__store_output()                     \
     output[0] = x0 + y0;                          \
@@ -5420,7 +5420,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 4;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 4
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -5439,7 +5439,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_load1( c, hc );                \
     stbir__simdf_0123to0000( c, c );            \
     stbir__simdf_mult_mem( tot0, c, decode );   \
-    stbir__simdf_mult_mem( tot1, c, decode+3 ); 
+    stbir__simdf_mult_mem( tot1, c, decode+3 );
 
 #define stbir__2_coeff_only()                         \
     stbir__simdf tot0,tot1,c,cs;                      \
@@ -5450,7 +5450,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_mult_mem( tot1, c, decode+3 );       \
     stbir__simdf_0123to1111( c, cs );                 \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \
-    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 ); 
+    stbir__simdf_madd_mem( tot1, tot1, c,decode+10 );
 
 #define stbir__3_coeff_only()                           \
     stbir__simdf tot0,tot1,c,cs;                        \
@@ -5464,7 +5464,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+10 );  \
     stbir__simdf_0123to2222( c, cs );                   \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+14 );  \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );
 
 #define stbir__store_output_tiny()                \
     stbir__simdf_store( output+3, tot1 );         \
@@ -5486,7 +5486,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to22222222( c, cs );         \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 );  \
     stbir__simdf8_0123to33333333( c, cs );         \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );  
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                   \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5498,19 +5498,19 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to22222222( c, cs );                      \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \
     stbir__simdf8_0123to33333333( c, cs );                      \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 ); 
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 );
 
 #define stbir__1_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
     stbir__simdf8_load1b( c, hc + (ofs) );                      \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );
 
 #define stbir__2_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
     stbir__simdf8_load1b( c, hc + (ofs) );                      \
     stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 );    \
     stbir__simdf8_load1b( c, hc + (ofs)+1 );                    \
-    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );   
+    stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );
 
 #define stbir__3_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5520,7 +5520,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf8_0123to11111111( c, cs );                      \
     stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 );  \
     stbir__simdf8_0123to22222222( c, cs );                      \
-    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); 
+    stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );
 
 #define stbir__store_output()                     \
     stbir__simdf8_add( tot0, tot0, tot1 );        \
@@ -5553,7 +5553,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+17 );  \
     stbir__simdf_0123to3333( c, cs );                   \
     stbir__simdf_madd_mem( tot2, tot2, c, decode+21 );  \
-    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );         
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+24 );
 
 #define stbir__4_coeff_continue_from_4( ofs )                   \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5569,7 +5569,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  \
     stbir__simdf_0123to3333( c, cs );                           \
     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 );  \
-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );   
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 );
 
 #define stbir__1_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
@@ -5586,8 +5586,8 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 );   \
     stbir__simdf_0123to1111( c, cs );                           \
     stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 );   \
-    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  
-  
+    stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );
+
 #define stbir__3_coeff_remnant( ofs )                           \
     STBIR_SIMD_NO_UNROLL(decode);                               \
     stbir__simdf_load( cs, hc + (ofs) );                        \
@@ -5599,7 +5599,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 );  \
     stbir__simdf_0123to2222( c, cs );                           \
     stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 );  \
-    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );  
+    stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 );
 
 #define stbir__store_output()                     \
     stbir__simdf_add( tot0, tot0, tot2 );         \
@@ -5623,7 +5623,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     tot3 = decode[3]*c;              \
     tot4 = decode[4]*c;              \
     tot5 = decode[5]*c;              \
-    tot6 = decode[6]*c;              
+    tot6 = decode[6]*c;
 
 #define stbir__2_coeff_only()        \
     float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \
@@ -5717,7 +5717,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y3 += decode[24] * c;         \
     y4 += decode[25] * c;         \
     y5 += decode[26] * c;         \
-    y6 += decode[27] * c; 
+    y6 += decode[27] * c;
 
 #define stbir__4_coeff_continue_from_4( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);  \
@@ -5752,7 +5752,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y3 += decode[24+(ofs)*7] * c;  \
     y4 += decode[25+(ofs)*7] * c;  \
     y5 += decode[26+(ofs)*7] * c;  \
-    y6 += decode[27+(ofs)*7] * c; 
+    y6 += decode[27+(ofs)*7] * c;
 
 #define stbir__1_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);  \
@@ -5783,7 +5783,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     y4 += decode[11+(ofs)*7] * c;  \
     y5 += decode[12+(ofs)*7] * c;  \
     y6 += decode[13+(ofs)*7] * c;  \
-  
+
 #define stbir__3_coeff_remnant( ofs ) \
     STBIR_SIMD_NO_UNROLL(decode);  \
     c = hc[0+(ofs)];               \
@@ -5823,7 +5823,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
     ++horizontal_contributors;                    \
     output += 7;
 
-#endif  
+#endif
 
 #define STBIR__horizontal_channels 7
 #define STB_IMAGE_RESIZE_DO_HORIZONTALS
@@ -5950,7 +5950,7 @@ static void stbir__encode_scanline( stbir__info const * stbir_info, void *output
   // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback)
   if ( stbir_info->out_pixels_cb )
     output_buffer = encode_buffer;
-  
+
   STBIR_PROFILE_START( encode );
   // convert into the output buffer
   stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer );
@@ -6028,7 +6028,7 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
     stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
   }
 
-  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes), 
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes),
                           encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 }
 
@@ -6043,7 +6043,7 @@ static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info cons
   // update new end scanline
   split_info->ring_buffer_last_scanline = n;
 
-  // get ring buffer 
+  // get ring buffer
   ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
   ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index);
 
@@ -6069,7 +6069,7 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
 
   // initialize the ring buffer for gathering
   split_info->ring_buffer_begin_index = 0;
-  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;  
+  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;
   split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
 
   for (y = start_output_y; y < end_output_y; y++)
@@ -6093,12 +6093,12 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
         split_info->ring_buffer_first_scanline++;
         split_info->ring_buffer_begin_index++;
       }
-      
+
       if ( stbir_info->vertical_first )
       {
         float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline );
         // Decode the nth scanline from the source image into the decode buffer.
-        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+        stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
       }
       else
       {
@@ -6121,10 +6121,10 @@ static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_
 {
   // evict a scanline out into the output buffer
   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
-  
+
   // dump the scanline out
   stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
-  
+
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
 
@@ -6142,10 +6142,10 @@ static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(st
 
   // Now resample it into the buffer.
   stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
-  
+
   // dump the scanline out
   stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
-  
+
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
 
@@ -6185,7 +6185,7 @@ static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stb
   STBIR_PROFILE_END( vertical );
 }
 
-typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info); 
+typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info);
 
 static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count )
 {
@@ -6206,7 +6206,7 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
   end_input_y = split_info[split_count-1].end_input_y;
 
   // adjust for starting offset start_input_y
-  y = start_input_y + stbir_info->vertical.filter_pixel_margin; 
+  y = start_input_y + stbir_info->vertical.filter_pixel_margin;
   vertical_contributors += y ;
   vertical_coefficients += stbir_info->vertical.coefficient_width * y;
 
@@ -6253,7 +6253,7 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
         split_info->start_input_y = y;
       on_first_input_y = 0;
 
-      // clip the region 
+      // clip the region
       if ( out_first_scanline < start_output_y )
       {
         vc += start_output_y - out_first_scanline;
@@ -6266,11 +6266,11 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
       // if very first scanline, init the index
       if (split_info->ring_buffer_begin_index < 0)
         split_info->ring_buffer_begin_index = out_first_scanline - start_output_y;
-      
+
       STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline );
 
       // Decode the nth scanline from the source image into the decode buffer.
-      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); 
+      stbir__decode_scanline( stbir_info, y, split_info->decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
       // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out
       if ( !stbir_info->vertical_first )
@@ -6282,7 +6282,7 @@ static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir_
       if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) &&
            ( out_last_scanline > split_info->ring_buffer_last_scanline ) )
         handle_scanline_for_scatter( stbir_info, split_info );
-    
+
       // Now the horizontal buffer is ready to write to all ring buffer rows, so do it.
       stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end );
 
@@ -6318,7 +6318,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
     if (scale_info->scale >= ( 1.0f - stbir__small_float ) )
     {
       if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) )
-        filter = STBIR_FILTER_POINT_SAMPLE;  
+        filter = STBIR_FILTER_POINT_SAMPLE;
       else
         filter = STBIR_DEFAULT_FILTER_UPSAMPLE;
     }
@@ -6326,7 +6326,7 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
   samp->filter_enum = filter;
 
   STBIR_ASSERT(samp->filter_enum != 0);
-  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER); 
+  STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER);
   samp->filter_kernel = stbir__builtin_kernels[ filter ];
   samp->filter_support = stbir__builtin_supports[ filter ];
 
@@ -6410,8 +6410,8 @@ static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contr
     range->n0 = in_first_pixel;
     stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge );
     range->n1 = in_last_pixel;
-     
-    // now go through the margin to the start of area to find bottom 
+
+    // now go through the margin to the start of area to find bottom
     n = range->n0 + 1;
     input_end = -filter_pixel_margin;
     while( n >= input_end )
@@ -6426,7 +6426,7 @@ static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contr
       --n;
     }
 
-    // now go through the end of the area through the margin to find top 
+    // now go through the end of the area through the margin to find top
     n = range->n1 - 1;
     input_end = n + 1 + filter_pixel_margin;
     while( n <= input_end )
@@ -6475,7 +6475,7 @@ static void stbir__get_split_info( stbir__per_split_info* split_info, int splits
   cur = 0;
   for( i = 0 ; i < splits ; i++ )
   {
-    int each; 
+    int each;
     split_info[i].start_output_y = cur;
     each = left / ( splits - i );
     split_info[i].end_output_y = cur + each;
@@ -6491,7 +6491,7 @@ static void stbir__get_split_info( stbir__per_split_info* split_info, int splits
 static void stbir__free_internal_mem( stbir__info *info )
 {
   #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } }
-  
+
   if ( info )
   {
   #ifndef STBIR__SEPARATE_ALLOCATIONS
@@ -6509,16 +6509,16 @@ static void stbir__free_internal_mem( stbir__info *info )
       for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ )
       {
         #ifdef STBIR_SIMD8
-        if ( info->effective_channels == 3 ) 
+        if ( info->effective_channels == 3 )
           --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
-        #endif  
+        #endif
         STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] );
       }
 
       #ifdef STBIR_SIMD8
-      if ( info->effective_channels == 3 ) 
+      if ( info->effective_channels == 3 )
         --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
-      #endif  
+      #endif
       STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer );
       STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers );
       STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer );
@@ -6535,7 +6535,7 @@ static void stbir__free_internal_mem( stbir__info *info )
     STBIR__FREE_AND_CLEAR( info );
   #endif
   }
-  
+
   #undef STBIR__FREE_AND_CLEAR
 }
 
@@ -6547,20 +6547,20 @@ static int stbir__get_max_split( int splits, int height )
   for( i = 0 ; i < splits ; i++ )
   {
     int each = height / ( splits - i );
-    if ( each > max ) 
+    if ( each > max )
       max = each;
     height -= each;
   }
   return max;
 }
 
-static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] = 
-{ 
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] =
+{
   0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs
 };
 
-static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] = 
-{ 
+static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] =
+{
   0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs
 };
 
@@ -6635,28 +6635,28 @@ static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0};
 #endif
 
 // Figure out whether to scale along the horizontal or vertical first.
-//   This only *super* important when you are scaling by a massively 
-//   different amount in the vertical vs the horizontal (for example, if 
-//   you are scaling by 2x in the width, and 0.5x in the height, then you 
-//   want to do the vertical scale first, because it's around 3x faster 
+//   This only *super* important when you are scaling by a massively
+//   different amount in the vertical vs the horizontal (for example, if
+//   you are scaling by 2x in the width, and 0.5x in the height, then you
+//   want to do the vertical scale first, because it's around 3x faster
 //   in that order.
 //
-//   In more normal circumstances, this makes a 20-40% differences, so 
+//   In more normal circumstances, this makes a 20-40% differences, so
 //     it's good to get right, but not critical. The normal way that you
-//     decide which direction goes first is just figuring out which 
-//     direction does more multiplies. But with modern CPUs with their 
+//     decide which direction goes first is just figuring out which
+//     direction does more multiplies. But with modern CPUs with their
 //     fancy caches and SIMD and high IPC abilities, so there's just a lot
-//     more that goes into it. 
+//     more that goes into it.
 //
-//   My handwavy sort of solution is to have an app that does a whole 
+//   My handwavy sort of solution is to have an app that does a whole
 //     bunch of timing for both vertical and horizontal first modes,
 //     and then another app that can read lots of these timing files
 //     and try to search for the best weights to use. Dotimings.c
 //     is the app that does a bunch of timings, and vf_train.c is the
-//     app that solves for the best weights (and shows how well it 
+//     app that solves for the best weights (and shows how well it
 //     does currently).
 
-static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )    
+static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info )
 {
   double v_cost, h_cost;
   float * weights;
@@ -6668,15 +6668,15 @@ static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLA
     v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7;
   else if ( vertical_scale <= 1.0f )
     v_classification = ( is_gather ) ? 1 : 0;
-  else if ( vertical_scale <= 2.0f) 
+  else if ( vertical_scale <= 2.0f)
     v_classification = 2;
-  else if ( vertical_scale <= 3.0f) 
+  else if ( vertical_scale <= 3.0f)
     v_classification = 3;
-  else if ( vertical_scale <= 4.0f) 
+  else if ( vertical_scale <= 4.0f)
     v_classification = 5;
-  else 
+  else
     v_classification = 6;
-  
+
   // use the right weights
   weights = weights_table[ v_classification ];
 
@@ -6697,10 +6697,10 @@ static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLA
     info->is_gather = is_gather;
   }
 
-  // and this allows us to override everything for testing (see dotiming.c) 
-  if ( ( info ) && ( info->control_v_first ) ) 
+  // and this allows us to override everything for testing (see dotiming.c)
+  if ( ( info ) && ( info->control_v_first ) )
     vertical_first = ( info->control_v_first == 2 ) ? 1 : 0;
-  
+
   return vertical_first;
 }
 
@@ -6712,9 +6712,9 @@ static unsigned char stbir__pixel_channels[] = {
 };
 
 // the internal pixel layout enums are in a different order, so we can easily do range comparisons of types
-//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible 
+//   the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible
 static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = {
-  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA, 
+  STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA,
   STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR,
   STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM,
 };
@@ -6730,12 +6730,12 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
   int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
 
   int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy
-  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size ); 
-  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];  
+  int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size );
+  stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ];
   stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ];
-  int channels = stbir__pixel_channels[ input_pixel_layout ];      
+  int channels = stbir__pixel_channels[ input_pixel_layout ];
   int effective_channels = channels;
-  
+
   // first figure out what type of alpha weighting to use (if any)
   if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling
   {
@@ -6773,11 +6773,11 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
   // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect)
   decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
-  
+
 #if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8)
   if ( effective_channels == 3 )
     decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations)
-#endif  
+#endif
 
   ring_buffer_length_bytes = horizontal->scale_info.output_sub_size * effective_channels * sizeof(float) + sizeof(float); // extra float for padding
 
@@ -6816,9 +6816,9 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
     #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = ((char*)advance_mem) + (size);
 #endif
 
-    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );      
+    STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info );
 
-    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );      
+    STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info );
 
     if ( info )
     {
@@ -6833,39 +6833,39 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
       info->channels = channels;
       info->effective_channels = effective_channels;
-  
+
       info->offset_x = new_x;
       info->offset_y = new_y;
       info->alloc_ring_buffer_num_entries = alloc_ring_buffer_num_entries;
-      info->ring_buffer_num_entries = 0;  
+      info->ring_buffer_num_entries = 0;
       info->ring_buffer_length_bytes = ring_buffer_length_bytes;
       info->splits = splits;
       info->vertical_first = vertical_first;
 
-      info->input_pixel_layout_internal = input_pixel_layout;  
+      info->input_pixel_layout_internal = input_pixel_layout;
       info->output_pixel_layout_internal = output_pixel_layout;
 
       // setup alpha weight functions
       info->alpha_weight = 0;
       info->alpha_unweight = 0;
-    
+
       // handle alpha weighting functions and overrides
       if ( alpha_weighting_type == 2 )
       {
         // high quality alpha multiplying on the way in, dividing on the way out
-        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
         info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
       }
       else if ( alpha_weighting_type == 4 )
       {
         // fast alpha multiplying on the way in, dividing on the way out
-        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];  
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
         info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ];
       }
       else if ( alpha_weighting_type == 1 )
       {
         // fast alpha on the way in, leave in premultiplied form on way out
-        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; 
+        info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ];
       }
       else if ( alpha_weighting_type == 3 )
       {
@@ -6884,7 +6884,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
           info->alpha_weight = stbir__simple_flip_3ch;
       }
 
-    }        
+    }
 
     // get all the per-split buffers
     for( i = 0 ; i < splits ; i++ )
@@ -6896,7 +6896,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       #ifdef STBIR_SIMD8
       if ( ( info ) && ( effective_channels == 3 ) )
         ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer
-      #endif  
+      #endif
 
       STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* );
       {
@@ -6907,7 +6907,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
           #ifdef STBIR_SIMD8
           if ( ( info ) && ( effective_channels == 3 ) )
             ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer
-          #endif  
+          #endif
         }
       }
 #else
@@ -6935,10 +6935,10 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 #endif
       if ( temp_mem_amt >= both )
       {
-        if ( info ) 
-        { 
-          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer; 
-          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size ); 
+        if ( info )
+        {
+          vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer;
+          vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size );
         }
       }
       else
@@ -6961,7 +6961,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       if ( diff_shift < 0.0f ) diff_shift = -diff_shift;
       if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) )
       {
-        if ( horizontal->is_gather == vertical->is_gather ) 
+        if ( horizontal->is_gather == vertical->is_gather )
         {
           copy_horizontal = 1;
           goto no_vert_alloc;
@@ -6988,16 +6988,16 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size
       if ( horizontal->extent_info.widest <= 12 )
         info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ];
-      
+
       info->scanline_extents.conservative.n0 = conservative->n0;
       info->scanline_extents.conservative.n1 = conservative->n1;
-      
+
       // get exact extents
       stbir__get_extents( horizontal, &info->scanline_extents );
 
       // pack the horizontal coeffs
       horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 );
-      
+
       STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
 
       STBIR_PROFILE_BUILD_END( horizontal );
@@ -7031,21 +7031,21 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       for( i = 0 ; i < splits ; i++ )
       {
         int width, ofs;
-        
+
         // find the right most span
         if ( info->scanline_extents.spans[0].n1 > info->scanline_extents.spans[1].n1 )
           width = info->scanline_extents.spans[0].n1 - info->scanline_extents.spans[0].n0;
         else
           width = info->scanline_extents.spans[1].n1 - info->scanline_extents.spans[1].n0;
-        
+
         // this calc finds the exact end of the decoded scanline for all filter modes.
-        //   usually this is just the width * effective channels.  But we have to account 
-        //   for the area to the left of the scanline for wrap filtering and alignment, this 
+        //   usually this is just the width * effective channels.  But we have to account
+        //   for the area to the left of the scanline for wrap filtering and alignment, this
         //   is stored as a negative value in info->scanline_extents.conservative.n0. Next,
         //   we need to skip the exact size of the right hand size filter area (again for
         //   wrap mode), this is in info->scanline_extents.edge_sizes[1]).
         ofs = ( width + 1 - info->scanline_extents.conservative.n0 + info->scanline_extents.edge_sizes[1] ) * effective_channels;
-        
+
         // place a known, but numerically valid value in the decode buffer
         info->split_info[i].decode_buffer[ ofs ] = 9999.0f;
 
@@ -7053,7 +7053,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
         //   of the ring buffer accumulators
         if ( vertical_first )
         {
-          int j;  
+          int j;
           for( j = 0; j < info->ring_buffer_num_entries ; j++ )
           {
             stbir__get_ring_buffer_entry( info, info->split_info + i, j )[ ofs ] = 9999.0f;
@@ -7078,7 +7078,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
   }
 }
 
-static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count ) 
+static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count )
 {
   stbir__per_split_info * split_info = info->split_info + split_start;
 
@@ -7098,7 +7098,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
 {
   static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
   {
-    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear, 
+    /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear,
   };
 
   static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]=
@@ -7161,7 +7161,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   stbir_datatype input_type, output_type;
 
   input_type = resize->input_data_type;
-  output_type = resize->output_data_type; 
+  output_type = resize->output_data_type;
   info->input_data = resize->input_pixels;
   info->input_stride_bytes = resize->input_stride_in_bytes;
   info->output_stride_bytes = resize->output_stride_in_bytes;
@@ -7169,7 +7169,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   // if we're completely point sampling, then we can turn off SRGB
   if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) )
   {
-    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) && 
+    if ( ( ( input_type  == STBIR_TYPE_UINT8_SRGB ) || ( input_type  == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) &&
          ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) )
     {
       input_type = STBIR_TYPE_UINT8;
@@ -7177,7 +7177,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
     }
   }
 
-  // recalc the output and input strides  
+  // recalc the output and input strides
   if ( info->input_stride_bytes == 0 )
     info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type];
 
@@ -7218,7 +7218,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) )
   {
     int non_scaled = 0;
-    
+
     // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16)
     if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual)
       if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) )
@@ -7238,16 +7238,16 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
   }
 
   info->input_type = input_type;
-  info->output_type = output_type; 
+  info->output_type = output_type;
   info->decode_pixels = decode_pixels;
-  info->encode_pixels = encode_pixels; 
+  info->encode_pixels = encode_pixels;
 }
 
 static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 )
 {
   double per, adj;
   int over;
-  
+
   // do left/top edge
   if ( *outx < 0 )
   {
@@ -7266,7 +7266,7 @@ static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, doubl
     *u1 += adj; // decrease u1
     *outsubw = outw - *outx;
   }
-}    
+}
 
 // converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so)
 static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0)
@@ -7283,7 +7283,7 @@ static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32
   bot = 1 << 25;
 
   // keep refining, but usually stops in a few loops - usually 5 for bad cases
-  for(;;)  
+  for(;;)
   {
     stbir_uint64 est, temp;
 
@@ -7316,13 +7316,13 @@ static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32
     bot = temp;
 
     // move remainders
-    temp = est * denom_estimate + denom_last; 
-    denom_last = denom_estimate; 
+    temp = est * denom_estimate + denom_last;
+    denom_last = denom_estimate;
     denom_estimate = temp;
 
     // move remainders
-    temp = est * numer_estimate + numer_last; 
-    numer_last = numer_estimate; 
+    temp = est * numer_estimate + numer_last;
+    numer_last = numer_estimate;
     numer_estimate = temp;
   }
 
@@ -7366,11 +7366,11 @@ static int stbir__calculate_region_transform( stbir__scale_info * scale_info, in
 
   output_s = ( (double)output_sub_range) / output_range;
 
-  // figure out the scaling to use 
-  ratio = output_s / input_s; 
+  // figure out the scaling to use
+  ratio = output_s / input_s;
 
   // save scale before clipping
-  scale = ( output_range / input_range ) * ratio; 
+  scale = ( output_range / input_range ) * ratio;
   scale_info->scale = (float)scale;
   scale_info->inv_scale = (float)( 1.0 / scale );
 
@@ -7381,11 +7381,11 @@ static int stbir__calculate_region_transform( stbir__scale_info * scale_info, in
   input_s = input_s1 - input_s0;
 
   // after clipping do we have zero input area?
-  if ( input_s <= stbir__small_float ) 
+  if ( input_s <= stbir__small_float )
     return 0;
 
-  // calculate and store the starting source offsets in output pixel space 
-  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range ); 
+  // calculate and store the starting source offsets in output pixel space
+  scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range );
 
   scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) );
 
@@ -7418,7 +7418,7 @@ static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layou
   resize->needs_rebuild = 1;
 }
 
-STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, 
+STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize,
                                  const void *input_pixels,  int input_w,  int input_h, int input_stride_in_bytes, // stride can be zero
                                        void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero
                                  stbir_pixel_layout pixel_layout, stbir_datatype data_type )
@@ -7441,7 +7441,7 @@ STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_t
 {
   resize->input_data_type = input_type;
   resize->output_data_type = output_type;
-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
     stbir__update_info_from_resize( resize->samplers, resize );
 }
 
@@ -7450,7 +7450,7 @@ STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_call
   resize->input_cb = input_cb;
   resize->output_cb = output_cb;
 
-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
   {
     resize->samplers->in_pixels_cb = input_cb;
     resize->samplers->out_pixels_cb = output_cb;
@@ -7460,7 +7460,7 @@ STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_call
 STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data )                                     // pass back STBIR_RESIZE* by default
 {
   resize->user_data = user_data;
-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
     resize->samplers->user_data = user_data;
 }
 
@@ -7470,7 +7470,7 @@ STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_p
   resize->input_stride_in_bytes = input_stride_in_bytes;
   resize->output_pixels = output_pixels;
   resize->output_stride_in_bytes = output_stride_in_bytes;
-  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) 
+  if ( ( resize->samplers ) && ( !resize->needs_rebuild ) )
     stbir__update_info_from_resize( resize->samplers, resize );
 }
 
@@ -7574,7 +7574,7 @@ STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby,
   return 1;
 }
 
-static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) 
+static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
 {
   stbir__contributors conservative = { 0, 0 };
   stbir__sampler horizontal, vertical;
@@ -7588,13 +7588,13 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
   // have we already built the samplers?
   if ( resize->samplers )
     return 0;
-  
+
   #define STBIR_RETURN_ERROR_AND_ASSERT( exp )  STBIR_ASSERT( !(exp) ); if (exp) return 0;
   STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER)
   STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER)
   #undef STBIR_RETURN_ERROR_AND_ASSERT
 
-  if ( splits <= 0 ) 
+  if ( splits <= 0 )
     return 0;
 
   STBIR_PROFILE_BUILD_FIRST_START( build );
@@ -7628,7 +7628,7 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
   out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO );
   STBIR_PROFILE_BUILD_END( alloc );
   STBIR_PROFILE_BUILD_END( build );
- 
+
   if ( out_info )
   {
     resize->splits = splits;
@@ -7640,7 +7640,7 @@ static int stbir__perform_build( STBIR_RESIZE * resize, int splits )
 
     // update anything that can be changed without recalcing samplers
     stbir__update_info_from_resize( out_info, resize );
- 
+
     return splits;
   }
 
@@ -7669,7 +7669,7 @@ STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits
   }
 
   STBIR_PROFILE_BUILD_CLEAR( resize->samplers );
-  
+
   return 1;
 }
 
@@ -7681,7 +7681,7 @@ STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize )
 STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
 {
   int result;
-  
+
   if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
   {
     int alloc_state = resize->called_alloc;  // remember allocated state
@@ -7694,10 +7694,10 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
 
     if ( !stbir_build_samplers( resize ) )
       return 0;
-    
+
     resize->called_alloc = alloc_state;
 
-    // if build_samplers succeeded (above), but there are no samplers set, then 
+    // if build_samplers succeeded (above), but there are no samplers set, then
     //   the area to stretch into was zero pixels, so don't do anything and return
     //   success
     if ( resize->samplers == 0 )
@@ -7717,7 +7717,7 @@ STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize )
   {
     stbir_free_samplers( resize );
     resize->samplers = 0;
-  } 
+  }
 
   return result;
 }
@@ -7732,11 +7732,11 @@ STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start
 
   // you **must** build samplers first when using split resize
   if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) )
-    return 0; 
-    
+    return 0;
+
   if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) )
     return 0;
-  
+
   // do resize
   return stbir__perform_resize( resize->samplers, split_start, split_count );
 }
@@ -7774,7 +7774,7 @@ static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * o
     *ret_pitch = pitch;
   }
 
-  return 1;  
+  return 1;
 }
 
 
@@ -7789,9 +7789,9 @@ STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_p
   if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
     return 0;
 
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch,
                      pixel_layout, STBIR_TYPE_UINT8 );
 
   if ( !stbir_resize_extended( &resize ) )
@@ -7815,9 +7815,9 @@ STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pix
   if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( unsigned char ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
     return 0;
 
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch,
                      pixel_layout, STBIR_TYPE_UINT8_SRGB );
 
   if ( !stbir_resize_extended( &resize ) )
@@ -7842,9 +7842,9 @@ STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int inpu
   if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, sizeof( float ), output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
     return 0;
 
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, opitch, 
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, opitch,
                      pixel_layout, STBIR_TYPE_FLOAT );
 
   if ( !stbir_resize_extended( &resize ) )
@@ -7860,7 +7860,7 @@ STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int inpu
 
 STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes,
                                     void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
-                              stbir_pixel_layout pixel_layout, stbir_datatype data_type, 
+                              stbir_pixel_layout pixel_layout, stbir_datatype data_type,
                               stbir_edge edge, stbir_filter filter )
 {
   STBIR_RESIZE resize;
@@ -7870,9 +7870,9 @@ STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input
   if ( !stbir__check_output_stuff( (void**)&optr, &opitch, output_pixels, stbir__type_size[data_type], output_w, output_h, output_stride_in_bytes, stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ) )
     return 0;
 
-  stbir_resize_init( &resize, 
-                     input_pixels,  input_w,  input_h,  input_stride_in_bytes, 
-                     (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes, 
+  stbir_resize_init( &resize,
+                     input_pixels,  input_w,  input_h,  input_stride_in_bytes,
+                     (optr) ? optr : output_pixels, output_w, output_h, output_stride_in_bytes,
                      pixel_layout, data_type );
 
   resize.horizontal_edge = edge;
@@ -7980,7 +7980,7 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
 #else  // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS
 
 // we reinclude the header file to define all the horizontal functions
-//   specializing each function for the number of coeffs is 20-40% faster *OVERALL* 
+//   specializing each function for the number of coeffs is 20-40% faster *OVERALL*
 
 // by including the header file again this way, we can still debug the functions
 
@@ -8013,16 +8013,16 @@ STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STB
 #define stbir__encode_order2 2
 #define stbir__encode_order3 3
 #define stbir__decode_simdf8_flip(reg)
-#define stbir__decode_simdf4_flip(reg) 
+#define stbir__decode_simdf4_flip(reg)
 #define stbir__encode_simdf8_unflip(reg)
-#define stbir__encode_simdf4_unflip(reg) 
+#define stbir__encode_simdf4_unflip(reg)
 #endif
 
 #ifdef STBIR_SIMD8
 #define stbir__encode_simdfX_unflip  stbir__encode_simdf8_unflip
 #else
 #define stbir__encode_simdfX_unflip  stbir__encode_simdf4_unflip
-#endif 
+#endif
 
 static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp )
 {
@@ -8076,7 +8076,7 @@ static void STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * deco
       #endif
       decode += 16;
       input += 16;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 16 ) )
         break;
@@ -8141,15 +8141,15 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
       stbir__encode_simdfX_unflip( e0 );
       stbir__encode_simdfX_unflip( e1 );
       #ifdef STBIR_SIMD8
-      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
       stbir__simdi_store( output, i );
       #else
-      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdf_pack_to_8bytes( i, e0, e1 );
       stbir__simdi_store2( output, i );
       #endif
       encode += stbir__simdfX_float_count*2;
       output += stbir__simdfX_float_count*2;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
         break;
@@ -8182,7 +8182,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
   #if stbir__coder_min_num < 4
   while( output < end_output )
   {
-    stbir__simdf e0; 
+    stbir__simdf e0;
     STBIR_NO_UNROLL(encode);
     stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 );
     #if stbir__coder_min_num >= 2
@@ -8195,7 +8195,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outpu
     encode += stbir__coder_min_num;
   }
   #endif
-  
+
   #else
 
   // try to do blocks of 4 when you can
@@ -8280,7 +8280,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int
 #endif
       decode += 16;
       input += 16;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 16 ) )
         break;
@@ -8345,15 +8345,15 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int
       stbir__encode_simdfX_unflip( e0 );
       stbir__encode_simdfX_unflip( e1 );
       #ifdef STBIR_SIMD8
-      stbir__simdf8_pack_to_16bytes( i, e0, e1 ); 
+      stbir__simdf8_pack_to_16bytes( i, e0, e1 );
       stbir__simdi_store( output, i );
       #else
-      stbir__simdf_pack_to_8bytes( i, e0, e1 ); 
+      stbir__simdf_pack_to_8bytes( i, e0, e1 );
       stbir__simdi_store2( output, i );
       #endif
       encode += stbir__simdfX_float_count*2;
       output += stbir__simdfX_float_count*2;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
         break;
@@ -8463,7 +8463,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
 #define stbir__min_max_shift20( i, f ) \
     stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \
     stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one  )) ); \
-    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 ); 
+    stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 );
 
 #define stbir__scale_and_convert( i, f ) \
     stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \
@@ -8490,7 +8490,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
   temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \
   v0 = temp0.m128i_i128; \
   v1 = temp1.m128i_i128; \
-} 
+}
 
 #define stbir__simdi_table_lookup3( v0,v1,v2, table ) \
 { \
@@ -8521,7 +8521,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int wi
   v1 = temp1.m128i_i128; \
   v2 = temp2.m128i_i128; \
   v3 = temp3.m128i_i128; \
-} 
+}
 
 static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode )
 {
@@ -8538,7 +8538,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
-      stbir__simdi i0, i1, i2, i3; 
+      stbir__simdi i0, i1, i2, i3;
       STBIR_SIMD_NO_UNROLL(encode);
 
       stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
@@ -8547,9 +8547,9 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
       stbir__min_max_shift20( i1, f1 );
       stbir__min_max_shift20( i2, f2 );
       stbir__min_max_shift20( i3, f3 );
-      
+
       stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
-     
+
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
       stbir__linear_to_srgb_finish( i2, f2 );
@@ -8559,7 +8559,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
       encode += 16;
       output += 16;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 16 ) )
         break;
@@ -8590,7 +8590,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
 
   // do the remnants
   #if stbir__coder_min_num < 4
-  while( output < end_output ) 
+  while( output < end_output )
   {
     STBIR_NO_UNROLL(encode);
     output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] );
@@ -8647,10 +8647,10 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       stbir__min_max_shift20( i0, f0 );
       stbir__min_max_shift20( i1, f1 );
       stbir__min_max_shift20( i2, f2 );
-      stbir__scale_and_convert( i3, f3 ); 
-      
+      stbir__scale_and_convert( i3, f3 );
+
       stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
-     
+
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
       stbir__linear_to_srgb_finish( i2, f2 );
@@ -8660,7 +8660,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       output += 16;
       encode += 16;
 
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 16 ) )
         break;
@@ -8673,7 +8673,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
 
   do {
     float f;
-    STBIR_SIMD_NO_UNROLL(encode);                                        
+    STBIR_SIMD_NO_UNROLL(encode);
 
     output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] );
     output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] );
@@ -8708,7 +8708,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * de
     decode += 4;
   }
   decode -= 4;
-  if( decode < decode_end ) 
+  if( decode < decode_end )
   {
     decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ];
     decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted;
@@ -8730,7 +8730,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
     for(;;)
     {
       stbir__simdf f0, f1, f2, f3;
-      stbir__simdi i0, i1, i2, i3; 
+      stbir__simdi i0, i1, i2, i3;
 
       STBIR_SIMD_NO_UNROLL(encode);
       stbir__simdf_load4_transposed( f0, f1, f2, f3, encode );
@@ -8739,9 +8739,9 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
       stbir__scale_and_convert( i1, f1 );
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
-      
+
       stbir__simdi_table_lookup2( i0, i2, to_srgb );
-     
+
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i2, f2 );
 
@@ -8749,7 +8749,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
 
       output += 16;
       encode += 16;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 16 ) )
         break;
@@ -8815,9 +8815,9 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decod
       stbir__simdf_store( decode + 0,  of0 );
       stbir__simdf_store( decode + 4,  of1 );
       #endif
-      decode += 8;  
+      decode += 8;
       input += 8;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 8 ) )
         break;
@@ -8887,7 +8887,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
         stbir__simdiX_store( output, i );
         encode += stbir__simdfX_float_count*2;
         output += stbir__simdfX_float_count*2;
-        if ( output <= end_output ) 
+        if ( output <= end_output )
           continue;
         if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
           break;
@@ -8934,7 +8934,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * output
     encode += stbir__coder_min_num;
   }
   #endif
-  
+
   #else
 
   // try to do blocks of 4 when you can
@@ -9011,7 +9011,7 @@ static void STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int
       #endif
       decode += 8;
       input += 8;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 8 ) )
         break;
@@ -9080,7 +9080,7 @@ static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int
         stbir__simdiX_store( output, i );
         encode += stbir__simdfX_float_count*2;
         output += stbir__simdfX_float_count*2;
-        if ( output <= end_output ) 
+        if ( output <= end_output )
           continue;
         if ( output == ( end_output + stbir__simdfX_float_count*2 ) )
           break;
@@ -9188,7 +9188,7 @@ static void STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep,
       #endif
       decode += 8;
       input += 8;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 8 ) )
         break;
@@ -9269,7 +9269,7 @@ static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp
       #endif
       encode += 8;
       output += 8;
-      if ( output <= end_output ) 
+      if ( output <= end_output )
         continue;
       if ( output == ( end_output + 8 ) )
         break;
@@ -9360,7 +9360,7 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
       #endif
       decode += 16;
       input += 16;
-      if ( decode <= decode_end ) 
+      if ( decode <= decode_end )
         continue;
       if ( decode == ( decode_end + 16 ) )
         break;
@@ -9405,10 +9405,10 @@ static void STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int
   #endif
 
   #else
-  
+
   if ( (void*)decodep != inputp )
     STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) );
-  
+
   #endif
 }
 
@@ -9457,18 +9457,18 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
 #ifdef STBIR_FLOAT_HIGH_CLAMP
       stbir__simdfX_min( e0, e0, high_clamp );
       stbir__simdfX_min( e1, e1, high_clamp );
-#endif      
+#endif
 #ifdef STBIR_FLOAT_LOW_CLAMP
       stbir__simdfX_max( e0, e0, low_clamp );
       stbir__simdfX_max( e1, e1, low_clamp );
-#endif      
+#endif
       stbir__encode_simdfX_unflip( e0 );
       stbir__encode_simdfX_unflip( e1 );
       stbir__simdfX_store( output, e0 );
       stbir__simdfX_store( output+stbir__simdfX_float_count, e1 );
       encode += stbir__simdfX_float_count * 2;
       output += stbir__simdfX_float_count * 2;
-      if ( output < end_output ) 
+      if ( output < end_output )
         continue;
       if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) )
         break;
@@ -9488,10 +9488,10 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
     stbir__simdf_load( e0, encode );
 #ifdef STBIR_FLOAT_HIGH_CLAMP
     stbir__simdf_min( e0, e0, high_clamp );
-#endif      
+#endif
 #ifdef STBIR_FLOAT_LOW_CLAMP
     stbir__simdf_max( e0, e0, low_clamp );
-#endif      
+#endif
     stbir__encode_simdf4_unflip( e0 );
     stbir__simdf_store( output-4, e0 );
     output += 4;
@@ -9539,18 +9539,18 @@ static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int
     encode += stbir__coder_min_num;
   }
   #endif
-  
+
   #endif
 }
 
-#undef stbir__decode_suffix 
+#undef stbir__decode_suffix
 #undef stbir__decode_simdf8_flip
 #undef stbir__decode_simdf4_flip
-#undef stbir__decode_order0 
+#undef stbir__decode_order0
 #undef stbir__decode_order1
 #undef stbir__decode_order2
 #undef stbir__decode_order3
-#undef stbir__encode_order0 
+#undef stbir__encode_order0
 #undef stbir__encode_order1
 #undef stbir__encode_order2
 #undef stbir__encode_order3
@@ -9634,7 +9634,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
-    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) ) 
+    while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
       STBIR_SIMD_NO_UNROLL(output0);
@@ -9643,52 +9643,52 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
 
       #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE
       stbIF0( stbir__simdfX_load( o0, output0 );     stbir__simdfX_load( o1, output0+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );           
+              stbir__simdfX_madd( o0, o0, r0, c0 );  stbir__simdfX_madd( o1, o1, r1, c0 );  stbir__simdfX_madd( o2, o2, r2, c0 );   stbir__simdfX_madd( o3, o3, r3, c0 );
               stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
       stbIF1( stbir__simdfX_load( o0, output1 );     stbir__simdfX_load( o1, output1+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );             
+              stbir__simdfX_madd( o0, o0, r0, c1 );  stbir__simdfX_madd( o1, o1, r1, c1 );  stbir__simdfX_madd( o2, o2, r2, c1 );   stbir__simdfX_madd( o3, o3, r3, c1 );
               stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
       stbIF2( stbir__simdfX_load( o0, output2 );     stbir__simdfX_load( o1, output2+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );             
+              stbir__simdfX_madd( o0, o0, r0, c2 );  stbir__simdfX_madd( o1, o1, r1, c2 );  stbir__simdfX_madd( o2, o2, r2, c2 );   stbir__simdfX_madd( o3, o3, r3, c2 );
               stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
       stbIF3( stbir__simdfX_load( o0, output3 );     stbir__simdfX_load( o1, output3+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );             
+              stbir__simdfX_madd( o0, o0, r0, c3 );  stbir__simdfX_madd( o1, o1, r1, c3 );  stbir__simdfX_madd( o2, o2, r2, c3 );   stbir__simdfX_madd( o3, o3, r3, c3 );
               stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
       stbIF4( stbir__simdfX_load( o0, output4 );     stbir__simdfX_load( o1, output4+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );             
+              stbir__simdfX_madd( o0, o0, r0, c4 );  stbir__simdfX_madd( o1, o1, r1, c4 );  stbir__simdfX_madd( o2, o2, r2, c4 );   stbir__simdfX_madd( o3, o3, r3, c4 );
               stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
       stbIF5( stbir__simdfX_load( o0, output5 );     stbir__simdfX_load( o1, output5+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count));    stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );             
+              stbir__simdfX_madd( o0, o0, r0, c5 );  stbir__simdfX_madd( o1, o1, r1, c5 );  stbir__simdfX_madd( o2, o2, r2, c5 );   stbir__simdfX_madd( o3, o3, r3, c5 );
               stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
       stbIF6( stbir__simdfX_load( o0, output6 );     stbir__simdfX_load( o1, output6+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );             
+              stbir__simdfX_madd( o0, o0, r0, c6 );  stbir__simdfX_madd( o1, o1, r1, c6 );  stbir__simdfX_madd( o2, o2, r2, c6 );   stbir__simdfX_madd( o3, o3, r3, c6 );
               stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
       stbIF7( stbir__simdfX_load( o0, output7 );     stbir__simdfX_load( o1, output7+stbir__simdfX_float_count );   stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) );    stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) );
-              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );             
+              stbir__simdfX_madd( o0, o0, r0, c7 );  stbir__simdfX_madd( o1, o1, r1, c7 );  stbir__simdfX_madd( o2, o2, r2, c7 );   stbir__simdfX_madd( o3, o3, r3, c7 );
               stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
       #else
-      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );  
+      stbIF0( stbir__simdfX_mult( o0, r0, c0 );      stbir__simdfX_mult( o1, r1, c0 );      stbir__simdfX_mult( o2, r2, c0 );       stbir__simdfX_mult( o3, r3, c0 );
               stbir__simdfX_store( output0, o0 );    stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );  
+      stbIF1( stbir__simdfX_mult( o0, r0, c1 );      stbir__simdfX_mult( o1, r1, c1 );      stbir__simdfX_mult( o2, r2, c1 );       stbir__simdfX_mult( o3, r3, c1 );
               stbir__simdfX_store( output1, o0 );    stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );  
+      stbIF2( stbir__simdfX_mult( o0, r0, c2 );      stbir__simdfX_mult( o1, r1, c2 );      stbir__simdfX_mult( o2, r2, c2 );       stbir__simdfX_mult( o3, r3, c2 );
               stbir__simdfX_store( output2, o0 );    stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );  
+      stbIF3( stbir__simdfX_mult( o0, r0, c3 );      stbir__simdfX_mult( o1, r1, c3 );      stbir__simdfX_mult( o2, r2, c3 );       stbir__simdfX_mult( o3, r3, c3 );
               stbir__simdfX_store( output3, o0 );    stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );  
+      stbIF4( stbir__simdfX_mult( o0, r0, c4 );      stbir__simdfX_mult( o1, r1, c4 );      stbir__simdfX_mult( o2, r2, c4 );       stbir__simdfX_mult( o3, r3, c4 );
               stbir__simdfX_store( output4, o0 );    stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );  
+      stbIF5( stbir__simdfX_mult( o0, r0, c5 );      stbir__simdfX_mult( o1, r1, c5 );      stbir__simdfX_mult( o2, r2, c5 );       stbir__simdfX_mult( o3, r3, c5 );
               stbir__simdfX_store( output5, o0 );    stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );  
+      stbIF6( stbir__simdfX_mult( o0, r0, c6 );      stbir__simdfX_mult( o1, r1, c6 );      stbir__simdfX_mult( o2, r2, c6 );       stbir__simdfX_mult( o3, r3, c6 );
               stbir__simdfX_store( output6, o0 );    stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); )
-      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );  
+      stbIF7( stbir__simdfX_mult( o0, r0, c7 );      stbir__simdfX_mult( o1, r1, c7 );      stbir__simdfX_mult( o2, r2, c7 );       stbir__simdfX_mult( o3, r3, c7 );
               stbir__simdfX_store( output7, o0 );    stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 );  stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 );   stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); )
       #endif
 
       input += (4*stbir__simdfX_float_count);
       stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); )
     }
-    while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+    while ( ( (char*)input_end - (char*) input ) >= 16 )
     {
       stbir__simdf o0, r0;
       STBIR_SIMD_NO_UNROLL(output0);
@@ -9714,13 +9714,13 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
       stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) );   stbir__simdf_store( output6, o0 ); )
       stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) );   stbir__simdf_store( output7, o0 ); )
       #endif
-      
+
       input += 4;
       stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
     }
   }
   #else
-  while ( ( (char*)input_end - (char*) input ) >= 16 ) 
+  while ( ( (char*)input_end - (char*) input ) >= 16 )
   {
     float r0, r1, r2, r3;
     STBIR_NO_UNROLL(input);
@@ -9751,7 +9751,7 @@ static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** output
     stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; )
   }
   #endif
-  while ( input < input_end ) 
+  while ( input < input_end )
   {
     float r = input[0];
     STBIR_NO_UNROLL(output0);
@@ -9801,7 +9801,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 );
     return;
   }
-#endif  
+#endif
 
   #ifdef STBIR_SIMD
   {
@@ -9813,14 +9813,14 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); )
     stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); )
     stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); )
-    
-    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) ) 
+
+    while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) )
     {
       stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3;
       STBIR_SIMD_NO_UNROLL(output);
 
       // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones)
-      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); ) 
+      stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); )
       stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); )
       stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); )
       stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); )
@@ -9858,7 +9858,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
       stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); )
     }
 
-    while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+    while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
     {
       stbir__simdf o0, r0;
       STBIR_SIMD_NO_UNROLL(output);
@@ -9882,7 +9882,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     }
   }
   #else
-  while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) 
+  while ( ( (char*)input0_end - (char*) input0 ) >= 16 )
   {
     float o0, o1, o2, o3;
     STBIR_NO_UNROLL(output);
@@ -9903,7 +9903,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; )
   }
   #endif
-  while ( input0 < input0_end ) 
+  while ( input0 < input0_end )
   {
     float o0;
     STBIR_NO_UNROLL(output);
@@ -9919,7 +9919,7 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
     stbIF5( o0 += input5[0] * c5s; )
     stbIF6( o0 += input6[0] * c6s; )
     stbIF7( o0 += input7[0] * c7s; )
-    output[0] = o0; 
+    output[0] = o0;
     ++output;
     stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; )
   }
@@ -9950,25 +9950,25 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
 #ifndef stbir__2_coeff_only
 #define stbir__2_coeff_only()             \
     stbir__1_coeff_only();                \
-    stbir__1_coeff_remnant(1);            
+    stbir__1_coeff_remnant(1);
 #endif
 
 #ifndef stbir__2_coeff_remnant
 #define stbir__2_coeff_remnant( ofs )     \
     stbir__1_coeff_remnant(ofs);          \
-    stbir__1_coeff_remnant((ofs)+1);      
+    stbir__1_coeff_remnant((ofs)+1);
 #endif
-    
+
 #ifndef stbir__3_coeff_only
 #define stbir__3_coeff_only()             \
     stbir__2_coeff_only();                \
-    stbir__1_coeff_remnant(2);            
+    stbir__1_coeff_remnant(2);
 #endif
-    
+
 #ifndef stbir__3_coeff_remnant
 #define stbir__3_coeff_remnant( ofs )     \
     stbir__2_coeff_remnant(ofs);          \
-    stbir__1_coeff_remnant((ofs)+2);      
+    stbir__1_coeff_remnant((ofs)+2);
 #endif
 
 #ifndef stbir__3_coeff_setup
@@ -9978,13 +9978,13 @@ static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp,
 #ifndef stbir__4_coeff_start
 #define stbir__4_coeff_start()            \
     stbir__2_coeff_only();                \
-    stbir__2_coeff_remnant(2);            
+    stbir__2_coeff_remnant(2);
 #endif
-    
+
 #ifndef stbir__4_coeff_continue_from_4
 #define stbir__4_coeff_continue_from_4( ofs )     \
     stbir__2_coeff_remnant(ofs);                  \
-    stbir__2_coeff_remnant((ofs)+2);      
+    stbir__2_coeff_remnant((ofs)+2);
 #endif
 
 #ifndef stbir__store_output_tiny
@@ -9996,7 +9996,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( floa
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__1_coeff_only();
     stbir__store_output_tiny();
@@ -10008,7 +10008,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__2_coeff_only();
     stbir__store_output_tiny();
@@ -10020,7 +10020,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__3_coeff_only();
     stbir__store_output_tiny();
@@ -10032,7 +10032,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__store_output();
@@ -10044,7 +10044,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__1_coeff_remnant(4);
@@ -10057,7 +10057,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__2_coeff_remnant(4);
@@ -10071,9 +10071,9 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( flo
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
-  
+
     stbir__4_coeff_start();
     stbir__3_coeff_remnant(4);
     stbir__store_output();
@@ -10085,7 +10085,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10098,7 +10098,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( flo
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10112,7 +10112,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( fl
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10127,7 +10127,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( fl
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10141,7 +10141,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( fl
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
     float const * hc = horizontal_coefficients;
     stbir__4_coeff_start();
     stbir__4_coeff_continue_from_4(4);
@@ -10155,8 +10155,8 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
@@ -10175,8 +10175,8 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
@@ -10186,7 +10186,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1
       stbir__4_coeff_continue_from_4( 0 );
       --n;
     } while ( n > 0 );
-    stbir__1_coeff_remnant( 4 ); 
+    stbir__1_coeff_remnant( 4 );
     stbir__store_output();
   } while ( output < output_end );
 }
@@ -10196,8 +10196,8 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2
   float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels;
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
@@ -10207,7 +10207,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2
       stbir__4_coeff_continue_from_4( 0 );
       --n;
     } while ( n > 0 );
-    stbir__2_coeff_remnant( 4 ); 
+    stbir__2_coeff_remnant( 4 );
 
     stbir__store_output();
   } while ( output < output_end );
@@ -10219,8 +10219,8 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
   float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer;
   stbir__3_coeff_setup();
   do {
-    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; 
-    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2; 
+    float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels;
+    int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2;
     float const * hc = horizontal_coefficients;
 
     stbir__4_coeff_start();
@@ -10230,7 +10230,7 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
       stbir__4_coeff_continue_from_4( 0 );
       --n;
     } while ( n > 0 );
-    stbir__3_coeff_remnant( 4 ); 
+    stbir__3_coeff_remnant( 4 );
 
     stbir__store_output();
   } while ( output < output_end );
@@ -10238,26 +10238,26 @@ static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3
 
 static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]=
 {
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3),
 };
 
 static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]=
 {
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs),
   STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs),
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs),
   STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs),
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),  
-  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),  
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs),
+  STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs),
 };
 
 #undef STBIR__horizontal_channels
@@ -10288,38 +10288,38 @@ This software is available under 2 licenses -- choose whichever you prefer.
 ------------------------------------------------------------------------------
 ALTERNATIVE A - MIT License
 Copyright (c) 2017 Sean Barrett
-Permission is hereby granted, free of charge, to any person obtaining a copy of 
-this software and associated documentation files (the "Software"), to deal in 
-the Software without restriction, including without limitation the rights to 
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
-of the Software, and to permit persons to whom the Software is furnished to do 
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all 
+The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 ------------------------------------------------------------------------------
 ALTERNATIVE B - Public Domain (www.unlicense.org)
 This is free and unencumbered software released into the public domain.
-Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
-software, either in source code form or as a compiled binary, for any purpose, 
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
 commercial or non-commercial, and by any means.
-In jurisdictions that recognize copyright laws, the author or authors of this 
-software dedicate any and all copyright interest in the software to the public 
-domain. We make this dedication for the benefit of the public at large and to 
-the detriment of our heirs and successors. We intend this dedication to be an 
-overt act of relinquishment in perpetuity of all present and future rights to 
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
 this software under copyright law.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
-AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ------------------------------------------------------------------------------
 */

From 0bc88af4de5fb022db643c2d8e549a0927749354 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 14 Dec 2023 03:12:50 -0800
Subject: [PATCH 160/170] stb_image: optimizations

---
 stb_image.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_image.h b/stb_image.h
index a6d33fb..a632d54 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -1,4 +1,4 @@
-/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.29 - public domain image loader - http://nothings.org/stb
                                   no warranty implied; use at your own risk
 
    Do this:
@@ -48,6 +48,7 @@ LICENSE
 
 RECENT REVISION HISTORY:
 
+      2.29  (2023-05-xx) optimizations
       2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
       2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
       2.26  (2020-07-13) many minor fixes

From a8a25e17b5fddf4976376f0e2024ffc824907679 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 14 Dec 2023 03:11:33 -0800
Subject: [PATCH 161/170] update readme version numbers

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 48fb917..d13eebf 100644
--- a/README.md
+++ b/README.md
@@ -24,10 +24,10 @@ library    | lastest version | category | LoC | description
 --------------------- | ---- | -------- | --- | --------------------------------
 **[stb_vorbis.c](stb_vorbis.c)** | 1.22 | audio | 5584 | decode ogg vorbis files from file/memory to float/16-bit signed output
 **[stb_hexwave.h](stb_hexwave.h)** | 0.5 | audio | 680 | audio waveform synthesizer
-**[stb_image.h](stb_image.h)** | 2.28 | graphics | 7987 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
+**[stb_image.h](stb_image.h)** | 2.29 | graphics | 7985 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
-**[stb_image_resize2.h](stb_image_resize2.h)** | 2.01 | graphics | 10303 | resize images larger/smaller with good quality
+**[stb_image_resize2.h](stb_image_resize2.h)** | 2.04 | graphics | 10325 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
 **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
@@ -45,7 +45,7 @@ library    | lastest version | category | LoC | description
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 50786
+Total lines of C code: 50806
 
 
 FAQ

From f4a71b13373436a2866c5d68f8f80ac6f0bc1ffe Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 14 Dec 2023 10:59:23 -0800
Subject: [PATCH 162/170] README.md: tweak credits

---
 README.md              | 4 ++--
 tools/README.header.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index d13eebf..90f6d38 100644
--- a/README.md
+++ b/README.md
@@ -15,8 +15,8 @@ Noteworthy:
 * font text rasterizer: [stb_truetype.h](stb_truetype.h)
 * typesafe containers: [stb_ds.h](stb_ds.h)
 
-Most libraries by stb, except: stb_dxt by Fabian "ryg" Giesen, stb_image_resize
-by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts.
+Most libraries by stb, except: stb_dxt by Fabian "ryg" Giesen, original stb_image_resize
+by Jorge L. "VinoBS" Rodriguez, and stb_image_resize2 and stb_sprintf by Jeff Roberts.
 
 <a name="stb_libs"></a>
 
diff --git a/tools/README.header.md b/tools/README.header.md
index 0050967..65e0408 100644
--- a/tools/README.header.md
+++ b/tools/README.header.md
@@ -13,8 +13,8 @@ Noteworthy:
 * font text rasterizer: [stb_truetype.h](stb_truetype.h)
 * typesafe containers: [stb_ds.h](stb_ds.h)
 
-Most libraries by stb, except: stb_dxt by Fabian "ryg" Giesen, stb_image_resize
-by Jorge L. "VinoBS" Rodriguez, and stb_sprintf by Jeff Roberts.
+Most libraries by stb, except: stb_dxt by Fabian "ryg" Giesen, original stb_image_resize
+by Jorge L. "VinoBS" Rodriguez, and stb_image_resize2 and stb_sprintf by Jeff Roberts.
 
 <a name="stb_libs"></a>
 

From 7a075fe7c79d88ef0150e03fe6e20c828782c0ab Mon Sep 17 00:00:00 2001
From: "Jeff Roberts (Bellevue)" <jeffr@radgametools.com>
Date: Sun, 4 Feb 2024 14:42:51 -0800
Subject: [PATCH 163/170] Fix 2 pixel to 1 pixel with wrap Fix output buffer
 for output callback

---
 stb_image_resize2.h | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index faf1b08..2c693c5 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.04 - public domain image resizing
+/* stb_image_resize2 - v2.05 - public domain image resizing
 
    by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
@@ -324,11 +324,13 @@
       Fabian Giesen: half float and srgb converters
       Sean Barrett: API design, optimizations
       Jorge L Rodriguez: Original 1.0 implementation
-      Aras Pranckevicius: bugfixes for 1.0
+      Aras Pranckevicius: bugfixes
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.04 (2023-11-17) Fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
+      2.05 (2024-02-24) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras)
+                        fix for output callback (thanks Julien Koenen)
+      2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
       2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
       2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc
                           (2x-5x faster without simd, 4x-12x faster with simd)
@@ -5958,7 +5960,7 @@ static void stbir__encode_scanline( stbir__info const * stbir_info, void *output
 
   // if we have an output callback, call it to send the data
   if ( stbir_info->out_pixels_cb )
-    stbir_info->out_pixels_cb( output_buffer_data, num_pixels, row, stbir_info->user_data );
+    stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data );
 }
 
 
@@ -6352,15 +6354,24 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
   // pre calculate stuff based on the above
   samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
 
-  if ( edge == STBIR_EDGE_WRAP )
-    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 2 ) )  // this can only happen when shrinking to a single pixel
-      samp->filter_pixel_width = scale_info->input_full_size * 2;
+  // don't double reflect on edges (happens on 2 pixel to 1 pixel output)
+  if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
+    samp->filter_pixel_width = scale_info->input_full_size * 3;
 
   // This is how much to expand buffers to account for filters seeking outside
   // the image boundaries.
   samp->filter_pixel_margin = samp->filter_pixel_width / 2;
+  
+  // again, no double reflect
+  if ( samp->filter_pixel_margin > scale_info->input_full_size )
+    samp->filter_pixel_margin = scale_info->input_full_size;
 
   samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
+
+  // again, no double reflect
+  if ( samp->num_contributors > ( scale_info->input_full_size * 3 ) )
+    samp->num_contributors = scale_info->input_full_size * 3;
+
   samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
   samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
 

From 7f7e3469cf21fa01b3dccc35649c0d7df6df73c5 Mon Sep 17 00:00:00 2001
From: "Jeff Roberts (Bellevue)" <jeffr@radgametools.com>
Date: Thu, 8 Feb 2024 10:36:54 -0800
Subject: [PATCH 164/170] clean up comments

---
 stb_image_resize2.h | 48 +++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 2c693c5..ac2f065 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -3202,8 +3202,8 @@ static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel
 
   if ( edge == STBIR_EDGE_WRAP )
   {
-    if ( first <= -input_size )
-      first = -(input_size-1);
+    if ( first < -input_size )
+      first = -input_size;
     if ( last >= (input_size*2))
       last = (input_size*2) - 1;
   }
@@ -3562,7 +3562,7 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   filter_info->widest = widest;
 }
 
-static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row_width )
+static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
 {
   #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
   #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; }
@@ -3571,6 +3571,9 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   #else
   #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
   #endif
+  
+  int row_end = row1 + 1;
+
   if ( coefficient_width != widest )
   {
     float * pc = coefficents;
@@ -3712,10 +3715,10 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
     float * coeffs = coefficents + widest * ( num_contributors - 1 );
 
     // go until no chance of clipping (this is usually less than 8 lops)
-    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_width ) )
+    while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) )
     {
       // might we clip??
-      if ( ( contribs->n0 + widest ) > row_width )
+      if ( ( contribs->n0 + widest ) > row_end )
       {
         int stop_range = widest;
 
@@ -3734,15 +3737,15 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
         }
 
         // now see if we still clip with the refined range
-        if ( ( contribs->n0 + stop_range ) > row_width )
+        if ( ( contribs->n0 + stop_range ) > row_end )
         {
-          int new_n0 = row_width - stop_range;
+          int new_n0 = row_end - stop_range;
           int num = contribs->n1 - contribs->n0 + 1;
           int backup = contribs->n0 - new_n0;
           float * from_co = coeffs + num - 1;
           float * to_co = from_co + backup;
 
-          STBIR_ASSERT( ( new_n0 >= 0 ) && ( new_n0 < contribs->n0 ) );
+          STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) );
 
           // move the coeffs over
           while( num )
@@ -6354,24 +6357,31 @@ static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir
   // pre calculate stuff based on the above
   samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data);
 
-  // don't double reflect on edges (happens on 2 pixel to 1 pixel output)
-  if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
-    samp->filter_pixel_width = scale_info->input_full_size * 3;
+  // filter_pixel_width is the conservative size in pixels of input that affect an output pixel.
+  //   In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the 
+  //   filter will extend before or after the scanline beyond just one extra entire copy of the 
+  //   scanline (we would hit the edge twice). We don't let you do that, so we clamp the total 
+  //   width to 3x the total of input pixel (once for the scanline, once for the left side 
+  //   overhang, and once for the right side). We only do this for edge mode, since the other 
+  //   modes can just re-edge clamp back in again.
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) )
+      samp->filter_pixel_width = scale_info->input_full_size * 3;
 
   // This is how much to expand buffers to account for filters seeking outside
   // the image boundaries.
   samp->filter_pixel_margin = samp->filter_pixel_width / 2;
   
-  // again, no double reflect
-  if ( samp->filter_pixel_margin > scale_info->input_full_size )
-    samp->filter_pixel_margin = scale_info->input_full_size;
+  // filter_pixel_margin is the amount that this filter can overhang on just one side of either 
+  //   end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's 
+  //   worth of pixels, we clamp this one side of overhang to the input scanline size. Again, 
+  //   this clamping only happens in rare cases with the default filters (2 pix to 1 pix). 
+  if ( edge == STBIR_EDGE_WRAP )
+    if ( samp->filter_pixel_margin > scale_info->input_full_size )
+      samp->filter_pixel_margin = scale_info->input_full_size;
 
   samp->num_contributors = stbir__get_contributors(samp, samp->is_gather);
 
-  // again, no double reflect
-  if ( samp->num_contributors > ( scale_info->input_full_size * 3 ) )
-    samp->num_contributors = scale_info->input_full_size * 3;
-
   samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors);
   samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float); // extra sizeof(float) is padding
 
@@ -7007,7 +7017,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
       stbir__get_extents( horizontal, &info->scanline_extents );
 
       // pack the horizontal coeffs
-      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n1 + 1 );
+      horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 );
 
       STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) );
 

From c59da6729e0b80af931548dda43c9ea964613315 Mon Sep 17 00:00:00 2001
From: "Jeff Roberts (Bellevue)" <jeffr@radgametools.com>
Date: Thu, 8 Feb 2024 12:53:51 -0800
Subject: [PATCH 165/170] Mark row0 as unused

---
 stb_image_resize2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index ac2f065..4a8bcb2 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -3571,7 +3571,8 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   #else
   #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
   #endif
-  
+  STBIR__UNUSED( row0 ); // only used in an assert
+
   int row_end = row1 + 1;
 
   if ( coefficient_width != widest )

From b7cf1246284b49dfe7f1288e6f739b7a3a9d966b Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Thu, 8 Feb 2024 13:24:06 -0800
Subject: [PATCH 166/170] stb_image: fix VC6

---
 stb_image_resize2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 4a8bcb2..35a334d 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -3571,9 +3571,9 @@ static int stbir__pack_coefficients( int num_contributors, stbir__contributors*
   #else
   #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; }
   #endif
-  STBIR__UNUSED( row0 ); // only used in an assert
 
   int row_end = row1 + 1;
+  STBIR__UNUSED( row0 ); // only used in an assert
 
   if ( coefficient_width != widest )
   {

From 1828f357dc8f18256618d32a1d3aa8dba5cd24df Mon Sep 17 00:00:00 2001
From: "Jeff Roberts (Bellevue)" <jeffr@radgametools.com>
Date: Mon, 12 Feb 2024 22:10:02 -0800
Subject: [PATCH 167/170] Fix bug in coeff generation on more than 3x
 downsamples with width and height scale equal

---
 stb_image_resize2.h | 60 +++++++++++++++++++++++++++++----------------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 35a334d..19241c1 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.05 - public domain image resizing
+/* stb_image_resize2 - v2.06 - public domain image resizing
 
    by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
@@ -328,7 +328,9 @@
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.05 (2024-02-24) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras)
+      2.06 (2024-02-10) fix for indentical width/height 3x or more down-scaling 
+                          undersampling a single row on rare resize ratios (about 1%)
+      2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras)
                         fix for output callback (thanks Julien Koenen)
       2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic).
       2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks.
@@ -3393,6 +3395,12 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
     }
   }
 }
+#define STBIR_RENORMALIZE_IN_FLOAT
+#ifdef STBIR_RENORMALIZE_IN_FLOAT
+#define STBIR_RENORM_TYPE float
+#else
+#define STBIR_RENORM_TYPE double
+#endif
 
 static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width )
 {
@@ -3415,14 +3423,14 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   for (n = 0; n < end; n++)
   {
     int i;
-    float filter_scale, total_filter = 0;
+    STBIR_RENORM_TYPE filter_scale, total_filter = 0;
     int e;
 
     // add all contribs
     e = contribs->n1 - contribs->n0;
     for( i = 0 ; i <= e ; i++ )
     {
-      total_filter += coeffs[i];
+      total_filter += (STBIR_RENORM_TYPE) coeffs[i];
       STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f )  ); // check for wonky weights
     }
 
@@ -3438,10 +3446,11 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
       // if the total isn't 1.0, rescale everything
       if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) )
       {
-        filter_scale = 1.0f / total_filter;
+        filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter;
+
         // scale them all
         for (i = 0; i <= e; i++)
-          coeffs[i] *= filter_scale;
+          coeffs[i] = (float) ( coeffs[i] * filter_scale );
       }
     }
     ++contribs;
@@ -3562,6 +3571,8 @@ static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter
   filter_info->widest = widest;
 }
 
+#undef STBIR_RENORM_TYPE 
+
 static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) 
 {
   #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; }
@@ -3869,26 +3880,33 @@ static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * ot
           for (k = gn0 ; k <= gn1 ; k++ )
           {
             float gc = *g_coeffs++;
-            if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
+            
+            // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width
+            //   (which happens when pivoting from horizontal, which might have dummy zeros)
+            if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) )
             {
+              if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) )
               {
-                // if we are skipping over several contributors, we need to clear the skipped ones
-                stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
-                while ( clear_contributors < scatter_contributors )
                 {
-                  clear_contributors->n0 = 0;
-                  clear_contributors->n1 = -1;
-                  ++clear_contributors;
+                  // if we are skipping over several contributors, we need to clear the skipped ones
+                  stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1);
+                  while ( clear_contributors < scatter_contributors )
+                  {
+                    clear_contributors->n0 = 0;
+                    clear_contributors->n1 = -1;
+                    ++clear_contributors;
+                  }
                 }
+                scatter_contributors->n0 = n;
+                scatter_contributors->n1 = n;
+                scatter_coeffs[0]  = gc;
+                highest_set = k;
               }
-              scatter_contributors->n0 = n;
-              scatter_contributors->n1 = n;
-              scatter_coeffs[0]  = gc;
-              highest_set = k;
-            }
-            else
-            {
-              stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+              else
+              {
+                stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc );
+              }
+              STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width );
             }
             ++scatter_contributors;
             scatter_coeffs += scatter_coefficient_width;

From 2fb057af65b0951d063eafd3e052b06e8b4b365e Mon Sep 17 00:00:00 2001
From: "Jeff Roberts (Bellevue)" <jeffr@radgametools.com>
Date: Mon, 12 Feb 2024 22:14:27 -0800
Subject: [PATCH 168/170] remove test

---
 stb_image_resize2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 19241c1..1cd379a 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -3395,7 +3395,7 @@ static void stbir__calculate_coefficients_for_gather_downsample( int start, int
     }
   }
 }
-#define STBIR_RENORMALIZE_IN_FLOAT
+
 #ifdef STBIR_RENORMALIZE_IN_FLOAT
 #define STBIR_RENORM_TYPE float
 #else

From 43201e7788f727bb307eb80cd8bd03c818d2c12d Mon Sep 17 00:00:00 2001
From: "Jeff Roberts (Bellevue)" <jeffr@radgametools.com>
Date: Fri, 24 May 2024 18:48:38 -0700
Subject: [PATCH 169/170] image resize 2.07

---
 stb_image_resize2.h | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/stb_image_resize2.h b/stb_image_resize2.h
index 1cd379a..f48c509 100644
--- a/stb_image_resize2.h
+++ b/stb_image_resize2.h
@@ -1,4 +1,4 @@
-/* stb_image_resize2 - v2.06 - public domain image resizing
+/* stb_image_resize2 - v2.07 - public domain image resizing
 
    by Jeff Roberts (v2) and Jorge L Rodriguez
    http://github.com/nothings/stb
@@ -328,7 +328,11 @@
       Nathan Reed: warning fixes for 1.0
 
    REVISIONS
-      2.06 (2024-02-10) fix for indentical width/height 3x or more down-scaling 
+      2.07 (2024-05-24) fix for slow final split during threaded conversions of very 
+                          wide scanlines when downsampling (caused by extra input 
+                          converting), fix for wide scanline resamples with many 
+                          splits (int overflow), fix GCC warning.
+      2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling 
                           undersampling a single row on rare resize ratios (about 1%)
       2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras)
                         fix for output callback (thanks Julien Koenen)
@@ -1068,7 +1072,7 @@ struct stbir__info
   stbir__alpha_unweight_func * alpha_unweight;
   stbir__encode_pixels_func * encode_pixels;
 
-  int alloced_total;
+  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
   int splits; // count of splits
 
   stbir_internal_pixel_layout input_pixel_layout_internal;
@@ -1079,7 +1083,7 @@ struct stbir__info
   int vertical_first;
   int channels;
   int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3)
-  int alloc_ring_buffer_num_entries;    // Number of entries in the ring buffer that will be allocated
+  size_t alloced_total;
 };
 
 
@@ -2513,8 +2517,8 @@ static const STBIR__SIMDI_CONST(STBIR_topscale,      0x02000000);
 
 #ifdef STBIR_MEMCPY
 #undef STBIR_MEMCPY
-#define STBIR_MEMCPY stbir_simd_memcpy
 #endif
+#define STBIR_MEMCPY stbir_simd_memcpy
 
 // override normal use of memcpy with much simpler copy (faster and smaller with our sized copies)
 static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
@@ -2546,7 +2550,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
       // do one unaligned to get us aligned for the stream out below
       stbir__simdf_load( x, ( d + ofs_to_src ) );
       stbir__simdf_store( d, x );
-      d = (char*)( ( ( (ptrdiff_t)d ) + 16 ) & ~15 );
+      d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 );
 
       for(;;)
       {
@@ -2578,7 +2582,7 @@ static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes )
     stbir__simdfX_store( d +  4*stbir__simdfX_float_count, x1 );
     stbir__simdfX_store( d +  8*stbir__simdfX_float_count, x2 );
     stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 );
-    d = (char*)( ( ( (ptrdiff_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
+    d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) );
 
     for(;;)
     {
@@ -4399,7 +4403,7 @@ static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float
   stbir_edge edge_horizontal = stbir_info->horizontal.edge;
   stbir_edge edge_vertical = stbir_info->vertical.edge;
   int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size);
-  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (ptrdiff_t)row * (ptrdiff_t) stbir_info->input_stride_bytes;
+  const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes;
   stbir__span const * spans = stbir_info->scanline_extents.spans;
   float* full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels;
 
@@ -6052,7 +6056,7 @@ static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbi
     stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
   }
 
-  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((ptrdiff_t)n * (ptrdiff_t)stbir_info->output_stride_bytes),
+  stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes),
                           encode_buffer, n  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 }
 
@@ -6093,7 +6097,7 @@ static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__
 
   // initialize the ring buffer for gathering
   split_info->ring_buffer_begin_index = 0;
-  split_info->ring_buffer_first_scanline = stbir_info->vertical.extent_info.lowest;
+  split_info->ring_buffer_first_scanline = vertical_contributors->n0;
   split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty"
 
   for (y = start_output_y; y < end_output_y; y++)
@@ -6147,7 +6151,7 @@ static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_
   float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index );
 
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
@@ -6168,7 +6172,7 @@ static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(st
   stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // dump the scanline out
-  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (ptrdiff_t)split_info->ring_buffer_first_scanline * (ptrdiff_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
+  stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline  STBIR_ONLY_PROFILE_SET_SPLIT_INFO );
 
   // mark it as empty
   ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER;
@@ -6765,7 +6769,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
 
   stbir__info * info = 0;
   void * alloced = 0;
-  int alloced_total = 0;
+  size_t alloced_total = 0;
   int vertical_first;
   int decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size, alloc_ring_buffer_num_entries;
 
@@ -7108,7 +7112,7 @@ static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sample
     // is this the first time through loop?
     if ( info == 0 )
     {
-      alloced_total = (int) ( 15 + (size_t)advance_mem );
+      alloced_total = ( 15 + (size_t)advance_mem );
       alloced = STBIR_MALLOC( alloced_total, user_data );
       if ( alloced == 0 )
         return 0;
@@ -7225,7 +7229,7 @@ static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * r
     info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type];
 
   // calc offset
-  info->output_data = ( (char*) resize->output_pixels ) + ( (ptrdiff_t) info->offset_y * (ptrdiff_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
+  info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] );
 
   info->in_pixels_cb = resize->input_cb;
   info->user_data = resize->user_data;
@@ -7797,7 +7801,7 @@ static int stbir__check_output_stuff( void ** ret_ptr, int * ret_pitch, void * o
   if ( output_stride_in_bytes < pitch )
     return 0;
 
-  size = output_stride_in_bytes * output_h;
+  size = (size_t)output_stride_in_bytes * (size_t)output_h;
   if ( size == 0 )
     return 0;
 
@@ -8569,7 +8573,6 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
@@ -8588,7 +8591,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int w
       stbir__min_max_shift20( i2, f2 );
       stbir__min_max_shift20( i3, f3 );
 
-      stbir__simdi_table_lookup4( i0, i1, i2, i3, to_srgb );
+      stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
@@ -8670,7 +8673,6 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
@@ -8689,7 +8691,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * o
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
 
-      stbir__simdi_table_lookup3( i0, i1, i2, to_srgb );
+      stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i1, f1 );
@@ -8761,7 +8763,6 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
   unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels;
 
   #ifdef STBIR_SIMD
-  stbir_uint32 const * to_srgb = fp32_to_srgb8_tab4 - (127-13)*8;
 
   if ( width_times_channels >= 16 )
   {
@@ -8780,7 +8781,7 @@ static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * o
       stbir__min_max_shift20( i2, f2 );
       stbir__scale_and_convert( i3, f3 );
 
-      stbir__simdi_table_lookup2( i0, i2, to_srgb );
+      stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) );
 
       stbir__linear_to_srgb_finish( i0, f0 );
       stbir__linear_to_srgb_finish( i2, f2 );

From 449758bd74ce14b7d0ba9b24a3dbc4386702a0e0 Mon Sep 17 00:00:00 2001
From: Sean Barrett <seanb@radgametools.com>
Date: Sat, 25 May 2024 10:56:08 -0700
Subject: [PATCH 170/170] update stb_image_resize2.h

---
 README.md     | 4 ++--
 tests/stb.dsp | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 90f6d38..cba7a52 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ library    | lastest version | category | LoC | description
 **[stb_image.h](stb_image.h)** | 2.29 | graphics | 7985 | image loading/decoding from file/memory: JPG, PNG, TGA, BMP, PSD, GIF, HDR, PIC
 **[stb_truetype.h](stb_truetype.h)** | 1.26 | graphics | 5077 | parse, decode, and rasterize characters from truetype fonts
 **[stb_image_write.h](stb_image_write.h)** | 1.16 | graphics | 1724 | image writing to disk: PNG, TGA, BMP
-**[stb_image_resize2.h](stb_image_resize2.h)** | 2.04 | graphics | 10325 | resize images larger/smaller with good quality
+**[stb_image_resize2.h](stb_image_resize2.h)** | 2.07 | graphics | 10366 | resize images larger/smaller with good quality
 **[stb_rect_pack.h](stb_rect_pack.h)** | 1.01 | graphics | 623 | simple 2D rectangle packer with decent quality
 **[stb_perlin.h](stb_perlin.h)** | 0.5 | graphics | 428 | perlin's revised simplex noise w/ different seeds
 **[stb_ds.h](stb_ds.h)** | 0.67 | utility | 1895 | typesafe dynamic array and hash tables for C, will compile in C++
@@ -45,7 +45,7 @@ library    | lastest version | category | LoC | description
 **[stb_include.h](stb_include.h)** | 0.02 | misc | 295 | implement recursive #include support, particularly for GLSL
 
 Total libraries: 21
-Total lines of C code: 50806
+Total lines of C code: 50847
 
 
 FAQ
diff --git a/tests/stb.dsp b/tests/stb.dsp
index 849b95b..b7039c7 100644
--- a/tests/stb.dsp
+++ b/tests/stb.dsp
@@ -130,6 +130,10 @@ SOURCE=..\stb_image.h
 # End Source File
 # Begin Source File
 
+SOURCE=..\stb_image_resize2.h
+# End Source File
+# Begin Source File
+
 SOURCE=..\stb_image_write.h
 # End Source File
 # Begin Source File