diff --git a/stb_image.h b/stb_image.h
index 1831028..ef01a26 100644
--- a/stb_image.h
+++ b/stb_image.h
@@ -81,7 +81,8 @@
 
 // Limitations:
 //    - no jpeg progressive support
-//    - non-HDR formats support 8-bit samples only (jpeg, png)
+//    - no 16-bit-per-channel PNG
+//    - no 12-bit-per-channel jpeg
 //    - no delayed line count (jpeg) -- IJG doesn't support either
 //    - no 1-bit BMP
 //    - GIF always returns *comp=4
@@ -196,8 +197,12 @@
 //
 // SIMD support
 //
-// The JPEG decoder will automatically use SIMD kernels where supported,
-// replacing the STBI_SIMD-do-it-yourself interface from previous versions.
+// The JPEG decoder will automatically use SIMD kernels on x86 platforms
+// where supported.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
 // The code will automatically detect if the required SIMD instructions are
 // available, and fall back to the generic C version where they're not.
 //
@@ -402,16 +407,35 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #include <emmintrin.h>
 
 #ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
 #include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
 static int stbi__sse2_available()
 {
-   int info[4];
-   __cpuid(info, 1);
-   return ((info[3] >> 26) & 1) != 0;
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
 }
-
 #else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
@@ -2117,6 +2141,35 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
    }
 }
 
+#define float2fixed2(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+
+static void stbi__YCbCr_to_RGB_backport(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr*float2fixed2(1.40200f);
+      g = y_fixed;
+      g += (cr*-float2fixed2(0.71414f)) & 0xffff0000;
+      g += (cb*-float2fixed2(0.34414f)) & 0xffff0000;
+      b = y_fixed + cb*float2fixed2(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
 #ifdef STBI_SSE2
 static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
 {
@@ -2130,37 +2183,35 @@ static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc cons
    if (step == 4) {
       // this is a fairly straightforward implementation and not super-optimized.
       __m128i signflip = _mm_set1_epi8(-0x80);
-      __m128i cr_const0 = _mm_set1_epi16((short) ( 1.40200f*4096.0f));
-      __m128i cr_const1 = _mm_set1_epi16((short) (-0.71414f*4096.0f));
-      __m128i cb_const0 = _mm_set1_epi16((short) (-0.34414f*4096.0f));
-      __m128i cb_const1 = _mm_set1_epi16((short) ( 1.77200f*4096.0f));
-      __m128i y_bias = _mm_set1_epi16(8);
-      __m128i xw = _mm_set1_epi16(255);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
 
       for (; i+7 < count; i += 8) {
          // load
-         __m128i zero = _mm_setzero_si128();
          __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
          __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
          __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
-         __m128i cr_bias = _mm_xor_si128(cr_bytes, signflip); // -128
-         __m128i cb_bias = _mm_xor_si128(cb_bytes, signflip); // -128
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
 
          // unpack to short (and left-shift cr, cb by 8)
-         __m128i yw  = _mm_unpacklo_epi8(y_bytes, zero);
-         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_bias);
-         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_bias);
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
 
          // color transform
-         __m128i yws = _mm_slli_epi16(yw, 4);
+         __m128i yws = _mm_srli_epi16(yw, 4);
          __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
          __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
-         __m128i ywb = _mm_add_epi16(yws, y_bias);
          __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
          __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
-         __m128i rws = _mm_add_epi16(cr0, ywb);
-         __m128i gwt = _mm_add_epi16(cb0, ywb);
-         __m128i bws = _mm_add_epi16(ywb, cb1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
          __m128i gws = _mm_add_epi16(gwt, cr1);
 
          // descale
diff --git a/tests/image_test.c b/tests/image_test.c
index 5971704..b3c8e1d 100644
--- a/tests/image_test.c
+++ b/tests/image_test.c
@@ -9,9 +9,48 @@
 
 #define PNGSUITE_PRIMARY
 
+#if 1
+void test_ycbcr(void)
+{
+   STBI_SIMD_ALIGN(unsigned char, y[256]);
+   STBI_SIMD_ALIGN(unsigned char, cb[256]);
+   STBI_SIMD_ALIGN(unsigned char, cr[256]);
+   STBI_SIMD_ALIGN(unsigned char, out1[256][4]);
+   STBI_SIMD_ALIGN(unsigned char, out2[256][4]);
+
+   int i,j,k;
+   int count = 0, bigcount=0;
+
+   for (i=0; i < 256; ++i) {
+      for (j=0; j < 256; ++j) {
+         for (k=0; k < 256; ++k) {
+            y[k] = k;
+            cb[k] = j;
+            cr[k] = i;
+         }
+         stbi__YCbCr_to_RGB_sse2(out1[0], y, cb, cr, 256, 4);
+         stbi__YCbCr_to_RGB_backport(out2[0], y, cb, cr, 256, 4);
+         for (k=0; k < 256; ++k) {
+            if (out1[k][0] != out2[k][0] || out1[k][1] != out2[k][1] || out1[k][2] != out2[k][2]) {
+               int dist1 = abs(out1[k][0] - out2[k][0]);
+               int dist2 = abs(out1[k][1] - out2[k][1]);
+               int dist3 = abs(out1[k][2] - out2[k][2]);
+               ++count;
+               if (dist2)
+                  ++bigcount;
+            }
+         }
+      }
+      printf("So far: %d (%d big)\n", count, bigcount);
+   }
+   printf("Final: %d (%d big)\n", count, bigcount);
+}
+#endif
+
 int main(int argc, char **argv)
 {
    int w,h;
+   test_ycbcr();
    if (argc > 1) {
       int i;
       for (i=1; i < argc; ++i) {