mirror of
https://github.com/nothings/stb
synced 2025-01-05 22:34:23 +03:00
stb_image: Add SSE2 IDCT for JPEG decoder.
Also add SSE2 detection for MSVC++. Detection on GCC will follow later.
This commit is contained in:
parent
fb2c841bb8
commit
e5db25f637
201
stb_image.h
201
stb_image.h
@ -422,7 +422,16 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
|
||||
#include <emmintrin.h>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h> // __cpuid
|
||||
#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
|
||||
|
||||
static int stbi__sse2_available()
|
||||
{
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return ((info[3] >> 26) & 1) != 0;
|
||||
}
|
||||
|
||||
#else // assume GCC-style if not VC++
|
||||
#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
|
||||
#endif
|
||||
@ -1311,7 +1320,7 @@ stbi_inline static stbi_uc stbi__clamp(int x)
|
||||
return (stbi_uc) x;
|
||||
}
|
||||
|
||||
#define stbi__f2f(x) (int) (((x) * 4096 + 0.5))
|
||||
#define stbi__f2f(x) ((int) (((x) * 4096 + 0.5)))
|
||||
#define stbi__fsh(x) ((x) << 12)
|
||||
|
||||
// derived from jidctint -- DCT_ISLOW
|
||||
@ -1421,17 +1430,191 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef STBI_SIMD
|
||||
static void stbi__idct_block_wrapper(stbi_uc *out, int out_stride, short data[64], unsigned short dequant[64])
|
||||
#ifdef STBI_SSE2
|
||||
|
||||
// sse2 integer IDCT. not the fastest possible implementation but it
|
||||
// produces bit-identical results to the generic C version so it's
|
||||
// fully "transparent".
|
||||
static void stbi__idct_sse2(stbi_uc *out, int out_stride, short data[64])
|
||||
{
|
||||
stbi__idct_block(out, out_stride, data);
|
||||
// This is constructed to match our regular (generic) integer IDCT exactly.
|
||||
__m128i row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
__m128i tmp;
|
||||
|
||||
// dot product constant: even elems=x, odd elems=y
|
||||
#define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
|
||||
|
||||
// out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
|
||||
// out(1) = c1[even]*x + c1[odd]*y
|
||||
#define dct_rot(out0,out1, x,y,c0,c1) \
|
||||
__m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
|
||||
__m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
|
||||
__m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
|
||||
__m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
|
||||
__m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
|
||||
__m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
|
||||
|
||||
// out = in << 12 (in 16-bit, out 32-bit)
|
||||
#define dct_widen(out, in) \
|
||||
__m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
|
||||
__m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
|
||||
|
||||
// wide add
|
||||
#define dct_wadd(out, a, b) \
|
||||
__m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
|
||||
__m128i out##_h = _mm_add_epi32(a##_h, b##_h)
|
||||
|
||||
// wide sub
|
||||
#define dct_wsub(out, a, b) \
|
||||
__m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
|
||||
__m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
|
||||
|
||||
// butterfly a/b, add bias, then shift by "s" and pack
|
||||
#define dct_bfly32o(out0, out1, a,b,bias,s) \
|
||||
{ \
|
||||
__m128i abiased_l = _mm_add_epi32(a##_l, bias); \
|
||||
__m128i abiased_h = _mm_add_epi32(a##_h, bias); \
|
||||
dct_wadd(sum, abiased, b); \
|
||||
dct_wsub(dif, abiased, b); \
|
||||
out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
|
||||
out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
|
||||
}
|
||||
|
||||
// 8-bit interleave step (for transposes)
|
||||
#define dct_interleave8(a, b) \
|
||||
tmp = a; \
|
||||
a = _mm_unpacklo_epi8(a, b); \
|
||||
b = _mm_unpackhi_epi8(tmp, b)
|
||||
|
||||
// 16-bit interleave step (for transposes)
|
||||
#define dct_interleave16(a, b) \
|
||||
tmp = a; \
|
||||
a = _mm_unpacklo_epi16(a, b); \
|
||||
b = _mm_unpackhi_epi16(tmp, b)
|
||||
|
||||
#define dct_pass(bias,shift) \
|
||||
{ \
|
||||
/* even part */ \
|
||||
dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
|
||||
__m128i sum04 = _mm_add_epi16(row0, row4); \
|
||||
__m128i dif04 = _mm_sub_epi16(row0, row4); \
|
||||
dct_widen(t0e, sum04); \
|
||||
dct_widen(t1e, dif04); \
|
||||
dct_wadd(x0, t0e, t3e); \
|
||||
dct_wsub(x3, t0e, t3e); \
|
||||
dct_wadd(x1, t1e, t2e); \
|
||||
dct_wsub(x2, t1e, t2e); \
|
||||
/* odd part */ \
|
||||
dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
|
||||
dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
|
||||
__m128i sum17 = _mm_add_epi16(row1, row7); \
|
||||
__m128i sum35 = _mm_add_epi16(row3, row5); \
|
||||
dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
|
||||
dct_wadd(x4, y0o, y4o); \
|
||||
dct_wadd(x5, y1o, y5o); \
|
||||
dct_wadd(x6, y2o, y5o); \
|
||||
dct_wadd(x7, y3o, y4o); \
|
||||
dct_bfly32o(row0,row7, x0,x7,bias,shift); \
|
||||
dct_bfly32o(row1,row6, x1,x6,bias,shift); \
|
||||
dct_bfly32o(row2,row5, x2,x5,bias,shift); \
|
||||
dct_bfly32o(row3,row4, x3,x4,bias,shift); \
|
||||
}
|
||||
|
||||
__m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
|
||||
__m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
|
||||
__m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
|
||||
__m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
|
||||
__m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
|
||||
__m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
|
||||
__m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
|
||||
__m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
|
||||
|
||||
// rounding biases in column/row passes, see stbi__idct_block for explanation.
|
||||
__m128i bias_0 = _mm_set1_epi32(512);
|
||||
__m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
|
||||
|
||||
// load
|
||||
row0 = _mm_load_si128((const __m128i *) (data + 0*8));
|
||||
row1 = _mm_load_si128((const __m128i *) (data + 1*8));
|
||||
row2 = _mm_load_si128((const __m128i *) (data + 2*8));
|
||||
row3 = _mm_load_si128((const __m128i *) (data + 3*8));
|
||||
row4 = _mm_load_si128((const __m128i *) (data + 4*8));
|
||||
row5 = _mm_load_si128((const __m128i *) (data + 5*8));
|
||||
row6 = _mm_load_si128((const __m128i *) (data + 6*8));
|
||||
row7 = _mm_load_si128((const __m128i *) (data + 7*8));
|
||||
|
||||
// column pass
|
||||
dct_pass(bias_0, 10);
|
||||
|
||||
{
|
||||
// 16bit 8x8 transpose pass 1
|
||||
dct_interleave16(row0, row4);
|
||||
dct_interleave16(row1, row5);
|
||||
dct_interleave16(row2, row6);
|
||||
dct_interleave16(row3, row7);
|
||||
|
||||
// transpose pass 2
|
||||
dct_interleave16(row0, row2);
|
||||
dct_interleave16(row1, row3);
|
||||
dct_interleave16(row4, row6);
|
||||
dct_interleave16(row5, row7);
|
||||
|
||||
// transpose pass 3
|
||||
dct_interleave16(row0, row1);
|
||||
dct_interleave16(row2, row3);
|
||||
dct_interleave16(row4, row5);
|
||||
dct_interleave16(row6, row7);
|
||||
}
|
||||
|
||||
// row pass
|
||||
dct_pass(bias_1, 17);
|
||||
|
||||
{
|
||||
// pack
|
||||
__m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
|
||||
__m128i p1 = _mm_packus_epi16(row2, row3);
|
||||
__m128i p2 = _mm_packus_epi16(row4, row5);
|
||||
__m128i p3 = _mm_packus_epi16(row6, row7);
|
||||
|
||||
// 8bit 8x8 transpose pass 1
|
||||
dct_interleave8(p0, p2); // a0e0a1e1...
|
||||
dct_interleave8(p1, p3); // c0g0c1g1...
|
||||
|
||||
// transpose pass 2
|
||||
dct_interleave8(p0, p1); // a0c0e0g0...
|
||||
dct_interleave8(p2, p3); // b0d0f0h0...
|
||||
|
||||
// transpose pass 3
|
||||
dct_interleave8(p0, p2); // a0b0c0d0...
|
||||
dct_interleave8(p1, p3); // a4b4c4d4...
|
||||
|
||||
// store
|
||||
_mm_storel_epi64((__m128i *) out, p0); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, p2); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, p1); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, p3); out += out_stride;
|
||||
_mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
|
||||
}
|
||||
|
||||
#undef dct_const
|
||||
#undef dct_rot
|
||||
#undef dct_widen
|
||||
#undef dct_wadd
|
||||
#undef dct_wsub
|
||||
#undef dct_bfly32o
|
||||
#undef dct_interleave8
|
||||
#undef dct_interleave16
|
||||
#undef dct_pass
|
||||
}
|
||||
|
||||
static stbi_idct_8x8 stbi__idct_installed = stbi__idct_block_wrapper;
|
||||
#endif // STBI_SSE2
|
||||
|
||||
#ifdef STBI_SIMD
|
||||
STBIDEF void stbi_install_idct(stbi_idct_8x8 func)
|
||||
{
|
||||
stbi__idct_installed = func;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1887,6 +2070,12 @@ STBIDEF void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func)
|
||||
static void stbi__setup_jpeg(stbi__jpeg *j)
|
||||
{
|
||||
j->idct_block_kernel = stbi__idct_block;
|
||||
|
||||
#ifdef STBI_SSE2
|
||||
if (stbi__sse2_available()) {
|
||||
j->idct_block_kernel = stbi__idct_sse2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// clean up the temporary component buffers
|
||||
|
Loading…
Reference in New Issue
Block a user