mirror of
https://github.com/nothings/stb
synced 2025-01-07 07:12:06 +03:00
Merge branch 'fastjpeg' into working
This commit is contained in:
commit
de385bb6bb
339
stb_image.h
339
stb_image.h
@ -27,7 +27,7 @@
|
|||||||
|
|
||||||
- decode from memory or through FILE (define STBI_NO_STDIO to remove code)
|
- decode from memory or through FILE (define STBI_NO_STDIO to remove code)
|
||||||
- decode from arbitrary I/O callbacks
|
- decode from arbitrary I/O callbacks
|
||||||
- SIMD acceleration on x86/x64
|
- SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
|
||||||
|
|
||||||
Full documentation under "DOCUMENTATION" below.
|
Full documentation under "DOCUMENTATION" below.
|
||||||
|
|
||||||
@ -213,14 +213,17 @@ License:
|
|||||||
//
|
//
|
||||||
// SIMD support
|
// SIMD support
|
||||||
//
|
//
|
||||||
// The JPEG decoder will automatically use SIMD kernels on x86 platforms
|
// The JPEG decoder will try to automatically use SIMD kernels on when
|
||||||
// where supported.
|
// supported by the compiler.
|
||||||
//
|
//
|
||||||
// (The old do-it-yourself SIMD API is no longer supported in the current
|
// (The old do-it-yourself SIMD API is no longer supported in the current
|
||||||
// code.)
|
// code.)
|
||||||
//
|
//
|
||||||
// The code will automatically detect if the required SIMD instructions are
|
// On x86, SSE2 will automatically be used when available; if not, the
|
||||||
// available, and fall back to the generic C version where they're not.
|
// generic C versions are used as a fall-back. On ARM targets, the typical
|
||||||
|
// path is to have separate builds for NEON and non-NEON devices (at least
|
||||||
|
// this is true for iOS and Android). Therefore, the NEON support is
|
||||||
|
// toggled by a build flag: define STBI_NEON to get NEON loops.
|
||||||
//
|
//
|
||||||
// The output of the JPEG decoder is slightly different from versions where
|
// The output of the JPEG decoder is slightly different from versions where
|
||||||
// SIMD support was introduced (that is, for versions before 1.49). The
|
// SIMD support was introduced (that is, for versions before 1.49). The
|
||||||
@ -544,6 +547,16 @@ static int stbi__sse2_available()
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// ARM NEON
|
||||||
|
#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
|
||||||
|
#undef STBI_NEON
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef STBI_NEON
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef STBI_SIMD_ALIGN
|
#ifndef STBI_SIMD_ALIGN
|
||||||
#define STBI_SIMD_ALIGN(type, name) type name
|
#define STBI_SIMD_ALIGN(type, name) type name
|
||||||
#endif
|
#endif
|
||||||
@ -1890,6 +1903,214 @@ static void stbi__idct_sse2(stbi_uc *out, int out_stride, short data[64])
|
|||||||
|
|
||||||
#endif // STBI_SSE2
|
#endif // STBI_SSE2
|
||||||
|
|
||||||
|
#ifdef STBI_NEON
|
||||||
|
|
||||||
|
// NEON integer IDCT. should produce bit-identical
|
||||||
|
// results to the generic C version.
|
||||||
|
static void stbi__idct_neon(stbi_uc *out, int out_stride, short data[64])
|
||||||
|
{
|
||||||
|
int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
|
||||||
|
|
||||||
|
int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
|
||||||
|
int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
|
||||||
|
int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
|
||||||
|
int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
|
||||||
|
int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
|
||||||
|
int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
|
||||||
|
int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
|
||||||
|
int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
|
||||||
|
int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
|
||||||
|
int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
|
||||||
|
int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
|
||||||
|
int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
|
||||||
|
|
||||||
|
#define dct_long_mul(out, inq, coeff) \
|
||||||
|
int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
|
||||||
|
int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
|
||||||
|
|
||||||
|
#define dct_long_mac(out, acc, inq, coeff) \
|
||||||
|
int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
|
||||||
|
int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
|
||||||
|
|
||||||
|
#define dct_widen(out, inq) \
|
||||||
|
int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
|
||||||
|
int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
|
||||||
|
|
||||||
|
// wide add
|
||||||
|
#define dct_wadd(out, a, b) \
|
||||||
|
int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
|
||||||
|
int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
|
||||||
|
|
||||||
|
// wide sub
|
||||||
|
#define dct_wsub(out, a, b) \
|
||||||
|
int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
|
||||||
|
int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
|
||||||
|
|
||||||
|
// butterfly a/b, then shift using "shiftop" by "s" and pack
|
||||||
|
#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
|
||||||
|
{ \
|
||||||
|
dct_wadd(sum, a, b); \
|
||||||
|
dct_wsub(dif, a, b); \
|
||||||
|
out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
|
||||||
|
out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define dct_pass(shiftop, shift) \
|
||||||
|
{ \
|
||||||
|
/* even part */ \
|
||||||
|
int16x8_t sum26 = vaddq_s16(row2, row6); \
|
||||||
|
dct_long_mul(p1e, sum26, rot0_0); \
|
||||||
|
dct_long_mac(t2e, p1e, row6, rot0_1); \
|
||||||
|
dct_long_mac(t3e, p1e, row2, rot0_2); \
|
||||||
|
int16x8_t sum04 = vaddq_s16(row0, row4); \
|
||||||
|
int16x8_t dif04 = vsubq_s16(row0, row4); \
|
||||||
|
dct_widen(t0e, sum04); \
|
||||||
|
dct_widen(t1e, dif04); \
|
||||||
|
dct_wadd(x0, t0e, t3e); \
|
||||||
|
dct_wsub(x3, t0e, t3e); \
|
||||||
|
dct_wadd(x1, t1e, t2e); \
|
||||||
|
dct_wsub(x2, t1e, t2e); \
|
||||||
|
/* odd part */ \
|
||||||
|
int16x8_t sum15 = vaddq_s16(row1, row5); \
|
||||||
|
int16x8_t sum17 = vaddq_s16(row1, row7); \
|
||||||
|
int16x8_t sum35 = vaddq_s16(row3, row5); \
|
||||||
|
int16x8_t sum37 = vaddq_s16(row3, row7); \
|
||||||
|
int16x8_t sumodd = vaddq_s16(sum17, sum35); \
|
||||||
|
dct_long_mul(p5o, sumodd, rot1_0); \
|
||||||
|
dct_long_mac(p1o, p5o, sum17, rot1_1); \
|
||||||
|
dct_long_mac(p2o, p5o, sum35, rot1_2); \
|
||||||
|
dct_long_mul(p3o, sum37, rot2_0); \
|
||||||
|
dct_long_mul(p4o, sum15, rot2_1); \
|
||||||
|
dct_wadd(sump13o, p1o, p3o); \
|
||||||
|
dct_wadd(sump24o, p2o, p4o); \
|
||||||
|
dct_wadd(sump23o, p2o, p3o); \
|
||||||
|
dct_wadd(sump14o, p1o, p4o); \
|
||||||
|
dct_long_mac(x4, sump13o, row7, rot3_0); \
|
||||||
|
dct_long_mac(x5, sump24o, row5, rot3_1); \
|
||||||
|
dct_long_mac(x6, sump23o, row3, rot3_2); \
|
||||||
|
dct_long_mac(x7, sump14o, row1, rot3_3); \
|
||||||
|
dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
|
||||||
|
dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
|
||||||
|
dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
|
||||||
|
dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
|
||||||
|
}
|
||||||
|
|
||||||
|
// load
|
||||||
|
row0 = vld1q_s16(data + 0*8);
|
||||||
|
row1 = vld1q_s16(data + 1*8);
|
||||||
|
row2 = vld1q_s16(data + 2*8);
|
||||||
|
row3 = vld1q_s16(data + 3*8);
|
||||||
|
row4 = vld1q_s16(data + 4*8);
|
||||||
|
row5 = vld1q_s16(data + 5*8);
|
||||||
|
row6 = vld1q_s16(data + 6*8);
|
||||||
|
row7 = vld1q_s16(data + 7*8);
|
||||||
|
|
||||||
|
// add DC bias
|
||||||
|
row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
|
||||||
|
|
||||||
|
// column pass
|
||||||
|
dct_pass(vrshrn_n_s32, 10);
|
||||||
|
|
||||||
|
// 16bit 8x8 transpose
|
||||||
|
{
|
||||||
|
// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
|
||||||
|
// whether compilers actually get this is another story, sadly.
|
||||||
|
#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
|
||||||
|
#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
|
||||||
|
#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
|
||||||
|
|
||||||
|
// pass 1
|
||||||
|
dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
|
||||||
|
dct_trn16(row2, row3);
|
||||||
|
dct_trn16(row4, row5);
|
||||||
|
dct_trn16(row6, row7);
|
||||||
|
|
||||||
|
// pass 2
|
||||||
|
dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
|
||||||
|
dct_trn32(row1, row3);
|
||||||
|
dct_trn32(row4, row6);
|
||||||
|
dct_trn32(row5, row7);
|
||||||
|
|
||||||
|
// pass 3
|
||||||
|
dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
|
||||||
|
dct_trn64(row1, row5);
|
||||||
|
dct_trn64(row2, row6);
|
||||||
|
dct_trn64(row3, row7);
|
||||||
|
|
||||||
|
#undef dct_trn16
|
||||||
|
#undef dct_trn32
|
||||||
|
#undef dct_trn64
|
||||||
|
}
|
||||||
|
|
||||||
|
// row pass
|
||||||
|
// vrshrn_n_s32 only supports shifts up to 16, we need
|
||||||
|
// 17. so do a non-rounding shift of 16 first then follow
|
||||||
|
// up with a rounding shift by 1.
|
||||||
|
dct_pass(vshrn_n_s32, 16);
|
||||||
|
|
||||||
|
{
|
||||||
|
// pack and round
|
||||||
|
uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
|
||||||
|
uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
|
||||||
|
uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
|
||||||
|
uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
|
||||||
|
uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
|
||||||
|
uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
|
||||||
|
uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
|
||||||
|
uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
|
||||||
|
|
||||||
|
// again, these can translate into one instruction, but often don't.
|
||||||
|
#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
|
||||||
|
#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
|
||||||
|
#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
|
||||||
|
|
||||||
|
// sadly can't use interleaved stores here since we only write
|
||||||
|
// 8 bytes to each scan line!
|
||||||
|
|
||||||
|
// 8x8 8-bit transpose pass 1
|
||||||
|
dct_trn8_8(p0, p1);
|
||||||
|
dct_trn8_8(p2, p3);
|
||||||
|
dct_trn8_8(p4, p5);
|
||||||
|
dct_trn8_8(p6, p7);
|
||||||
|
|
||||||
|
// pass 2
|
||||||
|
dct_trn8_16(p0, p2);
|
||||||
|
dct_trn8_16(p1, p3);
|
||||||
|
dct_trn8_16(p4, p6);
|
||||||
|
dct_trn8_16(p5, p7);
|
||||||
|
|
||||||
|
// pass 3
|
||||||
|
dct_trn8_32(p0, p4);
|
||||||
|
dct_trn8_32(p1, p5);
|
||||||
|
dct_trn8_32(p2, p6);
|
||||||
|
dct_trn8_32(p3, p7);
|
||||||
|
|
||||||
|
// store
|
||||||
|
vst1_u8(out, p0); out += out_stride;
|
||||||
|
vst1_u8(out, p1); out += out_stride;
|
||||||
|
vst1_u8(out, p2); out += out_stride;
|
||||||
|
vst1_u8(out, p3); out += out_stride;
|
||||||
|
vst1_u8(out, p4); out += out_stride;
|
||||||
|
vst1_u8(out, p5); out += out_stride;
|
||||||
|
vst1_u8(out, p6); out += out_stride;
|
||||||
|
vst1_u8(out, p7);
|
||||||
|
|
||||||
|
#undef dct_trn8_8
|
||||||
|
#undef dct_trn8_16
|
||||||
|
#undef dct_trn8_32
|
||||||
|
}
|
||||||
|
|
||||||
|
#undef dct_long_mul
|
||||||
|
#undef dct_long_mac
|
||||||
|
#undef dct_widen
|
||||||
|
#undef dct_wadd
|
||||||
|
#undef dct_wsub
|
||||||
|
#undef dct_bfly32o
|
||||||
|
#undef dct_pass
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // STBI_NEON
|
||||||
|
|
||||||
#define STBI__MARKER_none 0xff
|
#define STBI__MARKER_none 0xff
|
||||||
// if there's a pending marker from the entropy stream, return that
|
// if there's a pending marker from the entropy stream, return that
|
||||||
// otherwise, fetch from the stream and get a marker. if there's no
|
// otherwise, fetch from the stream and get a marker. if there's no
|
||||||
@ -2405,12 +2626,11 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef STBI_SSE2
|
#if defined(STBI_SSE2) || defined(STBI_NEON)
|
||||||
static stbi_uc *stbi__resample_row_hv_2_sse2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
|
static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
|
||||||
{
|
{
|
||||||
// need to generate 2x2 samples for every one in input
|
// need to generate 2x2 samples for every one in input
|
||||||
int i=0,t0,t1;
|
int i=0,t0,t1;
|
||||||
__m128i bias = _mm_set1_epi16(8);
|
|
||||||
|
|
||||||
if (w == 1) {
|
if (w == 1) {
|
||||||
out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
|
out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
|
||||||
@ -2422,6 +2642,7 @@ static stbi_uc *stbi__resample_row_hv_2_sse2(stbi_uc *out, stbi_uc *in_near, stb
|
|||||||
// note we can't handle the last pixel in a row in this loop
|
// note we can't handle the last pixel in a row in this loop
|
||||||
// because we need to handle the filter boundary conditions.
|
// because we need to handle the filter boundary conditions.
|
||||||
for (; i < ((w-1) & ~7); i += 8) {
|
for (; i < ((w-1) & ~7); i += 8) {
|
||||||
|
#if defined(STBI_SSE2)
|
||||||
// load and perform the vertical filtering pass
|
// load and perform the vertical filtering pass
|
||||||
// this uses 3*x + y = 4*x + (y - x)
|
// this uses 3*x + y = 4*x + (y - x)
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
@ -2433,7 +2654,7 @@ static stbi_uc *stbi__resample_row_hv_2_sse2(stbi_uc *out, stbi_uc *in_near, stb
|
|||||||
__m128i nears = _mm_slli_epi16(nearw, 2);
|
__m128i nears = _mm_slli_epi16(nearw, 2);
|
||||||
__m128i curr = _mm_add_epi16(nears, diff); // current row
|
__m128i curr = _mm_add_epi16(nears, diff); // current row
|
||||||
|
|
||||||
// horizontal filter works the same based on shifted of current
|
// horizontal filter works the same based on shifted vers of current
|
||||||
// row. "prev" is current row shifted right by 1 pixel; we need to
|
// row. "prev" is current row shifted right by 1 pixel; we need to
|
||||||
// insert the previous pixel value (from t1).
|
// insert the previous pixel value (from t1).
|
||||||
// "next" is current row shifted left by 1 pixel, with first pixel
|
// "next" is current row shifted left by 1 pixel, with first pixel
|
||||||
@ -2447,6 +2668,7 @@ static stbi_uc *stbi__resample_row_hv_2_sse2(stbi_uc *out, stbi_uc *in_near, stb
|
|||||||
// even pixels = 3*cur + prev = cur*4 + (prev - cur)
|
// even pixels = 3*cur + prev = cur*4 + (prev - cur)
|
||||||
// odd pixels = 3*cur + next = cur*4 + (next - cur)
|
// odd pixels = 3*cur + next = cur*4 + (next - cur)
|
||||||
// note the shared term.
|
// note the shared term.
|
||||||
|
__m128i bias = _mm_set1_epi16(8);
|
||||||
__m128i curs = _mm_slli_epi16(curr, 2);
|
__m128i curs = _mm_slli_epi16(curr, 2);
|
||||||
__m128i prvd = _mm_sub_epi16(prev, curr);
|
__m128i prvd = _mm_sub_epi16(prev, curr);
|
||||||
__m128i nxtd = _mm_sub_epi16(next, curr);
|
__m128i nxtd = _mm_sub_epi16(next, curr);
|
||||||
@ -2463,6 +2685,41 @@ static stbi_uc *stbi__resample_row_hv_2_sse2(stbi_uc *out, stbi_uc *in_near, stb
|
|||||||
// pack and write output
|
// pack and write output
|
||||||
__m128i outv = _mm_packus_epi16(de0, de1);
|
__m128i outv = _mm_packus_epi16(de0, de1);
|
||||||
_mm_storeu_si128((__m128i *) (out + i*2), outv);
|
_mm_storeu_si128((__m128i *) (out + i*2), outv);
|
||||||
|
#elif defined(STBI_NEON)
|
||||||
|
// load and perform the vertical filtering pass
|
||||||
|
// this uses 3*x + y = 4*x + (y - x)
|
||||||
|
uint8x8_t farb = vld1_u8(in_far + i);
|
||||||
|
uint8x8_t nearb = vld1_u8(in_near + i);
|
||||||
|
int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
|
||||||
|
int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
|
||||||
|
int16x8_t curr = vaddq_s16(nears, diff); // current row
|
||||||
|
|
||||||
|
// horizontal filter works the same based on shifted vers of current
|
||||||
|
// row. "prev" is current row shifted right by 1 pixel; we need to
|
||||||
|
// insert the previous pixel value (from t1).
|
||||||
|
// "next" is current row shifted left by 1 pixel, with first pixel
|
||||||
|
// of next block of 8 pixels added in.
|
||||||
|
int16x8_t prv0 = vextq_s16(curr, curr, 7);
|
||||||
|
int16x8_t nxt0 = vextq_s16(curr, curr, 1);
|
||||||
|
int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
|
||||||
|
int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
|
||||||
|
|
||||||
|
// horizontal filter, polyphase implementation since it's convenient:
|
||||||
|
// even pixels = 3*cur + prev = cur*4 + (prev - cur)
|
||||||
|
// odd pixels = 3*cur + next = cur*4 + (next - cur)
|
||||||
|
// note the shared term.
|
||||||
|
int16x8_t curs = vshlq_n_s16(curr, 2);
|
||||||
|
int16x8_t prvd = vsubq_s16(prev, curr);
|
||||||
|
int16x8_t nxtd = vsubq_s16(next, curr);
|
||||||
|
int16x8_t even = vaddq_s16(curs, prvd);
|
||||||
|
int16x8_t odd = vaddq_s16(curs, nxtd);
|
||||||
|
|
||||||
|
// undo scaling and round, then store with even/odd phases interleaved
|
||||||
|
uint8x8x2_t o;
|
||||||
|
o.val[0] = vqrshrun_n_s16(even, 4);
|
||||||
|
o.val[1] = vqrshrun_n_s16(odd, 4);
|
||||||
|
vst2_u8(out + i*2, o);
|
||||||
|
#endif
|
||||||
|
|
||||||
// "previous" value for next iter
|
// "previous" value for next iter
|
||||||
t1 = 3*in_near[i+7] + in_far[i+7];
|
t1 = 3*in_near[i+7] + in_far[i+7];
|
||||||
@ -2555,16 +2812,15 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef STBI_SSE2
|
#if defined(STBI_SSE2) || defined(STBI_NEON)
|
||||||
static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
|
static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
|
||||||
{
|
{
|
||||||
int i = 0;
|
int i = 0;
|
||||||
|
|
||||||
|
#ifdef STBI_SSE2
|
||||||
// step == 3 is pretty ugly on the final interleave, and i'm not convinced
|
// step == 3 is pretty ugly on the final interleave, and i'm not convinced
|
||||||
// it's useful in practice (you wouldn't use it for textures, for example).
|
// it's useful in practice (you wouldn't use it for textures, for example).
|
||||||
// so just accelerate step == 4 case.
|
// so just accelerate step == 4 case.
|
||||||
//
|
|
||||||
// note: unlike the IDCT, this isn't bit-identical to the integer version.
|
|
||||||
if (step == 4) {
|
if (step == 4) {
|
||||||
// this is a fairly straightforward implementation and not super-optimized.
|
// this is a fairly straightforward implementation and not super-optimized.
|
||||||
__m128i signflip = _mm_set1_epi8(-0x80);
|
__m128i signflip = _mm_set1_epi8(-0x80);
|
||||||
@ -2620,6 +2876,53 @@ static void stbi__YCbCr_to_RGB_sse2(stbi_uc *out, stbi_uc const *y, stbi_uc cons
|
|||||||
out += 32;
|
out += 32;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef STBI_NEON
|
||||||
|
// in this version, step=3 support would be easy to add. but is there demand?
|
||||||
|
if (step == 4) {
|
||||||
|
// this is a fairly straightforward implementation and not super-optimized.
|
||||||
|
uint8x8_t signflip = vdup_n_u8(0x80);
|
||||||
|
int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f));
|
||||||
|
int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
|
||||||
|
int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
|
||||||
|
int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f));
|
||||||
|
|
||||||
|
for (; i+7 < count; i += 8) {
|
||||||
|
// load
|
||||||
|
uint8x8_t y_bytes = vld1_u8(y + i);
|
||||||
|
uint8x8_t cr_bytes = vld1_u8(pcr + i);
|
||||||
|
uint8x8_t cb_bytes = vld1_u8(pcb + i);
|
||||||
|
int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
|
||||||
|
int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
|
||||||
|
|
||||||
|
// expand to s16
|
||||||
|
int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
|
||||||
|
int16x8_t crw = vshll_n_s8(cr_biased, 7);
|
||||||
|
int16x8_t cbw = vshll_n_s8(cb_biased, 7);
|
||||||
|
|
||||||
|
// color transform
|
||||||
|
int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
|
||||||
|
int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
|
||||||
|
int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
|
||||||
|
int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
|
||||||
|
int16x8_t rws = vaddq_s16(yws, cr0);
|
||||||
|
int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
|
||||||
|
int16x8_t bws = vaddq_s16(yws, cb1);
|
||||||
|
|
||||||
|
// undo scaling, round, convert to byte
|
||||||
|
uint8x8x4_t o;
|
||||||
|
o.val[0] = vqrshrun_n_s16(rws, 4);
|
||||||
|
o.val[1] = vqrshrun_n_s16(gws, 4);
|
||||||
|
o.val[2] = vqrshrun_n_s16(bws, 4);
|
||||||
|
o.val[3] = vdup_n_u8(255);
|
||||||
|
|
||||||
|
// store, interleaving r/g/b/a
|
||||||
|
vst4_u8(out, o);
|
||||||
|
out += 8*4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (; i < count; ++i) {
|
for (; i < count; ++i) {
|
||||||
int y_fixed = (y[i] << 20) + (1<<19); // rounding
|
int y_fixed = (y[i] << 20) + (1<<19); // rounding
|
||||||
@ -2655,11 +2958,17 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
|
|||||||
if (stbi__sse2_available()) {
|
if (stbi__sse2_available()) {
|
||||||
j->idct_block_kernel = stbi__idct_sse2;
|
j->idct_block_kernel = stbi__idct_sse2;
|
||||||
#ifndef STBI_JPEG_OLD
|
#ifndef STBI_JPEG_OLD
|
||||||
j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_sse2;
|
j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
|
||||||
#endif
|
#endif
|
||||||
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_sse2;
|
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef STBI_NEON
|
||||||
|
j->idct_block_kernel = stbi__idct_neon;
|
||||||
|
j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
|
||||||
|
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up the temporary component buffers
|
// clean up the temporary component buffers
|
||||||
|
Loading…
Reference in New Issue
Block a user