mirror of
https://github.com/nothings/stb
synced 2024-12-15 12:22:55 +03:00
stb_image: Add SSE2 h2v2 resampling kernel.
This commit is contained in:
parent
c625d24197
commit
42bb08b10b
86
stb_image.h
86
stb_image.h
@ -1073,6 +1073,7 @@ typedef struct
|
||||
// kernels
|
||||
void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
|
||||
void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
|
||||
stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
|
||||
} stbi__jpeg;
|
||||
|
||||
static int stbi__build_huffman(stbi__huffman *h, int *count)
|
||||
@ -1995,6 +1996,87 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
|
||||
return out;
|
||||
}
|
||||
|
||||
#ifdef STBI_SSE2
|
||||
static stbi_uc *stbi__resample_row_hv_2_sse2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
|
||||
{
|
||||
// need to generate 2x2 samples for every one in input
|
||||
int i=0,t0,t1;
|
||||
__m128i bias = _mm_set1_epi16(8);
|
||||
|
||||
if (w == 1) {
|
||||
out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
|
||||
return out;
|
||||
}
|
||||
|
||||
t1 = 3*in_near[0] + in_far[0];
|
||||
// process groups of 8 pixels for as long as we can.
|
||||
// note we can't handle the last pixel in a row in this loop
|
||||
// because we need to handle the filter boundary conditions.
|
||||
for (; i < ((w-1) & ~7); i += 8) {
|
||||
// load and perform the vertical filtering pass
|
||||
// this uses 3*x + y = 4*x + (y - x)
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i));
|
||||
__m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
|
||||
__m128i farw = _mm_unpacklo_epi8(farb, zero);
|
||||
__m128i nearw = _mm_unpacklo_epi8(nearb, zero);
|
||||
__m128i diff = _mm_sub_epi16(farw, nearw);
|
||||
__m128i nears = _mm_slli_epi16(nearw, 2);
|
||||
__m128i curr = _mm_add_epi16(nears, diff); // current row
|
||||
|
||||
// horizontal filter works the same based on shifted of current
|
||||
// row. "prev" is current row shifted right by 1 pixel; we need to
|
||||
// insert the previous pixel value (from t1).
|
||||
// "next" is current row shifted left by 1 pixel, with first pixel
|
||||
// of next block of 8 pixels added in.
|
||||
__m128i prv0 = _mm_slli_si128(curr, 2);
|
||||
__m128i nxt0 = _mm_srli_si128(curr, 2);
|
||||
__m128i prev = _mm_insert_epi16(prv0, t1, 0);
|
||||
__m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
|
||||
|
||||
// horizontal filter, polyphase implementation since it's convenient:
|
||||
// even pixels = 3*cur + prev = cur*4 + (prev - cur)
|
||||
// odd pixels = 3*cur + next = cur*4 + (next - cur)
|
||||
// note the shared term.
|
||||
__m128i curs = _mm_slli_epi16(curr, 2);
|
||||
__m128i prvd = _mm_sub_epi16(prev, curr);
|
||||
__m128i nxtd = _mm_sub_epi16(next, curr);
|
||||
__m128i curb = _mm_add_epi16(curs, bias);
|
||||
__m128i even = _mm_add_epi16(prvd, curb);
|
||||
__m128i odd = _mm_add_epi16(nxtd, curb);
|
||||
|
||||
// interleave even and odd pixels, then undo scaling.
|
||||
__m128i int0 = _mm_unpacklo_epi16(even, odd);
|
||||
__m128i int1 = _mm_unpackhi_epi16(even, odd);
|
||||
__m128i de0 = _mm_srli_epi16(int0, 4);
|
||||
__m128i de1 = _mm_srli_epi16(int1, 4);
|
||||
|
||||
// pack and write output
|
||||
__m128i outv = _mm_packus_epi16(de0, de1);
|
||||
_mm_storeu_si128((__m128i *) (out + i*2), outv);
|
||||
|
||||
// "previous" value for next iter
|
||||
t1 = 3*in_near[i+7] + in_far[i+7];
|
||||
}
|
||||
|
||||
t0 = t1;
|
||||
t1 = 3*in_near[i] + in_far[i];
|
||||
out[i*2] = stbi__div16(3*t1 + t0 + 8);
|
||||
|
||||
for (++i; i < w; ++i) {
|
||||
t0 = t1;
|
||||
t1 = 3*in_near[i]+in_far[i];
|
||||
out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
|
||||
out[i*2 ] = stbi__div16(3*t1 + t0 + 8);
|
||||
}
|
||||
out[w*2-1] = stbi__div4(t1+2);
|
||||
|
||||
STBI_NOTUSED(hs);
|
||||
|
||||
return out;
|
||||
}
|
||||
#endif
|
||||
|
||||
static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
|
||||
{
|
||||
// resample with nearest-neighbor
|
||||
@ -2131,11 +2213,13 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
|
||||
{
|
||||
j->idct_block_kernel = stbi__idct_block;
|
||||
j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
|
||||
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
|
||||
|
||||
#ifdef STBI_SSE2
|
||||
if (stbi__sse2_available()) {
|
||||
j->idct_block_kernel = stbi__idct_sse2;
|
||||
j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_sse2;
|
||||
j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_sse2;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -2213,7 +2297,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
|
||||
if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
|
||||
else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
|
||||
else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
|
||||
else if (r->hs == 2 && r->vs == 2) r->resample = stbi__resample_row_hv_2;
|
||||
else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
|
||||
else r->resample = stbi__resample_row_generic;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user