mmx bicubic

This commit is contained in:
nothings.org 2007-06-29 10:18:49 +00:00
parent f734cfa7be
commit 7570e628ee
2 changed files with 284 additions and 132 deletions

414
imv.c
View File

@ -41,6 +41,8 @@
// trivial error handling // trivial error handling
void error(char *str) { MessageBox(NULL, str, "imv(stb) error", MB_OK); } void error(char *str) { MessageBox(NULL, str, "imv(stb) error", MB_OK); }
#define _DEBUG
// OutputDebugString with varargs, can be compiled out // OutputDebugString with varargs, can be compiled out
#ifdef _DEBUG #ifdef _DEBUG
int do_debug; int do_debug;
@ -695,6 +697,7 @@ struct
queued_size size; queued_size size;
Image *image; Image *image;
char *filename; char *filename;
ImageFile *image_c;
} pending_resize; } pending_resize;
// temporary structure for communicating across stb_workq() call // temporary structure for communicating across stb_workq() call
@ -706,13 +709,13 @@ typedef struct
} Resize; } Resize;
// threaded image resizer, uses work queue AND current thread // threaded image resizer, uses work queue AND current thread
static void image_resize(Image *dest, Image *src, ImageFile *cache); static void image_resize(Image *dest, Image *src);
// wrapper for image_resize() to be called via work queue // wrapper for image_resize() to be called via work queue
void * work_resize(void *p) void * work_resize(void *p)
{ {
Resize *r = (Resize *) p; Resize *r = (Resize *) p;
image_resize(&r->dest, r->src->image, r->src); image_resize(&r->dest, r->src->image);
return r->result; return r->result;
} }
@ -776,6 +779,7 @@ void queue_resize(int w, int h, ImageFile *src_c, int immediate)
src_c->status = LOAD_resizing; src_c->status = LOAD_resizing;
// store data to come back for later // store data to come back for later
pending_resize.image = NULL; pending_resize.image = NULL;
pending_resize.image_c = src_c;
pending_resize.filename = strdup(src_c->filename); pending_resize.filename = strdup(src_c->filename);
// run the resizer in the background (equivalent to the call below) // run the resizer in the background (equivalent to the call below)
stb_workq(resize_workers, work_resize, &res, &pending_resize.image); stb_workq(resize_workers, work_resize, &res, &pending_resize.image);
@ -1075,6 +1079,8 @@ void queue_disk_command(DiskCommand *dc, int which, int make_current)
// it's being loaded/decoded // it's being loaded/decoded
return; return;
} }
if (z->status == LOAD_reading_done)
return;
if (z->status == LOAD_available) { if (z->status == LOAD_available) {
if (make_current) if (make_current)
update_source((ImageFile *) z); update_source((ImageFile *) z);
@ -1777,14 +1783,16 @@ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine
for(;;) { for(;;) {
// if we're not currently resizing, start a resize // if we're not currently resizing, start a resize
if (qs.w && pending_resize.size.w == 0) { if (qs.w && pending_resize.size.w == 0) {
if (cur_is_current() && (!cur || (qs.w == cur->x && qs.h >= cur->y) || (qs.h == cur->y && qs.w >= cur->x))) { if (source) {
// no resize necessary, just a variant of the current shape if (cur_is_current() && (!cur || (qs.w == cur->x && qs.h >= cur->y) || (qs.h == cur->y && qs.w >= cur->x))) {
MoveWindow(win, qs.x,qs.y,qs.w,qs.h, TRUE); // no resize necessary, just a variant of the current shape
InvalidateRect(win, NULL, FALSE); MoveWindow(win, qs.x,qs.y,qs.w,qs.h, TRUE);
} else { InvalidateRect(win, NULL, FALSE);
o(("Enqueueing resize\n")); } else {
pending_resize.size = qs; o(("Enqueueing resize\n"));
queue_resize(qs.w, qs.h, source_c, FALSE); pending_resize.size = qs;
queue_resize(qs.w, qs.h, source_c, FALSE);
}
} }
qs.w = 0; qs.w = 0;
} }
@ -1797,8 +1805,9 @@ o(("Enqueueing resize\n"));
Sleep(10); Sleep(10);
} else { } else {
HDC hdc; HDC hdc;
o(("Finished resize\n")); o(("Finished resize\n"));
imfree(cur); imfree(cur);
pending_resize.image_c->status = LOAD_available;
cur = pending_resize.image; cur = pending_resize.image;
display_error[0] = 0; display_error[0] = 0;
cur_filename = pending_resize.filename; cur_filename = pending_resize.filename;
@ -1847,7 +1856,6 @@ typedef struct
double temp; double temp;
Image *dest; Image *dest;
Image *src; Image *src;
ImageFile *src_c;
SplitPoint *p; SplitPoint *p;
int j0,j1; int j0,j1;
float dy; float dy;
@ -1942,6 +1950,65 @@ void *image_resize_work(ImageProcess *q)
return NULL; return NULL;
} }
void image_resize_old(Image *dest, Image *src)
{
ImageProcess proc_buffer[16], *q = stb_temp(proc_buffer, resize_threads * sizeof(*q));
SplitPoint *p = stb_temp(point_buffer, dest->x * sizeof(*p));
int i,j0,j1,k;
float x,dx,dy;
assert(src->frame == 0);
dx = (float) (src->x - 1) / (dest->x - 1);
dy = (float) (src->y - 1) / (dest->y - 1);
x=0;
for (i=0; i < dest->x; ++i) {
p[i].i = (int) floor(x);
p[i].f = (int) floor(255.9f*(x - p[i].i));
if (p[i].i >= src->x-1) {
p[i].i = src->x-2;
p[i].f = 255;
}
x += dx;
p[i].i *= BPP;
}
for (k=0; k < dest->x; k += CACHE_REBLOCK) {
int k2 = stb_min(k+CACHE_REBLOCK, dest->x);
for (i=k2-1; i > k; --i) {
p[i].i -= p[i-1].i;
}
}
j0 = 0;
for (i=0; i < resize_threads; ++i) {
j1 = dest->y * (i+1) / resize_threads;
q[i].dest = dest;
q[i].src = src;
q[i].j0 = j0;
q[i].j1 = j1;
q[i].dy = dy;
q[i].p = p;
q[i].done = FALSE;
j1 = j0;
}
if (resize_threads == 1) {
image_resize_work(q);
} else {
barrier();
for (i=1; i < resize_threads; ++i)
stb_workq(resize_workers, image_resize_work, q+i, NULL);
image_resize_work(q);
for(;;) {
for (i=1; i < resize_threads; ++i)
if (!q[i].done)
break;
if (i == resize_threads) break;
Sleep(10);
}
}
stb_tempfree(point_buffer, p);
stb_tempfree(proc_buffer , q);
}
// //
@ -1990,63 +2057,195 @@ static int cubic(int x0, int x1, int x2, int x3, int lerp8)
return res; return res;
} }
static Color cubicRGBA(Color x0, Color x1, Color x2, Color x3, int lerp8) #if 1
#define SSE __declspec(align(16))
#define MMX __declspec(align(8))
// out = a * t^3 + b*t^2 + c*t + d
// out = (a*t+b)*t^2 + (c*t+d)*1
MMX int16 three[4] = { 3,3,3,3 };
static void cubicRGBA(uint32 *dest, uint32 *x0, uint32 *x1, uint32 *x2, uint32 *x3, int lerp8, int step_dest, int step_src, int len)
//static uint32 cubicRGBA(uint32 x0, uint32 x1, uint32 x2, uint32 x3, int lerp8)
{ {
int r,g,b,a; if (len <= 0) return;
r = cubic(R(x0),R(x1),R(x2),R(x3),lerp8); __asm {
g = cubic(G(x0),G(x1),G(x2),G(x3),lerp8); // these save/restores shouldn't be necessary... but they seem to be in VC6 opt builds
b = cubic(B(x0),B(x1),B(x2),B(x3),lerp8); push eax
a = cubic(A(x0),A(x1),A(x2),A(x3),lerp8); push ebx
return RGBA(r,g,b,a); push ecx
push edx
push esi
push edi
mov edi,dest
mov eax,x0
mov ebx,x1
mov ecx,x2
mov edx,x3
pxor mm0,mm0
movd mm7,lerp8
mov esi,len
punpcklbw mm7,mm7 // 0,0,0,0,0,0,lerp,lerp
punpcklbw mm7,mm7 // 0,0,0,0,lerp,lerp,lerp,lerp
punpcklbw mm7,mm7 // 8xlerp. (This meakes each unsigned lerp value 0..15)
psrlw mm7,1 // slide away from the sign bit; 1.15 lerp
} looptop: __asm {
movd mm1,[eax]
movd mm2,[ebx]
movd mm3,[ecx]
movd mm4,[edx]
punpcklbw mm1,mm0 // mm1 = x0
punpcklbw mm2,mm0 // mm2 = x1
punpcklbw mm3,mm0 // mm3 = x2
punpcklbw mm4,mm0 // mm4 = x3
#if 1
psubw mm4,mm1 // mm4 = x3-x0
movq mm5,mm2 // mm5 = x1
psubw mm5,mm3 // mm5 = x1-x2
pmullw mm5,three // mm5 = 3*(x1-x2)
paddw mm5,mm4 // mm5 = a
paddw mm2,mm2 // mm2 = d
movq mm6,mm3 // mm6 = x2
psubw mm6,mm1 // mm6 = c
paddw mm3,mm1 // mm3 = x0+x2
psubw mm3,mm2 // mm3 = x0+x2-d
psubw mm3,mm5 // mm3 = b
psllw mm5,3 // mm5 = a(15.1)
pmulhw mm5,mm7 // mm5 = a
pmulhw mm5,mm7 // mm5 = a
pmulhw mm5,mm7 // mm5 = a*t^3
psllw mm3,2 // mm3 = b
pmulhw mm3,mm7 // mm3 = b
pmulhw mm3,mm7 // mm3 = b*t^2
psllw mm6,1 // mm6 = c
pmulhw mm6,mm7 // mm6 = c*t
paddw mm5,mm2
paddw mm5,mm3
paddw mm5,mm6
psraw mm5,1
packuswb mm5,mm5
movd [edi],mm5
#else
// unknown spline type from: http://local.wasp.uwa.edu.au/~pbourke/other/interpolation/
psubw mm4,mm3 // mm4 = x3-x2
psubw mm4,mm1 // mm4 = x3-x2-x0
paddw mm4,mm2 // mm4 = a0 = x3-x2-x0+x1
psubw mm3,mm1 // mm3 = a2 = x2-x0
psubw mm1,mm2 // mm1 = x0-x1
psubw mm1,mm4 // mm1 = a1 = x0-x1-a0
// mm2 = a3 = y1
psllw mm4,3
pmulhw mm4,mm7
pmulhw mm4,mm7
pmulhw mm4,mm7
psllw mm1,2
pmulhw mm1,mm7
pmulhw mm1,mm7
psllw mm3,1
pmulhw mm3,mm7
paddw mm1,mm2
paddw mm1,mm3
paddw mm1,mm4
packuswb mm1,mm1
movd [edi],mm1
#endif
add eax,step_src
add ebx,step_src
add ecx,step_src
add edx,step_src
add edi,step_dest
dec esi
jnz looptop
emms
pop edi
pop esi
pop edx
pop ecx
pop ebx
pop eax
}
} }
#else
static void cubicRGBA(Color *dest, Color *x0, Color *x1, Color *x2, Color *x3, int lerp8, int step_dest, int step_src, int len)
{
int i;
for (i=0; i < len; ++i) {
int r,g,b,a;
r = cubic(R(*x0),R(*x1),R(*x2),R(*x3),lerp8);
g = cubic(G(*x0),G(*x1),G(*x2),G(*x3),lerp8);
b = cubic(B(*x0),B(*x1),B(*x2),B(*x3),lerp8);
a = cubic(A(*x0),A(*x1),A(*x2),A(*x3),lerp8);
*dest = RGBA(r,g,b,a);
x0 += step_src>>2;
x1 += step_src>>2;
x2 += step_src>>2;
x3 += step_src>>2;
dest += step_dest>>2;
}
}
#endif
#define PLUS(x,y) ((uint32 *) ((uint8 *) (x) + (y)))
#define CUBIC_BLOCK 32
Image *tot;
Image *grCubicScaleBitmapX(Image *src, int out_w) Image *grCubicScaleBitmapX(Image *src, int out_w)
{ {
int x,dx,i,j; int x,dx,i,j,k;
Image *out = bmp_alloc(out_w, src->y); Image *out = bmp_alloc(out_w, src->y);
//tot = out;
if ((uint) out - 0x1000 >= 0x8000000) __asm int 3;
dx = (src->x-1)*65536 / (out_w-1); dx = (src->x-1)*65536 / (out_w-1);
for (j=0; j < out->y; ++j) { for (k=0; k < out->y; k += CUBIC_BLOCK) {
uint32 *data = (uint32 *) (src->pixels + j*src->stride); int k2 = stb_min(k+CUBIC_BLOCK, out->y);
uint32 *dest = (uint32 *) (out->pixels + j*out->stride);
x = 0; x = 0;
for (i=0; i < out_w; ++i) { for (i=0; i < out_w; ++i) {
uint32 *data = (uint32 *) (src->pixels + k*src->stride);
uint32 *dest = (uint32 *) (out->pixels + k*out->stride) + i;
int xp = (x >> 16); int xp = (x >> 16);
int xw = (x >> 8) & 255; int xw = (x >> 8) & 255;
if (xp == 0) if ((uint) dest - 0x1000 >= 0x8000000) __asm int 3;
dest[i] = cubicRGBA(data[xp],data[xp],data[xp+1],data[xp+2], xw); if (xp == 0) {
else if (xp >= src->x - 2) cubicRGBA(dest, data+xp,data+xp,data+xp+1,data+xp+2,xw,out->stride,src->stride,k2-k);
if (xp == src->x-1) } else if (xp >= src->x - 2) {
dest[i] = data[xp]; if (xp == src->x-1) {
else for (j=k; j < k2; ++j) {
dest[i] = cubicRGBA(data[xp-1], data[xp], data[xp+1], data[xp+1], xw); dest[0] = data[xp];
else data = PLUS(data, src->stride);
dest[i] = cubicRGBA(data[xp-1], data[xp], data[xp+1], data[xp+2], xw); dest = PLUS(dest , out->stride);
}
} else {
cubicRGBA(dest, data+xp-1,data+xp,data+xp+1,data+xp+1,xw,out->stride,src->stride,k2-k);
}
} else {
cubicRGBA(dest, data+xp-1,data+xp,data+xp+1,data+xp+2,xw,out->stride,src->stride,k2-k);
}
x += dx; x += dx;
} }
} }
return out; return out;
} }
#define PLUS(x,y) ((uint32 *) ((uint8 *) (x) + (y)))
Image *grCubicScaleBitmapY(Image *src, int out_h) Image *grCubicScaleBitmapY(Image *src, int out_h)
{ {
int y,dy,i,j; int y,dy,j;
Image *out = bmp_alloc(src->x, out_h); Image *out = bmp_alloc(src->x, out_h);
dy = ((src->y-1)*65536-1) / (out_h-1); dy = ((src->y-1)*65536-1) / (out_h-1);
y = 0; y = 0;
for (j=0; j < out_h; ++j,y+=dy) { for (j=0; j < out_h; ++j,y+=dy) {
uint32 *dest = (uint32 *) (out->pixels + j*out->stride);
int yp = (y >> 16); int yp = (y >> 16);
uint8 yw = (y >> 8); uint8 yw = (y >> 8);
uint32 *data1 = (uint32 *) (src->pixels + yp*src->stride); uint32 *data1 = (uint32 *) (src->pixels + yp*src->stride);
uint32 *data2 = PLUS(data1,src->stride); uint32 *data2 = PLUS(data1,src->stride);
uint32 *dest = (uint32 *) (out->pixels + j*out->stride);
uint32 *data0 = (yp > 0) ? PLUS(data1, - src->stride) : data1; uint32 *data0 = (yp > 0) ? PLUS(data1, - src->stride) : data1;
uint32 *data3 = (yp < src->y-2) ? PLUS(data2, src->stride) : data2; uint32 *data3 = (yp < src->y-2) ? PLUS(data2, src->stride) : data2;
for (i=0; i < out->x; ++i) { cubicRGBA(dest, data0, data1, data2, data3, yw, 4,4,out->x);
dest[i] = cubicRGBA(data0[i], data1[i], data2[i], data3[i], yw);
}
} }
return out; return out;
} }
@ -2163,34 +2362,38 @@ Image *grScaleBitmapTwoThirds(Image *src)
return res; return res;
} }
Image *grScaleBitmap(Image *src, int gx, int gy) Image *grScaleBitmap(Image *src, int gx, int gy, Image *dest)
{ {
Image *to_free, *res; Image *to_free, *res;
int upsample=FALSE;
to_free = NULL; to_free = NULL;
// if scaling up, just always use bicubic // check if we're scaling up
if ((gx > src->x || gy > src->y) && upsample_cubic) { if (gx > src->x || gy > src->y) {
upsample = TRUE;
#if 0
res = grCubicScaleBitmapY(src, gy); res = grCubicScaleBitmapY(src, gy);
to_free = res; to_free = res;
res = grCubicScaleBitmapX(res, gx); res = grCubicScaleBitmapX(res, gx);
imfree(to_free); imfree(to_free);
return res; return res;
} #endif
} else {
// maybe should do something smarter here, like find the
// nearest box size, instead of repetitive powers of two
while (gx <= (src->x >> 1) && gy <= (src->y >> 1)) {
src = grScaleBitmapOneHalf(src);
if (to_free) imfree(to_free);
to_free = src;
}
while (gx <= (src->x >> 1) && gy <= (src->y >> 1)) { if (gx < src->x * 0.666666f && gy < src->y * 0.666666f) {
src = grScaleBitmapOneHalf(src); src = grScaleBitmapTwoThirds(src);
if (to_free) imfree(to_free); if (to_free) imfree(to_free);
to_free = src; to_free = src;
}
} }
#if 1
if (gx < src->x * 0.666666f && gy < src->y * 0.666666f) {
src = grScaleBitmapTwoThirds(src);
if (to_free) imfree(to_free);
to_free = src;
}
#endif
if (gx == src->x && gy == src->y) { if (gx == src->x && gy == src->y) {
if (to_free) if (to_free)
res = src; res = src;
@ -2199,94 +2402,41 @@ Image *grScaleBitmap(Image *src, int gx, int gy)
memcpy(res->pixels, src->pixels, res->y * res->stride); memcpy(res->pixels, src->pixels, res->y * res->stride);
return res; return res;
} }
} else if (!downsample_cubic) { } else if (upsample ? upsample_cubic : downsample_cubic) {
res = grScaleBitmapX(src, gx);
if (to_free) imfree(to_free);
to_free = res;
res = grScaleBitmapY(res, gy);
imfree(to_free);
} else {
res = grCubicScaleBitmapY(src, gy); res = grCubicScaleBitmapY(src, gy);
if (to_free) imfree(to_free); if (to_free) imfree(to_free);
to_free = res; to_free = res;
res = grCubicScaleBitmapX(res, gx); res = grCubicScaleBitmapX(res, gx);
imfree(to_free); imfree(to_free);
} else {
#if 1
image_resize_old(dest, src);
if (to_free) imfree(to_free);
res = NULL;
#else
res = grScaleBitmapX(src, gx);
if (to_free) imfree(to_free);
to_free = res;
res = grScaleBitmapY(res, gy);
imfree(to_free);
#endif
} }
return res; return res;
} }
void image_resize(Image *dest, Image *src)
{
#if BPP==3 #if BPP==3
void image_resize(Image *dest, Image *src, ImageFile *src_c) image_resize_old(dest, src);
{
ImageProcess proc_buffer[16], *q = stb_temp(proc_buffer, resize_threads * sizeof(*q));
SplitPoint *p = stb_temp(point_buffer, dest->x * sizeof(*p));
int i,j0,j1,k;
float x,dx,dy;
assert(reentry == 0);
assert(src->frame == 0);
dx = (float) (src->x - 1) / (dest->x - 1);
dy = (float) (src->y - 1) / (dest->y - 1);
x=0;
for (i=0; i < dest->x; ++i) {
p[i].i = (int) floor(x);
p[i].f = (int) floor(255.9f*(x - p[i].i));
if (p[i].i >= src->x-1) {
p[i].i = src->x-2;
p[i].f = 255;
}
x += dx;
p[i].i *= BPP;
}
for (k=0; k < dest->x; k += CACHE_REBLOCK) {
int k2 = stb_min(k+CACHE_REBLOCK, dest->x);
for (i=k2-1; i > k; --i) {
p[i].i -= p[i-1].i;
}
}
j0 = 0;
for (i=0; i < resize_threads; ++i) {
j1 = dest->y * (i+1) / resize_threads;
q[i].dest = dest;
q[i].src = src;
q[i].j0 = j0;
q[i].j1 = j1;
q[i].dy = dy;
q[i].p = p;
q[i].done = FALSE;
q[i].src_c = src_c;
j1 = j0;
}
if (resize_threads == 1) {
image_resize_work(q);
} else {
barrier();
for (i=1; i < resize_threads; ++i)
stb_workq(resize_workers, image_resize_work, q+i, NULL);
image_resize_work(q);
for(;;) {
for (i=1; i < resize_threads; ++i)
if (!q[i].done)
break;
if (i == resize_threads) break;
Sleep(10);
}
}
stb_tempfree(point_buffer, p);
stb_tempfree(proc_buffer , q);
}
#else #else
void image_resize(Image *dest, Image *src, ImageFile *src_c)
{
int j; int j;
Image *temp; Image *temp;
temp = grScaleBitmap(src, dest->x, dest->y); temp = grScaleBitmap(src, dest->x, dest->y, dest);
for (j=0; j < dest->y; ++j) if (temp) {
memcpy(dest->pixels + j*dest->stride, temp->pixels + j*temp->stride, BPP*dest->x); for (j=0; j < dest->y; ++j)
imfree(temp); memcpy(dest->pixels + j*dest->stride, temp->pixels + j*temp->stride, BPP*dest->x);
if(src_c) src_c->status = LOAD_available; imfree(temp);
} }
#endif #endif
}

View File

@ -1,4 +1,6 @@
Version 0.57 Version 0.57
* feature: cubic image resampling
* bugfix: occasional error when advancing to image that was about to be decoded
* bugfix: comment code * bugfix: comment code
* bugfix: fix logic for fitting large images onscreen to not stop a few pixels short * bugfix: fix logic for fitting large images onscreen to not stop a few pixels short