mirror of
https://github.com/frida/tinycc
synced 2025-01-12 06:39:51 +03:00
Optimize vswap()
vswap() is called often enough and shows in profile and it was easy to hand optimize swapping vtop[-1] and vtop[0] - instead of large (28 bytes on i386) tmp variable and two memory to memory copies, let's swap areas by longs through registers with streamlined assembly. For $ ./tcc -B. -bench -DONE_SOURCE -DCONFIG_MULTIARCHDIR=\"i386-linux-gnu\" -c tcc.c before: # Overhead Command Shared Object Symbol # ........ ........... ................... .............................................. # 15.19% tcc tcc [.] next_nomacro1 5.19% tcc libc-2.13.so [.] _int_malloc 4.57% tcc tcc [.] next 3.36% tcc tcc [.] tok_str_add2 3.03% tcc tcc [.] macro_subst_tok 2.93% tcc tcc [.] macro_subst 2.53% tcc tcc [.] next_nomacro_spc 2.49% tcc tcc [.] vswap 2.36% tcc libc-2.13.so [.] _int_free │ ST_FUNC void vswap(void) │ { 1,96 │ push %edi 2,65 │ push %esi 1,08 │ sub $0x20,%esp │ SValue tmp; │ │ /* cannot let cpu flags if other instruction are generated. Also │ avoid leaving VT_JMP anywhere except on the top of the stack │ because it would complicate the code generator. */ │ if (vtop >= vstack) { 0,98 │ mov 0x8078cac,%eax │ cmp $0x8078d3c,%eax 1,18 │ ┌──jb 24 │ │ int v = vtop->r & VT_VALMASK; 1,08 │ │ mov 0x8(%eax),%edx 0,78 │ │ and $0x3f,%edx │ │ if (v == VT_CMP || (v & ~1) == VT_JMP) 0,78 │ │ cmp $0x33,%edx 0,69 │ │↓ je 54 0,59 │ │ and $0xfffffffe,%edx 0,49 │ │ cmp $0x34,%edx 0,29 │ │↓ je 54 │ │ gv(RC_INT); │ │ } │ │ tmp = vtop[0]; 1,08 │24:└─→lea 0x4(%esp),%edi 0,39 │ mov $0x7,%ecx │ mov %eax,%esi 14,41 │ rep movsl %ds:(%esi),%es:(%edi) │ vtop[0] = vtop[-1]; 9,51 │ lea -0x1c(%eax),%esi 1,96 │ mov $0x7,%cl │ mov %eax,%edi 17,06 │ rep movsl %ds:(%esi),%es:(%edi) │ vtop[-1] = tmp; 10,20 │ mov 0x8078cac,%edi 2,35 │ sub $0x1c,%edi 0,78 │ lea 0x4(%esp),%esi │ mov $0x7,%cl 15,20 │ rep movsl %ds:(%esi),%es:(%edi) │ } 9,90 │ add $0x20,%esp 2,25 │ pop %esi 1,67 │ pop %edi 0,69 │ ret after: # Overhead Command Shared Object Symbol # ........ ........... ................... .............................................. # 15.27% tcc tcc [.] next_nomacro1 5.08% tcc libc-2.13.so [.] _int_malloc 4.57% tcc tcc [.] next 3.17% tcc tcc [.] tok_str_add2 3.12% tcc tcc [.] macro_subst 2.99% tcc tcc [.] macro_subst_tok 2.43% tcc tcc [.] next_nomacro_spc 2.32% tcc libc-2.13.so [.] _int_free . . . 0.71% tcc tcc [.] vswap │ ST_FUNC void vswap(void) │ { 7,22 │ push %eax │ /* cannot let cpu flags if other instruction are generated. Also │ avoid leaving VT_JMP anywhere except on the top of the stack │ because it would complicate the code generator. */ │ if (vtop >= vstack) { 11,34 │ mov 0x8078cac,%eax 2,75 │ cmp $0x8078d3c,%eax 0,34 │ ┌──jb 20 │ │ int v = vtop->r & VT_VALMASK; 0,34 │ │ mov 0x8(%eax),%edx 8,93 │ │ and $0x3f,%edx │ │ if (v == VT_CMP || (v & ~1) == VT_JMP) 2,06 │ │ cmp $0x33,%edx 2,41 │ │↓ je 74 2,41 │ │ and $0xfffffffe,%edx 0,34 │ │ cmp $0x34,%edx 2,41 │ │↓ je 74 │ │ vtopl[-1*VSIZEL + i] = tmpl; \ │ │ } do {} while (0) │ │ │ │ VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12); │ │ VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8); │ │ VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4); 2,06 │20:└─→mov 0x18(%eax),%edx 1,37 │ mov -0x4(%eax),%ecx 2,06 │ mov %ecx,0x18(%eax) 1,37 │ mov %edx,-0x4(%eax) 2,06 │ mov 0x14(%eax),%edx 2,06 │ mov -0x8(%eax),%ecx 2,41 │ mov %ecx,0x14(%eax) 3,09 │ mov %edx,-0x8(%eax) 3,09 │ mov 0x10(%eax),%edx 1,72 │ mov -0xc(%eax),%ecx 2,75 │ mov %ecx,0x10(%eax) 1,72 │ mov %edx,-0xc(%eax) │ VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0); 2,41 │ mov 0xc(%eax),%edx 2,41 │ mov -0x10(%eax),%ecx 2,41 │ mov %ecx,0xc(%eax) 0,69 │ mov %edx,-0x10(%eax) 1,72 │ mov 0x8(%eax),%edx 0,69 │ mov -0x14(%eax),%ecx 1,03 │ mov %ecx,0x8(%eax) 1,37 │ mov %edx,-0x14(%eax) 1,37 │ mov 0x4(%eax),%edx 0,69 │ mov -0x18(%eax),%ecx 3,09 │ mov %ecx,0x4(%eax) 2,06 │ mov %edx,-0x18(%eax) 1,37 │ mov (%eax),%edx 2,41 │ mov -0x1c(%eax),%ecx 1,37 │ mov %ecx,(%eax) 4,12 │ mov %edx,-0x1c(%eax) │ } │ │ # undef VSWAPL │ # undef VSIZEL │ } 1,03 │ pop %eax 3,44 │ ret Overal speedup: # best of 5 runs before: 8268 idents, 47203 lines, 1526763 bytes, 0.148 s, 319217 lines/s, 10.3 MB/s after: 8273 idents, 47231 lines, 1527685 bytes, 0.146 s, 324092 lines/s, 10.5 MB/s Static ASSERT macro taken from CCAN's[1] build_assert[2] which is in public domain. [1] http://ccodearchive.net/ [2] http://git.ozlabs.org/?p=ccan;a=blob;f=ccan/build_assert/build_assert.h;h=24e59c44cd930173178ac9b6e101b0af64a879e9;hb=HEAD
This commit is contained in:
parent
8eb92e6052
commit
63193d1794
4
tcc.h
4
tcc.h
@ -228,6 +228,10 @@
|
||||
#define true 1
|
||||
typedef int BOOL;
|
||||
|
||||
#ifndef _STATIC_ASSERT
|
||||
#define _STATIC_ASSERT(cond) do { (void) sizeof(char [1 - 2*!(cond)]); } while(0)
|
||||
#endif
|
||||
|
||||
#define INCLUDE_STACK_SIZE 32
|
||||
#define IFDEF_STACK_SIZE 64
|
||||
#define VSTACK_SIZE 256
|
||||
|
34
tccgen.c
34
tccgen.c
@ -458,8 +458,6 @@ static void vseti(int r, int v)
|
||||
|
||||
ST_FUNC void vswap(void)
|
||||
{
|
||||
SValue tmp;
|
||||
|
||||
/* cannot let cpu flags if other instruction are generated. Also
|
||||
avoid leaving VT_JMP anywhere except on the top of the stack
|
||||
because it would complicate the code generator. */
|
||||
@ -468,9 +466,35 @@ ST_FUNC void vswap(void)
|
||||
if (v == VT_CMP || (v & ~1) == VT_JMP)
|
||||
gv(RC_INT);
|
||||
}
|
||||
tmp = vtop[0];
|
||||
vtop[0] = vtop[-1];
|
||||
vtop[-1] = tmp;
|
||||
|
||||
/*
|
||||
* vtop[0], vtop[-1] = vtop[-1], vtop[0]
|
||||
*
|
||||
* vswap is called often and exchanging vtop[0] vs vtop[-1] is hot on
|
||||
* profile, so it is hand optimized
|
||||
*/
|
||||
unsigned long *vtopl = (unsigned long *)vtop;
|
||||
# define VSIZEL (sizeof(*vtop) / sizeof(*vtopl))
|
||||
|
||||
_STATIC_ASSERT( VSIZEL*sizeof(*vtopl) == sizeof(*vtop) );
|
||||
_STATIC_ASSERT( VSIZEL <= 16 ); /* should be enough */
|
||||
switch(VSIZEL) {
|
||||
# define VSWAPL(i) \
|
||||
case i+1: { \
|
||||
unsigned long tmpl; \
|
||||
tmpl = vtopl[i]; \
|
||||
vtopl[i] = vtopl[-1*VSIZEL + i]; \
|
||||
vtopl[-1*VSIZEL + i] = tmpl; \
|
||||
} do {} while (0)
|
||||
|
||||
VSWAPL(15); VSWAPL(14); VSWAPL(13); VSWAPL(12);
|
||||
VSWAPL(11); VSWAPL(10); VSWAPL( 9); VSWAPL( 8);
|
||||
VSWAPL( 7); VSWAPL( 6); VSWAPL( 5); VSWAPL( 4);
|
||||
VSWAPL( 3); VSWAPL( 2); VSWAPL( 1); VSWAPL( 0);
|
||||
}
|
||||
|
||||
# undef VSWAPL
|
||||
# undef VSIZEL
|
||||
}
|
||||
|
||||
ST_FUNC void vpushv(SValue *v)
|
||||
|
Loading…
Reference in New Issue
Block a user