2002-04-16 18:28:29 +04:00
|
|
|
In this directory, I'm developing some test cases to study
|
|
|
|
some different options for Bochs performance improvements.
|
|
|
|
|
|
|
|
ideas:
|
|
|
|
|
|
|
|
write a loop or nested loop that takes some serious computing, that takes
|
|
|
|
maybe 15 seconds to run
|
|
|
|
benchmark on several machines
|
|
|
|
compile with optimization and understand the assembly
|
|
|
|
reimplement with switch stmt,function pointers, etc
|
|
|
|
benchmark again to measure the overhead of switch & function calls
|
|
|
|
test on several machines
|
|
|
|
reimplement with goto *label (gcc extension) style
|
|
|
|
benchmark again to measure the overhead of switch & function calls
|
|
|
|
test on several machines
|
|
|
|
|
|
|
|
or, by working more directly with the bochs source, we can test the
|
|
|
|
performance benefit of the "cut-and-paste native code" method on
|
|
|
|
a small scale.
|
2002-04-16 20:47:17 +04:00
|
|
|
|
|
|
|
---------------------------------------------------
|
|
|
|
Performance measurements
|
|
|
|
|
|
|
|
blur.c 1.2
|
|
|
|
-----------
|
|
|
|
estimated 1.43 million instructions per iteration (-O2)
|
|
|
|
(see blur-1.2-performance.gnumeric spreadsheet for details)
|
|
|
|
on Athlon 750 with no optimization, 4.0622 ms per iteration
|
|
|
|
on Athlon 750 with -O2, 1.795138 ms per iteration
|
|
|
|
on P2 350 with no optimization, 6.162224 ms per iteration
|
|
|
|
on P2 350 with -O2, 2.5258 ms per iteration
|
|
|
|
on Bochs with -O2 on Athlon 750, 478 ms per iteration
|
|
|
|
|
|
|
|
Conclusion
|
|
|
|
Bochs is about 270x slower than native code.
|
|
|
|
|
|
|
|
Try replacing the innermost loop of blur() with a function call
|
|
|
|
that does the same thing. Continue to compile with -O2.
|
|
|
|
|
|
|
|
if innermost loop is "sum += array[x2][y2]", 1.796 ms per iter.
|
|
|
|
if innermost loop is "blur_func (&sum, &array[x2][y2])", 3.526978 ms per iter.
|
|
|
|
function with three arguments: 3.784
|
|
|
|
function with four arguments: 4.390
|
|
|
|
function with five arguments: 4.879
|
|
|
|
|
|
|
|
what is the overhead in terms of instructions?
|
|
|
|
func_overhead(N, ARGS) for N instructions in the function, ARGS arguments
|
|
|
|
F(N, ARGS) = ?
|
|
|
|
1.5*ARGS to push them onto the stack
|
|
|
|
1 for the call
|
|
|
|
1 to set %esp back to normal
|
|
|
|
1.5*ARGS to load it into a register and maybe save the old register value
|
|
|
|
2 for leave & ret
|
|
|
|
|
2002-04-16 21:32:55 +04:00
|
|
|
= 4 + 3*ARGS instructions
|
2002-04-16 20:47:17 +04:00
|
|
|
|
|
|
|
This is ignoring the fact that some CISC instructions are much more expensive
|
|
|
|
than others.
|
|
|
|
|
|
|
|
However when you measure the time cost of the function call, it is
|
|
|
|
much larger.
|
|
|
|
|
|
|
|
Compare blur with no function call to blur with a call to a function with
|
|
|
|
2 arguments. 1.796 ms per iter versus 3.526978 ms per iter. The function
|
|
|
|
was called 142884 times, so the overhead of each function call is
|
2002-04-16 21:32:55 +04:00
|
|
|
12.1 ns. (In 12.1 ns this machine can execute between 9-10 instructions.)
|
2002-04-16 20:47:17 +04:00
|
|
|
|
|
|
|
I briefly tested the overhead of function calls of 2,3,4,5 arguments.
|
|
|
|
This should measure the difference between NO function call (inline function)
|
|
|
|
and a function call with N arguments.
|
|
|
|
|
2002-04-16 21:32:55 +04:00
|
|
|
2 args: 12.1 ns more than no function call
|
|
|
|
3 args: 13.91 ns more than no function call
|
|
|
|
4 args: 18.15 ns more than no function call
|
|
|
|
5 args: 21.58 ns more than no function call
|
2002-04-16 20:47:17 +04:00
|
|
|
|
2002-04-16 21:32:55 +04:00
|
|
|
Based on measurements, maybe the func_overhead should be
|
|
|
|
func_overhead(ARGS) = 4 + 2.5*ARGS instructions
|
2002-04-16 20:47:17 +04:00
|
|
|
|
|
|
|
|
2002-04-16 21:32:55 +04:00
|
|
|
Measure the cost of a switch statement.
|
|
|
|
With no switch statement, around 1.84 ms per iteration.
|
|
|
|
With 2 case switch+default, 2.3867 ms.
|
|
|
|
With 3 case switch+default, 3.739635 ms.
|
|
|
|
With 4 case switch, 3.341807 ms. (successive compares)
|
|
|
|
With 8 case switch, 2.241196 ms. (jump table)
|
|
|
|
With 16 case switch, 3.122807 ms. (jump table)
|
|
|
|
With 32 case switch, 3.080882 ms. (jump table)
|
|
|
|
|
2002-04-16 22:16:58 +04:00
|
|
|
Once you have enough cases that gcc creates a jump table, the
|
|
|
|
cost of a switch statement is about 7 instructions.
|
|
|
|
|
|
|
|
Try with a bunch of function calls in a switch stmt.
|
|
|
|
|
|
|
|
|
|
|
|
blur-O2 1.798693 ms
|
|
|
|
blur-O2-func 3.309882 ms
|
|
|
|
blur-O2-switch 3.101132 ms
|
|
|
|
blur-O2-switch-call 2.667050 ms
|
|
|
|
blur-O2-fnptr-switch 8.669802 ms
|
|
|
|
blur-O2-fnptr-table 6.010503 ms
|
|
|
|
|
|
|
|
|
2002-04-16 20:47:17 +04:00
|
|
|
|
|
|
|
;;;;;;;;;;;blur.c revision 1.2, compiled with -O2 -S;;;;;;;;;;;;
|
|
|
|
;; with some annotations by Bryce to figure out what is what.
|
|
|
|
|
|
|
|
.file "blur.c"
|
|
|
|
.version "01.01"
|
|
|
|
gcc2_compiled.:
|
|
|
|
.text
|
|
|
|
.align 4
|
|
|
|
.globl blur
|
|
|
|
.type blur,@function
|
|
|
|
blur:
|
|
|
|
pushl %ebp
|
|
|
|
movl %esp,%ebp
|
|
|
|
subl $36,%esp
|
|
|
|
pushl %edi
|
|
|
|
pushl %esi
|
|
|
|
pushl %ebx
|
|
|
|
movl $1,%eax
|
|
|
|
.p2align 4,,7
|
|
|
|
.L20:
|
|
|
|
movl $1,%edi
|
|
|
|
leal -1(%eax),%ebx
|
|
|
|
movl %ebx,-28(%ebp)
|
|
|
|
leal 1(%eax),%ebx
|
|
|
|
movl %ebx,-16(%ebp)
|
|
|
|
sall $9,%eax
|
|
|
|
movl %eax,-24(%ebp)
|
|
|
|
movl %ebx,-8(%ebp)
|
|
|
|
movl -28(%ebp),%ebx
|
|
|
|
movl %ebx,-32(%ebp)
|
|
|
|
sall $9,-32(%ebp)
|
|
|
|
.p2align 4,,7
|
|
|
|
.L24:
|
|
|
|
xorl %esi,%esi ; let %esi = sum
|
|
|
|
movl -28(%ebp),%ecx
|
|
|
|
leal 1(%edi),%ebx
|
|
|
|
movl %ebx,-20(%ebp)
|
|
|
|
cmpl -8(%ebp),%ecx
|
|
|
|
jg .L26
|
|
|
|
|
|
|
|
|
|
|
|
movl %ebx,-12(%ebp)
|
|
|
|
movl -16(%ebp),%ebx
|
|
|
|
movl %ebx,-4(%ebp)
|
|
|
|
movl -32(%ebp),%ebx
|
|
|
|
movl %ebx,-36(%ebp)
|
|
|
|
.p2align 4,,7
|
|
|
|
.L28:
|
|
|
|
leal -1(%edi),%edx ; let %edx = y2
|
|
|
|
cmpl -12(%ebp),%edx ; test if y2
|
|
|
|
jg .L27
|
|
|
|
|
|
|
|
|
|
|
|
;; build pointer in %eax to array[x2][y2]
|
|
|
|
movl -36(%ebp),%eax
|
|
|
|
addl $array,%eax
|
|
|
|
leal (%eax,%edx,4),%eax
|
|
|
|
.p2align 4,,7
|
|
|
|
|
|
|
|
.L32:
|
|
|
|
;; innermost loop. it has precomputed the endpoint in -20(%ebp)
|
|
|
|
addl (%eax),%esi ;; sum += (%eax)
|
|
|
|
addl $4,%eax ;; point %eax to next value
|
|
|
|
incl %edx ;; y2++
|
|
|
|
cmpl -20(%ebp),%edx ;; if y2<=max
|
|
|
|
jle .L32
|
|
|
|
|
|
|
|
.L27:
|
|
|
|
addl $512,-36(%ebp)
|
|
|
|
incl %ecx
|
|
|
|
cmpl -4(%ebp),%ecx
|
|
|
|
jle .L28
|
|
|
|
|
|
|
|
.L26:
|
|
|
|
movl -24(%ebp),%ebx
|
|
|
|
leal (%ebx,%edi,4),%eax
|
|
|
|
movl %esi,array2(%eax)
|
|
|
|
movl -20(%ebp),%edi
|
|
|
|
cmpl $126,%edi
|
|
|
|
jle .L24
|
|
|
|
|
|
|
|
movl -16(%ebp),%eax
|
|
|
|
cmpl $126,%eax
|
|
|
|
jle .L20
|
|
|
|
|
|
|
|
leal -48(%ebp),%esp
|
|
|
|
popl %ebx
|
|
|
|
popl %esi
|
|
|
|
popl %edi
|
|
|
|
leave
|
|
|
|
ret
|
|
|
|
.Lfe1:
|
|
|
|
.size blur,.Lfe1-blur
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
2002-04-17 06:31:58 +04:00
|
|
|
----------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
blur-opcode.c
|
|
|
|
|
|
|
|
In a past email, I described a way to use gcc and gas to produce
|
|
|
|
native code, so that Bochs would not need its own code generator for
|
|
|
|
every supported platform. I will attempt a proof of concept here.
|
|
|
|
|
|
|
|
I will try to create a system in which snippets of C++ code can be compiled
|
|
|
|
by gcc, then extracted to form a chunk of native binary code. The chunks
|
|
|
|
will be designed so that they can be pasted together efficiently.
|
|
|
|
|
|
|
|
I haven't tried anything like this before....
|