/* * $Id: blur-translate.c,v 1.4 2002-04-17 22:51:58 bdenney Exp $ * * This function is a proof of concept for dynamic C code generation. It * defines a set of simple opcodes which can be used to implement the blur.c * operation. A block of opcodes can be emulated/interpreted, or translated * into C code, compiled as a dynamic library and loaded into the binary for * much faster execution. * * A CodeBlock structure holds a sequence of opcodes to be translated, and * has fields for the dynamic library handle and a function pointer to the * translated function. When a block has not been translated, the function * pointer is NULL. The block may be emulated/interpreted by passing its * opcode_list to the emulate_opcodes() function. To translate the block, * you pass it into translate_block(). * * The translation process begins by writing C code into a file. One * CodeBlock is translated as one function. There is some fixed code that * is always printed at the beginning and end of each function. Each * opcode turns into one macro call in the function, like this: * DO_MOVE_REL(-1,-1); * DO_SET_ACCUM(0); * DO_ADD_DATA(); * The macro definitions are defined in a static header file which is * #included at the top of the translated C code. * * Once the C code is generated, a helper script called buildshared is * used to compile the shared library. The script uses GNU libtool to * compile the C code into a dynamic library. When it's done, we can * dlopen() the dynamic library and fill in the missing CodeBlock fields: * the handle to the dynamic library (so that it could be closed some day) * and the function pointer to the translated code. * * The function execute_code_block() runs translated code. In fact, if * the code block has not been translated, it translated it first and then * runs it. * * Performance: * * For these simple opcodes, I regularly see a 10x speedup in translated * code. * * On a 750MHz Athlon: * emulated code: 10.1msec per 2D blur * translated code: 1.08msec per 2D blur * translation takes 484msec * * To do: * - deal with errors. for example if the compile fails, I could just mark * the CodeBlock as not compilable and always emulate it. * - there are currently two implementations of each opcode, one in * translate2-defs.h and one in blur-translate.c. I should be able to * get it down to just one implementation that is used in both contexts. * This will make it a lot easier to maintain. * */ #include #include #include #include #include #include // libtool library #include "blur-translate.h" #define BLUR_WINDOW_HALF 1 #define DEFAULT_TIMES 1000 int array[MAX_ARRAY][MAX_ARRAY]; int array2[MAX_ARRAY][MAX_ARRAY]; #define MAX_TIMERS 3 struct timeval start[MAX_TIMERS], stop[MAX_TIMERS]; #define start_timer(T) gettimeofday (&start[T], NULL); #define stop_timer(T) gettimeofday (&stop[T], NULL); void report_time (FILE *fp, int T, int iters) { int usec_duration = (stop[T].tv_sec*1000000 + stop[T].tv_usec) - (start[T].tv_sec*1000000 + start[T].tv_usec); double sec = (double)usec_duration / 1.0e3; double sec_per_iter = sec / (double)iters; fprintf (fp, "%f msec\n", sec); if (iters!=1) { fprintf (fp, "Iterations = %d\n", iters); fprintf (fp, "Time per iteration = %f msec\n", sec_per_iter); } } typedef enum { OP_MOVE_REL, // 2 args delta_x and delta_y OP_SET_ACCUM, // 1 arg, sets accum to that arg OP_ADD_DATA, // add data from *load_ptr OP_SUBTRACT_DATA, // sub data from *load_ptr OP_MULTIPLY_DATA, // mul data from *load_ptr OP_STORE_DATA, // store accum to *store_ptr OP_END, // stop N_OPS // must be last } Opcode; typedef void (*exec_func)(State *state); typedef struct { int n_opcodes; int *opcode_list; lt_dlhandle dlhandle; exec_func func; } CodeBlock; // this opcode sequence implements the blur filter, just like all the others. int blur_instructions[] = { OP_MOVE_REL, -1, -1, OP_SET_ACCUM, 0, OP_ADD_DATA, OP_MOVE_REL, 0, 1, OP_ADD_DATA, OP_MOVE_REL, 0, 1, OP_ADD_DATA, OP_MOVE_REL, 1, -2, OP_ADD_DATA, OP_MOVE_REL, 0, 1, OP_ADD_DATA, OP_MOVE_REL, 0, 1, OP_ADD_DATA, OP_MOVE_REL, 1, -2, OP_ADD_DATA, OP_MOVE_REL, 0, 1, OP_ADD_DATA, OP_MOVE_REL, 0, 1, OP_ADD_DATA, OP_MOVE_REL, -1, -1, OP_STORE_DATA, OP_END }; CodeBlock blur = { sizeof(blur_instructions), &blur_instructions[0], NULL, NULL }; void print_state (State *state) { printf ("state={x=%d, y=%d, accum=%d, load_ptr=%p, store_ptr=%p\n", state->x, state->y, state->accum, state->load_ptr, state->store_ptr); } // this is the original blur function from blur.c. I keep it around // for regression testing. void blur_reference() { int sum; int x,y,x2,y2; for (x=1; xfunc != NULL) return 0; block->dlhandle = NULL; block->func = NULL; start_timer(1); // generate C code sprintf (buffer, "translate%d.c", id); fprintf (stderr, "building translation function in %s\n", buffer); fp = fopen (buffer, "w"); assert (fp!=NULL); n = gen_header (fp); assert (n>=0); n = gen_function (block->opcode_list, fp, id); assert (n>=0); fclose (fp); // compile into a shared library fprintf (stderr, "compiling %s\n", buffer); sprintf (buffer, "./buildshared translate%d", id); if (system (buffer) < 0) { fprintf (stderr, "failed: %s\n", buffer); return -1; } sprintf (buffer, "translate%d.c", id); if (stat (buffer, &st) < 0) { fprintf (stderr, "stat failed\n"); return -1; } // open shared library and get the function pointer sprintf (buffer, "libtranslate%d.la", id); block->dlhandle = lt_dlopen (buffer); if (!block->dlhandle) { fprintf (stderr, "can't open the module %s!\n", buffer); fprintf (stderr, "error was: %s\n", lt_dlerror()); return -1; } sprintf (buffer, "translate%d", id); block->func = (void (*)(State *)) lt_dlsym(block->dlhandle, buffer); if (!block->func) { fprintf (stderr, "can't find symbol %s\n", buffer); return -1; } stop_timer(1); fprintf (stderr, "Loaded shared library.\n"); fprintf (stderr, "Translation took "); report_time(stderr, 1, 1); return 0; } void execute_code_block (CodeBlock *block) { State state; if (!block->func) { int n = translate_block (block); assert (n>=0); assert (block->func != NULL); } state.x = 1; state.y = 1; (*block->func)(&state); } void fill_array() { int x,y; for (x=0; x='0' && argv[arg][0]<='9') { sscanf (argv[arg], "%d", ×); fprintf (stderr, "Set iterations to %d\n", times); } else { usage(); } } if (lt_dlinit () != 0) { fprintf (stderr, "lt_dlinit() failed\n"); exit (1); } fill_array(); //dump_array (stderr); start_timer(0); for (i=0; i