diff --git a/src/add-ons/accelerants/nvidia/Acceleration.c b/src/add-ons/accelerants/nvidia/Acceleration.c index 205bf0f5cb..893de5150c 100644 --- a/src/add-ons/accelerants/nvidia/Acceleration.c +++ b/src/add-ons/accelerants/nvidia/Acceleration.c @@ -9,8 +9,8 @@ /* note: moved DMA acceleration 'top-level' routines to be integrated in the engine: - it is costly to call the engine for every single function within a loop!! - (BeRoMeter 1.2.6 benchmarked: P4 3.2Ghz increased 15%, ...) + it is costly to call the engine for every single function within a loop! + (measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.) Leaving PIO acceleration as it is for now, for the purpose of benchmarking :-) note also: diff --git a/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c b/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c index 3fbe4482ac..bb6a21d7b1 100644 --- a/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c +++ b/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c @@ -1052,14 +1052,19 @@ void nv_acc_assert_fifo_dma(void) /* note: moved acceleration 'top-level' routines to be integrated in the engine: - it is costly to call the engine for every single function within a loop!! - (BeRoMeter 1.2.6 benchmarked: P4 3.2Ghz increased 15%, ...) + it is costly to call the engine for every single function within a loop! + (measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.) + + note also: + splitting up each command list into sublists (see routines below) prevents + a lot more nested calls, further increasing the speed with upto 70%. */ /* screen to screen blit - i.e. move windows around and scroll within them. */ void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count) { uint32 i = 0; + uint16 subcnt; /*** init acc engine for blit function ***/ /* ROP registers (Raster OPeration): @@ -1072,25 +1077,34 @@ void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count /*** do each blit ***/ /* Note: * blit-copy direction is determined inside nvidia hardware: no setup needed */ - while (count--) + while (count) { - /* instruct engine what to blit: - * wait for room in fifo for blit cmd if needed. */ - if (nv_acc_fifofree_dma(4) != B_OK) return; - /* now setup blit (writing 4 32bit words) */ - nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3); - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */ - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */ - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - ((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */ + /* break up the list in sublists to minimize calls, while making sure long + * lists still get executed without trouble */ + subcnt = 32; + if (count < 32) subcnt = count; + count -= subcnt; + + /* wait for room in fifo for blit cmd if needed. */ + if (nv_acc_fifofree_dma(4 * subcnt) != B_OK) return; + + while (subcnt--) + { + /* now setup blit (writing 4 32bit words) */ + nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3); + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */ + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */ + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + ((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */ + + i++; + } /* tell the engine to fetch the commands in the DMA buffer that where not * executed before. */ nv_start_dma(); - - i++; } } @@ -1098,6 +1112,7 @@ void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count) { uint32 i = 0; + uint16 subcnt; /*** init acc engine for fill function ***/ /* ROP registers (Raster OPeration): @@ -1111,24 +1126,33 @@ void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *l si->engine.dma.cmdbuffer[si->engine.dma.current++] = colorIndex; /* Color1A */ /*** draw each rectangle ***/ - while (count--) + while (count) { - /* instruct engine what to fill: - * wait for room in fifo for bitmap cmd if needed. */ - if (nv_acc_fifofree_dma(3) != B_OK) return; - /* now setup fill (writing 3 32bit words) */ - nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, 2); - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */ - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((((list[i].right)+1) - (list[i].left)) << 16) | - (((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */ + /* break up the list in sublists to minimize calls, while making sure long + * lists still get executed without trouble */ + subcnt = 32; + if (count < 32) subcnt = count; + count -= subcnt; + + /* wait for room in fifo for bitmap cmd if needed. */ + if (nv_acc_fifofree_dma(3 * subcnt) != B_OK) return; + + while (subcnt--) + { + /* now setup fill (writing 3 32bit words) */ + nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, 2); + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */ + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((((list[i].right)+1) - (list[i].left)) << 16) | + (((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */ + + i++; + } /* tell the engine to fetch the commands in the DMA buffer that where not * executed before. */ nv_start_dma(); - - i++; } } @@ -1136,6 +1160,7 @@ void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *l void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count) { uint32 i = 0; + uint16 subcnt; /*** init acc engine for fill function ***/ /* ROP registers (Raster OPeration): @@ -1149,23 +1174,32 @@ void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 cou si->engine.dma.cmdbuffer[si->engine.dma.current++] = colorIndex; /* Color1A */ /*** draw each span ***/ - while (count--) + while (count) { - /* instruct engine what to fill: - * wait for room in fifo for bitmap cmd if needed. */ - if (nv_acc_fifofree_dma(3) != B_OK) return; - /* now setup fill (writing 3 32bit words) */ - nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, 2); - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */ - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - ((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */ + /* break up the list in sublists to minimize calls, while making sure long + * lists still get executed without trouble */ + subcnt = 32; + if (count < 32) subcnt = count; + count -= subcnt; + + /* wait for room in fifo for bitmap cmd if needed. */ + if (nv_acc_fifofree_dma(3 * subcnt) != B_OK) return; + + while (subcnt--) + { + /* now setup fill (writing 3 32bit words) */ + nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, 2); + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */ + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + ((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */ + + i+=3; + } /* tell the engine to fetch the commands in the DMA buffer that where not * executed before. */ nv_start_dma(); - - i+=3; } } @@ -1173,6 +1207,7 @@ void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 cou void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count) { uint32 i = 0; + uint16 subcnt; /*** init acc engine for invert function ***/ /* ROP registers (Raster OPeration): @@ -1186,23 +1221,32 @@ void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count si->engine.dma.cmdbuffer[si->engine.dma.current++] = 0x00000000; /* Color1A */ /*** invert each rectangle ***/ - while (count--) + while (count) { - /* instruct engine what to fill: - * wait for room in fifo for bitmap cmd if needed. */ - if (nv_acc_fifofree_dma(3) != B_OK) return; - /* now setup fill (writing 3 32bit words) */ - nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, 2); - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */ - si->engine.dma.cmdbuffer[si->engine.dma.current++] = - (((((list[i].right)+1) - (list[i].left)) << 16) | - (((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */ + /* break up the list in sublists to minimize calls, while making sure long + * lists still get executed without trouble */ + subcnt = 32; + if (count < 32) subcnt = count; + count -= subcnt; + + /* wait for room in fifo for bitmap cmd if needed. */ + if (nv_acc_fifofree_dma(3 * subcnt) != B_OK) return; + + while (subcnt--) + { + /* now setup fill (writing 3 32bit words) */ + nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, 2); + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */ + si->engine.dma.cmdbuffer[si->engine.dma.current++] = + (((((list[i].right)+1) - (list[i].left)) << 16) | + (((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */ + + i++; + } /* tell the engine to fetch the commands in the DMA buffer that where not * executed before. */ nv_start_dma(); - - i++; } } diff --git a/src/add-ons/accelerants/nvidia/engine/nv_general.c b/src/add-ons/accelerants/nvidia/engine/nv_general.c index 5ea9cac1ff..3c50311ba2 100644 --- a/src/add-ons/accelerants/nvidia/engine/nv_general.c +++ b/src/add-ons/accelerants/nvidia/engine/nv_general.c @@ -90,7 +90,7 @@ status_t nv_general_powerup() { status_t status; - LOG(1,("POWERUP: Haiku nVidia Accelerant 0.39 running.\n")); + LOG(1,("POWERUP: Haiku nVidia Accelerant 0.40 running.\n")); /* preset no laptop */ si->ps.laptop = false;