diff --git a/headers/private/graphics/intel_extreme/intel_extreme.h b/headers/private/graphics/intel_extreme/intel_extreme.h
index c9e3fedd01..cc1e23a3d1 100644
--- a/headers/private/graphics/intel_extreme/intel_extreme.h
+++ b/headers/private/graphics/intel_extreme/intel_extreme.h
@@ -176,6 +176,7 @@ struct intel_free_graphics_memory {
 #define RING_BUFFER_START				0x8
 #define RING_BUFFER_CONTROL				0xc
 #define INTEL_RING_BUFFER_SIZE_MASK		0x000ff800
+#define INTEL_RING_BUFFER_HEAD_MASK		0x001ffffc
 #define INTEL_RING_BUFFER_ENABLED		1
 
 #define INTEL_DISPLAY_HTOTAL			0x60000
@@ -248,6 +249,7 @@ struct intel_free_graphics_memory {
 
 // 2D acceleration
 #define COMMAND_BLIT					0x54c00006
+#define COMMAND_BLIT_RGBA				0x00300000
 
 // overlay
 
diff --git a/src/add-ons/accelerants/intel_extreme/commands.h b/src/add-ons/accelerants/intel_extreme/commands.h
index f76457855e..0a689131aa 100644
--- a/src/add-ons/accelerants/intel_extreme/commands.h
+++ b/src/add-ons/accelerants/intel_extreme/commands.h
@@ -16,7 +16,6 @@ struct command {
 	uint32	opcode;
 
 	uint32 *Data() { return &opcode; }
-	virtual size_t Length() = 0;
 };
 
 class QueueCommands {
@@ -24,7 +23,7 @@ class QueueCommands {
 		QueueCommands(ring_buffer &ring);
 		~QueueCommands();
 
-		void Put(struct command &command);
+		void Put(struct command &command, size_t size);
 		void PutFlush();
 		void PutWaitFor(uint32 event);
 		void PutOverlayFlip(uint32 code, bool updateCoefficients);
@@ -40,9 +39,9 @@ class QueueCommands {
 // commands
 
 struct blit_command : command {
-	uint8	flags;
-	uint8	raster_operation;
 	uint16	dest_bytes_per_row;
+	uint8	raster_operation;
+	uint8	flags;
 	uint16	dest_left;
 	uint16	dest_top;
 	uint16	dest_right;
@@ -69,14 +68,14 @@ struct blit_command : command {
 				break;
 			case 32:
 				flags = 3;
+				opcode |= COMMAND_BLIT_RGBA;
 				break;
 		}
 		dest_base = source_base = gInfo->shared_info->frame_buffer_offset;
 		dest_bytes_per_row = source_bytes_per_row = gInfo->shared_info->bytes_per_row;
 		reserved = 0;
+		raster_operation = 0xcc;
 	}
-
-	virtual size_t Length() { return 6; }
 };
 
 #endif	// COMMANDS_H
diff --git a/src/add-ons/accelerants/intel_extreme/engine.cpp b/src/add-ons/accelerants/intel_extreme/engine.cpp
index 53f09d6074..2bc80d8e84 100644
--- a/src/add-ons/accelerants/intel_extreme/engine.cpp
+++ b/src/add-ons/accelerants/intel_extreme/engine.cpp
@@ -12,7 +12,7 @@
 #include "commands.h"
 
 
-#define TRACE_ENGINE
+//#define TRACE_ENGINE
 #ifdef TRACE_ENGINE
 extern "C" void _sPrintf(const char *format, ...);
 #	define TRACE(x) _sPrintf x
@@ -39,20 +39,23 @@ QueueCommands::~QueueCommands()
 		_Write(COMMAND_NOOP);
 	}
 
-	write32(fRingBuffer.register_base + RING_BUFFER_TAIL, fRingBuffer.position);
-
 	// We must make sure memory is written back in case the ring buffer
 	// is in write combining mode - releasing the lock does this, as the
 	// buffer is flushed on a locked memory operation (which is what this
-	// benaphore does)
+	// benaphore does), but it must happen before writing the new tail...
+	int32 flush;
+	atomic_add(&flush, 1);
+
+	write32(fRingBuffer.register_base + RING_BUFFER_TAIL, fRingBuffer.position);
+
 	release_lock(&fRingBuffer.lock);
 }
 
 
 void
-QueueCommands::Put(struct command &command)
+QueueCommands::Put(struct command &command, size_t size)
 {
-	uint32 count = command.Length();
+	uint32 count = size / sizeof(uint32);
 	uint32 *data = command.Data();
 
 	_MakeSpace(count);
@@ -174,6 +177,20 @@ void
 intel_wait_engine_idle(void)
 {
 	TRACE(("intel_wait_engine_idle()\n"));
+
+	// TODO: this should only be a temporary solution!
+	// a better way to do this would be to acquire the engine's lock and
+	// sync to the latest token
+
+	ring_buffer &ring = gInfo->shared_info->primary_ring_buffer;
+	uint32 head, tail;
+	do {
+		head = read32(ring.register_base + RING_BUFFER_HEAD) & INTEL_RING_BUFFER_HEAD_MASK;
+		tail = read32(ring.register_base + RING_BUFFER_TAIL) & INTEL_RING_BUFFER_HEAD_MASK;
+
+		//snooze(100);
+		// Isn't a snooze() a bit too slow? At least it's called *very* often in Haiku...
+	} while (head != tail);
 }
 
 
@@ -189,6 +206,7 @@ status_t
 intel_sync_to_token(sync_token *syncToken)
 {
 	TRACE(("intel_sync_to_token()\n"));
+	intel_wait_engine_idle();
 	return B_OK;
 }
 
@@ -207,10 +225,10 @@ intel_screen_to_screen_blit(engine_token *token, blit_params *params, uint32 cou
 		blit.source_top = params[i].src_top;
 		blit.dest_left = params[i].dest_left;
 		blit.dest_top = params[i].dest_top;
-		blit.dest_right = params[i].dest_left + params[i].width;
-		blit.dest_bottom = params[i].dest_top + params[i].height;
+		blit.dest_right = params[i].dest_left + params[i].width + 1;
+		blit.dest_bottom = params[i].dest_top + params[i].height + 1;
 
-		queue.Put(blit);
+		queue.Put(blit, sizeof(blit));
 	}
 }
 
diff --git a/src/add-ons/accelerants/intel_extreme/hooks.cpp b/src/add-ons/accelerants/intel_extreme/hooks.cpp
index 5e03696af4..4412d0a107 100644
--- a/src/add-ons/accelerants/intel_extreme/hooks.cpp
+++ b/src/add-ons/accelerants/intel_extreme/hooks.cpp
@@ -83,9 +83,9 @@ get_accelerant_hook(uint32 feature, void *data)
 			return intel_sync_to_token;
 
 		/* 2D acceleration */
-/*
 		case B_SCREEN_TO_SCREEN_BLIT:
 			return intel_screen_to_screen_blit;
+/*
 		case B_FILL_RECTANGLE:
 			return intel_fill_rectangle;
 		case B_INVERT_RECTANGLE: