H.264: converting only clipping rects to XRGB

This commit is contained in:
erbth 2014-09-02 22:16:56 +02:00
parent dee50a8ca2
commit 25593c7250
9 changed files with 437 additions and 245 deletions

View File

@ -23,8 +23,6 @@
#include "xf_gfx.h"
#include <sys/time.h>
int xf_ResetGraphics(RdpgfxClientContext* context, RDPGFX_RESET_GRAPHICS_PDU* resetGraphics)
{
xfContext* xfc = (xfContext*) context->custom;
@ -350,19 +348,10 @@ int xf_SurfaceCommand_Planar(xfContext* xfc, RdpgfxClientContext* context, RDPGF
int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_SURFACE_COMMAND* cmd)
{
int status;
UINT32 i, j;
int nXDst, nYDst;
int nWidth, nHeight;
int nbUpdateRects;
UINT32 i;
BYTE* DstData = NULL;
RDPGFX_RECT16* rect;
H264_CONTEXT* h264;
xfGfxSurface* surface;
REGION16 updateRegion;
RECTANGLE_16 updateRect;
RECTANGLE_16* updateRects;
REGION16 clippingRects;
RECTANGLE_16 clippingRect;
RDPGFX_H264_METABLOCK* meta;
RDPGFX_H264_BITMAP_STREAM* bs;
@ -384,7 +373,7 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
DstData = surface->data;
status = h264_decompress(xfc->h264, bs->data, bs->length, &DstData,
PIXEL_FORMAT_XRGB32, surface->scanline, cmd->left, cmd->top, cmd->width, cmd->height);
PIXEL_FORMAT_XRGB32, surface->scanline , surface->height, meta->regionRects, meta->numRegionRects);
if (status < 0)
{
@ -392,54 +381,11 @@ int xf_SurfaceCommand_H264(xfContext* xfc, RdpgfxClientContext* context, RDPGFX_
return -1;
}
if (status < 0)
return -1;
region16_init(&clippingRects);
for (i = 0; i < meta->numRegionRects; i++)
{
rect = &(meta->regionRects[i]);
clippingRect.left = rect->left;
clippingRect.top = rect->top;
clippingRect.right = rect->right;
clippingRect.bottom = rect->bottom;
region16_union_rect(&clippingRects, &clippingRects, &clippingRect);
region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), (RECTANGLE_16*) &(meta->regionRects[i]));
}
updateRect.left = cmd->left;
updateRect.top = cmd->top;
updateRect.right = cmd->right;
updateRect.bottom = cmd->bottom;
region16_init(&updateRegion);
region16_intersect_rect(&updateRegion, &clippingRects, &updateRect);
updateRects = (RECTANGLE_16*) region16_rects(&updateRegion, &nbUpdateRects);
for (j = 0; j < nbUpdateRects; j++)
{
nXDst = updateRects[j].left;
nYDst = updateRects[j].top;
nWidth = updateRects[j].right - updateRects[j].left;
nHeight = updateRects[j].bottom - updateRects[j].top;
/* update region from decoded H264 buffer */
freerdp_image_copy(surface->data, PIXEL_FORMAT_XRGB32, surface->scanline,
nXDst, nYDst, nWidth, nHeight,
h264->data, PIXEL_FORMAT_XRGB32, h264->scanline, nXDst, nYDst);
region16_union_rect(&(xfc->invalidRegion), &(xfc->invalidRegion), &updateRects[j]);
}
region16_uninit(&updateRegion);
region16_uninit(&clippingRects);
if (!xfc->inGfxFrame)
xf_OutputUpdate(xfc);

View File

@ -22,6 +22,7 @@
#include <freerdp/api.h>
#include <freerdp/types.h>
#include <freerdp/channels/rdpgfx.h>
#ifdef WITH_LIBAVCODEC
#ifdef WITH_OPENH264
@ -43,14 +44,16 @@ struct _H264_CONTEXT
{
BOOL Compressor;
BYTE* data;
UINT32 size;
//BYTE* data;
//UINT32 size;
UINT32 width;
UINT32 height;
int scanline;
//int scanline;
#ifdef WITH_OPENH264
ISVCDecoder* pDecoder;
BYTE* pYUVData[3];
int iStride[2];
#endif
#ifdef WITH_LIBAVCODEC
@ -69,7 +72,7 @@ extern "C" {
FREERDP_API int h264_compress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize, BYTE** ppDstData, UINT32* pDstSize);
FREERDP_API int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight);
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRect);
FREERDP_API void h264_context_reset(H264_CONTEXT* h264);

View File

@ -32,7 +32,7 @@
#ifdef WITH_H264_SSSE3
extern int check_ssse3();
extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
extern int freerdp_image_yuv420p_to_xrgb(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int *iStride,int scanline);
#else
#ifdef WITH_H264_ASM
extern int freerdp_image_yuv_to_xrgb_asm(BYTE *pDstData,BYTE **pSrcData,int nWidth,int nHeight,int iStride0,int iStride1);
@ -204,6 +204,7 @@ void h264_dump_yuv_data(BYTE* yuv[], int width, int height, int stride[])
fclose(fp);
}
#ifdef WITH_LIBAVCODEC
int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height)
{
UINT32 size;
@ -224,6 +225,7 @@ int h264_prepare_rgb_buffer(H264_CONTEXT* h264, int width, int height)
return 1;
}
#endif
int freerdp_image_copy_yuv420p_to_xrgb(BYTE* pDstData, int nDstStep, int nXDst, int nYDst,
int nWidth, int nHeight, BYTE* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc)
@ -343,13 +345,11 @@ static void openh264_trace_callback(H264_CONTEXT* h264, int level, const char* m
printf("%d - %s\n", level, message);
}
static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE* pDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight)
static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize)
{
DECODING_STATE state;
SBufferInfo sBufferInfo;
SSysMEMBuffer* pSystemBuffer;
BYTE* pYUVData[3];
struct timeval T1,T2;
@ -360,9 +360,9 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
* Decompress the image. The RDP host only seems to send I420 format.
*/
pYUVData[0] = NULL;
pYUVData[1] = NULL;
pYUVData[2] = NULL;
h264->pYUVData[0] = NULL;
h264->pYUVData[1] = NULL;
h264->pYUVData[2] = NULL;
ZeroMemory(&sBufferInfo, sizeof(sBufferInfo));
@ -371,7 +371,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
h264->pDecoder,
pSrcData,
SrcSize,
pYUVData,
h264->pYUVData,
&sBufferInfo);
/**
@ -382,7 +382,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
*/
if (sBufferInfo.iBufferStatus != 1)
state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, pYUVData, &sBufferInfo);
state = (*h264->pDecoder)->DecodeFrame2(h264->pDecoder, NULL, 0, h264->pYUVData, &sBufferInfo);
gettimeofday(&T2,NULL);
printf("OpenH264: decoding took: %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
@ -391,7 +391,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
#if 0
printf("h264_decompress: state=%u, pYUVData=[%p,%p,%p], bufferStatus=%d, width=%d, height=%d, format=%d, stride=[%d,%d]\n",
state, pYUVData[0], pYUVData[1], pYUVData[2], sBufferInfo.iBufferStatus,
state, h264->pYUVData[0], h264->pYUVData[1], h264->pYUVData[2], sBufferInfo.iBufferStatus,
pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iFormat,
pSystemBuffer->iStride[0], pSystemBuffer->iStride[1]);
#endif
@ -399,7 +399,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
if (state != 0)
return -1;
if (!pYUVData[0] || !pYUVData[1] || !pYUVData[2])
if (!h264->pYUVData[0] || !h264->pYUVData[1] || !h264->pYUVData[2])
return -1;
if (sBufferInfo.iBufferStatus != 1)
@ -412,11 +412,18 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
if (g_H264DumpFrames)
{
h264_dump_yuv_data(pYUVData, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iStride);
h264_dump_yuv_data(h264->pYUVData, pSystemBuffer->iWidth, pSystemBuffer->iHeight, pSystemBuffer->iStride);
}
g_H264FrameId++;
h264->iStride[0] = pSystemBuffer->iStride[0];
h264->iStride[1] = pSystemBuffer->iStride[1];
h264->width = pSystemBuffer->iWidth;
h264->height = pSystemBuffer->iHeight;
#if 0
if (h264_prepare_rgb_buffer(h264, pSystemBuffer->iWidth, pSystemBuffer->iHeight) < 0)
return -1;
@ -433,6 +440,7 @@ static int openh264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSiz
#endif
gettimeofday(&T2,NULL);
printf("\tconverting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
#endif
return 1;
}
@ -662,10 +670,20 @@ EXCEPTION:
int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nXDst, int nYDst, int nWidth, int nHeight)
BYTE** ppDstData, DWORD DstFormat, int nDstStep, int nDstHeight, RDPGFX_RECT16* regionRects, int numRegionRects)
{
UINT32 UncompressedSize;
BYTE* pDstData;
BYTE* pDstPoint;
BYTE** pYUVData;
BYTE* pYUVPoint[2];
RDPGFX_RECT16* rect;
int* iStride;
int ret, i, cx, cy;
struct timeval T1,T2;
if (!h264)
return -1;
@ -675,39 +693,27 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
#endif
#if 0
printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, nXDst=%d, nYDst=%d, nWidth=%d, nHeight=%d)\n",
pSrcData, SrcSize, *ppDstData, nDstStep, nXDst, nYDst, nWidth, nHeight);
printf("h264_decompress: pSrcData=%p, SrcSize=%u, pDstData=%p, nDstStep=%d, numRegionRects=%d\n",
pSrcData, SrcSize, *ppDstData, nDstStep, numRegionRects);
#endif
/* Allocate a destination buffer (if needed). */
UncompressedSize = nWidth * nHeight * 4;
if (UncompressedSize == 0)
if (!(pDstData = *ppDstData))
return -1;
pDstData = *ppDstData;
if (!pDstData)
{
pDstData = (BYTE*) malloc(UncompressedSize);
if (!pDstData)
return -1;
*ppDstData = pDstData;
}
if (g_H264DumpFrames)
{
h264_dump_h264_data(pSrcData, SrcSize);
}
#ifdef WITH_OPENH264
return openh264_decompress(
h264, pSrcData, SrcSize,
pDstData, DstFormat, nDstStep,
nXDst, nYDst, nWidth, nHeight);
ret = openh264_decompress(h264, pSrcData, SrcSize);
if (ret != 1)
return ret;
pYUVData = h264->pYUVData;
iStride = h264->iStride;
#endif
#ifdef WITH_LIBAVCODEC
@ -717,6 +723,38 @@ int h264_decompress(H264_CONTEXT* h264, BYTE* pSrcData, UINT32 SrcSize,
nXDst, nYDst, nWidth, nHeight);
#endif
/* Convert I420 (same as IYUV) to XRGB. */
UncompressedSize = h264->width * h264->height * 4;
if (UncompressedSize > (nDstStep * nDstHeight))
return -1;
gettimeofday(&T1,NULL);
for (i = 0; i < numRegionRects; i++){
rect = &(regionRects[i]);
cx = rect->right - rect->left;
cy = rect->bottom - rect->top;
pDstPoint = pDstData + rect->top * nDstStep + rect->left * 4;
pYUVPoint[0] = pYUVData[0] + rect->top * iStride[0] + rect->left;
ret = rect->top/2 * iStride[1] + rect->left/2;
pYUVPoint[1] = pYUVData[1] + ret;
pYUVPoint[2] = pYUVData[2] + ret;
#if 1
printf("regionRect: x: %d, y: %d, cx: %d, cy: %d\n",
rect->left, rect->top, cx, cy);
#endif
#ifdef WITH_H264_SSSE3
freerdp_image_yuv420p_to_xrgb(pDstPoint, pYUVPoint, cx, cy, iStride, nDstStep);
#endif
}
gettimeofday(&T2,NULL);
printf("converting took %u sec %u usec\n",(unsigned int)(T2.tv_sec-T1.tv_sec),(unsigned int)(T2.tv_usec-T1.tv_usec));
return 1;
}
@ -737,7 +775,7 @@ H264_CONTEXT* h264_context_new(BOOL Compressor)
#ifdef WITH_H264_SSSE3
if(check_ssse3()){
printf("SSSE3 seems to be not supported on this system, try without WITH_H264_ASM ...");
printf("SSSE3 seems to be not supported on this system, try without WITH_H264_SSSE3 ...");
return FALSE;
}
#endif
@ -772,14 +810,13 @@ void h264_context_free(H264_CONTEXT* h264)
{
if (h264)
{
_aligne_free(h264->data);
#ifdef WITH_OPENH264
openh264_free(h264);
#endif
#ifdef WITH_LIBAVCODEC
libavcodec_free(h264);
_aligned_free(h264->data);
#endif
free(h264);

View File

@ -73,7 +73,7 @@ freerdp_image_yuv420p_to_xrgb:
mov ebp,esp
;"local variables"
sub esp,318 ;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_column 1 -41,res 1 -42,G 16 -58,B 16 -74,
sub esp,318 ;res 8 -8,res 8 -16,res 8 -24,U 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,res 1 -42,G 16 -58,B 16 -74,
;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 4 -190,VaddUV 4 -194,stack offset 8 -202,
;cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 4 -318

View File

@ -1,7 +1,8 @@
; a entire function for converting YUV420p data to the RGB format (without any special upconverting)
; function for converting YUV420p data to the RGB format (but without any special upconverting)
; It's completely written in nasm-x86-assembly for intel processors supporting SSSE3 and higher.
; Restrictions are that output data has to be aligned to 16 byte (a question of REAL performance!)
; and the width of resolution must be divisable by four.
; The target scanline (6th parameter) must be a multiple of 16.
; iStride[0] must be (target scanline) / 4 or bigger and iStride[1] the next multiple of four
; of the half of iStride[0] or bigger
;
section .text
global check_ssse3
@ -48,7 +49,7 @@ check_ssse3_end:
ret
;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1)
;extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline)
global freerdp_image_yuv420p_to_xrgb
freerdp_image_yuv420p_to_xrgb:
push rbx
@ -79,11 +80,13 @@ freerdp_image_yuv420p_to_xrgb:
xor r14,r14
;"local variables"
sub rsp,316 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_column 1 -41,res 1 -42,G 16 -58,B 16 -74,
;R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190,res 12 -202,cmp:255 16 -218,
;cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,scanline 2 -316
sub rsp,338 ;pDstData 8 -8,Y 8 -16,U 8 -24,V 8 -32,nWidth 2 -34,nHeight 2 -36,iStride0 2 -38,iStride1 2 -40,last_line 1 -41,last_column 1 -42,
;G 16 -58,B 16 -74,R 16 -90,add:128 16 -106,sub:128 16 -122,mul:48 16 -138,mul:475 16 -154,mul:403 16 -170,mul:120 16 -186,VaddY 2 -188,VaddUV 2 -190,
;res 12 -202,cmp:255 16 -218,cmp:0 16 -234,shuflleR 16 -250,andG 16 -266,shuffleB 16 -280,shuffleY 16 -296,shuffleUV 16 -314,andRemainingColumns 16 -330,
;VddDst 8 -338
;last_line: if the last (U,V doubled) line should be skipped, set to 1B
;last_line: if the last (U,V doubled) line should be skipped, set to 10B
;last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four)
mov [rbp-8],rdi
@ -97,28 +100,46 @@ freerdp_image_yuv420p_to_xrgb:
mov [rbp-34],dx
mov r13w,cx
and r8,0FFFFH
mov [rbp-38],r8w
and r9,0FFFFH
mov [rbp-40],r9w
mov r10w,r9w
and r10,0FFFFH
shl r8w,1
sub r8w,dx
mov r11w,r8w
mov ecx,[r8]
mov [rbp-38],ecx
mov r12d,[r8+4]
mov [rbp-40],r12w
mov r10w,dx
shr dx,1
sub r9w,dx
mov r12w,r9w
mov [rbp-42],dl
and byte [rbp-42],11B
mov [rbp-338],r10
shr word [rbp-338],1
shl cx,1
mov r8w,[rbp-34]
shr r8w,2
shl r10w,2
add r8w,3
and r8w, 0FFFCH
sub [rbp-338],r8w
sub cx,r8w
shr r8w,1
mov dx,r8w
add dx,2
and dx,0FFFCH
sub r12w,dx
shl dword [rbp-338],2
mov r11w,cx
shr r8w,1
mov r9w,[rbp-38]
;and al,11B
;jz no_column_rest
@ -238,11 +259,40 @@ freerdp_image_yuv420p_to_xrgb:
mov eax,80038003H
mov [rbp-302],eax
;remaining columns and mask
cmp byte [rbp-42],0
je freerdp_image_yuv420p_to_xrgb_no_columns_remain
mov dl,[rbp-42]
xor ebx,ebx
xor ecx,ecx
xor esi,esi
mov eax,0FFFFFFFFH
cmp dl,1H
je freerdp_image_yuv420p_to_xrgb_write_columns_remain
mov ebx,0FFFFFFFFH
cmp dl,2H
je freerdp_image_yuv420p_to_xrgb_write_columns_remain
mov ecx,0FFFFFFFFH
freerdp_image_yuv420p_to_xrgb_write_columns_remain:
mov [rbp-330],eax
mov [rbp-326],ebx
mov [rbp-322],ecx
mov [rbp-318],esi
mov byte [rbp-42],1
freerdp_image_yuv420p_to_xrgb_no_columns_remain:
mov rsi,[rbp-16]
mov rax,[rbp-24]
mov rbx,[rbp-32]
;jmp freerdp_image_yuv420p_to_xrgb_end
freerdp_image_yuv420p_to_xrgb_hloop:
dec r13w
@ -254,7 +304,7 @@ not_last_line:
xor cx,cx
freerdp_image_yuv420p_to_xrgb_wloop:
;main loop
; Well, in the end it should look like this:
; C = Y;
; D = U - 128;
; E = V - 128;
@ -264,21 +314,31 @@ freerdp_image_yuv420p_to_xrgb_wloop:
; B = clip(( 256 * C + 475 * D + 128) >> 8);
test cx,1B
jnz load_yuv_data
jnz freerdp_image_yuv420p_to_xrgb_load_yuv_data
;prepare U data
; Y-, U- and V-data is stored in different arrays.
; We start with processing U-data.
; at first we fetch four U-values from its array and shuffle them like this:
; 0d0d 0c0c 0b0b 0a0a
; we've done two things: converting the values to signed words and duplicating
; each value, because always two pixel "share" the same U- (and V-) data
movd xmm0,[rax]
movdqa xmm5,[rbp-314]
pshufb xmm0,xmm5 ;but this is the omest instruction of all!!
pshufb xmm0,xmm5 ;but this is the awesomest instruction of all!!
add rax,4
; then we subtract 128 from each value, so we get D
movdqa xmm3,[rbp-122]
psubsw xmm0,xmm3
; we need to do two things with our D, so let's store it for later use
movdqa xmm2,xmm0
; now we can multiply our D with 48 and unpack it to xmm4:xmm0
; this is what we need to get G data later on
movdqa xmm4,xmm0
movdqa xmm7,[rbp-138]
pmullw xmm0,xmm7
@ -289,11 +349,16 @@ freerdp_image_yuv420p_to_xrgb_wloop:
punpckhwd xmm7,xmm4
movdqa xmm4,xmm7
; to complete this step, add (?) 128 to each value (rounding ?!)
; yeah, add. in the end this will be subtracted from something,
; because it's part of G: 256*C - (48*D + 120*E - 128), 48*D-128 !
; by the way, our values have become signed dwords during multiplication!
movdqa xmm6,[rbp-106]
psubd xmm0,xmm6
psubd xmm4,xmm6
; to get B data, we need to prepare a secound value, D*475+128
movdqa xmm1,xmm2
movdqa xmm7,[rbp-154]
pmullw xmm1,xmm7
@ -306,10 +371,14 @@ freerdp_image_yuv420p_to_xrgb_wloop:
paddd xmm1,xmm6
paddd xmm7,xmm6
; so we got something like this: xmm7:xmm1
; this pair contains values for 16 pixel:
; aabbccdd
; aabbccdd, but we can only work on four pixel at once, so we need to save upper values
movdqa [rbp-74],xmm7
;prepare V data
; Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients.
movd xmm2,[rbx]
pshufb xmm2,xmm5
@ -319,6 +388,7 @@ freerdp_image_yuv420p_to_xrgb_wloop:
movdqa xmm5,xmm2
; this is also known as E*403+128, we need it to convert R data
movdqa xmm3,xmm2
movdqa xmm7,[rbp-170]
pmullw xmm2,xmm7
@ -331,9 +401,11 @@ freerdp_image_yuv420p_to_xrgb_wloop:
paddd xmm2,xmm6
paddd xmm7,xmm6
; and preserve upper four values for future ...
movdqa [rbp-90],xmm7
; doing this step: E*120
movdqa xmm3,xmm5
movdqa xmm7,[rbp-186]
pmullw xmm3,xmm7
@ -343,59 +415,128 @@ freerdp_image_yuv420p_to_xrgb_wloop:
punpcklwd xmm3,xmm5
punpckhwd xmm7,xmm5
; now we complete what we've begun above:
; (48*D-128) + (120*E) = (48*D +120*E -128)
paddd xmm0,xmm3
paddd xmm4,xmm7
; and store to memory !
movdqa [rbp-58],xmm4
jmp valid_yuv_data
; real assembly programmers do not only produce best results between 0 and 5 o'clock,
; but are also kangaroos!
jmp freerdp_image_yuv420p_to_xrgb_valid_yuv_data
load_yuv_data:
freerdp_image_yuv420p_to_xrgb_load_yuv_data:
; maybe you've wondered about the conditional jump to this label above ?
; Well, we prepared UV data for eight pixel in each line, but can only process four
; per loop. So we need to load the upper four pixel data from memory each secound loop!
movdqa xmm1,[rbp-74]
movdqa xmm2,[rbp-90]
movdqa xmm0,[rbp-58]
valid_yuv_data:
freerdp_image_yuv420p_to_xrgb_valid_yuv_data:
inc cx
cmp cx,r8w
jne freerdp_image_yuv420p_to_xrgb_not_last_columns
shl byte [rbp-42],1
;Y data processing
freerdp_image_yuv420p_to_xrgb_not_last_columns:
; We didn't produce any output yet, so let's do so!
; Ok, fetch four pixel from the Y-data array and shuffle them like this:
; 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256
movd xmm4,[rsi]
pshufb xmm4,[rbp-298]
movdqa xmm5,xmm4
movdqa xmm6,xmm4
; no we can perform the "real" conversion itself and produce output!
paddd xmm4,xmm2
psubd xmm5,xmm0
paddd xmm6,xmm1
; in the end, we only need bytes for RGB values.
; So, what do we do? right! shifting left makes values bigger and thats always good.
; before we had dwords of data, and by shifting left and treating the result
; as packed words, we get not only signed words, but do also divide by 256
; imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least
; significant byte, that we don't need anymore, because we've done some rounding
pslld xmm4,8
pslld xmm5,8
pslld xmm6,8
; one thing we still have to face is the clip() function ...
; we have still signed words, and there are those min/max instructions in SSE2 ...
; the max instruction takes always the bigger of the two operands and stores it in the first one,
; and it operates with signs !
; if we feed it with our values and zeros, it takes the zeros if our values are smaller than
; zero and otherwise our values
movdqa xmm7,[rbp-234]
pmaxsw xmm4,xmm7 ;what an awesome instruction!
pmaxsw xmm5,xmm7
pmaxsw xmm6,xmm7
; the same thing just completely different can be used to limit our values to 255,
; but now using the min instruction and 255s
movdqa xmm7,[rbp-218]
pminsw xmm4,xmm7
pminsw xmm5,xmm7
pminsw xmm6,xmm7
; Now we got our bytes.
; the moment has come to assemble the three channels R,G and B to the xrgb dwords
; on Red channel we just have to and each futural dword with 00FF0000H
pand xmm4,[rbp-250]
; on Green channel we have to shuffle somehow, so we get something like this:
; 00d0 00c0 00b0 00a0
pshufb xmm5,[rbp-266]
; and on Blue channel that one:
; 000d 000c 000b 000a
pshufb xmm6,[rbp-282]
; and at last we or it together and get this one:
; xrgb xrgb xrgb xrgb
por xmm4,xmm5
por xmm4,xmm6
movdqa [rdi],xmm4
; Only thing to do know is writing data to memory, but this gets a bit more
; complicated if the width is not a multiple of four and it is the last column in line.
; but otherwise just play the kangaroo
test byte [rbp-42],2
je freerdp_image_yuv420p_to_xrgb_column_process_complete
; let's say, we need to only convert six pixel in width
; Ok, the first 4 pixel will be converted just like every 4 pixel else, but
; if it's the last loop in line, [rbp-42] is shifted left by one (curious? have a look above),
; and we land here. Through initialisation a mask was prepared. In this case it looks like
; 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH
movdqa xmm6,[rbp-330]
; we and our output data with this mask to get only the valid pixel
pand xmm4,xmm6
; then we fetch memory from the destination array ...
movdqu xmm5,[rdi]
; ... and and it with the inverse mask. We get only those pixel, which should not be updated
pandn xmm6,xmm5
; we only have to or the two values together and write it back to the destination array,
; and only the pixel that should be updated really get changed.
por xmm4,xmm6
freerdp_image_yuv420p_to_xrgb_column_process_complete:
movdqu [rdi],xmm4
;Y data processing in secound line
; Because UV data is the same for two lines, we can process the secound line just here,
; in the same loop. Only thing we need to do is to add some offsets to the Y- and destination
; pointer. These offsets are iStride[0] and the target scanline.
; But if we don't need to process the secound line, like if we are in the last line of processing nine lines,
; we just skip all this.
test r14b,2
jnz skip_last_line1
jnz freerdp_yuv420p_to_xrgb_skip_last_line
movd xmm4,[rsi+r9]
pshufb xmm4,[rbp-298]
@ -429,21 +570,46 @@ valid_yuv_data:
por xmm4,xmm5
por xmm4,xmm6
movdqa [rdi+r10],xmm4
test byte [rbp-42],2
je freerdp_image_yuv420p_to_xrgb_column_process_complete2
skip_last_line1:
movdqa xmm6,[rbp-330]
pand xmm4,xmm6
movdqu xmm5,[rdi+r10]
pandn xmm6,xmm5
por xmm4,xmm6
; only thing is, we should shift [rbp-42] back here, because we have processed the last column,
; and this "special condition" can be released
shr byte [rbp-42],1
freerdp_image_yuv420p_to_xrgb_column_process_complete2:
movdqu [rdi+r10],xmm4
freerdp_yuv420p_to_xrgb_skip_last_line:
; after all we have to increase the destination- and Y-data pointer by four pixel
add rdi,16
add rsi,4
inc cx
cmp cx,r8w
jne freerdp_image_yuv420p_to_xrgb_wloop
freerdp_image_yuv420p_to_xrgb_wloop_end:
add rdi,r10
; after each line we have to add the scanline to the destination pointer, because
; we are processing two lines at once, but only increasing the destination pointer
; in the first line. Well, we only have one pointer, so it's the easiest way to access
; the secound line with the one pointer and an offset (scanline)
; if we're not converting the full width of the scanline, like only 64 pixel, but the
; output buffer was "designed" for 1920p HD, we have to add the remaining length for each line,
; to get into the next line.
add rdi,[rbp-338]
; same thing has to be done for Y-data, but with iStride[0] instead of the target scanline
add rsi,r11
; and again for UV data, but here it's enough to add the remaining length, because
; UV data is the same for two lines and there exists only one "UV line" on two "real lines"
add rax,r12
add rbx,r12
;mov eax,r12d

View File

@ -67,14 +67,17 @@ YUV_to_RGB_asm31:
ret
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight);
;extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);
global freerdp_image_yuv_to_xrgb_asm
freerdp_image_yuv_to_xrgb_asm:
push rbx
push rbp
mov rbp, rsp
;cWidth: cx
sub rsp,72 ;pDstData,pSrcData[3],nWidth,nHeight,cHeight,scanline,iStride[1]
push rbx
sub rsp,82 ;pDstData -8,pSrcData[3] -32,nWidth -40,nHeight -48,cHeight -56,scanline -64,iStride[0] -72,VaddDst -80,last_column 1 -81,last_line 1 -82
;last_column: set to 10B, if last column should be skipped ('cause UV data is the same for two columns and two columns are processed at once)
;last_line: set to 10B, if last line should be skipped ('cause UV data is the same for two lines and two lines are processed at once)
mov [rbp-8],rdi
@ -86,126 +89,160 @@ freerdp_image_yuv_to_xrgb_asm:
mov rax,[rsi+16]
mov [rbp-32],rax
mov [rbp-40],rdx
and rdx,0FFFFH
;mov [rbp-40],rdx
shr rcx,1 ;/2
mov [rbp-48],rcx
shl rdx,2
mov [rbp-64],rdx
and r9,0FFFFH
mov [rbp-64],r9
shr r9d,1
sub r9d,edx
shl r9d,2
mov [rbp-80],r9
mov rax,[rbp-48]
mov [rbp-56],rax
mov [rbp-72],r8
mov rax,[rbp-40]
mov rcx,[r8]
and rcx,0FFFFH
mov [rbp-72],rcx
shl dword [rbp-72],1
sub [rbp-72],rax
sub [rbp-72],rdx
mov r9,[r8+4]
mov r8,rcx
and r9,0FFFFH
shr rax,1
sub r9,rax
mov al,dl
and al,1B
mov [rbp-81],al
inc dx
shr edx,1
mov [rbp-40],rdx
freerdp_image_yuv_to_xrgb_asm_loopH:
mov rcx,[rbp-40]
shr rcx,1
mov cx,[rbp-40]
freerdp_image_yuv_to_xrgb_asm_loopW:
mov rax,[rbp-16]
mov edi,[rax]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov edi,[rax+r8]
inc rax
mov [rbp-16],rax
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov rdx,[rbp-64]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
mov rax,[rbp-16]
mov edi,[rax]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
mov rax,[rbp-16]
mov edi,[rax+r8]
inc rax
mov [rbp-16],rax
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
inc rax
mov [rbp-24],rax
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
inc rax
mov [rbp-32],rax
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov rdx,[rbp-64]
mov [rbx+rdx],eax
add rbx,4
mov [rbp-8],rbx
dec cx
jne freerdp_image_yuv_to_xrgb_asm_not_last_column
shl byte [rbp-81],1
freerdp_image_yuv_to_xrgb_asm_not_last_column:
mov rax,[rbp-16]
mov edi,[rax]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
test byte [rbp-81],2
jne freerdp_image_yuv_to_xrgb_asm_skip_last_column
mov rax,[rbp-16]
mov edi,[rax+r8]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov rdx,[rbp-64]
mov [rbx+rdx],eax
freerdp_image_yuv_to_xrgb_asm_skip_last_column:
add qword [rbp-8],4
inc qword [rbp-16]
mov rax,[rbp-16]
mov edi,[rax]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
mov rbx,[rbp-8]
mov [rbx],eax
test byte [rbp-81],2
jne freerdp_image_yuv_to_xrgb_asm_skip_last_column2
mov rax,[rbp-16]
mov edi,[rax+r8]
and edi,0xFF
mov rax,[rbp-24]
mov esi,[rax]
and esi,0xFF
mov rax,[rbp-32]
mov edx,[rax]
and edx,0xFF
call YUV_to_RGB_asm
;shr [rbp-81],1
mov rbx,[rbp-8]
mov rdx,[rbp-64]
mov [rbx+rdx],eax
freerdp_image_yuv_to_xrgb_asm_skip_last_column2:
add qword [rbp-8],4
inc qword [rbp-16]
inc qword [rbp-24]
inc qword [rbp-32]
test cx,0FFFFH
jne freerdp_image_yuv_to_xrgb_asm_loopW
jmp END
mov rax,[rbp-8]
add rax,[rbp-64]
add rax,[rbp-80]
mov [rbp-8],rax
mov rax,[rbp-16]
@ -226,7 +263,7 @@ freerdp_image_yuv_to_xrgb_asm_loopW:
;END
mov rax,0
END:
pop rbx
mov rsp,rbp
pop rbp
pop rbx
ret

View File

@ -19,7 +19,7 @@ int main(void){
int nSrcStep[2];
#if SSSE3
if(check_ssse3()){
if(freerdp_check_ssse3()){
fprintf(stderr,"ssse3 not supported!\n");
return EXIT_FAILURE;
}
@ -30,8 +30,11 @@ int main(void){
pSrcData[0]=malloc(1984*HEIGHT*sizeof(char));
pSrcData[1]=malloc(1984*HEIGHT/4*sizeof(char));
pSrcData[2]=malloc(1984*HEIGHT/4*sizeof(char));
pDstData_asm=_aligned_malloc(WIDTH*HEIGHT*4*sizeof(char),16);
pDstData_c=malloc(WIDTH*HEIGHT*4*sizeof(char));
pDstData_asm=_aligned_malloc(WIDTH*(HEIGHT+1)*4*sizeof(char),16);
pDstData_c=malloc(WIDTH*(HEIGHT+1)*4*sizeof(char));
memset(pDstData_asm,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char));
memset(pDstData_c,0xFF,WIDTH*(HEIGHT+1)*4*sizeof(char));
for(i=0;i<WIDTH*HEIGHT;i++){
pSrcData[0][i]=i%255;
@ -44,9 +47,9 @@ int main(void){
gettimeofday(&t1,NULL);
#if SSSE3
ret=freerdp_image_yuv420p_to_xrgb(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep[0],nSrcStep[1]);
ret=freerdp_image_yuv420p_to_xrgb_ssse3(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep,WIDTH*4);
#else
ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep[0],nSrcStep[1]);
ret=freerdp_image_yuv_to_xrgb_asm(pDstData_asm,pSrcData,WIDTH,HEIGHT,nSrcStep,WIDTH*4);
#endif
gettimeofday(&t2,NULL);
freerdp_image_copy_yuv420p_to_xrgb(pDstData_c,WIDTH*4,0,0,WIDTH,HEIGHT,pSrcData,nSrcStep,0,0);

View File

@ -1,7 +1,7 @@
int freerdp_image_copy_yuv420p_to_xrgb(unsigned char* pDstData, int nDstStep, int nXDst, int nYDst,
int nWidth, int nHeight, unsigned char* pSrcData[3], int nSrcStep[2], int nXSrc, int nYSrc);
extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);
extern int freerdp_image_yuv_to_xrgb_asm(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);
extern int check_ssse3();
extern int freerdp_image_yuv420p_to_xrgb(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int istride0,int istride1);
extern int freerdp_check_ssse3();
extern int freerdp_image_yuv420p_to_xrgb_ssse3(unsigned char *pDstData,unsigned char **pSrcData,int nWidth,int nHeight,int *istride,int scanline);