optimize idwt extrapolate decoder with neon
This commit is contained in:
parent
859b1b5f0b
commit
6e5f74e462
@ -835,6 +835,15 @@ static INLINE void progressive_rfx_dwt_2d_decode_block(INT16* buffer, INT16* tem
|
||||
nBandL + nBandH);
|
||||
}
|
||||
|
||||
void rfx_dwt_2d_extrapolate_decode(INT16* buffer, INT16* temp)
|
||||
{
|
||||
WINPR_ASSERT(buffer);
|
||||
WINPR_ASSERT(temp);
|
||||
progressive_rfx_dwt_2d_decode_block(&buffer[3807], temp, 3);
|
||||
progressive_rfx_dwt_2d_decode_block(&buffer[3007], temp, 2);
|
||||
progressive_rfx_dwt_2d_decode_block(&buffer[0], temp, 1);
|
||||
}
|
||||
|
||||
static INLINE int progressive_rfx_dwt_2d_decode(PROGRESSIVE_CONTEXT* progressive, INT16* buffer,
|
||||
INT16* current, BOOL coeffDiff, BOOL extrapolate,
|
||||
BOOL reverse)
|
||||
@ -862,9 +871,8 @@ static INLINE int progressive_rfx_dwt_2d_decode(PROGRESSIVE_CONTEXT* progressive
|
||||
}
|
||||
else
|
||||
{
|
||||
progressive_rfx_dwt_2d_decode_block(&buffer[3807], temp, 3);
|
||||
progressive_rfx_dwt_2d_decode_block(&buffer[3007], temp, 2);
|
||||
progressive_rfx_dwt_2d_decode_block(&buffer[0], temp, 1);
|
||||
WINPR_ASSERT(progressive->rfx_context->dwt_2d_extrapolate_decode);
|
||||
progressive->rfx_context->dwt_2d_extrapolate_decode(buffer, temp);
|
||||
}
|
||||
BufferPool_Return(progressive->bufferPool, temp);
|
||||
return 1;
|
||||
@ -1910,11 +1918,11 @@ static INLINE BOOL progressive_write_region(PROGRESSIVE_CONTEXT* progressive, wS
|
||||
Stream_Write_UINT8(s, 64); /* tileSize (1 byte) */
|
||||
Stream_Write_UINT16(s, msg->numRects); /* numRects (2 bytes) */
|
||||
WINPR_ASSERT(msg->numQuant <= UINT8_MAX);
|
||||
Stream_Write_UINT8(s, (UINT8)msg->numQuant); /* numQuant (1 byte) */
|
||||
Stream_Write_UINT8(s, 0); /* numProgQuant (1 byte) */
|
||||
Stream_Write_UINT8(s, 0); /* flags (1 byte) */
|
||||
Stream_Write_UINT16(s, msg->numTiles); /* numTiles (2 bytes) */
|
||||
Stream_Write_UINT32(s, tilesDataSize); /* tilesDataSize (4 bytes) */
|
||||
Stream_Write_UINT8(s, (UINT8)msg->numQuant); /* numQuant (1 byte) */
|
||||
Stream_Write_UINT8(s, 0); /* numProgQuant (1 byte) */
|
||||
Stream_Write_UINT8(s, 0); /* flags (1 byte) */
|
||||
Stream_Write_UINT16(s, msg->numTiles); /* numTiles (2 bytes) */
|
||||
Stream_Write_UINT32(s, tilesDataSize); /* tilesDataSize (4 bytes) */
|
||||
|
||||
for (UINT16 i = 0; i < msg->numRects; i++)
|
||||
{
|
||||
|
@ -332,6 +332,7 @@ RFX_CONTEXT* rfx_context_new_ex(BOOL encoder, UINT32 ThreadingFlags)
|
||||
context->quantization_decode = rfx_quantization_decode;
|
||||
context->quantization_encode = rfx_quantization_encode;
|
||||
context->dwt_2d_decode = rfx_dwt_2d_decode;
|
||||
context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode;
|
||||
context->dwt_2d_encode = rfx_dwt_2d_encode;
|
||||
context->rlgr_decode = rfx_rlgr_decode;
|
||||
context->rlgr_encode = rfx_rlgr_encode;
|
||||
|
@ -25,5 +25,6 @@
|
||||
|
||||
FREERDP_LOCAL void rfx_dwt_2d_decode(INT16* buffer, INT16* dwt_buffer);
|
||||
FREERDP_LOCAL void rfx_dwt_2d_encode(INT16* buffer, INT16* dwt_buffer);
|
||||
FREERDP_LOCAL void rfx_dwt_2d_extrapolate_decode(INT16* buffer, INT16* dwt_buffer);
|
||||
|
||||
#endif /* FREERDP_LIB_CODEC_RFX_DWT_H */
|
||||
|
@ -225,6 +225,294 @@ static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
|
||||
rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
|
||||
}
|
||||
|
||||
static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
|
||||
const INT16* restrict pHighBand,
|
||||
size_t nHighStep, INT16* restrict pDstBand,
|
||||
size_t nDstStep, size_t nLowCount,
|
||||
size_t nHighCount, size_t nDstCount)
|
||||
{
|
||||
WINPR_ASSERT(pLowBand);
|
||||
WINPR_ASSERT(pHighBand);
|
||||
WINPR_ASSERT(pDstBand);
|
||||
|
||||
size_t n;
|
||||
INT16* l_ptr = pLowBand;
|
||||
const INT16* h_ptr = pHighBand;
|
||||
INT16* dst_ptr = pDstBand;
|
||||
size_t batchSize = (nLowCount + nHighCount) >> 1;
|
||||
|
||||
for (size_t y = 0; y < nDstCount; y++)
|
||||
{
|
||||
/* Even coefficients */
|
||||
for (n = 0; n < batchSize; n += 8)
|
||||
{
|
||||
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
|
||||
int16x8_t l_n = vld1q_s16(l_ptr);
|
||||
int16x8_t h_n = vld1q_s16(h_ptr);
|
||||
int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
|
||||
|
||||
if (n == 0)
|
||||
{
|
||||
int16_t first = vgetq_lane_s16(h_n_m, 1);
|
||||
h_n_m = vsetq_lane_s16(first, h_n_m, 0);
|
||||
}
|
||||
else if (n == 24)
|
||||
h_n = vsetq_lane_s16(0, h_n, 7);
|
||||
|
||||
int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
|
||||
tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
|
||||
tmp_n = vshrq_n_s16(tmp_n, 1);
|
||||
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
|
||||
vst1q_s16(l_ptr, dst_n);
|
||||
l_ptr += 8;
|
||||
h_ptr += 8;
|
||||
}
|
||||
if (n < 32)
|
||||
*l_ptr -= *(h_ptr - 1);
|
||||
|
||||
l_ptr -= batchSize;
|
||||
h_ptr -= batchSize;
|
||||
|
||||
/* Odd coefficients */
|
||||
for (n = 0; n < batchSize; n += 8)
|
||||
{
|
||||
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
|
||||
int16x8_t h_n = vld1q_s16(h_ptr);
|
||||
h_n = vshlq_n_s16(h_n, 1);
|
||||
int16x8x2_t dst_n;
|
||||
dst_n.val[0] = vld1q_s16(l_ptr);
|
||||
int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
|
||||
|
||||
if (n == 24)
|
||||
h_n = vsetq_lane_s16(0, h_n, 7);
|
||||
|
||||
dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
|
||||
dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
|
||||
dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
|
||||
vst2q_s16(dst_ptr, dst_n);
|
||||
l_ptr += 8;
|
||||
h_ptr += 8;
|
||||
dst_ptr += 16;
|
||||
}
|
||||
if (n == 32)
|
||||
{
|
||||
h_ptr -= 1;
|
||||
l_ptr += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
*dst_ptr = *l_ptr;
|
||||
l_ptr += 1;
|
||||
dst_ptr += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
|
||||
const INT16* restrict pHighBand, size_t nHighStep,
|
||||
INT16* restrict pDstBand, size_t nDstStep,
|
||||
size_t nLowCount, size_t nHighCount,
|
||||
size_t nDstCount)
|
||||
{
|
||||
WINPR_ASSERT(pLowBand);
|
||||
WINPR_ASSERT(pHighBand);
|
||||
WINPR_ASSERT(pDstBand);
|
||||
|
||||
const INT16* l_ptr = pLowBand;
|
||||
const INT16* h_ptr = pHighBand;
|
||||
INT16* dst_ptr = pDstBand;
|
||||
size_t batchSize = (nDstCount >> 3) << 3;
|
||||
size_t forceBandSize = (nLowCount + nHighCount) >> 1;
|
||||
|
||||
/* Even coefficients */
|
||||
for (size_t n = 0; n < forceBandSize; n++)
|
||||
{
|
||||
for (size_t x = 0; x < batchSize; x += 8)
|
||||
{
|
||||
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
|
||||
int16x8_t l_n = vld1q_s16(l_ptr);
|
||||
int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
|
||||
int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
|
||||
|
||||
if (n == 0)
|
||||
tmp_n = vaddq_s16(tmp_n, h_n);
|
||||
else if (n < 31)
|
||||
{
|
||||
int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
|
||||
tmp_n = vaddq_s16(tmp_n, h_n_m);
|
||||
}
|
||||
|
||||
tmp_n = vshrq_n_s16(tmp_n, 1);
|
||||
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
|
||||
vst1q_s16(dst_ptr, dst_n);
|
||||
l_ptr += 8;
|
||||
h_ptr += 8;
|
||||
dst_ptr += 8;
|
||||
}
|
||||
|
||||
if (nDstCount > batchSize)
|
||||
{
|
||||
int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
|
||||
int16_t tmp_n = h_n + 1;
|
||||
if (n == 0)
|
||||
tmp_n += h_n;
|
||||
else if (n < 31)
|
||||
tmp_n += *(h_ptr - nHighStep);
|
||||
tmp_n >>= 1;
|
||||
*dst_ptr = *l_ptr - tmp_n;
|
||||
l_ptr += 1;
|
||||
h_ptr += 1;
|
||||
dst_ptr += 1;
|
||||
}
|
||||
|
||||
dst_ptr += nDstStep;
|
||||
}
|
||||
|
||||
if (forceBandSize < 32)
|
||||
{
|
||||
for (size_t x = 0; x < batchSize; x += 8)
|
||||
{
|
||||
int16x8_t l_n = vld1q_s16(l_ptr);
|
||||
int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
|
||||
int16x8_t tmp_n = vsubq_s16(l_n, h_n);
|
||||
vst1q_s16(dst_ptr, tmp_n);
|
||||
l_ptr += 8;
|
||||
h_ptr += 8;
|
||||
dst_ptr += 8;
|
||||
}
|
||||
|
||||
if (nDstCount > batchSize)
|
||||
{
|
||||
*dst_ptr = *l_ptr - *(h_ptr - nHighStep);
|
||||
l_ptr += 1;
|
||||
h_ptr += 1;
|
||||
dst_ptr += 1;
|
||||
}
|
||||
}
|
||||
|
||||
h_ptr = pHighBand;
|
||||
dst_ptr = pDstBand + nDstStep;
|
||||
|
||||
/* Odd coefficients */
|
||||
for (size_t n = 0; n < forceBandSize; n++)
|
||||
{
|
||||
for (size_t x = 0; x < batchSize; x += 8)
|
||||
{
|
||||
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
|
||||
int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
|
||||
if (n == 31)
|
||||
{
|
||||
int16x8_t dst_n_p = vld1q_s16(l_ptr);
|
||||
l_ptr += 8;
|
||||
tmp_n = vaddq_s16(tmp_n, dst_n_p);
|
||||
tmp_n = vshrq_n_s16(tmp_n, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
|
||||
tmp_n = vaddq_s16(tmp_n, dst_n_p);
|
||||
tmp_n = vshrq_n_s16(tmp_n, 1);
|
||||
int16x8_t h_n = vld1q_s16(h_ptr);
|
||||
h_n = vshlq_n_s16(h_n, 1);
|
||||
tmp_n = vaddq_s16(tmp_n, h_n);
|
||||
}
|
||||
vst1q_s16(dst_ptr, tmp_n);
|
||||
h_ptr += 8;
|
||||
dst_ptr += 8;
|
||||
}
|
||||
|
||||
if (nDstCount > batchSize)
|
||||
{
|
||||
int16_t tmp_n = *(dst_ptr - nDstStep);
|
||||
if (n == 31)
|
||||
{
|
||||
int16_t dst_n_p = *l_ptr;
|
||||
l_ptr += 1;
|
||||
tmp_n += dst_n_p;
|
||||
tmp_n >>= 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
int16_t dst_n_p = *(dst_ptr + nDstStep);
|
||||
tmp_n += dst_n_p;
|
||||
tmp_n >>= 1;
|
||||
int16_t h_n = *h_ptr;
|
||||
h_n <<= 1;
|
||||
tmp_n += h_n;
|
||||
}
|
||||
*dst_ptr = tmp_n;
|
||||
h_ptr += 1;
|
||||
dst_ptr += 1;
|
||||
}
|
||||
|
||||
dst_ptr += nDstStep;
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE size_t prfx_get_band_l_count(size_t level)
|
||||
{
|
||||
return (64 >> level) + 1;
|
||||
}
|
||||
|
||||
static INLINE size_t prfx_get_band_h_count(size_t level)
|
||||
{
|
||||
if (level == 1)
|
||||
return (64 >> 1) - 1;
|
||||
else
|
||||
return (64 + (1 << (level - 1))) >> level;
|
||||
}
|
||||
|
||||
static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
|
||||
size_t level)
|
||||
{
|
||||
size_t nDstStepX;
|
||||
size_t nDstStepY;
|
||||
INT16 *HL, *LH;
|
||||
INT16 *HH, *LL;
|
||||
INT16 *L, *H, *LLx;
|
||||
|
||||
const size_t nBandL = prfx_get_band_l_count(level);
|
||||
const size_t nBandH = prfx_get_band_h_count(level);
|
||||
size_t offset = 0;
|
||||
|
||||
WINPR_ASSERT(buffer);
|
||||
WINPR_ASSERT(temp);
|
||||
|
||||
HL = &buffer[offset];
|
||||
offset += (nBandH * nBandL);
|
||||
LH = &buffer[offset];
|
||||
offset += (nBandL * nBandH);
|
||||
HH = &buffer[offset];
|
||||
offset += (nBandH * nBandH);
|
||||
LL = &buffer[offset];
|
||||
nDstStepX = (nBandL + nBandH);
|
||||
nDstStepY = (nBandL + nBandH);
|
||||
offset = 0;
|
||||
L = &temp[offset];
|
||||
offset += (nBandL * nDstStepX);
|
||||
H = &temp[offset];
|
||||
LLx = &buffer[0];
|
||||
|
||||
/* horizontal (LL + HL -> L) */
|
||||
rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
|
||||
|
||||
/* horizontal (LH + HH -> H) */
|
||||
rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
|
||||
|
||||
/* vertical (L + H -> LL) */
|
||||
rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
|
||||
nBandL + nBandH);
|
||||
}
|
||||
|
||||
static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
|
||||
{
|
||||
WINPR_ASSERT(buffer);
|
||||
WINPR_ASSERT(temp);
|
||||
rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
|
||||
rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
|
||||
rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
|
||||
}
|
||||
|
||||
void rfx_init_neon(RFX_CONTEXT* context)
|
||||
{
|
||||
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
||||
@ -236,6 +524,7 @@ void rfx_init_neon(RFX_CONTEXT* context)
|
||||
PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
|
||||
context->quantization_decode = rfx_quantization_decode_NEON;
|
||||
context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
|
||||
context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -168,6 +168,7 @@ struct S_RFX_CONTEXT
|
||||
void (*quantization_decode)(INT16* buffer, const UINT32* quantization_values);
|
||||
void (*quantization_encode)(INT16* buffer, const UINT32* quantization_values);
|
||||
void (*dwt_2d_decode)(INT16* buffer, INT16* dwt_buffer);
|
||||
void (*dwt_2d_extrapolate_decode)(INT16* src, INT16* temp);
|
||||
void (*dwt_2d_encode)(INT16* buffer, INT16* dwt_buffer);
|
||||
int (*rlgr_decode)(RLGR_MODE mode, const BYTE* data, UINT32 data_size, INT16* buffer,
|
||||
UINT32 buffer_size);
|
||||
|
Loading…
Reference in New Issue
Block a user