2011-11-10 15:18:07 +04:00
|
|
|
/*
|
2012-10-09 07:02:04 +04:00
|
|
|
FreeRDP: A Remote Desktop Protocol Implementation
|
2011-11-10 15:18:07 +04:00
|
|
|
RemoteFX Codec Library - NEON Optimizations
|
|
|
|
|
2013-12-04 14:37:57 +04:00
|
|
|
Copyright 2011 Martin Fleisz <martin.fleisz@thincast.com>
|
2011-11-10 15:18:07 +04:00
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
2022-02-16 13:20:38 +03:00
|
|
|
#include <freerdp/config.h>
|
2012-08-15 01:09:01 +04:00
|
|
|
|
2022-05-03 18:22:11 +03:00
|
|
|
#if defined(WITH_NEON)
|
2011-11-10 15:18:07 +04:00
|
|
|
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <arm_neon.h>
|
2013-02-27 19:04:45 +04:00
|
|
|
#include <winpr/sysinfo.h>
|
2011-11-10 15:18:07 +04:00
|
|
|
|
|
|
|
#include "rfx_types.h"
|
|
|
|
#include "rfx_neon.h"
|
|
|
|
|
2013-01-19 02:32:58 +04:00
|
|
|
/* rfx_decode_YCbCr_to_RGB_NEON code now resides in the primitives library. */
|
2011-11-10 15:18:07 +04:00
|
|
|
|
|
|
|
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
2023-10-11 12:08:58 +03:00
|
|
|
rfx_quantization_decode_block_NEON(INT16* buffer, const size_t buffer_size, const UINT32 factor)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2013-02-21 19:08:46 +04:00
|
|
|
int16x8_t quantFactors = vdupq_n_s16(factor);
|
2011-11-10 15:18:07 +04:00
|
|
|
int16x8_t* buf = (int16x8_t*)buffer;
|
|
|
|
int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size);
|
|
|
|
|
|
|
|
do
|
|
|
|
{
|
2012-10-09 11:01:37 +04:00
|
|
|
int16x8_t val = vld1q_s16((INT16*)buf);
|
2011-11-10 15:18:07 +04:00
|
|
|
val = vshlq_s16(val, quantFactors);
|
2012-10-09 11:01:37 +04:00
|
|
|
vst1q_s16((INT16*)buf, val);
|
2011-11-10 15:18:07 +04:00
|
|
|
buf++;
|
2019-11-06 17:24:51 +03:00
|
|
|
} while (buf < buf_end);
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
|
2023-10-11 12:08:58 +03:00
|
|
|
static void rfx_quantization_decode_NEON(INT16* buffer, const UINT32* WINPR_RESTRICT quantVals)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2023-08-23 15:45:35 +03:00
|
|
|
WINPR_ASSERT(buffer);
|
|
|
|
WINPR_ASSERT(quantVals);
|
|
|
|
|
2019-11-06 17:24:51 +03:00
|
|
|
rfx_quantization_decode_block_NEON(&buffer[0], 1024, quantVals[8] - 1); /* HL1 */
|
2014-08-19 05:10:56 +04:00
|
|
|
rfx_quantization_decode_block_NEON(&buffer[1024], 1024, quantVals[7] - 1); /* LH1 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[2048], 1024, quantVals[9] - 1); /* HH1 */
|
2019-11-06 17:24:51 +03:00
|
|
|
rfx_quantization_decode_block_NEON(&buffer[3072], 256, quantVals[5] - 1); /* HL2 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[3328], 256, quantVals[4] - 1); /* LH2 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[3584], 256, quantVals[6] - 1); /* HH2 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[3840], 64, quantVals[2] - 1); /* HL3 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[3904], 64, quantVals[1] - 1); /* LH3 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[3968], 64, quantVals[3] - 1); /* HH3 */
|
|
|
|
rfx_quantization_decode_block_NEON(&buffer[4032], 64, quantVals[0] - 1); /* LL3 */
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
2023-08-23 15:45:35 +03:00
|
|
|
rfx_dwt_2d_decode_block_horiz_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
|
2023-10-11 14:56:23 +03:00
|
|
|
INT16* WINPR_RESTRICT dst, size_t subband_width)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2018-11-15 19:52:43 +03:00
|
|
|
INT16* l_ptr = l;
|
|
|
|
INT16* h_ptr = h;
|
|
|
|
INT16* dst_ptr = dst;
|
2011-11-10 15:18:07 +04:00
|
|
|
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t y = 0; y < subband_width; y++)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
|
|
|
/* Even coefficients */
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t n = 0; n < subband_width; n += 8)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
|
|
|
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
|
|
|
|
int16x8_t l_n = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
|
|
|
int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
{
|
|
|
|
int16_t first = vgetq_lane_s16(h_n_m, 1);
|
|
|
|
h_n_m = vsetq_lane_s16(first, h_n_m, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
|
|
|
|
tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
|
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
|
|
|
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
|
|
|
|
vst1q_s16(l_ptr, dst_n);
|
2018-11-15 19:52:43 +03:00
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
2018-11-15 19:52:43 +03:00
|
|
|
|
2011-11-10 15:18:07 +04:00
|
|
|
l_ptr -= subband_width;
|
|
|
|
h_ptr -= subband_width;
|
|
|
|
|
|
|
|
/* Odd coefficients */
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t n = 0; n < subband_width; n += 8)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
|
|
|
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
|
|
|
h_n = vshlq_n_s16(h_n, 1);
|
|
|
|
int16x8x2_t dst_n;
|
|
|
|
dst_n.val[0] = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
|
2018-11-15 19:52:43 +03:00
|
|
|
|
2011-11-10 15:18:07 +04:00
|
|
|
if (n == subband_width - 8)
|
|
|
|
{
|
|
|
|
int16_t last = vgetq_lane_s16(dst_n_p, 6);
|
|
|
|
dst_n_p = vsetq_lane_s16(last, dst_n_p, 7);
|
|
|
|
}
|
|
|
|
|
|
|
|
dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
|
|
|
|
dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
|
|
|
|
dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
|
|
|
|
vst2q_s16(dst_ptr, dst_n);
|
2018-11-15 19:52:43 +03:00
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 16;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
2023-08-23 15:45:35 +03:00
|
|
|
rfx_dwt_2d_decode_block_vert_NEON(INT16* WINPR_RESTRICT l, INT16* WINPR_RESTRICT h,
|
2023-10-11 14:56:23 +03:00
|
|
|
INT16* WINPR_RESTRICT dst, size_t subband_width)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2018-11-15 19:52:43 +03:00
|
|
|
INT16* l_ptr = l;
|
|
|
|
INT16* h_ptr = h;
|
|
|
|
INT16* dst_ptr = dst;
|
2023-10-11 14:56:23 +03:00
|
|
|
const size_t total_width = subband_width + subband_width;
|
2011-11-10 15:18:07 +04:00
|
|
|
|
|
|
|
/* Even coefficients */
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t n = 0; n < subband_width; n++)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t x = 0; x < total_width; x += 8)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
|
|
|
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
|
|
|
|
int16x8_t l_n = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
2019-05-08 13:58:01 +03:00
|
|
|
int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
|
2018-11-15 19:52:43 +03:00
|
|
|
|
2011-11-10 15:18:07 +04:00
|
|
|
if (n == 0)
|
|
|
|
tmp_n = vaddq_s16(tmp_n, h_n);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int16x8_t h_n_m = vld1q_s16((h_ptr - total_width));
|
|
|
|
tmp_n = vaddq_s16(tmp_n, h_n_m);
|
|
|
|
}
|
|
|
|
|
2018-11-15 19:52:43 +03:00
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
2011-11-10 15:18:07 +04:00
|
|
|
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
|
|
|
|
vst1q_s16(dst_ptr, dst_n);
|
2018-11-15 19:52:43 +03:00
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 8;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
2018-11-15 19:52:43 +03:00
|
|
|
|
|
|
|
dst_ptr += total_width;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
h_ptr = h;
|
|
|
|
dst_ptr = dst + total_width;
|
|
|
|
|
|
|
|
/* Odd coefficients */
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t n = 0; n < subband_width; n++)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2023-10-11 14:56:23 +03:00
|
|
|
for (size_t x = 0; x < total_width; x += 8)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2018-11-15 19:52:43 +03:00
|
|
|
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
|
|
|
int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width);
|
|
|
|
h_n = vshlq_n_s16(h_n, 1);
|
|
|
|
int16x8_t tmp_n = dst_n_m;
|
2011-11-10 15:18:07 +04:00
|
|
|
|
2018-11-15 19:52:43 +03:00
|
|
|
if (n == subband_width - 1)
|
|
|
|
tmp_n = vaddq_s16(tmp_n, dst_n_m);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width));
|
|
|
|
tmp_n = vaddq_s16(tmp_n, dst_n_p);
|
|
|
|
}
|
2011-11-10 15:18:07 +04:00
|
|
|
|
2018-11-15 19:52:43 +03:00
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
|
|
|
int16x8_t dst_n = vaddq_s16(tmp_n, h_n);
|
|
|
|
vst1q_s16(dst_ptr, dst_n);
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 8;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
|
2018-11-15 19:52:43 +03:00
|
|
|
dst_ptr += total_width;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
2023-08-23 15:45:35 +03:00
|
|
|
rfx_dwt_2d_decode_block_NEON(INT16* WINPR_RESTRICT buffer, INT16* WINPR_RESTRICT idwt,
|
2023-10-11 14:56:23 +03:00
|
|
|
size_t subband_width)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2019-11-06 17:24:51 +03:00
|
|
|
INT16 *hl, *lh, *hh, *ll;
|
|
|
|
INT16 *l_dst, *h_dst;
|
|
|
|
/* Inverse DWT in horizontal direction, results in 2 sub-bands in L, H order in tmp buffer idwt.
|
|
|
|
*/
|
2011-11-10 15:18:07 +04:00
|
|
|
/* The 4 sub-bands are stored in HL(0), LH(1), HH(2), LL(3) order. */
|
|
|
|
/* The lower part L uses LL(3) and HL(0). */
|
|
|
|
/* The higher part H uses LH(1) and HH(2). */
|
|
|
|
ll = buffer + subband_width * subband_width * 3;
|
|
|
|
hl = buffer;
|
|
|
|
l_dst = idwt;
|
|
|
|
rfx_dwt_2d_decode_block_horiz_NEON(ll, hl, l_dst, subband_width);
|
|
|
|
lh = buffer + subband_width * subband_width;
|
|
|
|
hh = buffer + subband_width * subband_width * 2;
|
|
|
|
h_dst = idwt + subband_width * subband_width * 2;
|
|
|
|
rfx_dwt_2d_decode_block_horiz_NEON(lh, hh, h_dst, subband_width);
|
|
|
|
/* Inverse DWT in vertical direction, results are stored in original buffer. */
|
|
|
|
rfx_dwt_2d_decode_block_vert_NEON(l_dst, h_dst, buffer, subband_width);
|
|
|
|
}
|
|
|
|
|
2023-10-11 14:56:23 +03:00
|
|
|
static void rfx_dwt_2d_decode_NEON(INT16* buffer, INT16* dwt_buffer)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
|
|
|
rfx_dwt_2d_decode_block_NEON(buffer + 3840, dwt_buffer, 8);
|
|
|
|
rfx_dwt_2d_decode_block_NEON(buffer + 3072, dwt_buffer, 16);
|
|
|
|
rfx_dwt_2d_decode_block_NEON(buffer, dwt_buffer, 32);
|
|
|
|
}
|
|
|
|
|
2023-08-14 09:37:11 +03:00
|
|
|
static INLINE void rfx_idwt_extrapolate_horiz_neon(INT16* restrict pLowBand, size_t nLowStep,
|
|
|
|
const INT16* restrict pHighBand,
|
|
|
|
size_t nHighStep, INT16* restrict pDstBand,
|
|
|
|
size_t nDstStep, size_t nLowCount,
|
|
|
|
size_t nHighCount, size_t nDstCount)
|
|
|
|
{
|
|
|
|
WINPR_ASSERT(pLowBand);
|
|
|
|
WINPR_ASSERT(pHighBand);
|
|
|
|
WINPR_ASSERT(pDstBand);
|
|
|
|
|
|
|
|
INT16* l_ptr = pLowBand;
|
|
|
|
const INT16* h_ptr = pHighBand;
|
|
|
|
INT16* dst_ptr = pDstBand;
|
|
|
|
size_t batchSize = (nLowCount + nHighCount) >> 1;
|
|
|
|
|
|
|
|
for (size_t y = 0; y < nDstCount; y++)
|
|
|
|
{
|
|
|
|
/* Even coefficients */
|
2024-01-30 12:25:38 +03:00
|
|
|
size_t n = 0;
|
|
|
|
for (; n < batchSize; n += 8)
|
2023-08-14 09:37:11 +03:00
|
|
|
{
|
|
|
|
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
|
|
|
|
int16x8_t l_n = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
|
|
|
int16x8_t h_n_m = vld1q_s16(h_ptr - 1);
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
{
|
|
|
|
int16_t first = vgetq_lane_s16(h_n_m, 1);
|
|
|
|
h_n_m = vsetq_lane_s16(first, h_n_m, 0);
|
|
|
|
}
|
|
|
|
else if (n == 24)
|
|
|
|
h_n = vsetq_lane_s16(0, h_n, 7);
|
|
|
|
|
|
|
|
int16x8_t tmp_n = vaddq_s16(h_n, h_n_m);
|
|
|
|
tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1));
|
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
|
|
|
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
|
|
|
|
vst1q_s16(l_ptr, dst_n);
|
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
|
|
|
}
|
|
|
|
if (n < 32)
|
|
|
|
*l_ptr -= *(h_ptr - 1);
|
|
|
|
|
|
|
|
l_ptr -= batchSize;
|
|
|
|
h_ptr -= batchSize;
|
|
|
|
|
|
|
|
/* Odd coefficients */
|
2024-01-30 12:25:38 +03:00
|
|
|
n = 0;
|
|
|
|
for (; n < batchSize; n += 8)
|
2023-08-14 09:37:11 +03:00
|
|
|
{
|
|
|
|
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
|
|
|
h_n = vshlq_n_s16(h_n, 1);
|
|
|
|
int16x8x2_t dst_n;
|
|
|
|
dst_n.val[0] = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t dst_n_p = vld1q_s16(l_ptr + 1);
|
|
|
|
|
|
|
|
if (n == 24)
|
|
|
|
h_n = vsetq_lane_s16(0, h_n, 7);
|
|
|
|
|
|
|
|
dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]);
|
|
|
|
dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1);
|
|
|
|
dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n);
|
|
|
|
vst2q_s16(dst_ptr, dst_n);
|
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 16;
|
|
|
|
}
|
|
|
|
if (n == 32)
|
|
|
|
{
|
|
|
|
h_ptr -= 1;
|
|
|
|
l_ptr += 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
*dst_ptr = *l_ptr;
|
|
|
|
l_ptr += 1;
|
|
|
|
dst_ptr += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static INLINE void rfx_idwt_extrapolate_vert_neon(const INT16* restrict pLowBand, size_t nLowStep,
|
|
|
|
const INT16* restrict pHighBand, size_t nHighStep,
|
|
|
|
INT16* restrict pDstBand, size_t nDstStep,
|
|
|
|
size_t nLowCount, size_t nHighCount,
|
|
|
|
size_t nDstCount)
|
|
|
|
{
|
|
|
|
WINPR_ASSERT(pLowBand);
|
|
|
|
WINPR_ASSERT(pHighBand);
|
|
|
|
WINPR_ASSERT(pDstBand);
|
|
|
|
|
|
|
|
const INT16* l_ptr = pLowBand;
|
|
|
|
const INT16* h_ptr = pHighBand;
|
|
|
|
INT16* dst_ptr = pDstBand;
|
|
|
|
size_t batchSize = (nDstCount >> 3) << 3;
|
|
|
|
size_t forceBandSize = (nLowCount + nHighCount) >> 1;
|
|
|
|
|
|
|
|
/* Even coefficients */
|
|
|
|
for (size_t n = 0; n < forceBandSize; n++)
|
|
|
|
{
|
|
|
|
for (size_t x = 0; x < batchSize; x += 8)
|
|
|
|
{
|
|
|
|
// dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1);
|
|
|
|
int16x8_t l_n = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t h_n = vld1q_s16((n == 31) ? (h_ptr - nHighStep) : h_ptr);
|
|
|
|
int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));
|
|
|
|
|
|
|
|
if (n == 0)
|
|
|
|
tmp_n = vaddq_s16(tmp_n, h_n);
|
|
|
|
else if (n < 31)
|
|
|
|
{
|
|
|
|
int16x8_t h_n_m = vld1q_s16((h_ptr - nHighStep));
|
|
|
|
tmp_n = vaddq_s16(tmp_n, h_n_m);
|
|
|
|
}
|
|
|
|
|
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
|
|
|
int16x8_t dst_n = vsubq_s16(l_n, tmp_n);
|
|
|
|
vst1q_s16(dst_ptr, dst_n);
|
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nDstCount > batchSize)
|
|
|
|
{
|
|
|
|
int16_t h_n = (n == 31) ? *(h_ptr - nHighStep) : *h_ptr;
|
|
|
|
int16_t tmp_n = h_n + 1;
|
|
|
|
if (n == 0)
|
|
|
|
tmp_n += h_n;
|
|
|
|
else if (n < 31)
|
|
|
|
tmp_n += *(h_ptr - nHighStep);
|
|
|
|
tmp_n >>= 1;
|
|
|
|
*dst_ptr = *l_ptr - tmp_n;
|
|
|
|
l_ptr += 1;
|
|
|
|
h_ptr += 1;
|
|
|
|
dst_ptr += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst_ptr += nDstStep;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (forceBandSize < 32)
|
|
|
|
{
|
|
|
|
for (size_t x = 0; x < batchSize; x += 8)
|
|
|
|
{
|
|
|
|
int16x8_t l_n = vld1q_s16(l_ptr);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr - nHighStep);
|
|
|
|
int16x8_t tmp_n = vsubq_s16(l_n, h_n);
|
|
|
|
vst1q_s16(dst_ptr, tmp_n);
|
|
|
|
l_ptr += 8;
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nDstCount > batchSize)
|
|
|
|
{
|
|
|
|
*dst_ptr = *l_ptr - *(h_ptr - nHighStep);
|
|
|
|
l_ptr += 1;
|
|
|
|
h_ptr += 1;
|
|
|
|
dst_ptr += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
h_ptr = pHighBand;
|
|
|
|
dst_ptr = pDstBand + nDstStep;
|
|
|
|
|
|
|
|
/* Odd coefficients */
|
|
|
|
for (size_t n = 0; n < forceBandSize; n++)
|
|
|
|
{
|
|
|
|
for (size_t x = 0; x < batchSize; x += 8)
|
|
|
|
{
|
|
|
|
// dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1);
|
|
|
|
int16x8_t tmp_n = vld1q_s16(dst_ptr - nDstStep);
|
|
|
|
if (n == 31)
|
|
|
|
{
|
|
|
|
int16x8_t dst_n_p = vld1q_s16(l_ptr);
|
|
|
|
l_ptr += 8;
|
|
|
|
tmp_n = vaddq_s16(tmp_n, dst_n_p);
|
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int16x8_t dst_n_p = vld1q_s16(dst_ptr + nDstStep);
|
|
|
|
tmp_n = vaddq_s16(tmp_n, dst_n_p);
|
|
|
|
tmp_n = vshrq_n_s16(tmp_n, 1);
|
|
|
|
int16x8_t h_n = vld1q_s16(h_ptr);
|
|
|
|
h_n = vshlq_n_s16(h_n, 1);
|
|
|
|
tmp_n = vaddq_s16(tmp_n, h_n);
|
|
|
|
}
|
|
|
|
vst1q_s16(dst_ptr, tmp_n);
|
|
|
|
h_ptr += 8;
|
|
|
|
dst_ptr += 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nDstCount > batchSize)
|
|
|
|
{
|
|
|
|
int16_t tmp_n = *(dst_ptr - nDstStep);
|
|
|
|
if (n == 31)
|
|
|
|
{
|
|
|
|
int16_t dst_n_p = *l_ptr;
|
|
|
|
l_ptr += 1;
|
|
|
|
tmp_n += dst_n_p;
|
|
|
|
tmp_n >>= 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int16_t dst_n_p = *(dst_ptr + nDstStep);
|
|
|
|
tmp_n += dst_n_p;
|
|
|
|
tmp_n >>= 1;
|
|
|
|
int16_t h_n = *h_ptr;
|
|
|
|
h_n <<= 1;
|
|
|
|
tmp_n += h_n;
|
|
|
|
}
|
|
|
|
*dst_ptr = tmp_n;
|
|
|
|
h_ptr += 1;
|
|
|
|
dst_ptr += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
dst_ptr += nDstStep;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static INLINE size_t prfx_get_band_l_count(size_t level)
|
|
|
|
{
|
|
|
|
return (64 >> level) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static INLINE size_t prfx_get_band_h_count(size_t level)
|
|
|
|
{
|
|
|
|
if (level == 1)
|
|
|
|
return (64 >> 1) - 1;
|
|
|
|
else
|
|
|
|
return (64 + (1 << (level - 1))) >> level;
|
|
|
|
}
|
|
|
|
|
2023-10-11 14:56:23 +03:00
|
|
|
static INLINE void rfx_dwt_2d_decode_extrapolate_block_neon(INT16* buffer, INT16* temp,
|
2023-08-14 09:37:11 +03:00
|
|
|
size_t level)
|
|
|
|
{
|
|
|
|
size_t nDstStepX;
|
|
|
|
size_t nDstStepY;
|
|
|
|
INT16 *HL, *LH;
|
|
|
|
INT16 *HH, *LL;
|
|
|
|
INT16 *L, *H, *LLx;
|
|
|
|
|
|
|
|
const size_t nBandL = prfx_get_band_l_count(level);
|
|
|
|
const size_t nBandH = prfx_get_band_h_count(level);
|
|
|
|
size_t offset = 0;
|
|
|
|
|
|
|
|
WINPR_ASSERT(buffer);
|
|
|
|
WINPR_ASSERT(temp);
|
|
|
|
|
|
|
|
HL = &buffer[offset];
|
|
|
|
offset += (nBandH * nBandL);
|
|
|
|
LH = &buffer[offset];
|
|
|
|
offset += (nBandL * nBandH);
|
|
|
|
HH = &buffer[offset];
|
|
|
|
offset += (nBandH * nBandH);
|
|
|
|
LL = &buffer[offset];
|
|
|
|
nDstStepX = (nBandL + nBandH);
|
|
|
|
nDstStepY = (nBandL + nBandH);
|
|
|
|
offset = 0;
|
|
|
|
L = &temp[offset];
|
|
|
|
offset += (nBandL * nDstStepX);
|
|
|
|
H = &temp[offset];
|
|
|
|
LLx = &buffer[0];
|
|
|
|
|
|
|
|
/* horizontal (LL + HL -> L) */
|
|
|
|
rfx_idwt_extrapolate_horiz_neon(LL, nBandL, HL, nBandH, L, nDstStepX, nBandL, nBandH, nBandL);
|
|
|
|
|
|
|
|
/* horizontal (LH + HH -> H) */
|
|
|
|
rfx_idwt_extrapolate_horiz_neon(LH, nBandL, HH, nBandH, H, nDstStepX, nBandL, nBandH, nBandH);
|
|
|
|
|
|
|
|
/* vertical (L + H -> LL) */
|
|
|
|
rfx_idwt_extrapolate_vert_neon(L, nDstStepX, H, nDstStepX, LLx, nDstStepY, nBandL, nBandH,
|
|
|
|
nBandL + nBandH);
|
|
|
|
}
|
|
|
|
|
2023-10-11 14:56:23 +03:00
|
|
|
static void rfx_dwt_2d_extrapolate_decode_neon(INT16* buffer, INT16* temp)
|
2023-08-14 09:37:11 +03:00
|
|
|
{
|
|
|
|
WINPR_ASSERT(buffer);
|
|
|
|
WINPR_ASSERT(temp);
|
|
|
|
rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3807], temp, 3);
|
|
|
|
rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[3007], temp, 2);
|
|
|
|
rfx_dwt_2d_decode_extrapolate_block_neon(&buffer[0], temp, 1);
|
|
|
|
}
|
|
|
|
|
2018-11-15 19:52:43 +03:00
|
|
|
void rfx_init_neon(RFX_CONTEXT* context)
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
2013-02-27 13:59:06 +04:00
|
|
|
if (IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
|
2011-11-10 15:18:07 +04:00
|
|
|
{
|
|
|
|
DEBUG_RFX("Using NEON optimizations");
|
2018-11-15 19:52:43 +03:00
|
|
|
PROFILER_RENAME(context->priv->prof_rfx_ycbcr_to_rgb, "rfx_decode_YCbCr_to_RGB_NEON");
|
2019-11-06 17:24:51 +03:00
|
|
|
PROFILER_RENAME(context->priv->prof_rfx_quantization_decode,
|
|
|
|
"rfx_quantization_decode_NEON");
|
2018-11-15 19:52:43 +03:00
|
|
|
PROFILER_RENAME(context->priv->prof_rfx_dwt_2d_decode, "rfx_dwt_2d_decode_NEON");
|
2011-11-10 15:18:07 +04:00
|
|
|
context->quantization_decode = rfx_quantization_decode_NEON;
|
|
|
|
context->dwt_2d_decode = rfx_dwt_2d_decode_NEON;
|
2023-08-14 09:37:11 +03:00
|
|
|
context->dwt_2d_extrapolate_decode = rfx_dwt_2d_extrapolate_decode_neon;
|
2011-11-10 15:18:07 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-03 18:22:11 +03:00
|
|
|
#endif // WITH_NEON
|