* Imported ppc and sparc architectures optimized codes from ffmpeg 0.5.
* Added Jamfiles to build libavcodec_ppc.a and libavcodec_sparc.a. UNTESTED. git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@30187 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
parent
464e95c43f
commit
8a3c8a66b3
|
@ -0,0 +1,62 @@
|
|||
SubDir HAIKU_TOP src add-ons media plugins avcodec libavcodec ppc ;
|
||||
|
||||
SubDirHdrs [ FDirName $(SUBDIR) .. ] ;
|
||||
SubDirHdrs [ FDirName $(SUBDIR) ../.. ] ;
|
||||
SubDirHdrs [ FDirName $(SUBDIR) ../../libavutil ] ;
|
||||
SubDirHdrs [ FDirName $(SUBDIR) ../../libswscale ] ;
|
||||
|
||||
# filter warnings we don't want here
|
||||
TARGET_WARNING_CCFLAGS = [ FFilter $(TARGET_WARNING_CCFLAGS)
|
||||
: -Wall -Wmissing-prototypes -Wsign-compare -Wpointer-arith ] ;
|
||||
|
||||
if $(HAIKU_GCC_VERSION[1]) >= 3 {
|
||||
SubDirCcFlags -fomit-frame-pointer -fno-pic ;
|
||||
} else {
|
||||
SubDirCcFlags -fomit-frame-pointer -DPIC ;
|
||||
}
|
||||
|
||||
local defines ;
|
||||
defines = HAVE_AV_CONFIG_H=1 ;
|
||||
|
||||
if $(TARGET_ARCH) = x86 {
|
||||
defines += ARCH_X86=1 ARCH_X86_32=1 ARCH_PPC=0 ARCH_SPARC=0 ;
|
||||
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
|
||||
defines += HAVE_MMX=1 HAVE_MMX2=1 HAVE_SSE=0 HAVE_SSE3=1 ;
|
||||
defines += HAVE_ALTIVEC=0 ;
|
||||
defines += HAVE_VIS=0 ;
|
||||
} else if $(TARGET_ARCH) = ppc {
|
||||
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=1 ARCH_SPARC=0 ;
|
||||
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
|
||||
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
|
||||
defines += HAVE_ALTIVEC=1 ;
|
||||
defines += HAVE_VIS=0 ;
|
||||
} else if $(TARGET_ARCH) = sparc {
|
||||
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=0 ARCH_SPARC=1 ;
|
||||
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
|
||||
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
|
||||
defines += HAVE_ALTIVEC=0 ;
|
||||
defines += HAVE_VIS=1 ;
|
||||
}
|
||||
|
||||
defines = [ FDefines $(defines) ] ;
|
||||
SubDirCcFlags $(defines) ;
|
||||
SubDirC++Flags $(defines) ;
|
||||
|
||||
StaticLibrary libavcodec_ppc.a :
|
||||
check_altivec.c
|
||||
float_altivec.c
|
||||
int_altivec.c
|
||||
dsputil_altivec.c
|
||||
dsputil_ppc.c
|
||||
fdct_altivec.c
|
||||
fft_altivec.c
|
||||
idct_altivec.c
|
||||
gmc_altivec.c
|
||||
imgresample_altivec.c
|
||||
h264_altivec.c
|
||||
# h264_template_altivec.c
|
||||
mpegvideo_altivec.c
|
||||
vc1dsp_altivec.c
|
||||
snow_altivec.c
|
||||
;
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @file libavcodec/ppc/check_altivec.c
|
||||
* Checks for AltiVec presence.
|
||||
*/
|
||||
|
||||
#ifdef __APPLE__
|
||||
#undef _POSIX_C_SOURCE
|
||||
#include <sys/sysctl.h>
|
||||
#elif defined(__OpenBSD__)
|
||||
#include <sys/param.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <machine/cpu.h>
|
||||
#elif defined(__AMIGAOS4__)
|
||||
#include <exec/exec.h>
|
||||
#include <interfaces/exec.h>
|
||||
#include <proto/exec.h>
|
||||
#endif /* __APPLE__ */
|
||||
|
||||
/**
|
||||
* This function MAY rely on signal() or fork() in order to make sure AltiVec
|
||||
* is present.
|
||||
*/
|
||||
|
||||
int has_altivec(void)
|
||||
{
|
||||
#ifdef __AMIGAOS4__
|
||||
ULONG result = 0;
|
||||
extern struct ExecIFace *IExec;
|
||||
|
||||
IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
|
||||
if (result == VECTORTYPE_ALTIVEC) return 1;
|
||||
return 0;
|
||||
#elif defined(__APPLE__) || defined(__OpenBSD__)
|
||||
#ifdef __OpenBSD__
|
||||
int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
|
||||
#else
|
||||
int sels[2] = {CTL_HW, HW_VECTORUNIT};
|
||||
#endif
|
||||
int has_vu = 0;
|
||||
size_t len = sizeof(has_vu);
|
||||
int err;
|
||||
|
||||
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
|
||||
|
||||
if (err == 0) return has_vu != 0;
|
||||
return 0;
|
||||
#elif defined(RUNTIME_CPUDETECT)
|
||||
int proc_ver;
|
||||
// Support of mfspr PVR emulation added in Linux 2.6.17.
|
||||
__asm__ volatile("mfspr %0, 287" : "=r" (proc_ver));
|
||||
proc_ver >>= 16;
|
||||
if (proc_ver & 0x8000 ||
|
||||
proc_ver == 0x000c ||
|
||||
proc_ver == 0x0039 || proc_ver == 0x003c ||
|
||||
proc_ver == 0x0044 || proc_ver == 0x0045 ||
|
||||
proc_ver == 0x0070)
|
||||
return 1;
|
||||
return 0;
|
||||
#else
|
||||
// Since we were compiled for AltiVec, just assume we have it
|
||||
// until someone comes up with a proper way (not involving signal hacks).
|
||||
return 1;
|
||||
#endif /* __AMIGAOS4__ */
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,34 @@
|
|||
/*
|
||||
* Copyright (c) 2002 Brian Foley
|
||||
* Copyright (c) 2002 Dieter Shirley
|
||||
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H
|
||||
#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
int has_altivec(void);
|
||||
|
||||
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
|
||||
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
|
||||
#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */
|
|
@ -0,0 +1,306 @@
|
|||
/*
|
||||
* Copyright (c) 2002 Brian Foley
|
||||
* Copyright (c) 2002 Dieter Shirley
|
||||
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "dsputil_ppc.h"
|
||||
|
||||
#include "dsputil_altivec.h"
|
||||
|
||||
void fdct_altivec(int16_t *block);
|
||||
void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
|
||||
int x16, int y16, int rounder);
|
||||
void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
|
||||
void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
|
||||
|
||||
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
|
||||
|
||||
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
|
||||
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
|
||||
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
|
||||
void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
|
||||
void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
|
||||
|
||||
int mm_flags = 0;
|
||||
|
||||
int mm_support(void)
|
||||
{
|
||||
int result = 0;
|
||||
#if HAVE_ALTIVEC
|
||||
if (has_altivec()) {
|
||||
result |= FF_MM_ALTIVEC;
|
||||
}
|
||||
#endif /* result */
|
||||
return result;
|
||||
}
|
||||
|
||||
#if CONFIG_POWERPC_PERF
|
||||
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
|
||||
/* list below must match enum in dsputil_ppc.h */
|
||||
static unsigned char* perfname[] = {
|
||||
"ff_fft_calc_altivec",
|
||||
"gmc1_altivec",
|
||||
"dct_unquantize_h263_altivec",
|
||||
"fdct_altivec",
|
||||
"idct_add_altivec",
|
||||
"idct_put_altivec",
|
||||
"put_pixels16_altivec",
|
||||
"avg_pixels16_altivec",
|
||||
"avg_pixels8_altivec",
|
||||
"put_pixels8_xy2_altivec",
|
||||
"put_no_rnd_pixels8_xy2_altivec",
|
||||
"put_pixels16_xy2_altivec",
|
||||
"put_no_rnd_pixels16_xy2_altivec",
|
||||
"hadamard8_diff8x8_altivec",
|
||||
"hadamard8_diff16_altivec",
|
||||
"avg_pixels8_xy2_altivec",
|
||||
"clear_blocks_dcbz32_ppc",
|
||||
"clear_blocks_dcbz128_ppc",
|
||||
"put_h264_chroma_mc8_altivec",
|
||||
"avg_h264_chroma_mc8_altivec",
|
||||
"put_h264_qpel16_h_lowpass_altivec",
|
||||
"avg_h264_qpel16_h_lowpass_altivec",
|
||||
"put_h264_qpel16_v_lowpass_altivec",
|
||||
"avg_h264_qpel16_v_lowpass_altivec",
|
||||
"put_h264_qpel16_hv_lowpass_altivec",
|
||||
"avg_h264_qpel16_hv_lowpass_altivec",
|
||||
""
|
||||
};
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#if CONFIG_POWERPC_PERF
|
||||
void powerpc_display_perf_report(void)
|
||||
{
|
||||
int i, j;
|
||||
av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
|
||||
for(i = 0 ; i < powerpc_perf_total ; i++) {
|
||||
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
|
||||
if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
|
||||
av_log(NULL, AV_LOG_INFO,
|
||||
" Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
|
||||
perfname[i],
|
||||
j+1,
|
||||
perfdata[j][i][powerpc_data_min],
|
||||
perfdata[j][i][powerpc_data_max],
|
||||
(double)perfdata[j][i][powerpc_data_sum] /
|
||||
(double)perfdata[j][i][powerpc_data_num],
|
||||
perfdata[j][i][powerpc_data_num]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_POWERPC_PERF */
|
||||
|
||||
/* ***** WARNING ***** WARNING ***** WARNING ***** */
|
||||
/*
|
||||
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
|
||||
cache line size not equal to 32 bytes.
|
||||
Fortunately all processor used by Apple up to at least the 7450 (aka second
|
||||
generation G4) use 32 bytes cache line.
|
||||
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
|
||||
single cache line, so you need to know the cache line size to use it !
|
||||
It's absurd, but it's fast...
|
||||
|
||||
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
|
||||
size: 128 bytes. Oups.
|
||||
The semantic of dcbz was changed, it always clear 32 bytes. so the function
|
||||
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
|
||||
which is defined to clear a cache line (as dcbz before). So we still can
|
||||
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
|
||||
|
||||
see <http://developer.apple.com/technotes/tn/tn2087.html>
|
||||
and <http://developer.apple.com/technotes/tn/tn2086.html>
|
||||
*/
|
||||
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
|
||||
register int misal = ((unsigned long)blocks & 0x00000010);
|
||||
register int i = 0;
|
||||
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
|
||||
#if 1
|
||||
if (misal) {
|
||||
((unsigned long*)blocks)[0] = 0L;
|
||||
((unsigned long*)blocks)[1] = 0L;
|
||||
((unsigned long*)blocks)[2] = 0L;
|
||||
((unsigned long*)blocks)[3] = 0L;
|
||||
i += 16;
|
||||
}
|
||||
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
|
||||
__asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
|
||||
}
|
||||
if (misal) {
|
||||
((unsigned long*)blocks)[188] = 0L;
|
||||
((unsigned long*)blocks)[189] = 0L;
|
||||
((unsigned long*)blocks)[190] = 0L;
|
||||
((unsigned long*)blocks)[191] = 0L;
|
||||
i += 16;
|
||||
}
|
||||
#else
|
||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||
#endif
|
||||
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
|
||||
}
|
||||
|
||||
/* same as above, when dcbzl clear a whole 128B cache line
|
||||
i.e. the PPC970 aka G5 */
|
||||
#if HAVE_DCBZL
|
||||
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
|
||||
register int misal = ((unsigned long)blocks & 0x0000007f);
|
||||
register int i = 0;
|
||||
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
|
||||
#if 1
|
||||
if (misal) {
|
||||
// we could probably also optimize this case,
|
||||
// but there's not much point as the machines
|
||||
// aren't available yet (2003-06-26)
|
||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||
}
|
||||
else
|
||||
for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
|
||||
__asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
|
||||
}
|
||||
#else
|
||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||
#endif
|
||||
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
|
||||
}
|
||||
#else
|
||||
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
|
||||
{
|
||||
memset(blocks, 0, sizeof(DCTELEM)*6*64);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_DCBZL
|
||||
/* check dcbz report how many bytes are set to 0 by dcbz */
|
||||
/* update 24/06/2003 : replace dcbz by dcbzl to get
|
||||
the intended effect (Apple "fixed" dcbz)
|
||||
unfortunately this cannot be used unless the assembler
|
||||
knows about dcbzl ... */
|
||||
long check_dcbzl_effect(void)
|
||||
{
|
||||
register char *fakedata = av_malloc(1024);
|
||||
register char *fakedata_middle;
|
||||
register long zero = 0;
|
||||
register long i = 0;
|
||||
long count = 0;
|
||||
|
||||
if (!fakedata) {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
fakedata_middle = (fakedata + 512);
|
||||
|
||||
memset(fakedata, 0xFF, 1024);
|
||||
|
||||
/* below the constraint "b" seems to mean "Address base register"
|
||||
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
|
||||
__asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
|
||||
|
||||
for (i = 0; i < 1024 ; i ++) {
|
||||
if (fakedata[i] == (char)0)
|
||||
count++;
|
||||
}
|
||||
|
||||
av_free(fakedata);
|
||||
|
||||
return count;
|
||||
}
|
||||
#else
|
||||
long check_dcbzl_effect(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void prefetch_ppc(void *mem, int stride, int h)
|
||||
{
|
||||
register const uint8_t *p = mem;
|
||||
do {
|
||||
__asm__ volatile ("dcbt 0,%0" : : "r" (p));
|
||||
p+= stride;
|
||||
} while(--h);
|
||||
}
|
||||
|
||||
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
// Common optimizations whether AltiVec is available or not
|
||||
c->prefetch = prefetch_ppc;
|
||||
switch (check_dcbzl_effect()) {
|
||||
case 32:
|
||||
c->clear_blocks = clear_blocks_dcbz32_ppc;
|
||||
break;
|
||||
case 128:
|
||||
c->clear_blocks = clear_blocks_dcbz128_ppc;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
#if HAVE_ALTIVEC
|
||||
if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
|
||||
|
||||
if (has_altivec()) {
|
||||
mm_flags |= FF_MM_ALTIVEC;
|
||||
|
||||
dsputil_init_altivec(c, avctx);
|
||||
if(CONFIG_SNOW_DECODER) snow_init_altivec(c, avctx);
|
||||
if(CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER)
|
||||
vc1dsp_init_altivec(c, avctx);
|
||||
float_init_altivec(c, avctx);
|
||||
int_init_altivec(c, avctx);
|
||||
c->gmc1 = gmc1_altivec;
|
||||
|
||||
#if CONFIG_ENCODERS
|
||||
if (avctx->dct_algo == FF_DCT_AUTO ||
|
||||
avctx->dct_algo == FF_DCT_ALTIVEC) {
|
||||
c->fdct = fdct_altivec;
|
||||
}
|
||||
#endif //CONFIG_ENCODERS
|
||||
|
||||
if (avctx->lowres==0) {
|
||||
if ((avctx->idct_algo == FF_IDCT_AUTO) ||
|
||||
(avctx->idct_algo == FF_IDCT_ALTIVEC)) {
|
||||
c->idct_put = idct_put_altivec;
|
||||
c->idct_add = idct_add_altivec;
|
||||
c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_POWERPC_PERF
|
||||
{
|
||||
int i, j;
|
||||
for (i = 0 ; i < powerpc_perf_total ; i++) {
|
||||
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
|
||||
perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
|
||||
perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
|
||||
perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
|
||||
perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_POWERPC_PERF */
|
||||
}
|
||||
#endif /* HAVE_ALTIVEC */
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
/*
|
||||
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_DSPUTIL_PPC_H
|
||||
#define AVCODEC_PPC_DSPUTIL_PPC_H
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if CONFIG_POWERPC_PERF
|
||||
void powerpc_display_perf_report(void);
|
||||
/* the 604* have 2, the G3* have 4, the G4s have 6,
|
||||
and the G5 are completely different (they MUST use
|
||||
HAVE_PPC64, and let's hope all future 64 bis PPC
|
||||
will use the same PMCs... */
|
||||
#define POWERPC_NUM_PMC_ENABLED 6
|
||||
/* if you add to the enum below, also add to the perfname array
|
||||
in dsputil_ppc.c */
|
||||
enum powerpc_perf_index {
|
||||
altivec_fft_num = 0,
|
||||
altivec_gmc1_num,
|
||||
altivec_dct_unquantize_h263_num,
|
||||
altivec_fdct,
|
||||
altivec_idct_add_num,
|
||||
altivec_idct_put_num,
|
||||
altivec_put_pixels16_num,
|
||||
altivec_avg_pixels16_num,
|
||||
altivec_avg_pixels8_num,
|
||||
altivec_put_pixels8_xy2_num,
|
||||
altivec_put_no_rnd_pixels8_xy2_num,
|
||||
altivec_put_pixels16_xy2_num,
|
||||
altivec_put_no_rnd_pixels16_xy2_num,
|
||||
altivec_hadamard8_diff8x8_num,
|
||||
altivec_hadamard8_diff16_num,
|
||||
altivec_avg_pixels8_xy2_num,
|
||||
powerpc_clear_blocks_dcbz32,
|
||||
powerpc_clear_blocks_dcbz128,
|
||||
altivec_put_h264_chroma_mc8_num,
|
||||
altivec_avg_h264_chroma_mc8_num,
|
||||
altivec_put_h264_qpel16_h_lowpass_num,
|
||||
altivec_avg_h264_qpel16_h_lowpass_num,
|
||||
altivec_put_h264_qpel16_v_lowpass_num,
|
||||
altivec_avg_h264_qpel16_v_lowpass_num,
|
||||
altivec_put_h264_qpel16_hv_lowpass_num,
|
||||
altivec_avg_h264_qpel16_hv_lowpass_num,
|
||||
powerpc_perf_total
|
||||
};
|
||||
enum powerpc_data_index {
|
||||
powerpc_data_min = 0,
|
||||
powerpc_data_max,
|
||||
powerpc_data_sum,
|
||||
powerpc_data_num,
|
||||
powerpc_data_total
|
||||
};
|
||||
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
|
||||
|
||||
#if !HAVE_PPC64
|
||||
#define POWERP_PMC_DATATYPE unsigned long
|
||||
#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a))
|
||||
#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a))
|
||||
#if (POWERPC_NUM_PMC_ENABLED > 2)
|
||||
#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a))
|
||||
#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a))
|
||||
#else
|
||||
#define POWERPC_GET_PMC3(a) do {} while (0)
|
||||
#define POWERPC_GET_PMC4(a) do {} while (0)
|
||||
#endif
|
||||
#if (POWERPC_NUM_PMC_ENABLED > 4)
|
||||
#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a))
|
||||
#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a))
|
||||
#else
|
||||
#define POWERPC_GET_PMC5(a) do {} while (0)
|
||||
#define POWERPC_GET_PMC6(a) do {} while (0)
|
||||
#endif
|
||||
#else /* HAVE_PPC64 */
|
||||
#define POWERP_PMC_DATATYPE unsigned long long
|
||||
#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a))
|
||||
#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a))
|
||||
#if (POWERPC_NUM_PMC_ENABLED > 2)
|
||||
#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a))
|
||||
#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a))
|
||||
#else
|
||||
#define POWERPC_GET_PMC3(a) do {} while (0)
|
||||
#define POWERPC_GET_PMC4(a) do {} while (0)
|
||||
#endif
|
||||
#if (POWERPC_NUM_PMC_ENABLED > 4)
|
||||
#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a))
|
||||
#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a))
|
||||
#else
|
||||
#define POWERPC_GET_PMC5(a) do {} while (0)
|
||||
#define POWERPC_GET_PMC6(a) do {} while (0)
|
||||
#endif
|
||||
#endif /* HAVE_PPC64 */
|
||||
#define POWERPC_PERF_DECLARE(a, cond) \
|
||||
POWERP_PMC_DATATYPE \
|
||||
pmc_start[POWERPC_NUM_PMC_ENABLED], \
|
||||
pmc_stop[POWERPC_NUM_PMC_ENABLED], \
|
||||
pmc_loop_index;
|
||||
#define POWERPC_PERF_START_COUNT(a, cond) do { \
|
||||
POWERPC_GET_PMC6(pmc_start[5]); \
|
||||
POWERPC_GET_PMC5(pmc_start[4]); \
|
||||
POWERPC_GET_PMC4(pmc_start[3]); \
|
||||
POWERPC_GET_PMC3(pmc_start[2]); \
|
||||
POWERPC_GET_PMC2(pmc_start[1]); \
|
||||
POWERPC_GET_PMC1(pmc_start[0]); \
|
||||
} while (0)
|
||||
#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
|
||||
POWERPC_GET_PMC1(pmc_stop[0]); \
|
||||
POWERPC_GET_PMC2(pmc_stop[1]); \
|
||||
POWERPC_GET_PMC3(pmc_stop[2]); \
|
||||
POWERPC_GET_PMC4(pmc_stop[3]); \
|
||||
POWERPC_GET_PMC5(pmc_stop[4]); \
|
||||
POWERPC_GET_PMC6(pmc_stop[5]); \
|
||||
if (cond) { \
|
||||
for(pmc_loop_index = 0; \
|
||||
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
|
||||
pmc_loop_index++) { \
|
||||
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \
|
||||
POWERP_PMC_DATATYPE diff = \
|
||||
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
|
||||
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
|
||||
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
|
||||
if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
|
||||
perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
|
||||
perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \
|
||||
perfdata[pmc_loop_index][a][powerpc_data_num] ++; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} while (0)
|
||||
#else /* CONFIG_POWERPC_PERF */
|
||||
// those are needed to avoid empty statements.
|
||||
#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused))
|
||||
#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0)
|
||||
#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0)
|
||||
#endif /* CONFIG_POWERPC_PERF */
|
||||
|
||||
#endif /* AVCODEC_PPC_DSPUTIL_PPC_H */
|
|
@ -0,0 +1,493 @@
|
|||
/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the
|
||||
* AltiVec optimized library for the FFMPEG Multimedia System
|
||||
* Copyright (C) 2003 James Klicman <james@klicman.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "dsputil_ppc.h"
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
|
||||
#define vs16(v) ((vector signed short)(v))
|
||||
#define vs32(v) ((vector signed int)(v))
|
||||
#define vu8(v) ((vector unsigned char)(v))
|
||||
#define vu16(v) ((vector unsigned short)(v))
|
||||
#define vu32(v) ((vector unsigned int)(v))
|
||||
|
||||
|
||||
#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */
|
||||
#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */
|
||||
#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */
|
||||
#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */
|
||||
#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */
|
||||
#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */
|
||||
#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */
|
||||
#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
|
||||
|
||||
|
||||
#define W0 -(2 * C2)
|
||||
#define W1 (2 * C6)
|
||||
#define W2 (SQRT_2 * C6)
|
||||
#define W3 (SQRT_2 * C3)
|
||||
#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
|
||||
#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
|
||||
#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
|
||||
#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
|
||||
#define W8 (SQRT_2 * ( C7 - C3))
|
||||
#define W9 (SQRT_2 * (-C1 - C3))
|
||||
#define WA (SQRT_2 * (-C3 - C5))
|
||||
#define WB (SQRT_2 * ( C5 - C3))
|
||||
|
||||
|
||||
static vector float fdctconsts[3] = {
|
||||
{ W0, W1, W2, W3 },
|
||||
{ W4, W5, W6, W7 },
|
||||
{ W8, W9, WA, WB }
|
||||
};
|
||||
|
||||
#define LD_W0 vec_splat(cnsts0, 0)
|
||||
#define LD_W1 vec_splat(cnsts0, 1)
|
||||
#define LD_W2 vec_splat(cnsts0, 2)
|
||||
#define LD_W3 vec_splat(cnsts0, 3)
|
||||
#define LD_W4 vec_splat(cnsts1, 0)
|
||||
#define LD_W5 vec_splat(cnsts1, 1)
|
||||
#define LD_W6 vec_splat(cnsts1, 2)
|
||||
#define LD_W7 vec_splat(cnsts1, 3)
|
||||
#define LD_W8 vec_splat(cnsts2, 0)
|
||||
#define LD_W9 vec_splat(cnsts2, 1)
|
||||
#define LD_WA vec_splat(cnsts2, 2)
|
||||
#define LD_WB vec_splat(cnsts2, 3)
|
||||
|
||||
|
||||
#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
|
||||
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
|
||||
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
|
||||
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
|
||||
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
|
||||
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
|
||||
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
|
||||
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
|
||||
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
|
||||
\
|
||||
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
|
||||
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
|
||||
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
|
||||
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
|
||||
\
|
||||
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
|
||||
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
|
||||
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
|
||||
cnst = LD_W2; \
|
||||
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
|
||||
cnst = LD_W1; \
|
||||
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
|
||||
cnst = LD_W0; \
|
||||
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
|
||||
\
|
||||
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
|
||||
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
|
||||
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
|
||||
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
|
||||
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
|
||||
cnst = LD_W3; \
|
||||
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
|
||||
\
|
||||
cnst = LD_W8; \
|
||||
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
|
||||
cnst = LD_W9; \
|
||||
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
|
||||
cnst = LD_WA; \
|
||||
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
|
||||
cnst = LD_WB; \
|
||||
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
|
||||
\
|
||||
cnst = LD_W4; \
|
||||
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
|
||||
cnst = LD_W5; \
|
||||
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
|
||||
cnst = LD_W6; \
|
||||
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
|
||||
cnst = LD_W7; \
|
||||
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
|
||||
\
|
||||
b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
|
||||
b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
|
||||
b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
|
||||
b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \
|
||||
/* }}} */
|
||||
|
||||
#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
|
||||
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
|
||||
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
|
||||
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
|
||||
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
|
||||
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
|
||||
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
|
||||
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
|
||||
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
|
||||
\
|
||||
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
|
||||
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
|
||||
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
|
||||
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
|
||||
\
|
||||
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
|
||||
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
|
||||
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
|
||||
cnst = LD_W2; \
|
||||
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
|
||||
cnst = LD_W1; \
|
||||
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
|
||||
cnst = LD_W0; \
|
||||
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
|
||||
\
|
||||
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
|
||||
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
|
||||
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
|
||||
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
|
||||
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
|
||||
cnst = LD_W3; \
|
||||
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
|
||||
\
|
||||
cnst = LD_W8; \
|
||||
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
|
||||
cnst = LD_W9; \
|
||||
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
|
||||
cnst = LD_WA; \
|
||||
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
|
||||
cnst = LD_WB; \
|
||||
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
|
||||
\
|
||||
cnst = LD_W4; \
|
||||
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
|
||||
cnst = LD_W5; \
|
||||
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
|
||||
cnst = LD_W6; \
|
||||
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
|
||||
cnst = LD_W7; \
|
||||
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
|
||||
\
|
||||
b7 = vec_add(b7, x2); /* b7 += x2; */ \
|
||||
b5 = vec_add(b5, x3); /* b5 += x3; */ \
|
||||
b3 = vec_add(b3, x2); /* b3 += x2; */ \
|
||||
b1 = vec_add(b1, x3); /* b1 += x3; */ \
|
||||
/* }}} */
|
||||
|
||||
|
||||
|
||||
/* two dimensional discrete cosine transform */
|
||||
|
||||
void fdct_altivec(int16_t *block)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(altivec_fdct, 1);
|
||||
vector signed short *bp;
|
||||
vector float *cp;
|
||||
vector float b00, b10, b20, b30, b40, b50, b60, b70;
|
||||
vector float b01, b11, b21, b31, b41, b51, b61, b71;
|
||||
vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
|
||||
vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
|
||||
|
||||
POWERPC_PERF_START_COUNT(altivec_fdct, 1);
|
||||
|
||||
|
||||
/* setup constants {{{ */
|
||||
/* mzero = -0.0 */
|
||||
mzero = ((vector float)vec_splat_u32(-1));
|
||||
mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero)));
|
||||
cp = fdctconsts;
|
||||
cnsts0 = vec_ld(0, cp); cp++;
|
||||
cnsts1 = vec_ld(0, cp); cp++;
|
||||
cnsts2 = vec_ld(0, cp);
|
||||
/* }}} */
|
||||
|
||||
|
||||
/* 8x8 matrix transpose (vector short[8]) {{{ */
|
||||
#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
|
||||
|
||||
bp = (vector signed short*)block;
|
||||
b00 = ((vector float)vec_ld(0, bp));
|
||||
b40 = ((vector float)vec_ld(16*4, bp));
|
||||
b01 = ((vector float)MERGE_S16(h, b00, b40));
|
||||
b11 = ((vector float)MERGE_S16(l, b00, b40));
|
||||
bp++;
|
||||
b10 = ((vector float)vec_ld(0, bp));
|
||||
b50 = ((vector float)vec_ld(16*4, bp));
|
||||
b21 = ((vector float)MERGE_S16(h, b10, b50));
|
||||
b31 = ((vector float)MERGE_S16(l, b10, b50));
|
||||
bp++;
|
||||
b20 = ((vector float)vec_ld(0, bp));
|
||||
b60 = ((vector float)vec_ld(16*4, bp));
|
||||
b41 = ((vector float)MERGE_S16(h, b20, b60));
|
||||
b51 = ((vector float)MERGE_S16(l, b20, b60));
|
||||
bp++;
|
||||
b30 = ((vector float)vec_ld(0, bp));
|
||||
b70 = ((vector float)vec_ld(16*4, bp));
|
||||
b61 = ((vector float)MERGE_S16(h, b30, b70));
|
||||
b71 = ((vector float)MERGE_S16(l, b30, b70));
|
||||
|
||||
x0 = ((vector float)MERGE_S16(h, b01, b41));
|
||||
x1 = ((vector float)MERGE_S16(l, b01, b41));
|
||||
x2 = ((vector float)MERGE_S16(h, b11, b51));
|
||||
x3 = ((vector float)MERGE_S16(l, b11, b51));
|
||||
x4 = ((vector float)MERGE_S16(h, b21, b61));
|
||||
x5 = ((vector float)MERGE_S16(l, b21, b61));
|
||||
x6 = ((vector float)MERGE_S16(h, b31, b71));
|
||||
x7 = ((vector float)MERGE_S16(l, b31, b71));
|
||||
|
||||
b00 = ((vector float)MERGE_S16(h, x0, x4));
|
||||
b10 = ((vector float)MERGE_S16(l, x0, x4));
|
||||
b20 = ((vector float)MERGE_S16(h, x1, x5));
|
||||
b30 = ((vector float)MERGE_S16(l, x1, x5));
|
||||
b40 = ((vector float)MERGE_S16(h, x2, x6));
|
||||
b50 = ((vector float)MERGE_S16(l, x2, x6));
|
||||
b60 = ((vector float)MERGE_S16(h, x3, x7));
|
||||
b70 = ((vector float)MERGE_S16(l, x3, x7));
|
||||
|
||||
#undef MERGE_S16
|
||||
/* }}} */
|
||||
|
||||
|
||||
/* Some of the initial calculations can be done as vector short before
|
||||
* conversion to vector float. The following code section takes advantage
|
||||
* of this.
|
||||
*/
|
||||
#if 1
|
||||
/* fdct rows {{{ */
|
||||
x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));
|
||||
x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));
|
||||
x1 = ((vector float)vec_add(vs16(b10), vs16(b60)));
|
||||
x6 = ((vector float)vec_sub(vs16(b10), vs16(b60)));
|
||||
x2 = ((vector float)vec_add(vs16(b20), vs16(b50)));
|
||||
x5 = ((vector float)vec_sub(vs16(b20), vs16(b50)));
|
||||
x3 = ((vector float)vec_add(vs16(b30), vs16(b40)));
|
||||
x4 = ((vector float)vec_sub(vs16(b30), vs16(b40)));
|
||||
|
||||
b70 = ((vector float)vec_add(vs16(x0), vs16(x3)));
|
||||
b10 = ((vector float)vec_add(vs16(x1), vs16(x2)));
|
||||
|
||||
b00 = ((vector float)vec_add(vs16(b70), vs16(b10)));
|
||||
b40 = ((vector float)vec_sub(vs16(b70), vs16(b10)));
|
||||
|
||||
#define CTF0(n) \
|
||||
b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \
|
||||
b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \
|
||||
b##n##1 = vec_ctf(vs32(b##n##1), 0); \
|
||||
b##n##0 = vec_ctf(vs32(b##n##0), 0);
|
||||
|
||||
CTF0(0);
|
||||
CTF0(4);
|
||||
|
||||
b20 = ((vector float)vec_sub(vs16(x0), vs16(x3)));
|
||||
b60 = ((vector float)vec_sub(vs16(x1), vs16(x2)));
|
||||
|
||||
CTF0(2);
|
||||
CTF0(6);
|
||||
|
||||
#undef CTF0
|
||||
|
||||
x0 = vec_add(b60, b20);
|
||||
x1 = vec_add(b61, b21);
|
||||
|
||||
cnst = LD_W2;
|
||||
x0 = vec_madd(cnst, x0, mzero);
|
||||
x1 = vec_madd(cnst, x1, mzero);
|
||||
cnst = LD_W1;
|
||||
b20 = vec_madd(cnst, b20, x0);
|
||||
b21 = vec_madd(cnst, b21, x1);
|
||||
cnst = LD_W0;
|
||||
b60 = vec_madd(cnst, b60, x0);
|
||||
b61 = vec_madd(cnst, b61, x1);
|
||||
|
||||
#define CTFX(x,b) \
|
||||
b##0 = ((vector float)vec_unpackh(vs16(x))); \
|
||||
b##1 = ((vector float)vec_unpackl(vs16(x))); \
|
||||
b##0 = vec_ctf(vs32(b##0), 0); \
|
||||
b##1 = vec_ctf(vs32(b##1), 0); \
|
||||
|
||||
CTFX(x4, b7);
|
||||
CTFX(x5, b5);
|
||||
CTFX(x6, b3);
|
||||
CTFX(x7, b1);
|
||||
|
||||
#undef CTFX
|
||||
|
||||
|
||||
x0 = vec_add(b70, b10);
|
||||
x1 = vec_add(b50, b30);
|
||||
x2 = vec_add(b70, b30);
|
||||
x3 = vec_add(b50, b10);
|
||||
x8 = vec_add(x2, x3);
|
||||
cnst = LD_W3;
|
||||
x8 = vec_madd(cnst, x8, mzero);
|
||||
|
||||
cnst = LD_W8;
|
||||
x0 = vec_madd(cnst, x0, mzero);
|
||||
cnst = LD_W9;
|
||||
x1 = vec_madd(cnst, x1, mzero);
|
||||
cnst = LD_WA;
|
||||
x2 = vec_madd(cnst, x2, x8);
|
||||
cnst = LD_WB;
|
||||
x3 = vec_madd(cnst, x3, x8);
|
||||
|
||||
cnst = LD_W4;
|
||||
b70 = vec_madd(cnst, b70, x0);
|
||||
cnst = LD_W5;
|
||||
b50 = vec_madd(cnst, b50, x1);
|
||||
cnst = LD_W6;
|
||||
b30 = vec_madd(cnst, b30, x1);
|
||||
cnst = LD_W7;
|
||||
b10 = vec_madd(cnst, b10, x0);
|
||||
|
||||
b70 = vec_add(b70, x2);
|
||||
b50 = vec_add(b50, x3);
|
||||
b30 = vec_add(b30, x2);
|
||||
b10 = vec_add(b10, x3);
|
||||
|
||||
|
||||
x0 = vec_add(b71, b11);
|
||||
x1 = vec_add(b51, b31);
|
||||
x2 = vec_add(b71, b31);
|
||||
x3 = vec_add(b51, b11);
|
||||
x8 = vec_add(x2, x3);
|
||||
cnst = LD_W3;
|
||||
x8 = vec_madd(cnst, x8, mzero);
|
||||
|
||||
cnst = LD_W8;
|
||||
x0 = vec_madd(cnst, x0, mzero);
|
||||
cnst = LD_W9;
|
||||
x1 = vec_madd(cnst, x1, mzero);
|
||||
cnst = LD_WA;
|
||||
x2 = vec_madd(cnst, x2, x8);
|
||||
cnst = LD_WB;
|
||||
x3 = vec_madd(cnst, x3, x8);
|
||||
|
||||
cnst = LD_W4;
|
||||
b71 = vec_madd(cnst, b71, x0);
|
||||
cnst = LD_W5;
|
||||
b51 = vec_madd(cnst, b51, x1);
|
||||
cnst = LD_W6;
|
||||
b31 = vec_madd(cnst, b31, x1);
|
||||
cnst = LD_W7;
|
||||
b11 = vec_madd(cnst, b11, x0);
|
||||
|
||||
b71 = vec_add(b71, x2);
|
||||
b51 = vec_add(b51, x3);
|
||||
b31 = vec_add(b31, x2);
|
||||
b11 = vec_add(b11, x3);
|
||||
/* }}} */
|
||||
#else
|
||||
/* convert to float {{{ */
|
||||
#define CTF(n) \
|
||||
vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
|
||||
vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
|
||||
b##n##1 = vec_ctf(vs32(b##n##1), 0); \
|
||||
b##n##0 = vec_ctf(vs32(b##n##0), 0); \
|
||||
|
||||
CTF(0);
|
||||
CTF(1);
|
||||
CTF(2);
|
||||
CTF(3);
|
||||
CTF(4);
|
||||
CTF(5);
|
||||
CTF(6);
|
||||
CTF(7);
|
||||
|
||||
#undef CTF
|
||||
/* }}} */
|
||||
|
||||
FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70);
|
||||
FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71);
|
||||
#endif
|
||||
|
||||
|
||||
/* 8x8 matrix transpose (vector float[8][2]) {{{ */
|
||||
x0 = vec_mergel(b00, b20);
|
||||
x1 = vec_mergeh(b00, b20);
|
||||
x2 = vec_mergel(b10, b30);
|
||||
x3 = vec_mergeh(b10, b30);
|
||||
|
||||
b00 = vec_mergeh(x1, x3);
|
||||
b10 = vec_mergel(x1, x3);
|
||||
b20 = vec_mergeh(x0, x2);
|
||||
b30 = vec_mergel(x0, x2);
|
||||
|
||||
x4 = vec_mergel(b41, b61);
|
||||
x5 = vec_mergeh(b41, b61);
|
||||
x6 = vec_mergel(b51, b71);
|
||||
x7 = vec_mergeh(b51, b71);
|
||||
|
||||
b41 = vec_mergeh(x5, x7);
|
||||
b51 = vec_mergel(x5, x7);
|
||||
b61 = vec_mergeh(x4, x6);
|
||||
b71 = vec_mergel(x4, x6);
|
||||
|
||||
x0 = vec_mergel(b01, b21);
|
||||
x1 = vec_mergeh(b01, b21);
|
||||
x2 = vec_mergel(b11, b31);
|
||||
x3 = vec_mergeh(b11, b31);
|
||||
|
||||
x4 = vec_mergel(b40, b60);
|
||||
x5 = vec_mergeh(b40, b60);
|
||||
x6 = vec_mergel(b50, b70);
|
||||
x7 = vec_mergeh(b50, b70);
|
||||
|
||||
b40 = vec_mergeh(x1, x3);
|
||||
b50 = vec_mergel(x1, x3);
|
||||
b60 = vec_mergeh(x0, x2);
|
||||
b70 = vec_mergel(x0, x2);
|
||||
|
||||
b01 = vec_mergeh(x5, x7);
|
||||
b11 = vec_mergel(x5, x7);
|
||||
b21 = vec_mergeh(x4, x6);
|
||||
b31 = vec_mergel(x4, x6);
|
||||
/* }}} */
|
||||
|
||||
|
||||
FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
|
||||
FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
|
||||
|
||||
|
||||
/* round, convert back to short {{{ */
|
||||
#define CTS(n) \
|
||||
b##n##0 = vec_round(b##n##0); \
|
||||
b##n##1 = vec_round(b##n##1); \
|
||||
b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
|
||||
b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
|
||||
b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
|
||||
vec_st(vs16(b##n##0), 0, bp);
|
||||
|
||||
bp = (vector signed short*)block;
|
||||
CTS(0); bp++;
|
||||
CTS(1); bp++;
|
||||
CTS(2); bp++;
|
||||
CTS(3); bp++;
|
||||
CTS(4); bp++;
|
||||
CTS(5); bp++;
|
||||
CTS(6); bp++;
|
||||
CTS(7);
|
||||
|
||||
#undef CTS
|
||||
/* }}} */
|
||||
|
||||
POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
|
||||
}
|
||||
|
||||
/* vim:set foldmethod=marker foldlevel=0: */
|
|
@ -0,0 +1,138 @@
|
|||
/*
|
||||
* FFT/IFFT transforms
|
||||
* AltiVec-enabled
|
||||
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
|
||||
* Based on code Copyright (c) 2002 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
#include "dsputil_ppc.h"
|
||||
#include "util_altivec.h"
|
||||
/**
|
||||
* Do a complex FFT with the parameters defined in ff_fft_init(). The
|
||||
* input data must be permuted before with s->revtab table. No
|
||||
* 1.0/sqrt(n) normalization is done.
|
||||
* AltiVec-enabled
|
||||
* This code assumes that the 'z' pointer is 16 bytes-aligned
|
||||
* It also assumes all FFTComplex are 8 bytes-aligned pair of float
|
||||
* The code is exactly the same as the SSE version, except
|
||||
* that successive MUL + ADD/SUB have been merged into
|
||||
* fused multiply-add ('vec_madd' in altivec)
|
||||
*/
|
||||
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
|
||||
register const vector float vczero = (const vector float)vec_splat_u32(0.);
|
||||
|
||||
int ln = s->nbits;
|
||||
int j, np, np2;
|
||||
int nblocks, nloops;
|
||||
register FFTComplex *p, *q;
|
||||
FFTComplex *cptr, *cptr1;
|
||||
int k;
|
||||
|
||||
POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
|
||||
|
||||
np = 1 << ln;
|
||||
|
||||
{
|
||||
vector float *r, a, b, a1, c1, c2;
|
||||
|
||||
r = (vector float *)&z[0];
|
||||
|
||||
c1 = vcii(p,p,n,n);
|
||||
|
||||
if (s->inverse) {
|
||||
c2 = vcii(p,p,n,p);
|
||||
} else {
|
||||
c2 = vcii(p,p,p,n);
|
||||
}
|
||||
|
||||
j = (np >> 2);
|
||||
do {
|
||||
a = vec_ld(0, r);
|
||||
a1 = vec_ld(sizeof(vector float), r);
|
||||
|
||||
b = vec_perm(a,a,vcprmle(1,0,3,2));
|
||||
a = vec_madd(a,c1,b);
|
||||
/* do the pass 0 butterfly */
|
||||
|
||||
b = vec_perm(a1,a1,vcprmle(1,0,3,2));
|
||||
b = vec_madd(a1,c1,b);
|
||||
/* do the pass 0 butterfly */
|
||||
|
||||
/* multiply third by -i */
|
||||
b = vec_perm(b,b,vcprmle(2,3,1,0));
|
||||
|
||||
/* do the pass 1 butterfly */
|
||||
vec_st(vec_madd(b,c2,a), 0, r);
|
||||
vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
|
||||
|
||||
r += 2;
|
||||
} while (--j != 0);
|
||||
}
|
||||
/* pass 2 .. ln-1 */
|
||||
|
||||
nblocks = np >> 3;
|
||||
nloops = 1 << 2;
|
||||
np2 = np >> 1;
|
||||
|
||||
cptr1 = s->exptab1;
|
||||
do {
|
||||
p = z;
|
||||
q = z + nloops;
|
||||
j = nblocks;
|
||||
do {
|
||||
cptr = cptr1;
|
||||
k = nloops >> 1;
|
||||
do {
|
||||
vector float a,b,c,t1;
|
||||
|
||||
a = vec_ld(0, (float*)p);
|
||||
b = vec_ld(0, (float*)q);
|
||||
|
||||
/* complex mul */
|
||||
c = vec_ld(0, (float*)cptr);
|
||||
/* cre*re cim*re */
|
||||
t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
|
||||
c = vec_ld(sizeof(vector float), (float*)cptr);
|
||||
/* -cim*im cre*im */
|
||||
b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
|
||||
|
||||
/* butterfly */
|
||||
vec_st(vec_add(a,b), 0, (float*)p);
|
||||
vec_st(vec_sub(a,b), 0, (float*)q);
|
||||
|
||||
p += 2;
|
||||
q += 2;
|
||||
cptr += 4;
|
||||
} while (--k);
|
||||
|
||||
p += nloops;
|
||||
q += nloops;
|
||||
} while (--j);
|
||||
cptr1 += nloops * 2;
|
||||
nblocks = nblocks >> 1;
|
||||
nloops = nloops << 1;
|
||||
} while (nblocks != 0);
|
||||
|
||||
POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
|
||||
}
|
|
@ -0,0 +1,311 @@
|
|||
/*
|
||||
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
#include "dsputil_altivec.h"
|
||||
#include "util_altivec.h"
|
||||
|
||||
static void vector_fmul_altivec(float *dst, const float *src, int len)
|
||||
{
|
||||
int i;
|
||||
vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst+i);
|
||||
s = vec_ld(0, src+i);
|
||||
d1 = vec_ld(16, dst+i);
|
||||
d0 = vec_madd(d0, s, zero);
|
||||
d1 = vec_madd(d1, vec_ld(16,src+i), zero);
|
||||
vec_st(d0, 0, dst+i);
|
||||
vec_st(d1, 16, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
static void vector_fmul_reverse_altivec(float *dst, const float *src0,
|
||||
const float *src1, int len)
|
||||
{
|
||||
int i;
|
||||
vector float d, s0, s1, h0, l0,
|
||||
s2, s3, zero = (vector float)vec_splat_u32(0);
|
||||
src1 += len-4;
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
s1 = vec_ld(0, src1-i); // [a,b,c,d]
|
||||
s0 = vec_ld(0, src0+i);
|
||||
l0 = vec_mergel(s1, s1); // [c,c,d,d]
|
||||
s3 = vec_ld(-16, src1-i);
|
||||
h0 = vec_mergeh(s1, s1); // [a,a,b,b]
|
||||
s2 = vec_ld(16, src0+i);
|
||||
s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b]
|
||||
vec_mergeh(l0,h0)); // [c,a,c,a]
|
||||
// [d,c,b,a]
|
||||
l0 = vec_mergel(s3, s3);
|
||||
d = vec_madd(s0, s1, zero);
|
||||
h0 = vec_mergeh(s3, s3);
|
||||
vec_st(d, 0, dst+i);
|
||||
s3 = vec_mergeh(vec_mergel(l0,h0),
|
||||
vec_mergeh(l0,h0));
|
||||
d = vec_madd(s2, s3, zero);
|
||||
vec_st(d, 16, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
static void vector_fmul_add_add_altivec(float *dst, const float *src0,
|
||||
const float *src1, const float *src2,
|
||||
int src3, int len, int step)
|
||||
{
|
||||
int i;
|
||||
vector float d, s0, s1, s2, t0, t1, edges;
|
||||
vector unsigned char align = vec_lvsr(0,dst),
|
||||
mask = vec_lvsl(0, dst);
|
||||
|
||||
#if 0 //FIXME: there is still something wrong
|
||||
if (step == 2) {
|
||||
int y;
|
||||
vector float d0, d1, s3, t2;
|
||||
vector unsigned int sel =
|
||||
vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0));
|
||||
t1 = vec_ld(16, dst);
|
||||
for (i=0,y=0; i<len-3; i+=4,y+=8) {
|
||||
|
||||
s0 = vec_ld(0,src0+i);
|
||||
s1 = vec_ld(0,src1+i);
|
||||
s2 = vec_ld(0,src2+i);
|
||||
|
||||
// t0 = vec_ld(0, dst+y); //[x x x|a]
|
||||
// t1 = vec_ld(16, dst+y); //[b c d|e]
|
||||
t2 = vec_ld(31, dst+y); //[f g h|x]
|
||||
|
||||
d = vec_madd(s0,s1,s2); // [A B C D]
|
||||
|
||||
// [A A B B]
|
||||
|
||||
// [C C D D]
|
||||
|
||||
d0 = vec_perm(t0, t1, mask); // [a b c d]
|
||||
|
||||
d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d]
|
||||
|
||||
edges = vec_perm(t1, t0, mask);
|
||||
|
||||
t0 = vec_perm(edges, d0, align); // [x x x|A]
|
||||
|
||||
t1 = vec_perm(d0, edges, align); // [b B d|e]
|
||||
|
||||
vec_stl(t0, 0, dst+y);
|
||||
|
||||
d1 = vec_perm(t1, t2, mask); // [e f g h]
|
||||
|
||||
d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h]
|
||||
|
||||
edges = vec_perm(t2, t1, mask);
|
||||
|
||||
t1 = vec_perm(edges, d1, align); // [b B d|C]
|
||||
|
||||
t2 = vec_perm(d1, edges, align); // [f D h|x]
|
||||
|
||||
vec_stl(t1, 16, dst+y);
|
||||
|
||||
t0 = t1;
|
||||
|
||||
vec_stl(t2, 31, dst+y);
|
||||
|
||||
t1 = t2;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (step == 1 && src3 == 0)
|
||||
for (i=0; i<len-3; i+=4) {
|
||||
t0 = vec_ld(0, dst+i);
|
||||
t1 = vec_ld(15, dst+i);
|
||||
s0 = vec_ld(0, src0+i);
|
||||
s1 = vec_ld(0, src1+i);
|
||||
s2 = vec_ld(0, src2+i);
|
||||
edges = vec_perm(t1 ,t0, mask);
|
||||
d = vec_madd(s0,s1,s2);
|
||||
t1 = vec_perm(d, edges, align);
|
||||
t0 = vec_perm(edges, d, align);
|
||||
vec_st(t1, 15, dst+i);
|
||||
vec_st(t0, 0, dst+i);
|
||||
}
|
||||
else
|
||||
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
|
||||
}
|
||||
|
||||
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
|
||||
{
|
||||
union {
|
||||
vector float v;
|
||||
float s[4];
|
||||
} vadd;
|
||||
vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
|
||||
const vector unsigned char reverse = vcprm(3,2,1,0);
|
||||
int i,j;
|
||||
|
||||
dst += len;
|
||||
win += len;
|
||||
src0+= len;
|
||||
|
||||
vadd.s[0] = add_bias;
|
||||
vadd_bias = vec_splat(vadd.v, 0);
|
||||
zero = (vector float)vec_splat_u32(0);
|
||||
|
||||
for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
|
||||
s0 = vec_ld(i, src0);
|
||||
s1 = vec_ld(j, src1);
|
||||
wi = vec_ld(i, win);
|
||||
wj = vec_ld(j, win);
|
||||
|
||||
s1 = vec_perm(s1, s1, reverse);
|
||||
wj = vec_perm(wj, wj, reverse);
|
||||
|
||||
t0 = vec_madd(s0, wj, vadd_bias);
|
||||
t0 = vec_nmsub(s1, wi, t0);
|
||||
t1 = vec_madd(s0, wi, vadd_bias);
|
||||
t1 = vec_madd(s1, wj, t1);
|
||||
t1 = vec_perm(t1, t1, reverse);
|
||||
|
||||
vec_st(t0, i, dst);
|
||||
vec_st(t1, j, dst);
|
||||
}
|
||||
}
|
||||
|
||||
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
|
||||
{
|
||||
union {
|
||||
vector float v;
|
||||
float s[4];
|
||||
} mul_u;
|
||||
int i;
|
||||
vector float src1, src2, dst1, dst2, mul_v, zero;
|
||||
|
||||
zero = (vector float)vec_splat_u32(0);
|
||||
mul_u.s[0] = mul;
|
||||
mul_v = vec_splat(mul_u.v, 0);
|
||||
|
||||
for(i=0; i<len; i+=8) {
|
||||
src1 = vec_ctf(vec_ld(0, src+i), 0);
|
||||
src2 = vec_ctf(vec_ld(16, src+i), 0);
|
||||
dst1 = vec_madd(src1, mul_v, zero);
|
||||
dst2 = vec_madd(src2, mul_v, zero);
|
||||
vec_st(dst1, 0, dst+i);
|
||||
vec_st(dst2, 16, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static vector signed short
|
||||
float_to_int16_one_altivec(const float *src)
|
||||
{
|
||||
vector float s0 = vec_ld(0, src);
|
||||
vector float s1 = vec_ld(16, src);
|
||||
vector signed int t0 = vec_cts(s0, 0);
|
||||
vector signed int t1 = vec_cts(s1, 0);
|
||||
return vec_packs(t0,t1);
|
||||
}
|
||||
|
||||
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
|
||||
{
|
||||
int i;
|
||||
vector signed short d0, d1, d;
|
||||
vector unsigned char align;
|
||||
if(((long)dst)&15) //FIXME
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst+i);
|
||||
d = float_to_int16_one_altivec(src+i);
|
||||
d1 = vec_ld(15, dst+i);
|
||||
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
|
||||
align = vec_lvsr(0, dst+i);
|
||||
d0 = vec_perm(d1, d, align);
|
||||
d1 = vec_perm(d, d1, align);
|
||||
vec_st(d0, 0, dst+i);
|
||||
vec_st(d1,15, dst+i);
|
||||
}
|
||||
else
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d = float_to_int16_one_altivec(src+i);
|
||||
vec_st(d, 0, dst+i);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
|
||||
long len, int channels)
|
||||
{
|
||||
int i;
|
||||
vector signed short d0, d1, d2, c0, c1, t0, t1;
|
||||
vector unsigned char align;
|
||||
if(channels == 1)
|
||||
float_to_int16_altivec(dst, src[0], len);
|
||||
else
|
||||
if (channels == 2) {
|
||||
if(((long)dst)&15)
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
d0 = vec_ld(0, dst + i);
|
||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||
d1 = vec_ld(31, dst + i);
|
||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||
c0 = vec_mergeh(t0, t1);
|
||||
c1 = vec_mergel(t0, t1);
|
||||
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
|
||||
align = vec_lvsr(0, dst + i);
|
||||
d0 = vec_perm(d2, c0, align);
|
||||
d1 = vec_perm(c0, c1, align);
|
||||
vec_st(d0, 0, dst + i);
|
||||
d0 = vec_perm(c1, d2, align);
|
||||
vec_st(d1, 15, dst + i);
|
||||
vec_st(d0, 31, dst + i);
|
||||
dst+=8;
|
||||
}
|
||||
else
|
||||
for(i=0; i<len-7; i+=8) {
|
||||
t0 = float_to_int16_one_altivec(src[0] + i);
|
||||
t1 = float_to_int16_one_altivec(src[1] + i);
|
||||
d0 = vec_mergeh(t0, t1);
|
||||
d1 = vec_mergel(t0, t1);
|
||||
vec_st(d0, 0, dst + i);
|
||||
vec_st(d1, 16, dst + i);
|
||||
dst+=8;
|
||||
}
|
||||
} else {
|
||||
DECLARE_ALIGNED(16, int16_t, tmp[len]);
|
||||
int c, j;
|
||||
for (c = 0; c < channels; c++) {
|
||||
float_to_int16_altivec(tmp, src[c], len);
|
||||
for (i = 0, j = c; i < len; i++, j+=channels) {
|
||||
dst[j] = tmp[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
c->vector_fmul = vector_fmul_altivec;
|
||||
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
|
||||
c->vector_fmul_add_add = vector_fmul_add_add_altivec;
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->vector_fmul_window = vector_fmul_window_altivec;
|
||||
c->float_to_int16 = float_to_int16_altivec;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
/*
|
||||
* gcc fixes for altivec.
|
||||
* Used to workaround broken gcc (FSF gcc-3 pre gcc-3.3)
|
||||
* and to stay somewhat compatible with Darwin.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_GCC_FIXES_H
|
||||
#define AVCODEC_PPC_GCC_FIXES_H
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_ALTIVEC_H
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#if (__GNUC__ < 4)
|
||||
# define REG_v(a)
|
||||
#else
|
||||
# define REG_v(a) __asm__ ( #a )
|
||||
#endif
|
||||
|
||||
#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
|
||||
|
||||
/* This code was provided to me by Bartosch Pixa
|
||||
* as a separate header file (broken_mergel.h).
|
||||
* thanks to lu_zero for the workaround.
|
||||
*
|
||||
* See this mail for more information:
|
||||
* http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html
|
||||
*/
|
||||
|
||||
static inline vector signed char ff_vmrglb (vector signed char const A,
|
||||
vector signed char const B)
|
||||
{
|
||||
static const vector unsigned char lowbyte = {
|
||||
0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b,
|
||||
0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f
|
||||
};
|
||||
return vec_perm (A, B, lowbyte);
|
||||
}
|
||||
|
||||
static inline vector signed short ff_vmrglh (vector signed short const A,
|
||||
vector signed short const B)
|
||||
{
|
||||
static const vector unsigned char lowhalf = {
|
||||
0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b,
|
||||
0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f
|
||||
};
|
||||
return vec_perm (A, B, lowhalf);
|
||||
}
|
||||
|
||||
static inline vector signed int ff_vmrglw (vector signed int const A,
|
||||
vector signed int const B)
|
||||
{
|
||||
static const vector unsigned char lowword = {
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
|
||||
0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f
|
||||
};
|
||||
return vec_perm (A, B, lowword);
|
||||
}
|
||||
/*#define ff_vmrglb ff_vmrglb
|
||||
#define ff_vmrglh ff_vmrglh
|
||||
#define ff_vmrglw ff_vmrglw
|
||||
*/
|
||||
#undef vec_mergel
|
||||
|
||||
#define vec_mergel(a1, a2) \
|
||||
__ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \
|
||||
((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
|
||||
__ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \
|
||||
((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
|
||||
__ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \
|
||||
((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
|
||||
__ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \
|
||||
((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
|
||||
__ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \
|
||||
((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
|
||||
__ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \
|
||||
((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
|
||||
__ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \
|
||||
((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
|
||||
__altivec_link_error_invalid_argument ())))))))
|
||||
|
||||
#endif /* (__GNUC__ == 3 && __GNUC_MINOR__ < 3) */
|
||||
|
||||
#endif /* AVCODEC_PPC_GCC_FIXES_H */
|
|
@ -0,0 +1,141 @@
|
|||
/*
|
||||
* GMC (Global Motion Compensation)
|
||||
* AltiVec-enabled
|
||||
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
#include "dsputil_ppc.h"
|
||||
#include "util_altivec.h"
|
||||
|
||||
/*
|
||||
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
|
||||
to preserve proper dst alignment.
|
||||
*/
|
||||
#define GMC1_PERF_COND (h==8)
|
||||
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
|
||||
const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
|
||||
{rounder, rounder, rounder, rounder,
|
||||
rounder, rounder, rounder, rounder};
|
||||
const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
|
||||
{
|
||||
(16-x16)*(16-y16), /* A */
|
||||
( x16)*(16-y16), /* B */
|
||||
(16-x16)*( y16), /* C */
|
||||
( x16)*( y16), /* D */
|
||||
0, 0, 0, 0 /* padding */
|
||||
};
|
||||
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
|
||||
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
|
||||
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
|
||||
register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
|
||||
int i;
|
||||
unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
|
||||
unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
|
||||
|
||||
|
||||
POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
||||
|
||||
tempA = vec_ld(0, (unsigned short*)ABCD);
|
||||
Av = vec_splat(tempA, 0);
|
||||
Bv = vec_splat(tempA, 1);
|
||||
Cv = vec_splat(tempA, 2);
|
||||
Dv = vec_splat(tempA, 3);
|
||||
|
||||
rounderV = vec_ld(0, (unsigned short*)rounder_a);
|
||||
|
||||
// we'll be able to pick-up our 9 char elements
|
||||
// at src from those 32 bytes
|
||||
// we load the first batch here, as inside the loop
|
||||
// we can re-use 'src+stride' from one iteration
|
||||
// as the 'src' of the next.
|
||||
src_0 = vec_ld(0, src);
|
||||
src_1 = vec_ld(16, src);
|
||||
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
|
||||
|
||||
if (src_really_odd != 0x0000000F) {
|
||||
// if src & 0xF == 0xF, then (src+1) is properly aligned
|
||||
// on the second vector.
|
||||
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
|
||||
} else {
|
||||
srcvB = src_1;
|
||||
}
|
||||
srcvA = vec_mergeh(vczero, srcvA);
|
||||
srcvB = vec_mergeh(vczero, srcvB);
|
||||
|
||||
for(i=0; i<h; i++) {
|
||||
dst_odd = (unsigned long)dst & 0x0000000F;
|
||||
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
|
||||
|
||||
dstv = vec_ld(0, dst);
|
||||
|
||||
// we we'll be able to pick-up our 9 char elements
|
||||
// at src + stride from those 32 bytes
|
||||
// then reuse the resulting 2 vectors srvcC and srcvD
|
||||
// as the next srcvA and srcvB
|
||||
src_0 = vec_ld(stride + 0, src);
|
||||
src_1 = vec_ld(stride + 16, src);
|
||||
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
|
||||
|
||||
if (src_really_odd != 0x0000000F) {
|
||||
// if src & 0xF == 0xF, then (src+1) is properly aligned
|
||||
// on the second vector.
|
||||
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
|
||||
} else {
|
||||
srcvD = src_1;
|
||||
}
|
||||
|
||||
srcvC = vec_mergeh(vczero, srcvC);
|
||||
srcvD = vec_mergeh(vczero, srcvD);
|
||||
|
||||
|
||||
// OK, now we (finally) do the math :-)
|
||||
// those four instructions replaces 32 int muls & 32 int adds.
|
||||
// isn't AltiVec nice ?
|
||||
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
|
||||
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
|
||||
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
|
||||
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
|
||||
|
||||
srcvA = srcvC;
|
||||
srcvB = srcvD;
|
||||
|
||||
tempD = vec_sr(tempD, vcsr8);
|
||||
|
||||
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
|
||||
|
||||
if (dst_odd) {
|
||||
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
|
||||
} else {
|
||||
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
|
||||
}
|
||||
|
||||
vec_st(dstv2, 0, dst);
|
||||
|
||||
dst += stride;
|
||||
src += stride;
|
||||
}
|
||||
|
||||
POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,694 @@
|
|||
/*
|
||||
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
//#define DEBUG_ALIGNMENT
|
||||
#ifdef DEBUG_ALIGNMENT
|
||||
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
|
||||
#else
|
||||
#define ASSERT_ALIGNED(ptr) ;
|
||||
#endif
|
||||
|
||||
/* this code assume that stride % 16 == 0 */
|
||||
|
||||
#define CHROMA_MC8_ALTIVEC_CORE \
|
||||
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
|
||||
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
|
||||
\
|
||||
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
|
||||
psum = vec_mladd(vB, vsrc1ssH, psum);\
|
||||
psum = vec_mladd(vC, vsrc2ssH, psum);\
|
||||
psum = vec_mladd(vD, vsrc3ssH, psum);\
|
||||
psum = vec_sr(psum, v6us);\
|
||||
\
|
||||
vdst = vec_ld(0, dst);\
|
||||
ppsum = (vec_u8)vec_pack(psum, psum);\
|
||||
vfdst = vec_perm(vdst, ppsum, fperm);\
|
||||
\
|
||||
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
|
||||
\
|
||||
vec_st(fsum, 0, dst);\
|
||||
\
|
||||
vsrc0ssH = vsrc2ssH;\
|
||||
vsrc1ssH = vsrc3ssH;\
|
||||
\
|
||||
dst += stride;\
|
||||
src += stride;
|
||||
|
||||
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
|
||||
\
|
||||
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
|
||||
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
|
||||
\
|
||||
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
|
||||
psum = vec_mladd(vE, vsrc1ssH, psum);\
|
||||
psum = vec_sr(psum, v6us);\
|
||||
\
|
||||
vdst = vec_ld(0, dst);\
|
||||
ppsum = (vec_u8)vec_pack(psum, psum);\
|
||||
vfdst = vec_perm(vdst, ppsum, fperm);\
|
||||
\
|
||||
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
|
||||
\
|
||||
vec_st(fsum, 0, dst);\
|
||||
\
|
||||
dst += stride;\
|
||||
src += stride;
|
||||
|
||||
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
|
||||
int stride, int h, int x, int y) {
|
||||
POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
|
||||
DECLARE_ALIGNED_16(signed int, ABCD[4]) =
|
||||
{((8 - x) * (8 - y)),
|
||||
(( x) * (8 - y)),
|
||||
((8 - x) * ( y)),
|
||||
(( x) * ( y))};
|
||||
register int i;
|
||||
vec_u8 fperm;
|
||||
const vec_s32 vABCD = vec_ld(0, ABCD);
|
||||
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
|
||||
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
|
||||
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
|
||||
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
|
||||
LOAD_ZERO;
|
||||
const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
|
||||
const vec_u16 v6us = vec_splat_u16(6);
|
||||
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
|
||||
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
|
||||
|
||||
vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
|
||||
vec_u8 vsrc0uc, vsrc1uc;
|
||||
vec_s16 vsrc0ssH, vsrc1ssH;
|
||||
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
|
||||
vec_s16 vsrc2ssH, vsrc3ssH, psum;
|
||||
vec_u8 vdst, ppsum, vfdst, fsum;
|
||||
|
||||
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
|
||||
|
||||
if (((unsigned long)dst) % 16 == 0) {
|
||||
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
|
||||
0x14, 0x15, 0x16, 0x17,
|
||||
0x08, 0x09, 0x0A, 0x0B,
|
||||
0x0C, 0x0D, 0x0E, 0x0F};
|
||||
} else {
|
||||
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
|
||||
0x04, 0x05, 0x06, 0x07,
|
||||
0x18, 0x19, 0x1A, 0x1B,
|
||||
0x1C, 0x1D, 0x1E, 0x1F};
|
||||
}
|
||||
|
||||
vsrcAuc = vec_ld(0, src);
|
||||
|
||||
if (loadSecond)
|
||||
vsrcBuc = vec_ld(16, src);
|
||||
vsrcperm0 = vec_lvsl(0, src);
|
||||
vsrcperm1 = vec_lvsl(1, src);
|
||||
|
||||
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
|
||||
if (reallyBadAlign)
|
||||
vsrc1uc = vsrcBuc;
|
||||
else
|
||||
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
|
||||
|
||||
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
|
||||
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
|
||||
|
||||
if (ABCD[3]) {
|
||||
if (!loadSecond) {// -> !reallyBadAlign
|
||||
for (i = 0 ; i < h ; i++) {
|
||||
vsrcCuc = vec_ld(stride + 0, src);
|
||||
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
|
||||
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
|
||||
|
||||
CHROMA_MC8_ALTIVEC_CORE
|
||||
}
|
||||
} else {
|
||||
vec_u8 vsrcDuc;
|
||||
for (i = 0 ; i < h ; i++) {
|
||||
vsrcCuc = vec_ld(stride + 0, src);
|
||||
vsrcDuc = vec_ld(stride + 16, src);
|
||||
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
|
||||
if (reallyBadAlign)
|
||||
vsrc3uc = vsrcDuc;
|
||||
else
|
||||
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
|
||||
|
||||
CHROMA_MC8_ALTIVEC_CORE
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const vec_s16 vE = vec_add(vB, vC);
|
||||
if (ABCD[2]) { // x == 0 B == 0
|
||||
if (!loadSecond) {// -> !reallyBadAlign
|
||||
for (i = 0 ; i < h ; i++) {
|
||||
vsrcCuc = vec_ld(stride + 0, src);
|
||||
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
|
||||
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
|
||||
|
||||
vsrc0uc = vsrc1uc;
|
||||
}
|
||||
} else {
|
||||
vec_u8 vsrcDuc;
|
||||
for (i = 0 ; i < h ; i++) {
|
||||
vsrcCuc = vec_ld(stride + 0, src);
|
||||
vsrcDuc = vec_ld(stride + 15, src);
|
||||
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
|
||||
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
|
||||
|
||||
vsrc0uc = vsrc1uc;
|
||||
}
|
||||
}
|
||||
} else { // y == 0 C == 0
|
||||
if (!loadSecond) {// -> !reallyBadAlign
|
||||
for (i = 0 ; i < h ; i++) {
|
||||
vsrcCuc = vec_ld(0, src);
|
||||
vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
|
||||
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
|
||||
|
||||
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
|
||||
}
|
||||
} else {
|
||||
vec_u8 vsrcDuc;
|
||||
for (i = 0 ; i < h ; i++) {
|
||||
vsrcCuc = vec_ld(0, src);
|
||||
vsrcDuc = vec_ld(15, src);
|
||||
vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
|
||||
if (reallyBadAlign)
|
||||
vsrc1uc = vsrcDuc;
|
||||
else
|
||||
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
|
||||
|
||||
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
|
||||
}
|
||||
|
||||
#undef CHROMA_MC8_ALTIVEC_CORE
|
||||
|
||||
/* this code assume stride % 16 == 0 */
|
||||
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
|
||||
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
||||
register int i;
|
||||
|
||||
LOAD_ZERO;
|
||||
const vec_u8 permM2 = vec_lvsl(-2, src);
|
||||
const vec_u8 permM1 = vec_lvsl(-1, src);
|
||||
const vec_u8 permP0 = vec_lvsl(+0, src);
|
||||
const vec_u8 permP1 = vec_lvsl(+1, src);
|
||||
const vec_u8 permP2 = vec_lvsl(+2, src);
|
||||
const vec_u8 permP3 = vec_lvsl(+3, src);
|
||||
const vec_s16 v5ss = vec_splat_s16(5);
|
||||
const vec_u16 v5us = vec_splat_u16(5);
|
||||
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
||||
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
|
||||
|
||||
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
|
||||
|
||||
register int align = ((((unsigned long)src) - 2) % 16);
|
||||
|
||||
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
|
||||
srcP2A, srcP2B, srcP3A, srcP3B,
|
||||
srcM1A, srcM1B, srcM2A, srcM2B,
|
||||
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
|
||||
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
|
||||
psumA, psumB, sumA, sumB;
|
||||
|
||||
vec_u8 sum, vdst, fsum;
|
||||
|
||||
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
||||
|
||||
for (i = 0 ; i < 16 ; i ++) {
|
||||
vec_u8 srcR1 = vec_ld(-2, src);
|
||||
vec_u8 srcR2 = vec_ld(14, src);
|
||||
|
||||
switch (align) {
|
||||
default: {
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||
srcP3 = vec_perm(srcR1, srcR2, permP3);
|
||||
} break;
|
||||
case 11: {
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||
srcP3 = srcR2;
|
||||
} break;
|
||||
case 12: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||
srcP2 = srcR2;
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
case 13: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = srcR2;
|
||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
case 14: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = srcR2;
|
||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
case 15: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = srcR2;
|
||||
srcP0 = vec_perm(srcR2, srcR3, permP0);
|
||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
}
|
||||
|
||||
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
|
||||
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
|
||||
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
|
||||
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
|
||||
|
||||
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
|
||||
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
|
||||
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
|
||||
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
|
||||
|
||||
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
|
||||
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
|
||||
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
|
||||
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
|
||||
|
||||
sum1A = vec_adds(srcP0A, srcP1A);
|
||||
sum1B = vec_adds(srcP0B, srcP1B);
|
||||
sum2A = vec_adds(srcM1A, srcP2A);
|
||||
sum2B = vec_adds(srcM1B, srcP2B);
|
||||
sum3A = vec_adds(srcM2A, srcP3A);
|
||||
sum3B = vec_adds(srcM2B, srcP3B);
|
||||
|
||||
pp1A = vec_mladd(sum1A, v20ss, v16ss);
|
||||
pp1B = vec_mladd(sum1B, v20ss, v16ss);
|
||||
|
||||
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
||||
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
||||
|
||||
pp3A = vec_add(sum3A, pp1A);
|
||||
pp3B = vec_add(sum3B, pp1B);
|
||||
|
||||
psumA = vec_sub(pp3A, pp2A);
|
||||
psumB = vec_sub(pp3B, pp2B);
|
||||
|
||||
sumA = vec_sra(psumA, v5us);
|
||||
sumB = vec_sra(psumB, v5us);
|
||||
|
||||
sum = vec_packsu(sumA, sumB);
|
||||
|
||||
ASSERT_ALIGNED(dst);
|
||||
vdst = vec_ld(0, dst);
|
||||
|
||||
OP_U8_ALTIVEC(fsum, sum, vdst);
|
||||
|
||||
vec_st(fsum, 0, dst);
|
||||
|
||||
src += srcStride;
|
||||
dst += dstStride;
|
||||
}
|
||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
|
||||
}
|
||||
|
||||
/* this code assume stride % 16 == 0 */
|
||||
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
|
||||
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
||||
|
||||
register int i;
|
||||
|
||||
LOAD_ZERO;
|
||||
const vec_u8 perm = vec_lvsl(0, src);
|
||||
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
||||
const vec_u16 v5us = vec_splat_u16(5);
|
||||
const vec_s16 v5ss = vec_splat_s16(5);
|
||||
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
|
||||
|
||||
uint8_t *srcbis = src - (srcStride * 2);
|
||||
|
||||
const vec_u8 srcM2a = vec_ld(0, srcbis);
|
||||
const vec_u8 srcM2b = vec_ld(16, srcbis);
|
||||
const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
|
||||
//srcbis += srcStride;
|
||||
const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
|
||||
const vec_u8 srcM1b = vec_ld(16, srcbis);
|
||||
const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
|
||||
//srcbis += srcStride;
|
||||
const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
|
||||
const vec_u8 srcP0b = vec_ld(16, srcbis);
|
||||
const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
|
||||
//srcbis += srcStride;
|
||||
const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
|
||||
const vec_u8 srcP1b = vec_ld(16, srcbis);
|
||||
const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
|
||||
//srcbis += srcStride;
|
||||
const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
|
||||
const vec_u8 srcP2b = vec_ld(16, srcbis);
|
||||
const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
|
||||
//srcbis += srcStride;
|
||||
|
||||
vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
|
||||
vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
|
||||
vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
|
||||
vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
|
||||
vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
|
||||
vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
|
||||
vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
|
||||
vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
|
||||
vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
|
||||
vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
|
||||
|
||||
vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
|
||||
psumA, psumB, sumA, sumB,
|
||||
srcP3ssA, srcP3ssB,
|
||||
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
|
||||
|
||||
vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
|
||||
|
||||
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
||||
|
||||
for (i = 0 ; i < 16 ; i++) {
|
||||
srcP3a = vec_ld(0, srcbis += srcStride);
|
||||
srcP3b = vec_ld(16, srcbis);
|
||||
srcP3 = vec_perm(srcP3a, srcP3b, perm);
|
||||
srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
|
||||
srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
|
||||
//srcbis += srcStride;
|
||||
|
||||
sum1A = vec_adds(srcP0ssA, srcP1ssA);
|
||||
sum1B = vec_adds(srcP0ssB, srcP1ssB);
|
||||
sum2A = vec_adds(srcM1ssA, srcP2ssA);
|
||||
sum2B = vec_adds(srcM1ssB, srcP2ssB);
|
||||
sum3A = vec_adds(srcM2ssA, srcP3ssA);
|
||||
sum3B = vec_adds(srcM2ssB, srcP3ssB);
|
||||
|
||||
srcM2ssA = srcM1ssA;
|
||||
srcM2ssB = srcM1ssB;
|
||||
srcM1ssA = srcP0ssA;
|
||||
srcM1ssB = srcP0ssB;
|
||||
srcP0ssA = srcP1ssA;
|
||||
srcP0ssB = srcP1ssB;
|
||||
srcP1ssA = srcP2ssA;
|
||||
srcP1ssB = srcP2ssB;
|
||||
srcP2ssA = srcP3ssA;
|
||||
srcP2ssB = srcP3ssB;
|
||||
|
||||
pp1A = vec_mladd(sum1A, v20ss, v16ss);
|
||||
pp1B = vec_mladd(sum1B, v20ss, v16ss);
|
||||
|
||||
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
||||
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
||||
|
||||
pp3A = vec_add(sum3A, pp1A);
|
||||
pp3B = vec_add(sum3B, pp1B);
|
||||
|
||||
psumA = vec_sub(pp3A, pp2A);
|
||||
psumB = vec_sub(pp3B, pp2B);
|
||||
|
||||
sumA = vec_sra(psumA, v5us);
|
||||
sumB = vec_sra(psumB, v5us);
|
||||
|
||||
sum = vec_packsu(sumA, sumB);
|
||||
|
||||
ASSERT_ALIGNED(dst);
|
||||
vdst = vec_ld(0, dst);
|
||||
|
||||
OP_U8_ALTIVEC(fsum, sum, vdst);
|
||||
|
||||
vec_st(fsum, 0, dst);
|
||||
|
||||
dst += dstStride;
|
||||
}
|
||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
|
||||
}
|
||||
|
||||
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
|
||||
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
|
||||
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
||||
register int i;
|
||||
LOAD_ZERO;
|
||||
const vec_u8 permM2 = vec_lvsl(-2, src);
|
||||
const vec_u8 permM1 = vec_lvsl(-1, src);
|
||||
const vec_u8 permP0 = vec_lvsl(+0, src);
|
||||
const vec_u8 permP1 = vec_lvsl(+1, src);
|
||||
const vec_u8 permP2 = vec_lvsl(+2, src);
|
||||
const vec_u8 permP3 = vec_lvsl(+3, src);
|
||||
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
|
||||
const vec_u32 v10ui = vec_splat_u32(10);
|
||||
const vec_s16 v5ss = vec_splat_s16(5);
|
||||
const vec_s16 v1ss = vec_splat_s16(1);
|
||||
const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
|
||||
const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
|
||||
|
||||
register int align = ((((unsigned long)src) - 2) % 16);
|
||||
|
||||
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
|
||||
srcP2A, srcP2B, srcP3A, srcP3B,
|
||||
srcM1A, srcM1B, srcM2A, srcM2B,
|
||||
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
|
||||
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
|
||||
|
||||
const vec_u8 mperm = (const vec_u8)
|
||||
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
|
||||
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
|
||||
int16_t *tmpbis = tmp;
|
||||
|
||||
vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
|
||||
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
|
||||
tmpP2ssA, tmpP2ssB;
|
||||
|
||||
vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
|
||||
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
|
||||
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
|
||||
ssumAe, ssumAo, ssumBe, ssumBo;
|
||||
vec_u8 fsum, sumv, sum, vdst;
|
||||
vec_s16 ssume, ssumo;
|
||||
|
||||
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
||||
src -= (2 * srcStride);
|
||||
for (i = 0 ; i < 21 ; i ++) {
|
||||
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
|
||||
vec_u8 srcR1 = vec_ld(-2, src);
|
||||
vec_u8 srcR2 = vec_ld(14, src);
|
||||
|
||||
switch (align) {
|
||||
default: {
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||
srcP3 = vec_perm(srcR1, srcR2, permP3);
|
||||
} break;
|
||||
case 11: {
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||
srcP2 = vec_perm(srcR1, srcR2, permP2);
|
||||
srcP3 = srcR2;
|
||||
} break;
|
||||
case 12: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = vec_perm(srcR1, srcR2, permP1);
|
||||
srcP2 = srcR2;
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
case 13: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = vec_perm(srcR1, srcR2, permP0);
|
||||
srcP1 = srcR2;
|
||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
case 14: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = vec_perm(srcR1, srcR2, permM1);
|
||||
srcP0 = srcR2;
|
||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
case 15: {
|
||||
vec_u8 srcR3 = vec_ld(30, src);
|
||||
srcM2 = vec_perm(srcR1, srcR2, permM2);
|
||||
srcM1 = srcR2;
|
||||
srcP0 = vec_perm(srcR2, srcR3, permP0);
|
||||
srcP1 = vec_perm(srcR2, srcR3, permP1);
|
||||
srcP2 = vec_perm(srcR2, srcR3, permP2);
|
||||
srcP3 = vec_perm(srcR2, srcR3, permP3);
|
||||
} break;
|
||||
}
|
||||
|
||||
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
|
||||
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
|
||||
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
|
||||
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
|
||||
|
||||
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
|
||||
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
|
||||
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
|
||||
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
|
||||
|
||||
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
|
||||
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
|
||||
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
|
||||
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
|
||||
|
||||
sum1A = vec_adds(srcP0A, srcP1A);
|
||||
sum1B = vec_adds(srcP0B, srcP1B);
|
||||
sum2A = vec_adds(srcM1A, srcP2A);
|
||||
sum2B = vec_adds(srcM1B, srcP2B);
|
||||
sum3A = vec_adds(srcM2A, srcP3A);
|
||||
sum3B = vec_adds(srcM2B, srcP3B);
|
||||
|
||||
pp1A = vec_mladd(sum1A, v20ss, sum3A);
|
||||
pp1B = vec_mladd(sum1B, v20ss, sum3B);
|
||||
|
||||
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
|
||||
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
|
||||
|
||||
psumA = vec_sub(pp1A, pp2A);
|
||||
psumB = vec_sub(pp1B, pp2B);
|
||||
|
||||
vec_st(psumA, 0, tmp);
|
||||
vec_st(psumB, 16, tmp);
|
||||
|
||||
src += srcStride;
|
||||
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
|
||||
}
|
||||
|
||||
tmpM2ssA = vec_ld(0, tmpbis);
|
||||
tmpM2ssB = vec_ld(16, tmpbis);
|
||||
tmpbis += tmpStride;
|
||||
tmpM1ssA = vec_ld(0, tmpbis);
|
||||
tmpM1ssB = vec_ld(16, tmpbis);
|
||||
tmpbis += tmpStride;
|
||||
tmpP0ssA = vec_ld(0, tmpbis);
|
||||
tmpP0ssB = vec_ld(16, tmpbis);
|
||||
tmpbis += tmpStride;
|
||||
tmpP1ssA = vec_ld(0, tmpbis);
|
||||
tmpP1ssB = vec_ld(16, tmpbis);
|
||||
tmpbis += tmpStride;
|
||||
tmpP2ssA = vec_ld(0, tmpbis);
|
||||
tmpP2ssB = vec_ld(16, tmpbis);
|
||||
tmpbis += tmpStride;
|
||||
|
||||
for (i = 0 ; i < 16 ; i++) {
|
||||
const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
|
||||
const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
|
||||
|
||||
const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
|
||||
const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
|
||||
const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
|
||||
const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
|
||||
const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
|
||||
const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
|
||||
|
||||
tmpbis += tmpStride;
|
||||
|
||||
tmpM2ssA = tmpM1ssA;
|
||||
tmpM2ssB = tmpM1ssB;
|
||||
tmpM1ssA = tmpP0ssA;
|
||||
tmpM1ssB = tmpP0ssB;
|
||||
tmpP0ssA = tmpP1ssA;
|
||||
tmpP0ssB = tmpP1ssB;
|
||||
tmpP1ssA = tmpP2ssA;
|
||||
tmpP1ssB = tmpP2ssB;
|
||||
tmpP2ssA = tmpP3ssA;
|
||||
tmpP2ssB = tmpP3ssB;
|
||||
|
||||
pp1Ae = vec_mule(sum1A, v20ss);
|
||||
pp1Ao = vec_mulo(sum1A, v20ss);
|
||||
pp1Be = vec_mule(sum1B, v20ss);
|
||||
pp1Bo = vec_mulo(sum1B, v20ss);
|
||||
|
||||
pp2Ae = vec_mule(sum2A, v5ss);
|
||||
pp2Ao = vec_mulo(sum2A, v5ss);
|
||||
pp2Be = vec_mule(sum2B, v5ss);
|
||||
pp2Bo = vec_mulo(sum2B, v5ss);
|
||||
|
||||
pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
|
||||
pp3Ao = vec_mulo(sum3A, v1ss);
|
||||
pp3Be = vec_sra((vec_s32)sum3B, v16ui);
|
||||
pp3Bo = vec_mulo(sum3B, v1ss);
|
||||
|
||||
pp1cAe = vec_add(pp1Ae, v512si);
|
||||
pp1cAo = vec_add(pp1Ao, v512si);
|
||||
pp1cBe = vec_add(pp1Be, v512si);
|
||||
pp1cBo = vec_add(pp1Bo, v512si);
|
||||
|
||||
pp32Ae = vec_sub(pp3Ae, pp2Ae);
|
||||
pp32Ao = vec_sub(pp3Ao, pp2Ao);
|
||||
pp32Be = vec_sub(pp3Be, pp2Be);
|
||||
pp32Bo = vec_sub(pp3Bo, pp2Bo);
|
||||
|
||||
sumAe = vec_add(pp1cAe, pp32Ae);
|
||||
sumAo = vec_add(pp1cAo, pp32Ao);
|
||||
sumBe = vec_add(pp1cBe, pp32Be);
|
||||
sumBo = vec_add(pp1cBo, pp32Bo);
|
||||
|
||||
ssumAe = vec_sra(sumAe, v10ui);
|
||||
ssumAo = vec_sra(sumAo, v10ui);
|
||||
ssumBe = vec_sra(sumBe, v10ui);
|
||||
ssumBo = vec_sra(sumBo, v10ui);
|
||||
|
||||
ssume = vec_packs(ssumAe, ssumBe);
|
||||
ssumo = vec_packs(ssumAo, ssumBo);
|
||||
|
||||
sumv = vec_packsu(ssume, ssumo);
|
||||
sum = vec_perm(sumv, sumv, mperm);
|
||||
|
||||
ASSERT_ALIGNED(dst);
|
||||
vdst = vec_ld(0, dst);
|
||||
|
||||
OP_U8_ALTIVEC(fsum, sum, vdst);
|
||||
|
||||
vec_st(fsum, 0, dst);
|
||||
|
||||
dst += dstStride;
|
||||
}
|
||||
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
|
||||
}
|
|
@ -0,0 +1,227 @@
|
|||
/*
|
||||
* Copyright (c) 2001 Michel Lespinasse
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/*
|
||||
* NOTE: This code is based on GPL code from the libmpeg2 project. The
|
||||
* author, Michel Lespinasses, has given explicit permission to release
|
||||
* under LGPL as part of ffmpeg.
|
||||
*/
|
||||
|
||||
/*
|
||||
* FFMpeg integration by Dieter Shirley
|
||||
*
|
||||
* This file is a direct copy of the altivec idct module from the libmpeg2
|
||||
* project. I've deleted all of the libmpeg2 specific code, renamed the functions and
|
||||
* re-ordered the function parameters. The only change to the IDCT function
|
||||
* itself was to factor out the partial transposition, and to perform a full
|
||||
* transpose at the end of the function.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdlib.h> /* malloc(), free() */
|
||||
#include <string.h>
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
#include "types_altivec.h"
|
||||
#include "dsputil_ppc.h"
|
||||
|
||||
#define IDCT_HALF \
|
||||
/* 1st stage */ \
|
||||
t1 = vec_mradds (a1, vx7, vx1 ); \
|
||||
t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
|
||||
t7 = vec_mradds (a2, vx5, vx3); \
|
||||
t3 = vec_mradds (ma2, vx3, vx5); \
|
||||
\
|
||||
/* 2nd stage */ \
|
||||
t5 = vec_adds (vx0, vx4); \
|
||||
t0 = vec_subs (vx0, vx4); \
|
||||
t2 = vec_mradds (a0, vx6, vx2); \
|
||||
t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
|
||||
t6 = vec_adds (t8, t3); \
|
||||
t3 = vec_subs (t8, t3); \
|
||||
t8 = vec_subs (t1, t7); \
|
||||
t1 = vec_adds (t1, t7); \
|
||||
\
|
||||
/* 3rd stage */ \
|
||||
t7 = vec_adds (t5, t2); \
|
||||
t2 = vec_subs (t5, t2); \
|
||||
t5 = vec_adds (t0, t4); \
|
||||
t0 = vec_subs (t0, t4); \
|
||||
t4 = vec_subs (t8, t3); \
|
||||
t3 = vec_adds (t8, t3); \
|
||||
\
|
||||
/* 4th stage */ \
|
||||
vy0 = vec_adds (t7, t1); \
|
||||
vy7 = vec_subs (t7, t1); \
|
||||
vy1 = vec_mradds (c4, t3, t5); \
|
||||
vy6 = vec_mradds (mc4, t3, t5); \
|
||||
vy2 = vec_mradds (c4, t4, t0); \
|
||||
vy5 = vec_mradds (mc4, t4, t0); \
|
||||
vy3 = vec_adds (t2, t6); \
|
||||
vy4 = vec_subs (t2, t6);
|
||||
|
||||
|
||||
#define IDCT \
|
||||
vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
|
||||
vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
|
||||
vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
|
||||
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
|
||||
vec_u16 shift; \
|
||||
\
|
||||
c4 = vec_splat (constants[0], 0); \
|
||||
a0 = vec_splat (constants[0], 1); \
|
||||
a1 = vec_splat (constants[0], 2); \
|
||||
a2 = vec_splat (constants[0], 3); \
|
||||
mc4 = vec_splat (constants[0], 4); \
|
||||
ma2 = vec_splat (constants[0], 5); \
|
||||
bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
|
||||
\
|
||||
zero = vec_splat_s16 (0); \
|
||||
shift = vec_splat_u16 (4); \
|
||||
\
|
||||
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
|
||||
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
|
||||
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
|
||||
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
|
||||
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
|
||||
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
|
||||
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
|
||||
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
|
||||
\
|
||||
IDCT_HALF \
|
||||
\
|
||||
vx0 = vec_mergeh (vy0, vy4); \
|
||||
vx1 = vec_mergel (vy0, vy4); \
|
||||
vx2 = vec_mergeh (vy1, vy5); \
|
||||
vx3 = vec_mergel (vy1, vy5); \
|
||||
vx4 = vec_mergeh (vy2, vy6); \
|
||||
vx5 = vec_mergel (vy2, vy6); \
|
||||
vx6 = vec_mergeh (vy3, vy7); \
|
||||
vx7 = vec_mergel (vy3, vy7); \
|
||||
\
|
||||
vy0 = vec_mergeh (vx0, vx4); \
|
||||
vy1 = vec_mergel (vx0, vx4); \
|
||||
vy2 = vec_mergeh (vx1, vx5); \
|
||||
vy3 = vec_mergel (vx1, vx5); \
|
||||
vy4 = vec_mergeh (vx2, vx6); \
|
||||
vy5 = vec_mergel (vx2, vx6); \
|
||||
vy6 = vec_mergeh (vx3, vx7); \
|
||||
vy7 = vec_mergel (vx3, vx7); \
|
||||
\
|
||||
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
|
||||
vx1 = vec_mergel (vy0, vy4); \
|
||||
vx2 = vec_mergeh (vy1, vy5); \
|
||||
vx3 = vec_mergel (vy1, vy5); \
|
||||
vx4 = vec_mergeh (vy2, vy6); \
|
||||
vx5 = vec_mergel (vy2, vy6); \
|
||||
vx6 = vec_mergeh (vy3, vy7); \
|
||||
vx7 = vec_mergel (vy3, vy7); \
|
||||
\
|
||||
IDCT_HALF \
|
||||
\
|
||||
shift = vec_splat_u16 (6); \
|
||||
vx0 = vec_sra (vy0, shift); \
|
||||
vx1 = vec_sra (vy1, shift); \
|
||||
vx2 = vec_sra (vy2, shift); \
|
||||
vx3 = vec_sra (vy3, shift); \
|
||||
vx4 = vec_sra (vy4, shift); \
|
||||
vx5 = vec_sra (vy5, shift); \
|
||||
vx6 = vec_sra (vy6, shift); \
|
||||
vx7 = vec_sra (vy7, shift);
|
||||
|
||||
|
||||
static const vec_s16 constants[5] = {
|
||||
{23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
|
||||
{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
|
||||
{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
|
||||
{21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
|
||||
{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
|
||||
};
|
||||
|
||||
void idct_put_altivec(uint8_t* dest, int stride, vec_s16* block)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
|
||||
vec_u8 tmp;
|
||||
|
||||
#if CONFIG_POWERPC_PERF
|
||||
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
|
||||
#endif
|
||||
IDCT
|
||||
|
||||
#define COPY(dest,src) \
|
||||
tmp = vec_packsu (src, src); \
|
||||
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
|
||||
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
|
||||
|
||||
COPY (dest, vx0) dest += stride;
|
||||
COPY (dest, vx1) dest += stride;
|
||||
COPY (dest, vx2) dest += stride;
|
||||
COPY (dest, vx3) dest += stride;
|
||||
COPY (dest, vx4) dest += stride;
|
||||
COPY (dest, vx5) dest += stride;
|
||||
COPY (dest, vx6) dest += stride;
|
||||
COPY (dest, vx7)
|
||||
|
||||
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
|
||||
}
|
||||
|
||||
void idct_add_altivec(uint8_t* dest, int stride, vec_s16* block)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
|
||||
vec_u8 tmp;
|
||||
vec_s16 tmp2, tmp3;
|
||||
vec_u8 perm0;
|
||||
vec_u8 perm1;
|
||||
vec_u8 p0, p1, p;
|
||||
|
||||
#if CONFIG_POWERPC_PERF
|
||||
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
|
||||
#endif
|
||||
|
||||
IDCT
|
||||
|
||||
p0 = vec_lvsl (0, dest);
|
||||
p1 = vec_lvsl (stride, dest);
|
||||
p = vec_splat_u8 (-1);
|
||||
perm0 = vec_mergeh (p, p0);
|
||||
perm1 = vec_mergeh (p, p1);
|
||||
|
||||
#define ADD(dest,src,perm) \
|
||||
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
|
||||
tmp = vec_ld (0, dest); \
|
||||
tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
|
||||
tmp3 = vec_adds (tmp2, src); \
|
||||
tmp = vec_packsu (tmp3, tmp3); \
|
||||
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
|
||||
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
|
||||
|
||||
ADD (dest, vx0, perm0) dest += stride;
|
||||
ADD (dest, vx1, perm1) dest += stride;
|
||||
ADD (dest, vx2, perm0) dest += stride;
|
||||
ADD (dest, vx3, perm1) dest += stride;
|
||||
ADD (dest, vx4, perm0) dest += stride;
|
||||
ADD (dest, vx5, perm1) dest += stride;
|
||||
ADD (dest, vx6, perm0) dest += stride;
|
||||
ADD (dest, vx7, perm1)
|
||||
|
||||
POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
|
||||
}
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
/*
|
||||
* High quality image resampling with polyphase filters
|
||||
* Copyright (c) 2001 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file libavcodec/ppc/imgresample_altivec.c
|
||||
* High quality image resampling with polyphase filters - AltiVec bits
|
||||
*/
|
||||
|
||||
#include "util_altivec.h"
|
||||
#define FILTER_BITS 8
|
||||
|
||||
typedef union {
|
||||
vector signed short v;
|
||||
signed short s[8];
|
||||
} vec_ss;
|
||||
|
||||
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
||||
int wrap, int16_t *filter)
|
||||
{
|
||||
int sum, i;
|
||||
const uint8_t *s;
|
||||
vector unsigned char *tv, tmp, dstv, zero;
|
||||
vec_ss srchv[4], srclv[4], fv[4];
|
||||
vector signed short zeros, sumhv, sumlv;
|
||||
s = src;
|
||||
|
||||
for(i=0;i<4;i++) {
|
||||
/*
|
||||
The vec_madds later on does an implicit >>15 on the result.
|
||||
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
|
||||
a signed short, we have just enough bits to pre-shift our
|
||||
filter constants <<7 to compensate for vec_madds.
|
||||
*/
|
||||
fv[i].s[0] = filter[i] << (15-FILTER_BITS);
|
||||
fv[i].v = vec_splat(fv[i].v, 0);
|
||||
}
|
||||
|
||||
zero = vec_splat_u8(0);
|
||||
zeros = vec_splat_s16(0);
|
||||
|
||||
|
||||
/*
|
||||
When we're resampling, we'd ideally like both our input buffers,
|
||||
and output buffers to be 16-byte aligned, so we can do both aligned
|
||||
reads and writes. Sadly we can't always have this at the moment, so
|
||||
we opt for aligned writes, as unaligned writes have a huge overhead.
|
||||
To do this, do enough scalar resamples to get dst 16-byte aligned.
|
||||
*/
|
||||
i = (-(int)dst) & 0xf;
|
||||
while(i>0) {
|
||||
sum = s[0 * wrap] * filter[0] +
|
||||
s[1 * wrap] * filter[1] +
|
||||
s[2 * wrap] * filter[2] +
|
||||
s[3 * wrap] * filter[3];
|
||||
sum = sum >> FILTER_BITS;
|
||||
if (sum<0) sum = 0; else if (sum>255) sum=255;
|
||||
dst[0] = sum;
|
||||
dst++;
|
||||
s++;
|
||||
dst_width--;
|
||||
i--;
|
||||
}
|
||||
|
||||
/* Do our altivec resampling on 16 pixels at once. */
|
||||
while(dst_width>=16) {
|
||||
/* Read 16 (potentially unaligned) bytes from each of
|
||||
4 lines into 4 vectors, and split them into shorts.
|
||||
Interleave the multipy/accumulate for the resample
|
||||
filter with the loads to hide the 3 cycle latency
|
||||
the vec_madds have. */
|
||||
tv = (vector unsigned char *) &s[0 * wrap];
|
||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
|
||||
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||
srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
|
||||
sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
|
||||
sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
|
||||
|
||||
tv = (vector unsigned char *) &s[1 * wrap];
|
||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
|
||||
srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||
srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
|
||||
sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
|
||||
sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
|
||||
|
||||
tv = (vector unsigned char *) &s[2 * wrap];
|
||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
|
||||
srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||
srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
|
||||
sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
|
||||
sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
|
||||
|
||||
tv = (vector unsigned char *) &s[3 * wrap];
|
||||
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
|
||||
srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
|
||||
srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
|
||||
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
|
||||
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
|
||||
|
||||
/* Pack the results into our destination vector,
|
||||
and do an aligned write of that back to memory. */
|
||||
dstv = vec_packsu(sumhv, sumlv) ;
|
||||
vec_st(dstv, 0, (vector unsigned char *) dst);
|
||||
|
||||
dst+=16;
|
||||
s+=16;
|
||||
dst_width-=16;
|
||||
}
|
||||
|
||||
/* If there are any leftover pixels, resample them
|
||||
with the slow scalar method. */
|
||||
while(dst_width>0) {
|
||||
sum = s[0 * wrap] * filter[0] +
|
||||
s[1 * wrap] * filter[1] +
|
||||
s[2 * wrap] * filter[2] +
|
||||
s[3 * wrap] * filter[3];
|
||||
sum = sum >> FILTER_BITS;
|
||||
if (sum<0) sum = 0; else if (sum>255) sum=255;
|
||||
dst[0] = sum;
|
||||
dst++;
|
||||
s++;
|
||||
dst_width--;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_IMGRESAMPLE_ALTIVEC_H
|
||||
#define AVCODEC_PPC_IMGRESAMPLE_ALTIVEC_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
|
||||
int wrap, int16_t *filter);
|
||||
#endif /* AVCODEC_PPC_IMGRESAMPLE_ALTIVEC_H */
|
|
@ -0,0 +1,143 @@
|
|||
/*
|
||||
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
** @file libavcodec/ppc/int_altivec.c
|
||||
** integer misc ops.
|
||||
**/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
#include "dsputil_altivec.h"
|
||||
|
||||
#include "types_altivec.h"
|
||||
|
||||
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
|
||||
int size) {
|
||||
int i, size16;
|
||||
vector signed char vpix1;
|
||||
vector signed short vpix2, vdiff, vpix1l,vpix1h;
|
||||
union { vector signed int vscore;
|
||||
int32_t score[4];
|
||||
} u;
|
||||
u.vscore = vec_splat_s32(0);
|
||||
//
|
||||
//XXX lazy way, fix it later
|
||||
|
||||
#define vec_unaligned_load(b) \
|
||||
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
|
||||
|
||||
size16 = size >> 4;
|
||||
while(size16) {
|
||||
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
|
||||
//load pix1 and the first batch of pix2
|
||||
|
||||
vpix1 = vec_unaligned_load(pix1);
|
||||
vpix2 = vec_unaligned_load(pix2);
|
||||
pix2 += 8;
|
||||
//unpack
|
||||
vpix1h = vec_unpackh(vpix1);
|
||||
vdiff = vec_sub(vpix1h, vpix2);
|
||||
vpix1l = vec_unpackl(vpix1);
|
||||
// load another batch from pix2
|
||||
vpix2 = vec_unaligned_load(pix2);
|
||||
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
|
||||
vdiff = vec_sub(vpix1l, vpix2);
|
||||
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
|
||||
pix1 += 16;
|
||||
pix2 += 8;
|
||||
size16--;
|
||||
}
|
||||
u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
|
||||
|
||||
size %= 16;
|
||||
for (i = 0; i < size; i++) {
|
||||
u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
|
||||
}
|
||||
return u.score[3];
|
||||
}
|
||||
|
||||
static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
int i;
|
||||
register vec_s16 vec, *pv;
|
||||
|
||||
for(i = 0; i < order; i += 8){
|
||||
pv = (vec_s16*)v2;
|
||||
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
|
||||
vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
int i;
|
||||
register vec_s16 vec, *pv;
|
||||
|
||||
for(i = 0; i < order; i += 8){
|
||||
pv = (vec_s16*)v2;
|
||||
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
|
||||
vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
|
||||
{
|
||||
int i;
|
||||
LOAD_ZERO;
|
||||
register vec_s16 vec1, *pv;
|
||||
register vec_s32 res = vec_splat_s32(0), t;
|
||||
register vec_u32 shifts;
|
||||
DECLARE_ALIGNED_16(int32_t, ires);
|
||||
|
||||
shifts = zero_u32v;
|
||||
if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
|
||||
if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
|
||||
if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
|
||||
if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
|
||||
if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
|
||||
|
||||
for(i = 0; i < order; i += 8){
|
||||
pv = (vec_s16*)v1;
|
||||
vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
|
||||
t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
|
||||
t = vec_sr(t, shifts);
|
||||
res = vec_sums(t, res);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
res = vec_splat(res, 3);
|
||||
vec_ste(res, 0, &ires);
|
||||
return ires;
|
||||
}
|
||||
|
||||
void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
|
||||
c->add_int16 = add_int16_altivec;
|
||||
c->sub_int16 = sub_int16_altivec;
|
||||
c->scalarproduct_int16 = scalarproduct_int16_altivec;
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* simple math operations
|
||||
* Copyright (c) 2001, 2002 Fabrice Bellard
|
||||
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_MATHOPS_H
|
||||
#define AVCODEC_PPC_MATHOPS_H
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_PPC4XX
|
||||
/* signed 16x16 -> 32 multiply add accumulate */
|
||||
#define MAC16(rt, ra, rb) \
|
||||
__asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
|
||||
|
||||
/* signed 16x16 -> 32 multiply */
|
||||
#define MUL16(ra, rb) \
|
||||
({ int __rt; \
|
||||
__asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
|
||||
__rt; })
|
||||
#endif
|
||||
|
||||
#endif /* AVCODEC_PPC_MATHOPS_H */
|
|
@ -0,0 +1,627 @@
|
|||
/*
|
||||
* Copyright (c) 2002 Dieter Shirley
|
||||
*
|
||||
* dct_unquantize_h263_altivec:
|
||||
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
#include "dsputil_ppc.h"
|
||||
#include "util_altivec.h"
|
||||
// Swaps two variables (used for altivec registers)
|
||||
#define SWAP(a,b) \
|
||||
do { \
|
||||
__typeof__(a) swap_temp=a; \
|
||||
a=b; \
|
||||
b=swap_temp; \
|
||||
} while (0)
|
||||
|
||||
// transposes a matrix consisting of four vectors with four elements each
|
||||
#define TRANSPOSE4(a,b,c,d) \
|
||||
do { \
|
||||
__typeof__(a) _trans_ach = vec_mergeh(a, c); \
|
||||
__typeof__(a) _trans_acl = vec_mergel(a, c); \
|
||||
__typeof__(a) _trans_bdh = vec_mergeh(b, d); \
|
||||
__typeof__(a) _trans_bdl = vec_mergel(b, d); \
|
||||
\
|
||||
a = vec_mergeh(_trans_ach, _trans_bdh); \
|
||||
b = vec_mergel(_trans_ach, _trans_bdh); \
|
||||
c = vec_mergeh(_trans_acl, _trans_bdl); \
|
||||
d = vec_mergel(_trans_acl, _trans_bdl); \
|
||||
} while (0)
|
||||
|
||||
|
||||
// Loads a four-byte value (int or float) from the target address
|
||||
// into every element in the target vector. Only works if the
|
||||
// target address is four-byte aligned (which should be always).
|
||||
#define LOAD4(vec, address) \
|
||||
{ \
|
||||
__typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
|
||||
vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
|
||||
vec = vec_ld(0, _load_addr); \
|
||||
vec = vec_perm(vec, vec, _perm_vec); \
|
||||
vec = vec_splat(vec, 0); \
|
||||
}
|
||||
|
||||
|
||||
#define FOUROF(a) {a,a,a,a}
|
||||
|
||||
int dct_quantize_altivec(MpegEncContext* s,
|
||||
DCTELEM* data, int n,
|
||||
int qscale, int* overflow)
|
||||
{
|
||||
int lastNonZero;
|
||||
vector float row0, row1, row2, row3, row4, row5, row6, row7;
|
||||
vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
|
||||
const vector float zero = (const vector float)FOUROF(0.);
|
||||
// used after quantize step
|
||||
int oldBaseValue = 0;
|
||||
|
||||
// Load the data into the row/alt vectors
|
||||
{
|
||||
vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
|
||||
|
||||
data0 = vec_ld(0, data);
|
||||
data1 = vec_ld(16, data);
|
||||
data2 = vec_ld(32, data);
|
||||
data3 = vec_ld(48, data);
|
||||
data4 = vec_ld(64, data);
|
||||
data5 = vec_ld(80, data);
|
||||
data6 = vec_ld(96, data);
|
||||
data7 = vec_ld(112, data);
|
||||
|
||||
// Transpose the data before we start
|
||||
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
|
||||
|
||||
// load the data into floating point vectors. We load
|
||||
// the high half of each row into the main row vectors
|
||||
// and the low half into the alt vectors.
|
||||
row0 = vec_ctf(vec_unpackh(data0), 0);
|
||||
alt0 = vec_ctf(vec_unpackl(data0), 0);
|
||||
row1 = vec_ctf(vec_unpackh(data1), 0);
|
||||
alt1 = vec_ctf(vec_unpackl(data1), 0);
|
||||
row2 = vec_ctf(vec_unpackh(data2), 0);
|
||||
alt2 = vec_ctf(vec_unpackl(data2), 0);
|
||||
row3 = vec_ctf(vec_unpackh(data3), 0);
|
||||
alt3 = vec_ctf(vec_unpackl(data3), 0);
|
||||
row4 = vec_ctf(vec_unpackh(data4), 0);
|
||||
alt4 = vec_ctf(vec_unpackl(data4), 0);
|
||||
row5 = vec_ctf(vec_unpackh(data5), 0);
|
||||
alt5 = vec_ctf(vec_unpackl(data5), 0);
|
||||
row6 = vec_ctf(vec_unpackh(data6), 0);
|
||||
alt6 = vec_ctf(vec_unpackl(data6), 0);
|
||||
row7 = vec_ctf(vec_unpackh(data7), 0);
|
||||
alt7 = vec_ctf(vec_unpackl(data7), 0);
|
||||
}
|
||||
|
||||
// The following block could exist as a separate an altivec dct
|
||||
// function. However, if we put it inline, the DCT data can remain
|
||||
// in the vector local variables, as floats, which we'll use during the
|
||||
// quantize step...
|
||||
{
|
||||
const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
|
||||
const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
|
||||
const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
|
||||
const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
|
||||
const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
|
||||
const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
|
||||
const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
|
||||
const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
|
||||
const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
|
||||
const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
|
||||
const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
|
||||
const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);
|
||||
|
||||
|
||||
int whichPass, whichHalf;
|
||||
|
||||
for(whichPass = 1; whichPass<=2; whichPass++) {
|
||||
for(whichHalf = 1; whichHalf<=2; whichHalf++) {
|
||||
vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
vector float tmp10, tmp11, tmp12, tmp13;
|
||||
vector float z1, z2, z3, z4, z5;
|
||||
|
||||
tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
|
||||
tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
|
||||
tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
|
||||
tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
|
||||
tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
|
||||
tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
|
||||
tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
|
||||
tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
|
||||
|
||||
tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
|
||||
tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
|
||||
tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
|
||||
tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
|
||||
|
||||
|
||||
// dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
|
||||
row0 = vec_add(tmp10, tmp11);
|
||||
|
||||
// dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
|
||||
row4 = vec_sub(tmp10, tmp11);
|
||||
|
||||
|
||||
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
|
||||
z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
|
||||
|
||||
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
|
||||
// CONST_BITS-PASS1_BITS);
|
||||
row2 = vec_madd(tmp13, vec_0_765366865, z1);
|
||||
|
||||
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
|
||||
// CONST_BITS-PASS1_BITS);
|
||||
row6 = vec_madd(tmp12, vec_1_847759065, z1);
|
||||
|
||||
z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
|
||||
z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
|
||||
z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
|
||||
z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
|
||||
|
||||
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
|
||||
z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
|
||||
|
||||
// z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
|
||||
z3 = vec_madd(z3, vec_1_961570560, z5);
|
||||
|
||||
// z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
|
||||
z4 = vec_madd(z4, vec_0_390180644, z5);
|
||||
|
||||
// The following adds are rolled into the multiplies above
|
||||
// z3 = vec_add(z3, z5); // z3 += z5;
|
||||
// z4 = vec_add(z4, z5); // z4 += z5;
|
||||
|
||||
// z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
|
||||
// Wow! It's actually more efficient to roll this multiply
|
||||
// into the adds below, even thought the multiply gets done twice!
|
||||
// z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
|
||||
|
||||
// z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
|
||||
// Same with this one...
|
||||
// z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
|
||||
|
||||
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
|
||||
// dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
|
||||
row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
|
||||
|
||||
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
|
||||
// dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
|
||||
row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
|
||||
|
||||
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
|
||||
// dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
|
||||
row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
|
||||
|
||||
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
|
||||
// dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
|
||||
row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
|
||||
|
||||
// Swap the row values with the alts. If this is the first half,
|
||||
// this sets up the low values to be acted on in the second half.
|
||||
// If this is the second half, it puts the high values back in
|
||||
// the row values where they are expected to be when we're done.
|
||||
SWAP(row0, alt0);
|
||||
SWAP(row1, alt1);
|
||||
SWAP(row2, alt2);
|
||||
SWAP(row3, alt3);
|
||||
SWAP(row4, alt4);
|
||||
SWAP(row5, alt5);
|
||||
SWAP(row6, alt6);
|
||||
SWAP(row7, alt7);
|
||||
}
|
||||
|
||||
if (whichPass == 1) {
|
||||
// transpose the data for the second pass
|
||||
|
||||
// First, block transpose the upper right with lower left.
|
||||
SWAP(row4, alt0);
|
||||
SWAP(row5, alt1);
|
||||
SWAP(row6, alt2);
|
||||
SWAP(row7, alt3);
|
||||
|
||||
// Now, transpose each block of four
|
||||
TRANSPOSE4(row0, row1, row2, row3);
|
||||
TRANSPOSE4(row4, row5, row6, row7);
|
||||
TRANSPOSE4(alt0, alt1, alt2, alt3);
|
||||
TRANSPOSE4(alt4, alt5, alt6, alt7);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// perform the quantize step, using the floating point data
|
||||
// still in the row/alt registers
|
||||
{
|
||||
const int* biasAddr;
|
||||
const vector signed int* qmat;
|
||||
vector float bias, negBias;
|
||||
|
||||
if (s->mb_intra) {
|
||||
vector signed int baseVector;
|
||||
|
||||
// We must cache element 0 in the intra case
|
||||
// (it needs special handling).
|
||||
baseVector = vec_cts(vec_splat(row0, 0), 0);
|
||||
vec_ste(baseVector, 0, &oldBaseValue);
|
||||
|
||||
qmat = (vector signed int*)s->q_intra_matrix[qscale];
|
||||
biasAddr = &(s->intra_quant_bias);
|
||||
} else {
|
||||
qmat = (vector signed int*)s->q_inter_matrix[qscale];
|
||||
biasAddr = &(s->inter_quant_bias);
|
||||
}
|
||||
|
||||
// Load the bias vector (We add 0.5 to the bias so that we're
|
||||
// rounding when we convert to int, instead of flooring.)
|
||||
{
|
||||
vector signed int biasInt;
|
||||
const vector float negOneFloat = (vector float)FOUROF(-1.0f);
|
||||
LOAD4(biasInt, biasAddr);
|
||||
bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
|
||||
negBias = vec_madd(bias, negOneFloat, zero);
|
||||
}
|
||||
|
||||
{
|
||||
vector float q0, q1, q2, q3, q4, q5, q6, q7;
|
||||
|
||||
q0 = vec_ctf(qmat[0], QMAT_SHIFT);
|
||||
q1 = vec_ctf(qmat[2], QMAT_SHIFT);
|
||||
q2 = vec_ctf(qmat[4], QMAT_SHIFT);
|
||||
q3 = vec_ctf(qmat[6], QMAT_SHIFT);
|
||||
q4 = vec_ctf(qmat[8], QMAT_SHIFT);
|
||||
q5 = vec_ctf(qmat[10], QMAT_SHIFT);
|
||||
q6 = vec_ctf(qmat[12], QMAT_SHIFT);
|
||||
q7 = vec_ctf(qmat[14], QMAT_SHIFT);
|
||||
|
||||
row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
|
||||
vec_cmpgt(row0, zero));
|
||||
row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
|
||||
vec_cmpgt(row1, zero));
|
||||
row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
|
||||
vec_cmpgt(row2, zero));
|
||||
row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
|
||||
vec_cmpgt(row3, zero));
|
||||
row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
|
||||
vec_cmpgt(row4, zero));
|
||||
row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
|
||||
vec_cmpgt(row5, zero));
|
||||
row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
|
||||
vec_cmpgt(row6, zero));
|
||||
row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
|
||||
vec_cmpgt(row7, zero));
|
||||
|
||||
q0 = vec_ctf(qmat[1], QMAT_SHIFT);
|
||||
q1 = vec_ctf(qmat[3], QMAT_SHIFT);
|
||||
q2 = vec_ctf(qmat[5], QMAT_SHIFT);
|
||||
q3 = vec_ctf(qmat[7], QMAT_SHIFT);
|
||||
q4 = vec_ctf(qmat[9], QMAT_SHIFT);
|
||||
q5 = vec_ctf(qmat[11], QMAT_SHIFT);
|
||||
q6 = vec_ctf(qmat[13], QMAT_SHIFT);
|
||||
q7 = vec_ctf(qmat[15], QMAT_SHIFT);
|
||||
|
||||
alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
|
||||
vec_cmpgt(alt0, zero));
|
||||
alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
|
||||
vec_cmpgt(alt1, zero));
|
||||
alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
|
||||
vec_cmpgt(alt2, zero));
|
||||
alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
|
||||
vec_cmpgt(alt3, zero));
|
||||
alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
|
||||
vec_cmpgt(alt4, zero));
|
||||
alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
|
||||
vec_cmpgt(alt5, zero));
|
||||
alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
|
||||
vec_cmpgt(alt6, zero));
|
||||
alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
|
||||
vec_cmpgt(alt7, zero));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Store the data back into the original block
|
||||
{
|
||||
vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
|
||||
|
||||
data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
|
||||
data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
|
||||
data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
|
||||
data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
|
||||
data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
|
||||
data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
|
||||
data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
|
||||
data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
|
||||
|
||||
{
|
||||
// Clamp for overflow
|
||||
vector signed int max_q_int, min_q_int;
|
||||
vector signed short max_q, min_q;
|
||||
|
||||
LOAD4(max_q_int, &(s->max_qcoeff));
|
||||
LOAD4(min_q_int, &(s->min_qcoeff));
|
||||
|
||||
max_q = vec_pack(max_q_int, max_q_int);
|
||||
min_q = vec_pack(min_q_int, min_q_int);
|
||||
|
||||
data0 = vec_max(vec_min(data0, max_q), min_q);
|
||||
data1 = vec_max(vec_min(data1, max_q), min_q);
|
||||
data2 = vec_max(vec_min(data2, max_q), min_q);
|
||||
data4 = vec_max(vec_min(data4, max_q), min_q);
|
||||
data5 = vec_max(vec_min(data5, max_q), min_q);
|
||||
data6 = vec_max(vec_min(data6, max_q), min_q);
|
||||
data7 = vec_max(vec_min(data7, max_q), min_q);
|
||||
}
|
||||
|
||||
{
|
||||
vector bool char zero_01, zero_23, zero_45, zero_67;
|
||||
vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67;
|
||||
vector signed char negOne = vec_splat_s8(-1);
|
||||
vector signed char* scanPtr =
|
||||
(vector signed char*)(s->intra_scantable.inverse);
|
||||
signed char lastNonZeroChar;
|
||||
|
||||
// Determine the largest non-zero index.
|
||||
zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero),
|
||||
vec_cmpeq(data1, (vector signed short)zero));
|
||||
zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero),
|
||||
vec_cmpeq(data3, (vector signed short)zero));
|
||||
zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero),
|
||||
vec_cmpeq(data5, (vector signed short)zero));
|
||||
zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero),
|
||||
vec_cmpeq(data7, (vector signed short)zero));
|
||||
|
||||
// 64 biggest values
|
||||
scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01);
|
||||
scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23);
|
||||
scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45);
|
||||
scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67);
|
||||
|
||||
// 32 largest values
|
||||
scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23);
|
||||
scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67);
|
||||
|
||||
// 16 largest values
|
||||
scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45);
|
||||
|
||||
// 8 largest values
|
||||
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
|
||||
vec_mergel(scanIndexes_01, negOne));
|
||||
|
||||
// 4 largest values
|
||||
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
|
||||
vec_mergel(scanIndexes_01, negOne));
|
||||
|
||||
// 2 largest values
|
||||
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
|
||||
vec_mergel(scanIndexes_01, negOne));
|
||||
|
||||
// largest value
|
||||
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
|
||||
vec_mergel(scanIndexes_01, negOne));
|
||||
|
||||
scanIndexes_01 = vec_splat(scanIndexes_01, 0);
|
||||
|
||||
|
||||
vec_ste(scanIndexes_01, 0, &lastNonZeroChar);
|
||||
|
||||
lastNonZero = lastNonZeroChar;
|
||||
|
||||
// While the data is still in vectors we check for the transpose IDCT permute
|
||||
// and handle it using the vector unit if we can. This is the permute used
|
||||
// by the altivec idct, so it is common when using the altivec dct.
|
||||
|
||||
if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) {
|
||||
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
|
||||
}
|
||||
|
||||
vec_st(data0, 0, data);
|
||||
vec_st(data1, 16, data);
|
||||
vec_st(data2, 32, data);
|
||||
vec_st(data3, 48, data);
|
||||
vec_st(data4, 64, data);
|
||||
vec_st(data5, 80, data);
|
||||
vec_st(data6, 96, data);
|
||||
vec_st(data7, 112, data);
|
||||
}
|
||||
}
|
||||
|
||||
// special handling of block[0]
|
||||
if (s->mb_intra) {
|
||||
if (!s->h263_aic) {
|
||||
if (n < 4)
|
||||
oldBaseValue /= s->y_dc_scale;
|
||||
else
|
||||
oldBaseValue /= s->c_dc_scale;
|
||||
}
|
||||
|
||||
// Divide by 8, rounding the result
|
||||
data[0] = (oldBaseValue + 4) >> 3;
|
||||
}
|
||||
|
||||
// We handled the transpose permutation above and we don't
|
||||
// need to permute the "no" permutation case.
|
||||
if ((lastNonZero > 0) &&
|
||||
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
|
||||
(s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) {
|
||||
ff_block_permute(data, s->dsp.idct_permutation,
|
||||
s->intra_scantable.scantable, lastNonZero);
|
||||
}
|
||||
|
||||
return lastNonZero;
|
||||
}
|
||||
|
||||
/* AltiVec version of dct_unquantize_h263
|
||||
this code assumes `block' is 16 bytes-aligned */
|
||||
void dct_unquantize_h263_altivec(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
|
||||
int i, level, qmul, qadd;
|
||||
int nCoeffs;
|
||||
|
||||
assert(s->block_last_index[n]>=0);
|
||||
|
||||
POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
|
||||
|
||||
qadd = (qscale - 1) | 1;
|
||||
qmul = qscale << 1;
|
||||
|
||||
if (s->mb_intra) {
|
||||
if (!s->h263_aic) {
|
||||
if (n < 4)
|
||||
block[0] = block[0] * s->y_dc_scale;
|
||||
else
|
||||
block[0] = block[0] * s->c_dc_scale;
|
||||
}else
|
||||
qadd = 0;
|
||||
i = 1;
|
||||
nCoeffs= 63; //does not always use zigzag table
|
||||
} else {
|
||||
i = 0;
|
||||
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
||||
}
|
||||
|
||||
{
|
||||
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
|
||||
DECLARE_ALIGNED_16(short, qmul8[]) =
|
||||
{
|
||||
qmul, qmul, qmul, qmul,
|
||||
qmul, qmul, qmul, qmul
|
||||
};
|
||||
DECLARE_ALIGNED_16(short, qadd8[]) =
|
||||
{
|
||||
qadd, qadd, qadd, qadd,
|
||||
qadd, qadd, qadd, qadd
|
||||
};
|
||||
DECLARE_ALIGNED_16(short, nqadd8[]) =
|
||||
{
|
||||
-qadd, -qadd, -qadd, -qadd,
|
||||
-qadd, -qadd, -qadd, -qadd
|
||||
};
|
||||
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
|
||||
register vector bool short blockv_null, blockv_neg;
|
||||
register short backup_0 = block[0];
|
||||
register int j = 0;
|
||||
|
||||
qmulv = vec_ld(0, qmul8);
|
||||
qaddv = vec_ld(0, qadd8);
|
||||
nqaddv = vec_ld(0, nqadd8);
|
||||
|
||||
#if 0 // block *is* 16 bytes-aligned, it seems.
|
||||
// first make sure block[j] is 16 bytes-aligned
|
||||
for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
|
||||
level = block[j];
|
||||
if (level) {
|
||||
if (level < 0) {
|
||||
level = level * qmul - qadd;
|
||||
} else {
|
||||
level = level * qmul + qadd;
|
||||
}
|
||||
block[j] = level;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// vectorize all the 16 bytes-aligned blocks
|
||||
// of 8 elements
|
||||
for(; (j + 7) <= nCoeffs ; j+=8) {
|
||||
blockv = vec_ld(j << 1, block);
|
||||
blockv_neg = vec_cmplt(blockv, vczero);
|
||||
blockv_null = vec_cmpeq(blockv, vczero);
|
||||
// choose between +qadd or -qadd as the third operand
|
||||
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
|
||||
// multiply & add (block{i,i+7} * qmul [+-] qadd)
|
||||
temp1 = vec_mladd(blockv, qmulv, temp1);
|
||||
// put 0 where block[{i,i+7} used to have 0
|
||||
blockv = vec_sel(temp1, blockv, blockv_null);
|
||||
vec_st(blockv, j << 1, block);
|
||||
}
|
||||
|
||||
// if nCoeffs isn't a multiple of 8, finish the job
|
||||
// using good old scalar units.
|
||||
// (we could do it using a truncated vector,
|
||||
// but I'm not sure it's worth the hassle)
|
||||
for(; j <= nCoeffs ; j++) {
|
||||
level = block[j];
|
||||
if (level) {
|
||||
if (level < 0) {
|
||||
level = level * qmul - qadd;
|
||||
} else {
|
||||
level = level * qmul + qadd;
|
||||
}
|
||||
block[j] = level;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == 1) {
|
||||
// cheat. this avoid special-casing the first iteration
|
||||
block[0] = backup_0;
|
||||
}
|
||||
}
|
||||
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
|
||||
}
|
||||
|
||||
|
||||
void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
|
||||
void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
|
||||
|
||||
void MPV_common_init_altivec(MpegEncContext *s)
|
||||
{
|
||||
if ((mm_flags & FF_MM_ALTIVEC) == 0) return;
|
||||
|
||||
if (s->avctx->lowres==0) {
|
||||
if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
|
||||
(s->avctx->idct_algo == FF_IDCT_ALTIVEC)) {
|
||||
s->dsp.idct_put = idct_put_altivec;
|
||||
s->dsp.idct_add = idct_add_altivec;
|
||||
s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||
}
|
||||
}
|
||||
|
||||
// Test to make sure that the dct required alignments are met.
|
||||
if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
|
||||
(((long)(s->q_inter_matrix) & 0x0f) != 0)) {
|
||||
av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
|
||||
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) {
|
||||
av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
|
||||
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
|
||||
(s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
|
||||
#if 0 /* seems to cause trouble under some circumstances */
|
||||
s->dct_quantize = dct_quantize_altivec;
|
||||
#endif
|
||||
s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
|
||||
s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,788 @@
|
|||
/*
|
||||
* AltiVec-optimized snow DSP utils
|
||||
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/snow.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
#include "dsputil_altivec.h"
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
|
||||
//FIXME remove this replication
|
||||
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
|
||||
|
||||
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
|
||||
{
|
||||
int offset;
|
||||
DWTELEM * buffer;
|
||||
|
||||
// av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
|
||||
|
||||
assert(buf->data_stack_top >= 0);
|
||||
// assert(!buf->line[line]);
|
||||
if (buf->line[line])
|
||||
return buf->line[line];
|
||||
|
||||
offset = buf->line_width * line;
|
||||
buffer = buf->data_stack[buf->data_stack_top];
|
||||
buf->data_stack_top--;
|
||||
buf->line[line] = buffer;
|
||||
|
||||
// av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
//altivec code
|
||||
|
||||
void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width)
|
||||
{
|
||||
#if 0
|
||||
const int w2= (width+1)>>1;
|
||||
DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]);
|
||||
const int w_l= (width>>1);
|
||||
const int w_r= w2 - 1;
|
||||
int i;
|
||||
vector signed short t1, t2, x, y, tmp1, tmp2;
|
||||
vector signed short *vbuf, *vtmp;
|
||||
vector unsigned char align;
|
||||
|
||||
{ // Lift 0
|
||||
IDWTELEM * const ref = b + w2 - 1;
|
||||
IDWTELEM b_0 = b[0];
|
||||
vector signed short v7 = vec_splat_s16(7);
|
||||
vbuf = (vector signed short *)b;
|
||||
|
||||
tmp1 = vec_ld (0, ref);
|
||||
align = vec_lvsl (0, ref);
|
||||
tmp2 = vec_ld (15, ref);
|
||||
t1 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
for (i=0; i<w_l-15; i+=16) {
|
||||
#if 0
|
||||
/* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
|
||||
b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
|
||||
b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
|
||||
b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/
|
||||
b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8);
|
||||
#else
|
||||
|
||||
tmp1 = vec_ld (0, ref+8+i);
|
||||
tmp2 = vec_ld (15, ref+8+i);
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1, vec_sld(t1,t2,2));
|
||||
// y = vec_add(vec_add(y,y),y);
|
||||
|
||||
tmp1 = vec_ld (0, ref+12+i);
|
||||
|
||||
y = vec_add(y, vec_splat_s32(4));
|
||||
y = vec_sra(y, vec_splat_u32(3));
|
||||
|
||||
tmp2 = vec_ld (15, ref+12+i);
|
||||
|
||||
*vbuf = vec_sub(*vbuf, y);
|
||||
|
||||
t1 = t2;
|
||||
|
||||
vbuf++;
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_add(vec_add(y,y),y);
|
||||
|
||||
tmp1 = vec_ld (0, ref+12+i);
|
||||
|
||||
y = vec_add(y, vec_splat_s32(4));
|
||||
y = vec_sra(y, vec_splat_u32(3));
|
||||
|
||||
tmp2 = vec_ld (15, ref+12+i);
|
||||
|
||||
*vbuf = vec_sub(*vbuf, y);
|
||||
|
||||
t1=t2;
|
||||
|
||||
vbuf++;
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_add(vec_add(y,y),y);
|
||||
|
||||
tmp1 = vec_ld (0, ref+16+i);
|
||||
|
||||
y = vec_add(y, vec_splat_s32(4));
|
||||
y = vec_sra(y, vec_splat_u32(3));
|
||||
|
||||
tmp2 = vec_ld (15, ref+16+i);
|
||||
|
||||
*vbuf = vec_sub(*vbuf, y);
|
||||
|
||||
t1=t2;
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_add(vec_add(y,y),y);
|
||||
|
||||
vbuf++;
|
||||
|
||||
y = vec_add(y, vec_splat_s32(4));
|
||||
y = vec_sra(y, vec_splat_u32(3));
|
||||
*vbuf = vec_sub(*vbuf, y);
|
||||
|
||||
t1=t2;
|
||||
|
||||
vbuf++;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
|
||||
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
|
||||
}
|
||||
|
||||
{ // Lift 1
|
||||
DWTELEM * const dst = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
|
||||
dst[i] = dst[i] - (b[i] + b[i + 1]);
|
||||
}
|
||||
|
||||
align = vec_lvsl(0, b+i);
|
||||
tmp1 = vec_ld(0, b+i);
|
||||
vbuf = (vector signed int*) (dst + i);
|
||||
tmp2 = vec_ld(15, b+i);
|
||||
|
||||
t1 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
for (; i<w_r-3; i+=4) {
|
||||
|
||||
#if 0
|
||||
dst[i] = dst[i] - (b[i] + b[i + 1]);
|
||||
dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]);
|
||||
dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]);
|
||||
dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]);
|
||||
#else
|
||||
|
||||
tmp1 = vec_ld(0, b+4+i);
|
||||
tmp2 = vec_ld(15, b+4+i);
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1, vec_sld(t1,t2,4));
|
||||
*vbuf = vec_sub (*vbuf, y);
|
||||
|
||||
vbuf++;
|
||||
|
||||
t1 = t2;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
|
||||
}
|
||||
|
||||
{ // Lift 2
|
||||
DWTELEM * const ref = b+w2 - 1;
|
||||
DWTELEM b_0 = b[0];
|
||||
vbuf= (vector signed int *) b;
|
||||
|
||||
tmp1 = vec_ld (0, ref);
|
||||
align = vec_lvsl (0, ref);
|
||||
tmp2 = vec_ld (15, ref);
|
||||
t1= vec_perm(tmp1, tmp2, align);
|
||||
|
||||
i = 0;
|
||||
for (; i<w_l-15; i+=16) {
|
||||
#if 0
|
||||
b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4);
|
||||
b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4);
|
||||
b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4);
|
||||
b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4);
|
||||
#else
|
||||
tmp1 = vec_ld (0, ref+4+i);
|
||||
tmp2 = vec_ld (15, ref+4+i);
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_sub(vec_splat_s32(8),y);
|
||||
|
||||
tmp1 = vec_ld (0, ref+8+i);
|
||||
|
||||
x = vec_sl(*vbuf,vec_splat_u32(2));
|
||||
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
|
||||
|
||||
tmp2 = vec_ld (15, ref+8+i);
|
||||
|
||||
*vbuf = vec_sub( *vbuf, y);
|
||||
|
||||
t1 = t2;
|
||||
|
||||
vbuf++;
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_sub(vec_splat_s32(8),y);
|
||||
|
||||
tmp1 = vec_ld (0, ref+12+i);
|
||||
|
||||
x = vec_sl(*vbuf,vec_splat_u32(2));
|
||||
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
|
||||
|
||||
tmp2 = vec_ld (15, ref+12+i);
|
||||
|
||||
*vbuf = vec_sub( *vbuf, y);
|
||||
|
||||
t1 = t2;
|
||||
|
||||
vbuf++;
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_sub(vec_splat_s32(8),y);
|
||||
|
||||
tmp1 = vec_ld (0, ref+16+i);
|
||||
|
||||
x = vec_sl(*vbuf,vec_splat_u32(2));
|
||||
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
|
||||
|
||||
tmp2 = vec_ld (15, ref+16+i);
|
||||
|
||||
*vbuf = vec_sub( *vbuf, y);
|
||||
|
||||
t1 = t2;
|
||||
|
||||
vbuf++;
|
||||
|
||||
t2 = vec_perm(tmp1, tmp2, align);
|
||||
|
||||
y = vec_add(t1,vec_sld(t1,t2,4));
|
||||
y = vec_sub(vec_splat_s32(8),y);
|
||||
|
||||
t1 = t2;
|
||||
|
||||
x = vec_sl(*vbuf,vec_splat_u32(2));
|
||||
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
|
||||
*vbuf = vec_sub( *vbuf, y);
|
||||
|
||||
vbuf++;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
|
||||
b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
|
||||
}
|
||||
|
||||
{ // Lift 3
|
||||
DWTELEM * const src = b+w2;
|
||||
|
||||
vbuf = (vector signed int *)b;
|
||||
vtmp = (vector signed int *)temp;
|
||||
|
||||
i = 0;
|
||||
align = vec_lvsl(0, src);
|
||||
|
||||
for (; i<w_r-3; i+=4) {
|
||||
#if 0
|
||||
temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1);
|
||||
temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1);
|
||||
temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1);
|
||||
temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1);
|
||||
#else
|
||||
tmp1 = vec_ld(0,src+i);
|
||||
t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4));
|
||||
tmp2 = vec_ld(15,src+i);
|
||||
t1 = vec_sub(vec_splat_s32(0),t1); //bad!
|
||||
t1 = vec_add(t1,vec_add(t1,t1));
|
||||
t2 = vec_perm(tmp1 ,tmp2 ,align);
|
||||
t1 = vec_sra(t1,vec_splat_u32(1));
|
||||
vbuf++;
|
||||
*vtmp = vec_sub(t2,t1);
|
||||
vtmp++;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1);
|
||||
}
|
||||
|
||||
{
|
||||
//Interleave
|
||||
int a;
|
||||
vector signed int *t = (vector signed int *)temp,
|
||||
*v = (vector signed int *)b;
|
||||
|
||||
snow_interleave_line_header(&i, width, b, temp);
|
||||
|
||||
for (; (i & 0xE) != 0xE; i-=2){
|
||||
b[i+1] = temp[i>>1];
|
||||
b[i] = b[i>>1];
|
||||
}
|
||||
for (i-=14; i>=0; i-=16){
|
||||
a=i/4;
|
||||
|
||||
v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]);
|
||||
v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]);
|
||||
v[a+1]=vec_mergel(v[a>>1],t[a>>1]);
|
||||
v[a]=vec_mergeh(v[a>>1],t[a>>1]);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width)
|
||||
{
|
||||
int i, w4 = width/4;
|
||||
vector signed int *v0, *v1,*v2,*v3,*v4,*v5;
|
||||
vector signed int t1, t2;
|
||||
|
||||
v0=(vector signed int *)b0;
|
||||
v1=(vector signed int *)b1;
|
||||
v2=(vector signed int *)b2;
|
||||
v3=(vector signed int *)b3;
|
||||
v4=(vector signed int *)b4;
|
||||
v5=(vector signed int *)b5;
|
||||
|
||||
for (i=0; i< w4;i++) {
|
||||
|
||||
#if 0
|
||||
b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
|
||||
b3[i] -= ((b2[i] + b4[i]));
|
||||
b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4;
|
||||
b1[i] += (3*(b0[i] + b2[i]))>>1;
|
||||
#else
|
||||
t1 = vec_add(v3[i], v5[i]);
|
||||
t2 = vec_add(t1, vec_add(t1,t1));
|
||||
t1 = vec_add(t2, vec_splat_s32(4));
|
||||
v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3)));
|
||||
|
||||
v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i]));
|
||||
|
||||
t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i]));
|
||||
t2 = vec_sl(v2[i], vec_splat_u32(2));
|
||||
v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4)));
|
||||
t1 = vec_add(v0[i], v2[i]);
|
||||
t2 = vec_add(t1, vec_add(t1,t1));
|
||||
v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1)));
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
for(i*=4; i < width; i++)
|
||||
{
|
||||
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
|
||||
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
|
||||
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
|
||||
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
|
||||
}
|
||||
}
|
||||
|
||||
#define LOAD_BLOCKS \
|
||||
tmp1 = vec_ld(0, &block[3][y*src_stride]);\
|
||||
align = vec_lvsl(0, &block[3][y*src_stride]);\
|
||||
tmp2 = vec_ld(15, &block[3][y*src_stride]);\
|
||||
\
|
||||
b3 = vec_perm(tmp1,tmp2,align);\
|
||||
\
|
||||
tmp1 = vec_ld(0, &block[2][y*src_stride]);\
|
||||
align = vec_lvsl(0, &block[2][y*src_stride]);\
|
||||
tmp2 = vec_ld(15, &block[2][y*src_stride]);\
|
||||
\
|
||||
b2 = vec_perm(tmp1,tmp2,align);\
|
||||
\
|
||||
tmp1 = vec_ld(0, &block[1][y*src_stride]);\
|
||||
align = vec_lvsl(0, &block[1][y*src_stride]);\
|
||||
tmp2 = vec_ld(15, &block[1][y*src_stride]);\
|
||||
\
|
||||
b1 = vec_perm(tmp1,tmp2,align);\
|
||||
\
|
||||
tmp1 = vec_ld(0, &block[0][y*src_stride]);\
|
||||
align = vec_lvsl(0, &block[0][y*src_stride]);\
|
||||
tmp2 = vec_ld(15, &block[0][y*src_stride]);\
|
||||
\
|
||||
b0 = vec_perm(tmp1,tmp2,align);
|
||||
|
||||
#define LOAD_OBMCS \
|
||||
tmp1 = vec_ld(0, obmc1);\
|
||||
align = vec_lvsl(0, obmc1);\
|
||||
tmp2 = vec_ld(15, obmc1);\
|
||||
\
|
||||
ob1 = vec_perm(tmp1,tmp2,align);\
|
||||
\
|
||||
tmp1 = vec_ld(0, obmc2);\
|
||||
align = vec_lvsl(0, obmc2);\
|
||||
tmp2 = vec_ld(15, obmc2);\
|
||||
\
|
||||
ob2 = vec_perm(tmp1,tmp2,align);\
|
||||
\
|
||||
tmp1 = vec_ld(0, obmc3);\
|
||||
align = vec_lvsl(0, obmc3);\
|
||||
tmp2 = vec_ld(15, obmc3);\
|
||||
\
|
||||
ob3 = vec_perm(tmp1,tmp2,align);\
|
||||
\
|
||||
tmp1 = vec_ld(0, obmc4);\
|
||||
align = vec_lvsl(0, obmc4);\
|
||||
tmp2 = vec_ld(15, obmc4);\
|
||||
\
|
||||
ob4 = vec_perm(tmp1,tmp2,align);
|
||||
|
||||
/* interleave logic
|
||||
* h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
|
||||
* h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
|
||||
* h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
|
||||
*/
|
||||
|
||||
#define STEPS_0_1\
|
||||
h1 = (vector unsigned short)\
|
||||
vec_mergeh(ob1, ob2);\
|
||||
\
|
||||
h2 = (vector unsigned short)\
|
||||
vec_mergeh(ob3, ob4);\
|
||||
\
|
||||
ih = (vector unsigned char)\
|
||||
vec_mergeh(h1,h2);\
|
||||
\
|
||||
l1 = (vector unsigned short) vec_mergeh(b3, b2);\
|
||||
\
|
||||
ih1 = (vector unsigned char) vec_mergel(h1, h2);\
|
||||
\
|
||||
l2 = (vector unsigned short) vec_mergeh(b1, b0);\
|
||||
\
|
||||
il = (vector unsigned char) vec_mergeh(l1, l2);\
|
||||
\
|
||||
v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
|
||||
\
|
||||
il1 = (vector unsigned char) vec_mergel(l1, l2);\
|
||||
\
|
||||
v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
|
||||
|
||||
#define FINAL_STEP_SCALAR\
|
||||
for(x=0; x<b_w; x++)\
|
||||
if(add){\
|
||||
vbuf[x] += dst[x + src_x];\
|
||||
vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
|
||||
if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
|
||||
dst8[x + y*src_stride] = vbuf[x];\
|
||||
}else{\
|
||||
dst[x + src_x] -= vbuf[x];\
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
|
||||
const int obmc_stride,
|
||||
uint8_t * * block, int b_w,
|
||||
int b_h, int src_x, int src_y,
|
||||
int src_stride, slice_buffer * sb,
|
||||
int add, uint8_t * dst8)
|
||||
{
|
||||
int y, x;
|
||||
DWTELEM * dst;
|
||||
vector unsigned short h1, h2, l1, l2;
|
||||
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||
vector unsigned char b0,b1,b2,b3;
|
||||
vector unsigned char ob1,ob2,ob3,ob4;
|
||||
|
||||
DECLARE_ALIGNED_16(int, vbuf[16]);
|
||||
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||
|
||||
for(y=0; y<b_h; y++){
|
||||
//FIXME ugly misuse of obmc_stride
|
||||
|
||||
uint8_t *obmc1= obmc + y*obmc_stride;
|
||||
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||
|
||||
dst = slice_buffer_get_line(sb, src_y + y);
|
||||
d = (vector signed int *)(dst + src_x);
|
||||
|
||||
//FIXME i could avoid some loads!
|
||||
|
||||
// load blocks
|
||||
LOAD_BLOCKS
|
||||
|
||||
// load obmcs
|
||||
LOAD_OBMCS
|
||||
|
||||
// steps 0 1
|
||||
STEPS_0_1
|
||||
|
||||
FINAL_STEP_SCALAR
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#define STEPS_2_3\
|
||||
h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
|
||||
\
|
||||
h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
|
||||
\
|
||||
ih = (vector unsigned char) vec_mergeh(h1,h2);\
|
||||
\
|
||||
l1 = (vector unsigned short) vec_mergel(b3, b2);\
|
||||
\
|
||||
l2 = (vector unsigned short) vec_mergel(b1, b0);\
|
||||
\
|
||||
ih1 = (vector unsigned char) vec_mergel(h1,h2);\
|
||||
\
|
||||
il = (vector unsigned char) vec_mergeh(l1,l2);\
|
||||
\
|
||||
v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
|
||||
\
|
||||
il1 = (vector unsigned char) vec_mergel(l1,l2);\
|
||||
\
|
||||
v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
|
||||
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
|
||||
const int obmc_stride,
|
||||
uint8_t * * block, int b_w,
|
||||
int b_h, int src_x, int src_y,
|
||||
int src_stride, slice_buffer * sb,
|
||||
int add, uint8_t * dst8)
|
||||
{
|
||||
int y, x;
|
||||
DWTELEM * dst;
|
||||
vector unsigned short h1, h2, l1, l2;
|
||||
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||
vector unsigned char b0,b1,b2,b3;
|
||||
vector unsigned char ob1,ob2,ob3,ob4;
|
||||
DECLARE_ALIGNED_16(int, vbuf[b_w]);
|
||||
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||
|
||||
for(y=0; y<b_h; y++){
|
||||
//FIXME ugly misuse of obmc_stride
|
||||
|
||||
uint8_t *obmc1= obmc + y*obmc_stride;
|
||||
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||
|
||||
dst = slice_buffer_get_line(sb, src_y + y);
|
||||
d = (vector signed int *)(dst + src_x);
|
||||
|
||||
// load blocks
|
||||
LOAD_BLOCKS
|
||||
|
||||
// load obmcs
|
||||
LOAD_OBMCS
|
||||
|
||||
// steps 0 1 2 3
|
||||
STEPS_0_1
|
||||
|
||||
STEPS_2_3
|
||||
|
||||
FINAL_STEP_SCALAR
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#define FINAL_STEP_VEC \
|
||||
\
|
||||
if(add)\
|
||||
{\
|
||||
for(x=0; x<b_w/4; x++)\
|
||||
{\
|
||||
v[x] = vec_add(v[x], d[x]);\
|
||||
v[x] = vec_sra(vec_add(v[x],\
|
||||
vec_sl( vec_splat_s32(1),\
|
||||
vec_splat_u32(7))),\
|
||||
vec_splat_u32(8));\
|
||||
\
|
||||
mask = (vector bool int) vec_sl((vector signed int)\
|
||||
vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
|
||||
mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\
|
||||
\
|
||||
mask = (vector bool int)\
|
||||
vec_cmpeq((vector signed int)mask,\
|
||||
(vector signed int)vec_splat_u32(0));\
|
||||
\
|
||||
vs = vec_sra(v[x],vec_splat_u32(8));\
|
||||
vs = vec_sra(v[x],vec_splat_u32(8));\
|
||||
vs = vec_sra(v[x],vec_splat_u32(15));\
|
||||
\
|
||||
vs = vec_nor(vs,vs);\
|
||||
\
|
||||
v[x]= vec_sel(v[x],vs,mask);\
|
||||
}\
|
||||
\
|
||||
for(x=0; x<b_w; x++)\
|
||||
dst8[x + y*src_stride] = vbuf[x];\
|
||||
\
|
||||
}\
|
||||
else\
|
||||
for(x=0; x<b_w/4; x++)\
|
||||
d[x] = vec_sub(d[x], v[x]);
|
||||
|
||||
static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
|
||||
const int obmc_stride,
|
||||
uint8_t * * block, int b_w,
|
||||
int b_h, int src_x, int src_y,
|
||||
int src_stride, slice_buffer * sb,
|
||||
int add, uint8_t * dst8)
|
||||
{
|
||||
int y, x;
|
||||
DWTELEM * dst;
|
||||
vector bool int mask;
|
||||
vector signed int vs;
|
||||
vector unsigned short h1, h2, l1, l2;
|
||||
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||
vector unsigned char b0,b1,b2,b3;
|
||||
vector unsigned char ob1,ob2,ob3,ob4;
|
||||
|
||||
DECLARE_ALIGNED_16(int, vbuf[16]);
|
||||
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||
|
||||
for(y=0; y<b_h; y++){
|
||||
//FIXME ugly misuse of obmc_stride
|
||||
|
||||
uint8_t *obmc1= obmc + y*obmc_stride;
|
||||
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||
|
||||
dst = slice_buffer_get_line(sb, src_y + y);
|
||||
d = (vector signed int *)(dst + src_x);
|
||||
|
||||
//FIXME i could avoid some loads!
|
||||
|
||||
// load blocks
|
||||
LOAD_BLOCKS
|
||||
|
||||
// load obmcs
|
||||
LOAD_OBMCS
|
||||
|
||||
// steps 0 1
|
||||
STEPS_0_1
|
||||
|
||||
FINAL_STEP_VEC
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
|
||||
const int obmc_stride,
|
||||
uint8_t * * block, int b_w,
|
||||
int b_h, int src_x, int src_y,
|
||||
int src_stride, slice_buffer * sb,
|
||||
int add, uint8_t * dst8)
|
||||
{
|
||||
int y, x;
|
||||
DWTELEM * dst;
|
||||
vector bool int mask;
|
||||
vector signed int vs;
|
||||
vector unsigned short h1, h2, l1, l2;
|
||||
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||
vector unsigned char b0,b1,b2,b3;
|
||||
vector unsigned char ob1,ob2,ob3,ob4;
|
||||
DECLARE_ALIGNED_16(int, vbuf[b_w]);
|
||||
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||
|
||||
for(y=0; y<b_h; y++){
|
||||
//FIXME ugly misuse of obmc_stride
|
||||
|
||||
uint8_t *obmc1= obmc + y*obmc_stride;
|
||||
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||
|
||||
dst = slice_buffer_get_line(sb, src_y + y);
|
||||
d = (vector signed int *)(dst + src_x);
|
||||
|
||||
// load blocks
|
||||
LOAD_BLOCKS
|
||||
|
||||
// load obmcs
|
||||
LOAD_OBMCS
|
||||
|
||||
// steps 0 1 2 3
|
||||
STEPS_0_1
|
||||
|
||||
STEPS_2_3
|
||||
|
||||
FINAL_STEP_VEC
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
|
||||
uint8_t * * block, int b_w, int b_h,
|
||||
int src_x, int src_y, int src_stride,
|
||||
slice_buffer * sb, int add,
|
||||
uint8_t * dst8)
|
||||
{
|
||||
if (src_x&15) {
|
||||
if (b_w == 16)
|
||||
inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
|
||||
b_w, b_h, src_x, src_y,
|
||||
src_stride, sb, add, dst8);
|
||||
else if (b_w == 8)
|
||||
inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
|
||||
b_w, b_h, src_x, src_y,
|
||||
src_stride, sb, add, dst8);
|
||||
else
|
||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
|
||||
src_y, src_stride, sb, add, dst8);
|
||||
} else {
|
||||
if (b_w == 16)
|
||||
inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
|
||||
b_w, b_h, src_x, src_y,
|
||||
src_stride, sb, add, dst8);
|
||||
else if (b_w == 8)
|
||||
inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
|
||||
b_w, b_h, src_x, src_y,
|
||||
src_stride, sb, add, dst8);
|
||||
else
|
||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
|
||||
src_y, src_stride, sb, add, dst8);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
#if 0
|
||||
c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
|
||||
c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
|
||||
c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
|
||||
#endif
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H
|
||||
#define AVCODEC_PPC_TYPES_ALTIVEC_H
|
||||
|
||||
/***********************************************************************
|
||||
* Vector types
|
||||
**********************************************************************/
|
||||
#define vec_u8 vector unsigned char
|
||||
#define vec_s8 vector signed char
|
||||
#define vec_u16 vector unsigned short
|
||||
#define vec_s16 vector signed short
|
||||
#define vec_u32 vector unsigned int
|
||||
#define vec_s32 vector signed int
|
||||
|
||||
/***********************************************************************
|
||||
* Null vector
|
||||
**********************************************************************/
|
||||
#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
|
||||
|
||||
#define zero_u8v (vec_u8) zerov
|
||||
#define zero_s8v (vec_s8) zerov
|
||||
#define zero_u16v (vec_u16) zerov
|
||||
#define zero_s16v (vec_s16) zerov
|
||||
#define zero_u32v (vec_u32) zerov
|
||||
#define zero_s32v (vec_s32) zerov
|
||||
|
||||
#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */
|
|
@ -0,0 +1,105 @@
|
|||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file libavcodec/ppc/util_altivec.h
|
||||
* Contains misc utility macros and inline functions
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H
|
||||
#define AVCODEC_PPC_UTIL_ALTIVEC_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_ALTIVEC_H
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
// used to build registers permutation vectors (vcprm)
|
||||
// the 's' are for words in the _s_econd vector
|
||||
#define WORD_0 0x00,0x01,0x02,0x03
|
||||
#define WORD_1 0x04,0x05,0x06,0x07
|
||||
#define WORD_2 0x08,0x09,0x0a,0x0b
|
||||
#define WORD_3 0x0c,0x0d,0x0e,0x0f
|
||||
#define WORD_s0 0x10,0x11,0x12,0x13
|
||||
#define WORD_s1 0x14,0x15,0x16,0x17
|
||||
#define WORD_s2 0x18,0x19,0x1a,0x1b
|
||||
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
|
||||
|
||||
#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
|
||||
#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
|
||||
|
||||
// vcprmle is used to keep the same index as in the SSE version.
|
||||
// it's the same as vcprm, with the index inversed
|
||||
// ('le' is Little Endian)
|
||||
#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
|
||||
|
||||
// used to build inverse/identity vectors (vcii)
|
||||
// n is _n_egative, p is _p_ositive
|
||||
#define FLOAT_n -1.
|
||||
#define FLOAT_p 1.
|
||||
|
||||
|
||||
// Transpose 8x8 matrix of 16-bit elements (in-place)
|
||||
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
|
||||
do { \
|
||||
vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
|
||||
vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
|
||||
\
|
||||
A1 = vec_mergeh (a, e); \
|
||||
B1 = vec_mergel (a, e); \
|
||||
C1 = vec_mergeh (b, f); \
|
||||
D1 = vec_mergel (b, f); \
|
||||
E1 = vec_mergeh (c, g); \
|
||||
F1 = vec_mergel (c, g); \
|
||||
G1 = vec_mergeh (d, h); \
|
||||
H1 = vec_mergel (d, h); \
|
||||
\
|
||||
A2 = vec_mergeh (A1, E1); \
|
||||
B2 = vec_mergel (A1, E1); \
|
||||
C2 = vec_mergeh (B1, F1); \
|
||||
D2 = vec_mergel (B1, F1); \
|
||||
E2 = vec_mergeh (C1, G1); \
|
||||
F2 = vec_mergel (C1, G1); \
|
||||
G2 = vec_mergeh (D1, H1); \
|
||||
H2 = vec_mergel (D1, H1); \
|
||||
\
|
||||
a = vec_mergeh (A2, E2); \
|
||||
b = vec_mergel (A2, E2); \
|
||||
c = vec_mergeh (B2, F2); \
|
||||
d = vec_mergel (B2, F2); \
|
||||
e = vec_mergeh (C2, G2); \
|
||||
f = vec_mergel (C2, G2); \
|
||||
g = vec_mergeh (D2, H2); \
|
||||
h = vec_mergel (D2, H2); \
|
||||
} while (0)
|
||||
|
||||
|
||||
/** \brief loads unaligned vector \a *src with offset \a offset
|
||||
and returns it */
|
||||
static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
|
||||
{
|
||||
register vector unsigned char first = vec_ld(offset, src);
|
||||
register vector unsigned char second = vec_ld(offset+15, src);
|
||||
register vector unsigned char mask = vec_lvsl(offset, src);
|
||||
return vec_perm(first, second, mask);
|
||||
}
|
||||
|
||||
#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */
|
|
@ -0,0 +1,330 @@
|
|||
/*
|
||||
* VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
|
||||
* Copyright (c) 2006 Konstantin Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "gcc_fixes.h"
|
||||
|
||||
#include "util_altivec.h"
|
||||
|
||||
// main steps of 8x8 transform
|
||||
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
|
||||
do { \
|
||||
t0 = vec_sl(vec_add(s0, s4), vec_2); \
|
||||
t0 = vec_add(vec_sl(t0, vec_1), t0); \
|
||||
t0 = vec_add(t0, vec_rnd); \
|
||||
t1 = vec_sl(vec_sub(s0, s4), vec_2); \
|
||||
t1 = vec_add(vec_sl(t1, vec_1), t1); \
|
||||
t1 = vec_add(t1, vec_rnd); \
|
||||
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
|
||||
t2 = vec_add(t2, vec_sl(s2, vec_4)); \
|
||||
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
|
||||
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
|
||||
t4 = vec_add(t0, t2); \
|
||||
t5 = vec_add(t1, t3); \
|
||||
t6 = vec_sub(t1, t3); \
|
||||
t7 = vec_sub(t0, t2); \
|
||||
\
|
||||
t0 = vec_sl(vec_add(s1, s3), vec_4); \
|
||||
t0 = vec_add(t0, vec_sl(s5, vec_3)); \
|
||||
t0 = vec_add(t0, vec_sl(s7, vec_2)); \
|
||||
t0 = vec_add(t0, vec_sub(s5, s3)); \
|
||||
\
|
||||
t1 = vec_sl(vec_sub(s1, s5), vec_4); \
|
||||
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
|
||||
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
|
||||
t1 = vec_sub(t1, vec_add(s1, s7)); \
|
||||
\
|
||||
t2 = vec_sl(vec_sub(s7, s3), vec_4); \
|
||||
t2 = vec_add(t2, vec_sl(s1, vec_3)); \
|
||||
t2 = vec_add(t2, vec_sl(s5, vec_2)); \
|
||||
t2 = vec_add(t2, vec_sub(s1, s7)); \
|
||||
\
|
||||
t3 = vec_sl(vec_sub(s5, s7), vec_4); \
|
||||
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
|
||||
t3 = vec_add(t3, vec_sl(s1, vec_2)); \
|
||||
t3 = vec_sub(t3, vec_add(s3, s5)); \
|
||||
\
|
||||
s0 = vec_add(t4, t0); \
|
||||
s1 = vec_add(t5, t1); \
|
||||
s2 = vec_add(t6, t2); \
|
||||
s3 = vec_add(t7, t3); \
|
||||
s4 = vec_sub(t7, t3); \
|
||||
s5 = vec_sub(t6, t2); \
|
||||
s6 = vec_sub(t5, t1); \
|
||||
s7 = vec_sub(t4, t0); \
|
||||
}while(0)
|
||||
|
||||
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
|
||||
do { \
|
||||
s0 = vec_sra(s0, vec_3); \
|
||||
s1 = vec_sra(s1, vec_3); \
|
||||
s2 = vec_sra(s2, vec_3); \
|
||||
s3 = vec_sra(s3, vec_3); \
|
||||
s4 = vec_sra(s4, vec_3); \
|
||||
s5 = vec_sra(s5, vec_3); \
|
||||
s6 = vec_sra(s6, vec_3); \
|
||||
s7 = vec_sra(s7, vec_3); \
|
||||
}while(0)
|
||||
|
||||
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
|
||||
do { \
|
||||
s0 = vec_sra(s0, vec_7); \
|
||||
s1 = vec_sra(s1, vec_7); \
|
||||
s2 = vec_sra(s2, vec_7); \
|
||||
s3 = vec_sra(s3, vec_7); \
|
||||
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
|
||||
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
|
||||
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
|
||||
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
|
||||
}while(0)
|
||||
|
||||
/* main steps of 4x4 transform */
|
||||
#define STEP4(s0, s1, s2, s3, vec_rnd) \
|
||||
do { \
|
||||
t1 = vec_add(vec_sl(s0, vec_4), s0); \
|
||||
t1 = vec_add(t1, vec_rnd); \
|
||||
t2 = vec_add(vec_sl(s2, vec_4), s2); \
|
||||
t0 = vec_add(t1, t2); \
|
||||
t1 = vec_sub(t1, t2); \
|
||||
t3 = vec_sl(vec_sub(s3, s1), vec_1); \
|
||||
t3 = vec_add(t3, vec_sl(t3, vec_2)); \
|
||||
t2 = vec_add(t3, vec_sl(s1, vec_5)); \
|
||||
t3 = vec_add(t3, vec_sl(s3, vec_3)); \
|
||||
t3 = vec_add(t3, vec_sl(s3, vec_2)); \
|
||||
s0 = vec_add(t0, t2); \
|
||||
s1 = vec_sub(t1, t3); \
|
||||
s2 = vec_add(t1, t3); \
|
||||
s3 = vec_sub(t0, t2); \
|
||||
}while (0)
|
||||
|
||||
#define SHIFT_HOR4(s0, s1, s2, s3) \
|
||||
s0 = vec_sra(s0, vec_3); \
|
||||
s1 = vec_sra(s1, vec_3); \
|
||||
s2 = vec_sra(s2, vec_3); \
|
||||
s3 = vec_sra(s3, vec_3);
|
||||
|
||||
#define SHIFT_VERT4(s0, s1, s2, s3) \
|
||||
s0 = vec_sra(s0, vec_7); \
|
||||
s1 = vec_sra(s1, vec_7); \
|
||||
s2 = vec_sra(s2, vec_7); \
|
||||
s3 = vec_sra(s3, vec_7);
|
||||
|
||||
/** Do inverse transform on 8x8 block
|
||||
*/
|
||||
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
|
||||
{
|
||||
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
|
||||
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
|
||||
const vector unsigned int vec_7 = vec_splat_u32(7);
|
||||
const vector unsigned int vec_4 = vec_splat_u32(4);
|
||||
const vector signed int vec_4s = vec_splat_s32(4);
|
||||
const vector unsigned int vec_3 = vec_splat_u32(3);
|
||||
const vector unsigned int vec_2 = vec_splat_u32(2);
|
||||
const vector signed int vec_1s = vec_splat_s32(1);
|
||||
const vector unsigned int vec_1 = vec_splat_u32(1);
|
||||
|
||||
|
||||
src0 = vec_ld( 0, block);
|
||||
src1 = vec_ld( 16, block);
|
||||
src2 = vec_ld( 32, block);
|
||||
src3 = vec_ld( 48, block);
|
||||
src4 = vec_ld( 64, block);
|
||||
src5 = vec_ld( 80, block);
|
||||
src6 = vec_ld( 96, block);
|
||||
src7 = vec_ld(112, block);
|
||||
|
||||
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
s0 = vec_unpackl(src0);
|
||||
s1 = vec_unpackl(src1);
|
||||
s2 = vec_unpackl(src2);
|
||||
s3 = vec_unpackl(src3);
|
||||
s4 = vec_unpackl(src4);
|
||||
s5 = vec_unpackl(src5);
|
||||
s6 = vec_unpackl(src6);
|
||||
s7 = vec_unpackl(src7);
|
||||
s8 = vec_unpackh(src0);
|
||||
s9 = vec_unpackh(src1);
|
||||
sA = vec_unpackh(src2);
|
||||
sB = vec_unpackh(src3);
|
||||
sC = vec_unpackh(src4);
|
||||
sD = vec_unpackh(src5);
|
||||
sE = vec_unpackh(src6);
|
||||
sF = vec_unpackh(src7);
|
||||
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
|
||||
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
|
||||
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
|
||||
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
|
||||
src0 = vec_pack(s8, s0);
|
||||
src1 = vec_pack(s9, s1);
|
||||
src2 = vec_pack(sA, s2);
|
||||
src3 = vec_pack(sB, s3);
|
||||
src4 = vec_pack(sC, s4);
|
||||
src5 = vec_pack(sD, s5);
|
||||
src6 = vec_pack(sE, s6);
|
||||
src7 = vec_pack(sF, s7);
|
||||
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
s0 = vec_unpackl(src0);
|
||||
s1 = vec_unpackl(src1);
|
||||
s2 = vec_unpackl(src2);
|
||||
s3 = vec_unpackl(src3);
|
||||
s4 = vec_unpackl(src4);
|
||||
s5 = vec_unpackl(src5);
|
||||
s6 = vec_unpackl(src6);
|
||||
s7 = vec_unpackl(src7);
|
||||
s8 = vec_unpackh(src0);
|
||||
s9 = vec_unpackh(src1);
|
||||
sA = vec_unpackh(src2);
|
||||
sB = vec_unpackh(src3);
|
||||
sC = vec_unpackh(src4);
|
||||
sD = vec_unpackh(src5);
|
||||
sE = vec_unpackh(src6);
|
||||
sF = vec_unpackh(src7);
|
||||
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
|
||||
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
|
||||
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
|
||||
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
|
||||
src0 = vec_pack(s8, s0);
|
||||
src1 = vec_pack(s9, s1);
|
||||
src2 = vec_pack(sA, s2);
|
||||
src3 = vec_pack(sB, s3);
|
||||
src4 = vec_pack(sC, s4);
|
||||
src5 = vec_pack(sD, s5);
|
||||
src6 = vec_pack(sE, s6);
|
||||
src7 = vec_pack(sF, s7);
|
||||
|
||||
vec_st(src0, 0, block);
|
||||
vec_st(src1, 16, block);
|
||||
vec_st(src2, 32, block);
|
||||
vec_st(src3, 48, block);
|
||||
vec_st(src4, 64, block);
|
||||
vec_st(src5, 80, block);
|
||||
vec_st(src6, 96, block);
|
||||
vec_st(src7,112, block);
|
||||
}
|
||||
|
||||
/** Do inverse transform on 8x4 part of block
|
||||
*/
|
||||
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
|
||||
{
|
||||
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
|
||||
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
|
||||
const vector unsigned int vec_7 = vec_splat_u32(7);
|
||||
const vector unsigned int vec_5 = vec_splat_u32(5);
|
||||
const vector unsigned int vec_4 = vec_splat_u32(4);
|
||||
const vector signed int vec_4s = vec_splat_s32(4);
|
||||
const vector unsigned int vec_3 = vec_splat_u32(3);
|
||||
const vector unsigned int vec_2 = vec_splat_u32(2);
|
||||
const vector unsigned int vec_1 = vec_splat_u32(1);
|
||||
vector unsigned char tmp;
|
||||
vector signed short tmp2, tmp3;
|
||||
vector unsigned char perm0, perm1, p0, p1, p;
|
||||
|
||||
src0 = vec_ld( 0, block);
|
||||
src1 = vec_ld( 16, block);
|
||||
src2 = vec_ld( 32, block);
|
||||
src3 = vec_ld( 48, block);
|
||||
src4 = vec_ld( 64, block);
|
||||
src5 = vec_ld( 80, block);
|
||||
src6 = vec_ld( 96, block);
|
||||
src7 = vec_ld(112, block);
|
||||
|
||||
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
s0 = vec_unpackl(src0);
|
||||
s1 = vec_unpackl(src1);
|
||||
s2 = vec_unpackl(src2);
|
||||
s3 = vec_unpackl(src3);
|
||||
s4 = vec_unpackl(src4);
|
||||
s5 = vec_unpackl(src5);
|
||||
s6 = vec_unpackl(src6);
|
||||
s7 = vec_unpackl(src7);
|
||||
s8 = vec_unpackh(src0);
|
||||
s9 = vec_unpackh(src1);
|
||||
sA = vec_unpackh(src2);
|
||||
sB = vec_unpackh(src3);
|
||||
sC = vec_unpackh(src4);
|
||||
sD = vec_unpackh(src5);
|
||||
sE = vec_unpackh(src6);
|
||||
sF = vec_unpackh(src7);
|
||||
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
|
||||
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
|
||||
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
|
||||
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
|
||||
src0 = vec_pack(s8, s0);
|
||||
src1 = vec_pack(s9, s1);
|
||||
src2 = vec_pack(sA, s2);
|
||||
src3 = vec_pack(sB, s3);
|
||||
src4 = vec_pack(sC, s4);
|
||||
src5 = vec_pack(sD, s5);
|
||||
src6 = vec_pack(sE, s6);
|
||||
src7 = vec_pack(sF, s7);
|
||||
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
s0 = vec_unpackh(src0);
|
||||
s1 = vec_unpackh(src1);
|
||||
s2 = vec_unpackh(src2);
|
||||
s3 = vec_unpackh(src3);
|
||||
s8 = vec_unpackl(src0);
|
||||
s9 = vec_unpackl(src1);
|
||||
sA = vec_unpackl(src2);
|
||||
sB = vec_unpackl(src3);
|
||||
STEP4(s0, s1, s2, s3, vec_64);
|
||||
SHIFT_VERT4(s0, s1, s2, s3);
|
||||
STEP4(s8, s9, sA, sB, vec_64);
|
||||
SHIFT_VERT4(s8, s9, sA, sB);
|
||||
src0 = vec_pack(s0, s8);
|
||||
src1 = vec_pack(s1, s9);
|
||||
src2 = vec_pack(s2, sA);
|
||||
src3 = vec_pack(s3, sB);
|
||||
|
||||
p0 = vec_lvsl (0, dest);
|
||||
p1 = vec_lvsl (stride, dest);
|
||||
p = vec_splat_u8 (-1);
|
||||
perm0 = vec_mergeh (p, p0);
|
||||
perm1 = vec_mergeh (p, p1);
|
||||
|
||||
#define ADD(dest,src,perm) \
|
||||
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
|
||||
tmp = vec_ld (0, dest); \
|
||||
tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \
|
||||
tmp3 = vec_adds (tmp2, src); \
|
||||
tmp = vec_packsu (tmp3, tmp3); \
|
||||
vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
|
||||
vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
|
||||
|
||||
ADD (dest, src0, perm0) dest += stride;
|
||||
ADD (dest, src1, perm1) dest += stride;
|
||||
ADD (dest, src2, perm0) dest += stride;
|
||||
ADD (dest, src3, perm1)
|
||||
}
|
||||
|
||||
|
||||
void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {
|
||||
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
|
||||
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
SubDir HAIKU_TOP src add-ons media plugins avcodec libavcodec sparc ;
|
||||
|
||||
SubDirHdrs [ FDirName $(SUBDIR) .. ] ;
|
||||
SubDirHdrs [ FDirName $(SUBDIR) ../.. ] ;
|
||||
SubDirHdrs [ FDirName $(SUBDIR) ../../libavutil ] ;
|
||||
SubDirHdrs [ FDirName $(SUBDIR) ../../libswscale ] ;
|
||||
|
||||
# filter warnings we don't want here
|
||||
TARGET_WARNING_CCFLAGS = [ FFilter $(TARGET_WARNING_CCFLAGS)
|
||||
: -Wall -Wmissing-prototypes -Wsign-compare -Wpointer-arith ] ;
|
||||
|
||||
if $(HAIKU_GCC_VERSION[1]) >= 3 {
|
||||
SubDirCcFlags -fomit-frame-pointer -fno-pic ;
|
||||
} else {
|
||||
SubDirCcFlags -fomit-frame-pointer -DPIC ;
|
||||
}
|
||||
|
||||
local defines ;
|
||||
defines = HAVE_AV_CONFIG_H=1 ;
|
||||
|
||||
if $(TARGET_ARCH) = x86 {
|
||||
defines += ARCH_X86=1 ARCH_X86_32=1 ARCH_PPC=0 ARCH_SPARC=0 ;
|
||||
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
|
||||
defines += HAVE_MMX=1 HAVE_MMX2=1 HAVE_SSE=0 HAVE_SSE3=1 ;
|
||||
defines += HAVE_ALTIVEC=0 ;
|
||||
defines += HAVE_VIS=0 ;
|
||||
} else if $(TARGET_ARCH) = ppc {
|
||||
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=1 ARCH_SPARC=0 ;
|
||||
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
|
||||
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
|
||||
defines += HAVE_ALTIVEC=1 ;
|
||||
defines += HAVE_VIS=0 ;
|
||||
} else if $(TARGET_ARCH) = sparc {
|
||||
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=0 ARCH_SPARC=1 ;
|
||||
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
|
||||
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
|
||||
defines += HAVE_ALTIVEC=0 ;
|
||||
defines += HAVE_VIS=1 ;
|
||||
}
|
||||
|
||||
defines = [ FDefines $(defines) ] ;
|
||||
SubDirCcFlags $(defines) ;
|
||||
SubDirC++Flags $(defines) ;
|
||||
|
||||
StaticLibrary libavcodec_sparc.a :
|
||||
dsputil_vis.c
|
||||
simple_idct_vis.c
|
||||
;
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,528 @@
|
|||
/*
|
||||
* SPARC VIS optimized inverse DCT
|
||||
* Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu >
|
||||
*
|
||||
* I did consult the following fine web page about dct
|
||||
* http://www.geocities.com/ssavekar/dct.htm
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
static const DECLARE_ALIGNED_8(int16_t, coeffs[28]) = {
|
||||
- 1259,- 1259,- 1259,- 1259,
|
||||
- 4989,- 4989,- 4989,- 4989,
|
||||
-11045,-11045,-11045,-11045,
|
||||
-19195,-19195,-19195,-19195,
|
||||
-29126,-29126,-29126,-29126,
|
||||
25080, 25080, 25080, 25080,
|
||||
12785, 12785, 12785, 12785
|
||||
};
|
||||
static const DECLARE_ALIGNED_8(uint16_t, scale[4]) = {
|
||||
65536>>6, 65536>>6, 65536>>6, 65536>>6
|
||||
};
|
||||
static const DECLARE_ALIGNED_8(uint16_t, rounder[4]) = {
|
||||
1<<5, 1<<5, 1<<5, 1<<5
|
||||
};
|
||||
static const DECLARE_ALIGNED_8(uint16_t, expand[4]) = {
|
||||
1<<14, 1<<14, 1<<14, 1<<14
|
||||
};
|
||||
|
||||
#define INIT_IDCT \
|
||||
"ldd [%1], %%f32 \n\t"\
|
||||
"ldd [%1+8], %%f34 \n\t"\
|
||||
"ldd [%1+16], %%f36 \n\t"\
|
||||
"ldd [%1+24], %%f38 \n\t"\
|
||||
"ldd [%1+32], %%f40 \n\t"\
|
||||
"ldd [%1+40], %%f42 \n\t"\
|
||||
"ldd [%1+48], %%f44 \n\t"\
|
||||
"ldd [%0], %%f46 \n\t"\
|
||||
"fzero %%f62 \n\t"\
|
||||
|
||||
#define LOADSCALE(in) \
|
||||
"ldd [" in "], %%f0 \n\t"\
|
||||
"ldd [" in "+16], %%f2 \n\t"\
|
||||
"ldd [" in "+32], %%f4 \n\t"\
|
||||
"ldd [" in "+48], %%f6 \n\t"\
|
||||
"ldd [" in "+64], %%f8 \n\t"\
|
||||
"ldd [" in "+80], %%f10 \n\t"\
|
||||
"ldd [" in "+96], %%f12 \n\t"\
|
||||
"ldd [" in "+112], %%f14 \n\t"\
|
||||
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
||||
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
||||
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
||||
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
||||
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
||||
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
||||
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
||||
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
||||
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
||||
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
||||
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
||||
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
||||
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
||||
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
||||
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
||||
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
||||
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
||||
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
||||
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
||||
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
||||
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
||||
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
|
||||
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
|
||||
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
|
||||
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
|
||||
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
|
||||
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
|
||||
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
|
||||
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
|
||||
|
||||
#define LOAD(in) \
|
||||
"ldd [" in "], %%f16 \n\t"\
|
||||
"ldd [" in "+8], %%f18 \n\t"\
|
||||
"ldd [" in "+16], %%f20 \n\t"\
|
||||
"ldd [" in "+24], %%f22 \n\t"\
|
||||
"ldd [" in "+32], %%f24 \n\t"\
|
||||
"ldd [" in "+40], %%f26 \n\t"\
|
||||
"ldd [" in "+48], %%f28 \n\t"\
|
||||
"ldd [" in "+56], %%f30 \n\t"\
|
||||
|
||||
#define TRANSPOSE \
|
||||
"fpmerge %%f16, %%f24, %%f0 \n\t"\
|
||||
"fpmerge %%f20, %%f28, %%f2 \n\t"\
|
||||
"fpmerge %%f17, %%f25, %%f4 \n\t"\
|
||||
"fpmerge %%f21, %%f29, %%f6 \n\t"\
|
||||
"fpmerge %%f18, %%f26, %%f8 \n\t"\
|
||||
"fpmerge %%f22, %%f30, %%f10 \n\t"\
|
||||
"fpmerge %%f19, %%f27, %%f12 \n\t"\
|
||||
"fpmerge %%f23, %%f31, %%f14 \n\t"\
|
||||
\
|
||||
"fpmerge %%f0, %%f2, %%f16 \n\t"\
|
||||
"fpmerge %%f1, %%f3, %%f18 \n\t"\
|
||||
"fpmerge %%f4, %%f6, %%f20 \n\t"\
|
||||
"fpmerge %%f5, %%f7, %%f22 \n\t"\
|
||||
"fpmerge %%f8, %%f10, %%f24 \n\t"\
|
||||
"fpmerge %%f9, %%f11, %%f26 \n\t"\
|
||||
"fpmerge %%f12, %%f14, %%f28 \n\t"\
|
||||
"fpmerge %%f13, %%f15, %%f30 \n\t"\
|
||||
\
|
||||
"fpmerge %%f16, %%f17, %%f0 \n\t"\
|
||||
"fpmerge %%f18, %%f19, %%f2 \n\t"\
|
||||
"fpmerge %%f20, %%f21, %%f4 \n\t"\
|
||||
"fpmerge %%f22, %%f23, %%f6 \n\t"\
|
||||
"fpmerge %%f24, %%f25, %%f8 \n\t"\
|
||||
"fpmerge %%f26, %%f27, %%f10 \n\t"\
|
||||
"fpmerge %%f28, %%f29, %%f12 \n\t"\
|
||||
"fpmerge %%f30, %%f31, %%f14 \n\t"\
|
||||
|
||||
#define IDCT4ROWS \
|
||||
/* 1. column */\
|
||||
"fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\
|
||||
"for %%f4, %%f6, %%f60 \n\t"\
|
||||
"fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\
|
||||
"fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\
|
||||
"fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\
|
||||
"fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\
|
||||
\
|
||||
ADDROUNDER\
|
||||
\
|
||||
"fmul8sux16 %%f0, %%f38, %%f48 \n\t"\
|
||||
"fcmpd %%fcc0, %%f62, %%f60 \n\t"\
|
||||
"for %%f8, %%f10, %%f60 \n\t"\
|
||||
"fmul8sux16 %%f2, %%f32, %%f50 \n\t"\
|
||||
"fmul8sux16 %%f2, %%f36, %%f52 \n\t"\
|
||||
"fmul8sux16 %%f2, %%f40, %%f54 \n\t"\
|
||||
"fmul8sux16 %%f2, %%f44, %%f56 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f48, %%f28, %%f28 \n\t"\
|
||||
"fcmpd %%fcc1, %%f62, %%f60 \n\t"\
|
||||
"for %%f12, %%f14, %%f60 \n\t"\
|
||||
"fpadd16 %%f50, %%f18, %%f18 \n\t"\
|
||||
"fpadd16 %%f52, %%f22, %%f22 \n\t"\
|
||||
"fpadd16 %%f54, %%f26, %%f26 \n\t"\
|
||||
"fpadd16 %%f56, %%f30, %%f30 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f28, %%f0, %%f16 \n\t"\
|
||||
"fcmpd %%fcc2, %%f62, %%f60 \n\t"\
|
||||
"fpadd16 %%f28, %%f0, %%f20 \n\t"\
|
||||
"fpadd16 %%f28, %%f0, %%f24 \n\t"\
|
||||
"fpadd16 %%f28, %%f0, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f2, %%f18 \n\t"\
|
||||
"fpadd16 %%f22, %%f2, %%f22 \n\t"\
|
||||
/* 2. column */\
|
||||
"fbe %%fcc0, 3f \n\t"\
|
||||
"fpadd16 %%f26, %%f2, %%f26 \n\t"\
|
||||
"fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\
|
||||
"fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\
|
||||
"fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\
|
||||
"fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\
|
||||
"fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\
|
||||
"fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
||||
"fpadd16 %%f20, %%f50, %%f20 \n\t"\
|
||||
"fpsub16 %%f24, %%f50, %%f24 \n\t"\
|
||||
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
||||
"fpsub16 %%f26, %%f56, %%f26 \n\t"\
|
||||
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
||||
\
|
||||
"fmul8sux16 %%f4, %%f34, %%f48 \n\t"\
|
||||
"fmul8sux16 %%f4, %%f42, %%f50 \n\t"\
|
||||
"fmul8sux16 %%f6, %%f36, %%f52 \n\t"\
|
||||
"fmul8sux16 %%f6, %%f44, %%f54 \n\t"\
|
||||
"fmul8sux16 %%f6, %%f32, %%f56 \n\t"\
|
||||
"fmul8sux16 %%f6, %%f40, %%f58 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
||||
"fpadd16 %%f20, %%f50, %%f20 \n\t"\
|
||||
"fpsub16 %%f24, %%f50, %%f24 \n\t"\
|
||||
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
||||
"fpsub16 %%f26, %%f56, %%f26 \n\t"\
|
||||
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f4, %%f16 \n\t"\
|
||||
"fpsub16 %%f28, %%f4, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f6, %%f18 \n\t"\
|
||||
"fpsub16 %%f26, %%f6, %%f26 \n\t"\
|
||||
/* 3. column */\
|
||||
"3: \n\t"\
|
||||
"fbe %%fcc1, 4f \n\t"\
|
||||
"fpsub16 %%f30, %%f6, %%f30 \n\t"\
|
||||
"fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\
|
||||
"fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\
|
||||
"fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\
|
||||
"fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\
|
||||
"fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
||||
"fpsub16 %%f20, %%f48, %%f20 \n\t"\
|
||||
"fpsub16 %%f24, %%f48, %%f24 \n\t"\
|
||||
"fpadd16 %%f28, %%f48, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f50, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f52, %%f22 \n\t"\
|
||||
"fpadd16 %%f26, %%f54, %%f26 \n\t"\
|
||||
"fpadd16 %%f30, %%f56, %%f30 \n\t"\
|
||||
\
|
||||
"fmul8sux16 %%f8, %%f38, %%f48 \n\t"\
|
||||
"fmul8sux16 %%f10, %%f40, %%f50 \n\t"\
|
||||
"fmul8sux16 %%f10, %%f32, %%f52 \n\t"\
|
||||
"fmul8sux16 %%f10, %%f44, %%f54 \n\t"\
|
||||
"fmul8sux16 %%f10, %%f36, %%f56 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
||||
"fpsub16 %%f20, %%f48, %%f20 \n\t"\
|
||||
"fpsub16 %%f24, %%f48, %%f24 \n\t"\
|
||||
"fpadd16 %%f28, %%f48, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f50, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f52, %%f22 \n\t"\
|
||||
"fpadd16 %%f26, %%f54, %%f26 \n\t"\
|
||||
"fpadd16 %%f30, %%f56, %%f30 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f8, %%f16 \n\t"\
|
||||
"fpsub16 %%f20, %%f8, %%f20 \n\t"\
|
||||
"fpsub16 %%f24, %%f8, %%f24 \n\t"\
|
||||
"fpadd16 %%f28, %%f8, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f10, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f10, %%f22 \n\t"\
|
||||
/* 4. column */\
|
||||
"4: \n\t"\
|
||||
"fbe %%fcc2, 5f \n\t"\
|
||||
"fpadd16 %%f30, %%f10, %%f30 \n\t"\
|
||||
"fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\
|
||||
"fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\
|
||||
"fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\
|
||||
"fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\
|
||||
"fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\
|
||||
"fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
||||
"fpsub16 %%f20, %%f50, %%f20 \n\t"\
|
||||
"fpadd16 %%f24, %%f50, %%f24 \n\t"\
|
||||
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
||||
"fpadd16 %%f26, %%f56, %%f26 \n\t"\
|
||||
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
||||
\
|
||||
"fmul8sux16 %%f12, %%f42, %%f48 \n\t"\
|
||||
"fmul8sux16 %%f12, %%f34, %%f50 \n\t"\
|
||||
"fmul8sux16 %%f14, %%f44, %%f52 \n\t"\
|
||||
"fmul8sux16 %%f14, %%f40, %%f54 \n\t"\
|
||||
"fmul8sux16 %%f14, %%f36, %%f56 \n\t"\
|
||||
"fmul8sux16 %%f14, %%f32, %%f58 \n\t"\
|
||||
\
|
||||
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
|
||||
"fpsub16 %%f20, %%f50, %%f20 \n\t"\
|
||||
"fpadd16 %%f24, %%f50, %%f24 \n\t"\
|
||||
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
|
||||
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
|
||||
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
|
||||
"fpadd16 %%f26, %%f56, %%f26 \n\t"\
|
||||
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
|
||||
\
|
||||
"fpsub16 %%f20, %%f12, %%f20 \n\t"\
|
||||
"fpadd16 %%f24, %%f12, %%f24 \n\t"\
|
||||
"fpsub16 %%f22, %%f14, %%f22 \n\t"\
|
||||
"fpadd16 %%f26, %%f14, %%f26 \n\t"\
|
||||
"fpsub16 %%f30, %%f14, %%f30 \n\t"\
|
||||
/* final butterfly */\
|
||||
"5: \n\t"\
|
||||
"fpsub16 %%f16, %%f18, %%f48 \n\t"\
|
||||
"fpsub16 %%f20, %%f22, %%f50 \n\t"\
|
||||
"fpsub16 %%f24, %%f26, %%f52 \n\t"\
|
||||
"fpsub16 %%f28, %%f30, %%f54 \n\t"\
|
||||
"fpadd16 %%f16, %%f18, %%f16 \n\t"\
|
||||
"fpadd16 %%f20, %%f22, %%f20 \n\t"\
|
||||
"fpadd16 %%f24, %%f26, %%f24 \n\t"\
|
||||
"fpadd16 %%f28, %%f30, %%f28 \n\t"\
|
||||
|
||||
#define STOREROWS(out) \
|
||||
"std %%f48, [" out "+112] \n\t"\
|
||||
"std %%f50, [" out "+96] \n\t"\
|
||||
"std %%f52, [" out "+80] \n\t"\
|
||||
"std %%f54, [" out "+64] \n\t"\
|
||||
"std %%f16, [" out "] \n\t"\
|
||||
"std %%f20, [" out "+16] \n\t"\
|
||||
"std %%f24, [" out "+32] \n\t"\
|
||||
"std %%f28, [" out "+48] \n\t"\
|
||||
|
||||
#define SCALEROWS \
|
||||
"fmul8sux16 %%f46, %%f48, %%f48 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f50, %%f50 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f52, %%f52 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f54, %%f54 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f16, %%f16 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f20, %%f20 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f24, %%f24 \n\t"\
|
||||
"fmul8sux16 %%f46, %%f28, %%f28 \n\t"\
|
||||
|
||||
#define PUTPIXELSCLAMPED(dest) \
|
||||
"fpack16 %%f48, %%f14 \n\t"\
|
||||
"fpack16 %%f50, %%f12 \n\t"\
|
||||
"fpack16 %%f16, %%f0 \n\t"\
|
||||
"fpack16 %%f20, %%f2 \n\t"\
|
||||
"fpack16 %%f24, %%f4 \n\t"\
|
||||
"fpack16 %%f28, %%f6 \n\t"\
|
||||
"fpack16 %%f54, %%f8 \n\t"\
|
||||
"fpack16 %%f52, %%f10 \n\t"\
|
||||
"st %%f0, [%3+" dest "] \n\t"\
|
||||
"st %%f2, [%5+" dest "] \n\t"\
|
||||
"st %%f4, [%6+" dest "] \n\t"\
|
||||
"st %%f6, [%7+" dest "] \n\t"\
|
||||
"st %%f8, [%8+" dest "] \n\t"\
|
||||
"st %%f10, [%9+" dest "] \n\t"\
|
||||
"st %%f12, [%10+" dest "] \n\t"\
|
||||
"st %%f14, [%11+" dest "] \n\t"\
|
||||
|
||||
#define ADDPIXELSCLAMPED(dest) \
|
||||
"ldd [%5], %%f18 \n\t"\
|
||||
"ld [%3+" dest"], %%f0 \n\t"\
|
||||
"ld [%6+" dest"], %%f2 \n\t"\
|
||||
"ld [%7+" dest"], %%f4 \n\t"\
|
||||
"ld [%8+" dest"], %%f6 \n\t"\
|
||||
"ld [%9+" dest"], %%f8 \n\t"\
|
||||
"ld [%10+" dest"], %%f10 \n\t"\
|
||||
"ld [%11+" dest"], %%f12 \n\t"\
|
||||
"ld [%12+" dest"], %%f14 \n\t"\
|
||||
"fmul8x16 %%f0, %%f18, %%f0 \n\t"\
|
||||
"fmul8x16 %%f2, %%f18, %%f2 \n\t"\
|
||||
"fmul8x16 %%f4, %%f18, %%f4 \n\t"\
|
||||
"fmul8x16 %%f6, %%f18, %%f6 \n\t"\
|
||||
"fmul8x16 %%f8, %%f18, %%f8 \n\t"\
|
||||
"fmul8x16 %%f10, %%f18, %%f10 \n\t"\
|
||||
"fmul8x16 %%f12, %%f18, %%f12 \n\t"\
|
||||
"fmul8x16 %%f14, %%f18, %%f14 \n\t"\
|
||||
"fpadd16 %%f0, %%f16, %%f0 \n\t"\
|
||||
"fpadd16 %%f2, %%f20, %%f2 \n\t"\
|
||||
"fpadd16 %%f4, %%f24, %%f4 \n\t"\
|
||||
"fpadd16 %%f6, %%f28, %%f6 \n\t"\
|
||||
"fpadd16 %%f8, %%f54, %%f8 \n\t"\
|
||||
"fpadd16 %%f10, %%f52, %%f10 \n\t"\
|
||||
"fpadd16 %%f12, %%f50, %%f12 \n\t"\
|
||||
"fpadd16 %%f14, %%f48, %%f14 \n\t"\
|
||||
"fpack16 %%f0, %%f0 \n\t"\
|
||||
"fpack16 %%f2, %%f2 \n\t"\
|
||||
"fpack16 %%f4, %%f4 \n\t"\
|
||||
"fpack16 %%f6, %%f6 \n\t"\
|
||||
"fpack16 %%f8, %%f8 \n\t"\
|
||||
"fpack16 %%f10, %%f10 \n\t"\
|
||||
"fpack16 %%f12, %%f12 \n\t"\
|
||||
"fpack16 %%f14, %%f14 \n\t"\
|
||||
"st %%f0, [%3+" dest "] \n\t"\
|
||||
"st %%f2, [%6+" dest "] \n\t"\
|
||||
"st %%f4, [%7+" dest "] \n\t"\
|
||||
"st %%f6, [%8+" dest "] \n\t"\
|
||||
"st %%f8, [%9+" dest "] \n\t"\
|
||||
"st %%f10, [%10+" dest "] \n\t"\
|
||||
"st %%f12, [%11+" dest "] \n\t"\
|
||||
"st %%f14, [%12+" dest "] \n\t"\
|
||||
|
||||
|
||||
void ff_simple_idct_vis(DCTELEM *data) {
|
||||
int out1, out2, out3, out4;
|
||||
DECLARE_ALIGNED_8(int16_t, temp[8*8]);
|
||||
|
||||
__asm__ volatile(
|
||||
INIT_IDCT
|
||||
|
||||
#define ADDROUNDER
|
||||
|
||||
// shift right 16-4=12
|
||||
LOADSCALE("%2+8")
|
||||
IDCT4ROWS
|
||||
STOREROWS("%3+8")
|
||||
LOADSCALE("%2+0")
|
||||
IDCT4ROWS
|
||||
"std %%f48, [%3+112] \n\t"
|
||||
"std %%f50, [%3+96] \n\t"
|
||||
"std %%f52, [%3+80] \n\t"
|
||||
"std %%f54, [%3+64] \n\t"
|
||||
|
||||
// shift right 16+4
|
||||
"ldd [%3+8], %%f18 \n\t"
|
||||
"ldd [%3+24], %%f22 \n\t"
|
||||
"ldd [%3+40], %%f26 \n\t"
|
||||
"ldd [%3+56], %%f30 \n\t"
|
||||
TRANSPOSE
|
||||
IDCT4ROWS
|
||||
SCALEROWS
|
||||
STOREROWS("%2+0")
|
||||
LOAD("%3+64")
|
||||
TRANSPOSE
|
||||
IDCT4ROWS
|
||||
SCALEROWS
|
||||
STOREROWS("%2+8")
|
||||
|
||||
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4)
|
||||
: "0" (scale), "1" (coeffs), "2" (data), "3" (temp)
|
||||
);
|
||||
}
|
||||
|
||||
void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data) {
|
||||
int out1, out2, out3, out4, out5;
|
||||
int r1, r2, r3, r4, r5, r6, r7;
|
||||
|
||||
__asm__ volatile(
|
||||
"wr %%g0, 0x8, %%gsr \n\t"
|
||||
|
||||
INIT_IDCT
|
||||
|
||||
"add %3, %4, %5 \n\t"
|
||||
"add %5, %4, %6 \n\t"
|
||||
"add %6, %4, %7 \n\t"
|
||||
"add %7, %4, %8 \n\t"
|
||||
"add %8, %4, %9 \n\t"
|
||||
"add %9, %4, %10 \n\t"
|
||||
"add %10, %4, %11 \n\t"
|
||||
|
||||
// shift right 16-4=12
|
||||
LOADSCALE("%2+8")
|
||||
IDCT4ROWS
|
||||
STOREROWS("%2+8")
|
||||
LOADSCALE("%2+0")
|
||||
IDCT4ROWS
|
||||
"std %%f48, [%2+112] \n\t"
|
||||
"std %%f50, [%2+96] \n\t"
|
||||
"std %%f52, [%2+80] \n\t"
|
||||
"std %%f54, [%2+64] \n\t"
|
||||
|
||||
#undef ADDROUNDER
|
||||
#define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
|
||||
|
||||
// shift right 16+4
|
||||
"ldd [%2+8], %%f18 \n\t"
|
||||
"ldd [%2+24], %%f22 \n\t"
|
||||
"ldd [%2+40], %%f26 \n\t"
|
||||
"ldd [%2+56], %%f30 \n\t"
|
||||
TRANSPOSE
|
||||
IDCT4ROWS
|
||||
PUTPIXELSCLAMPED("0")
|
||||
LOAD("%2+64")
|
||||
TRANSPOSE
|
||||
IDCT4ROWS
|
||||
PUTPIXELSCLAMPED("4")
|
||||
|
||||
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5),
|
||||
"=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
|
||||
: "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size)
|
||||
);
|
||||
}
|
||||
|
||||
void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data) {
|
||||
int out1, out2, out3, out4, out5, out6;
|
||||
int r1, r2, r3, r4, r5, r6, r7;
|
||||
|
||||
__asm__ volatile(
|
||||
"wr %%g0, 0x8, %%gsr \n\t"
|
||||
|
||||
INIT_IDCT
|
||||
|
||||
"add %3, %4, %6 \n\t"
|
||||
"add %6, %4, %7 \n\t"
|
||||
"add %7, %4, %8 \n\t"
|
||||
"add %8, %4, %9 \n\t"
|
||||
"add %9, %4, %10 \n\t"
|
||||
"add %10, %4, %11 \n\t"
|
||||
"add %11, %4, %12 \n\t"
|
||||
|
||||
#undef ADDROUNDER
|
||||
#define ADDROUNDER
|
||||
|
||||
// shift right 16-4=12
|
||||
LOADSCALE("%2+8")
|
||||
IDCT4ROWS
|
||||
STOREROWS("%2+8")
|
||||
LOADSCALE("%2+0")
|
||||
IDCT4ROWS
|
||||
"std %%f48, [%2+112] \n\t"
|
||||
"std %%f50, [%2+96] \n\t"
|
||||
"std %%f52, [%2+80] \n\t"
|
||||
"std %%f54, [%2+64] \n\t"
|
||||
|
||||
#undef ADDROUNDER
|
||||
#define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
|
||||
|
||||
// shift right 16+4
|
||||
"ldd [%2+8], %%f18 \n\t"
|
||||
"ldd [%2+24], %%f22 \n\t"
|
||||
"ldd [%2+40], %%f26 \n\t"
|
||||
"ldd [%2+56], %%f30 \n\t"
|
||||
TRANSPOSE
|
||||
IDCT4ROWS
|
||||
ADDPIXELSCLAMPED("0")
|
||||
LOAD("%2+64")
|
||||
TRANSPOSE
|
||||
IDCT4ROWS
|
||||
ADDPIXELSCLAMPED("4")
|
||||
|
||||
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6),
|
||||
"=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
|
||||
: "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand)
|
||||
);
|
||||
}
|
|
@ -0,0 +1,331 @@
|
|||
/*
|
||||
* Copyright (C) 2003 David S. Miller <davem@redhat.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/* You may be asking why I hard-code the instruction opcodes and don't
|
||||
* use the normal VIS assembler mnenomics for the VIS instructions.
|
||||
*
|
||||
* The reason is that Sun, in their infinite wisdom, decided that a binary
|
||||
* using a VIS instruction will cause it to be marked (in the ELF headers)
|
||||
* as doing so, and this prevents the OS from loading such binaries if the
|
||||
* current cpu doesn't have VIS. There is no way to easily override this
|
||||
* behavior of the assembler that I am aware of.
|
||||
*
|
||||
* This totally defeats what libmpeg2 is trying to do which is allow a
|
||||
* single binary to be created, and then detect the availability of VIS
|
||||
* at runtime.
|
||||
*
|
||||
* I'm not saying that tainting the binary by default is bad, rather I'm
|
||||
* saying that not providing a way to override this easily unnecessarily
|
||||
* ties people's hands.
|
||||
*
|
||||
* Thus, we do the opcode encoding by hand and output 32-bit words in
|
||||
* the assembler to keep the binary from becoming tainted.
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_SPARC_VIS_H
|
||||
#define AVCODEC_SPARC_VIS_H
|
||||
|
||||
#define vis_opc_base ((0x1 << 31) | (0x36 << 19))
|
||||
#define vis_opf(X) ((X) << 5)
|
||||
#define vis_sreg(X) (X)
|
||||
#define vis_dreg(X) (((X)&0x1f)|((X)>>5))
|
||||
#define vis_rs1_s(X) (vis_sreg(X) << 14)
|
||||
#define vis_rs1_d(X) (vis_dreg(X) << 14)
|
||||
#define vis_rs2_s(X) (vis_sreg(X) << 0)
|
||||
#define vis_rs2_d(X) (vis_dreg(X) << 0)
|
||||
#define vis_rd_s(X) (vis_sreg(X) << 25)
|
||||
#define vis_rd_d(X) (vis_dreg(X) << 25)
|
||||
|
||||
#define vis_ss2s(opf,rs1,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs1_s(rs1) | \
|
||||
vis_rs2_s(rs2) | \
|
||||
vis_rd_s(rd)))
|
||||
|
||||
#define vis_dd2d(opf,rs1,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs1_d(rs1) | \
|
||||
vis_rs2_d(rs2) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_ss2d(opf,rs1,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs1_s(rs1) | \
|
||||
vis_rs2_s(rs2) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_sd2d(opf,rs1,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs1_s(rs1) | \
|
||||
vis_rs2_d(rs2) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_d2s(opf,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs2_d(rs2) | \
|
||||
vis_rd_s(rd)))
|
||||
|
||||
#define vis_s2d(opf,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs2_s(rs2) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_d12d(opf,rs1,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs1_d(rs1) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_d22d(opf,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs2_d(rs2) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_s12s(opf,rs1,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs1_s(rs1) | \
|
||||
vis_rd_s(rd)))
|
||||
|
||||
#define vis_s22s(opf,rs2,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rs2_s(rs2) | \
|
||||
vis_rd_s(rd)))
|
||||
|
||||
#define vis_s(opf,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rd_s(rd)))
|
||||
|
||||
#define vis_d(opf,rd) \
|
||||
__asm__ volatile (".word %0" \
|
||||
: : "i" (vis_opc_base | vis_opf(opf) | \
|
||||
vis_rd_d(rd)))
|
||||
|
||||
#define vis_r2m(op,rd,mem) \
|
||||
__asm__ volatile (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
|
||||
|
||||
#define vis_r2m_2(op,rd,mem1,mem2) \
|
||||
__asm__ volatile (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
|
||||
|
||||
#define vis_m2r(op,mem,rd) \
|
||||
__asm__ volatile (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
|
||||
|
||||
#define vis_m2r_2(op,mem1,mem2,rd) \
|
||||
__asm__ volatile (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
|
||||
|
||||
static inline void vis_set_gsr(unsigned int _val)
|
||||
{
|
||||
register unsigned int val __asm__("g1");
|
||||
|
||||
val = _val;
|
||||
__asm__ volatile(".word 0xa7804000"
|
||||
: : "r" (val));
|
||||
}
|
||||
|
||||
#define VIS_GSR_ALIGNADDR_MASK 0x0000007
|
||||
#define VIS_GSR_ALIGNADDR_SHIFT 0
|
||||
#define VIS_GSR_SCALEFACT_MASK 0x0000078
|
||||
#define VIS_GSR_SCALEFACT_SHIFT 3
|
||||
|
||||
#define vis_ld32(mem,rs1) vis_m2r(ld, mem, rs1)
|
||||
#define vis_ld32_2(mem1,mem2,rs1) vis_m2r_2(ld, mem1, mem2, rs1)
|
||||
#define vis_st32(rs1,mem) vis_r2m(st, rs1, mem)
|
||||
#define vis_st32_2(rs1,mem1,mem2) vis_r2m_2(st, rs1, mem1, mem2)
|
||||
#define vis_ld64(mem,rs1) vis_m2r(ldd, mem, rs1)
|
||||
#define vis_ld64_2(mem1,mem2,rs1) vis_m2r_2(ldd, mem1, mem2, rs1)
|
||||
#define vis_st64(rs1,mem) vis_r2m(std, rs1, mem)
|
||||
#define vis_st64_2(rs1,mem1,mem2) vis_r2m_2(std, rs1, mem1, mem2)
|
||||
|
||||
#define vis_ldblk(mem, rd) \
|
||||
do { register void *__mem __asm__("g1"); \
|
||||
__mem = &(mem); \
|
||||
__asm__ volatile(".word 0xc1985e00 | %1" \
|
||||
: \
|
||||
: "r" (__mem), \
|
||||
"i" (vis_rd_d(rd)) \
|
||||
: "memory"); \
|
||||
} while (0)
|
||||
|
||||
#define vis_stblk(rd, mem) \
|
||||
do { register void *__mem __asm__("g1"); \
|
||||
__mem = &(mem); \
|
||||
__asm__ volatile(".word 0xc1b85e00 | %1" \
|
||||
: \
|
||||
: "r" (__mem), \
|
||||
"i" (vis_rd_d(rd)) \
|
||||
: "memory"); \
|
||||
} while (0)
|
||||
|
||||
#define vis_membar_storestore() \
|
||||
__asm__ volatile(".word 0x8143e008" : : : "memory")
|
||||
|
||||
#define vis_membar_sync() \
|
||||
__asm__ volatile(".word 0x8143e040" : : : "memory")
|
||||
|
||||
/* 16 and 32 bit partitioned addition and subtraction. The normal
|
||||
* versions perform 4 16-bit or 2 32-bit additions or subtractions.
|
||||
* The 's' versions perform 2 16-bit or 1 32-bit additions or
|
||||
* subtractions.
|
||||
*/
|
||||
|
||||
#define vis_padd16(rs1,rs2,rd) vis_dd2d(0x50, rs1, rs2, rd)
|
||||
#define vis_padd16s(rs1,rs2,rd) vis_ss2s(0x51, rs1, rs2, rd)
|
||||
#define vis_padd32(rs1,rs2,rd) vis_dd2d(0x52, rs1, rs2, rd)
|
||||
#define vis_padd32s(rs1,rs2,rd) vis_ss2s(0x53, rs1, rs2, rd)
|
||||
#define vis_psub16(rs1,rs2,rd) vis_dd2d(0x54, rs1, rs2, rd)
|
||||
#define vis_psub16s(rs1,rs2,rd) vis_ss2s(0x55, rs1, rs2, rd)
|
||||
#define vis_psub32(rs1,rs2,rd) vis_dd2d(0x56, rs1, rs2, rd)
|
||||
#define vis_psub32s(rs1,rs2,rd) vis_ss2s(0x57, rs1, rs2, rd)
|
||||
|
||||
/* Pixel formatting instructions. */
|
||||
|
||||
#define vis_pack16(rs2,rd) vis_d2s( 0x3b, rs2, rd)
|
||||
#define vis_pack32(rs1,rs2,rd) vis_dd2d(0x3a, rs1, rs2, rd)
|
||||
#define vis_packfix(rs2,rd) vis_d2s( 0x3d, rs2, rd)
|
||||
#define vis_expand(rs2,rd) vis_s2d( 0x4d, rs2, rd)
|
||||
#define vis_pmerge(rs1,rs2,rd) vis_ss2d(0x4b, rs1, rs2, rd)
|
||||
|
||||
/* Partitioned multiply instructions. */
|
||||
|
||||
#define vis_mul8x16(rs1,rs2,rd) vis_sd2d(0x31, rs1, rs2, rd)
|
||||
#define vis_mul8x16au(rs1,rs2,rd) vis_ss2d(0x33, rs1, rs2, rd)
|
||||
#define vis_mul8x16al(rs1,rs2,rd) vis_ss2d(0x35, rs1, rs2, rd)
|
||||
#define vis_mul8sux16(rs1,rs2,rd) vis_dd2d(0x36, rs1, rs2, rd)
|
||||
#define vis_mul8ulx16(rs1,rs2,rd) vis_dd2d(0x37, rs1, rs2, rd)
|
||||
#define vis_muld8sux16(rs1,rs2,rd) vis_ss2d(0x38, rs1, rs2, rd)
|
||||
#define vis_muld8ulx16(rs1,rs2,rd) vis_ss2d(0x39, rs1, rs2, rd)
|
||||
|
||||
/* Alignment instructions. */
|
||||
|
||||
static inline void *vis_alignaddr(void *_ptr)
|
||||
{
|
||||
register void *ptr __asm__("g1");
|
||||
|
||||
ptr = _ptr;
|
||||
|
||||
__asm__ volatile(".word %2"
|
||||
: "=&r" (ptr)
|
||||
: "0" (ptr),
|
||||
"i" (vis_opc_base | vis_opf(0x18) |
|
||||
vis_rs1_s(1) |
|
||||
vis_rs2_s(0) |
|
||||
vis_rd_s(1)));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static inline void vis_alignaddr_g0(void *_ptr)
|
||||
{
|
||||
register void *ptr __asm__("g1");
|
||||
|
||||
ptr = _ptr;
|
||||
|
||||
__asm__ volatile(".word %2"
|
||||
: "=&r" (ptr)
|
||||
: "0" (ptr),
|
||||
"i" (vis_opc_base | vis_opf(0x18) |
|
||||
vis_rs1_s(1) |
|
||||
vis_rs2_s(0) |
|
||||
vis_rd_s(0)));
|
||||
}
|
||||
|
||||
static inline void *vis_alignaddrl(void *_ptr)
|
||||
{
|
||||
register void *ptr __asm__("g1");
|
||||
|
||||
ptr = _ptr;
|
||||
|
||||
__asm__ volatile(".word %2"
|
||||
: "=&r" (ptr)
|
||||
: "0" (ptr),
|
||||
"i" (vis_opc_base | vis_opf(0x19) |
|
||||
vis_rs1_s(1) |
|
||||
vis_rs2_s(0) |
|
||||
vis_rd_s(1)));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static inline void vis_alignaddrl_g0(void *_ptr)
|
||||
{
|
||||
register void *ptr __asm__("g1");
|
||||
|
||||
ptr = _ptr;
|
||||
|
||||
__asm__ volatile(".word %2"
|
||||
: "=&r" (ptr)
|
||||
: "0" (ptr),
|
||||
"i" (vis_opc_base | vis_opf(0x19) |
|
||||
vis_rs1_s(1) |
|
||||
vis_rs2_s(0) |
|
||||
vis_rd_s(0)));
|
||||
}
|
||||
|
||||
#define vis_faligndata(rs1,rs2,rd) vis_dd2d(0x48, rs1, rs2, rd)
|
||||
|
||||
/* Logical operate instructions. */
|
||||
|
||||
#define vis_fzero(rd) vis_d( 0x60, rd)
|
||||
#define vis_fzeros(rd) vis_s( 0x61, rd)
|
||||
#define vis_fone(rd) vis_d( 0x7e, rd)
|
||||
#define vis_fones(rd) vis_s( 0x7f, rd)
|
||||
#define vis_src1(rs1,rd) vis_d12d(0x74, rs1, rd)
|
||||
#define vis_src1s(rs1,rd) vis_s12s(0x75, rs1, rd)
|
||||
#define vis_src2(rs2,rd) vis_d22d(0x78, rs2, rd)
|
||||
#define vis_src2s(rs2,rd) vis_s22s(0x79, rs2, rd)
|
||||
#define vis_not1(rs1,rd) vis_d12d(0x6a, rs1, rd)
|
||||
#define vis_not1s(rs1,rd) vis_s12s(0x6b, rs1, rd)
|
||||
#define vis_not2(rs2,rd) vis_d22d(0x66, rs2, rd)
|
||||
#define vis_not2s(rs2,rd) vis_s22s(0x67, rs2, rd)
|
||||
#define vis_or(rs1,rs2,rd) vis_dd2d(0x7c, rs1, rs2, rd)
|
||||
#define vis_ors(rs1,rs2,rd) vis_ss2s(0x7d, rs1, rs2, rd)
|
||||
#define vis_nor(rs1,rs2,rd) vis_dd2d(0x62, rs1, rs2, rd)
|
||||
#define vis_nors(rs1,rs2,rd) vis_ss2s(0x63, rs1, rs2, rd)
|
||||
#define vis_and(rs1,rs2,rd) vis_dd2d(0x70, rs1, rs2, rd)
|
||||
#define vis_ands(rs1,rs2,rd) vis_ss2s(0x71, rs1, rs2, rd)
|
||||
#define vis_nand(rs1,rs2,rd) vis_dd2d(0x6e, rs1, rs2, rd)
|
||||
#define vis_nands(rs1,rs2,rd) vis_ss2s(0x6f, rs1, rs2, rd)
|
||||
#define vis_xor(rs1,rs2,rd) vis_dd2d(0x6c, rs1, rs2, rd)
|
||||
#define vis_xors(rs1,rs2,rd) vis_ss2s(0x6d, rs1, rs2, rd)
|
||||
#define vis_xnor(rs1,rs2,rd) vis_dd2d(0x72, rs1, rs2, rd)
|
||||
#define vis_xnors(rs1,rs2,rd) vis_ss2s(0x73, rs1, rs2, rd)
|
||||
#define vis_ornot1(rs1,rs2,rd) vis_dd2d(0x7a, rs1, rs2, rd)
|
||||
#define vis_ornot1s(rs1,rs2,rd) vis_ss2s(0x7b, rs1, rs2, rd)
|
||||
#define vis_ornot2(rs1,rs2,rd) vis_dd2d(0x76, rs1, rs2, rd)
|
||||
#define vis_ornot2s(rs1,rs2,rd) vis_ss2s(0x77, rs1, rs2, rd)
|
||||
#define vis_andnot1(rs1,rs2,rd) vis_dd2d(0x68, rs1, rs2, rd)
|
||||
#define vis_andnot1s(rs1,rs2,rd) vis_ss2s(0x69, rs1, rs2, rd)
|
||||
#define vis_andnot2(rs1,rs2,rd) vis_dd2d(0x64, rs1, rs2, rd)
|
||||
#define vis_andnot2s(rs1,rs2,rd) vis_ss2s(0x65, rs1, rs2, rd)
|
||||
|
||||
/* Pixel component distance. */
|
||||
|
||||
#define vis_pdist(rs1,rs2,rd) vis_dd2d(0x3e, rs1, rs2, rd)
|
||||
|
||||
#endif /* AVCODEC_SPARC_VIS_H */
|
Loading…
Reference in New Issue