* Imported ppc and sparc architectures optimized codes from ffmpeg 0.5.

* Added Jamfiles to build libavcodec_ppc.a and libavcodec_sparc.a. 
  UNTESTED.



git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@30187 a95241bf-73f2-0310-859d-f6bbb57e9c96
This commit is contained in:
Philippe Houdoin 2009-04-15 23:38:05 +00:00
parent 464e95c43f
commit 8a3c8a66b3
27 changed files with 12549 additions and 0 deletions

View File

@ -0,0 +1,62 @@
SubDir HAIKU_TOP src add-ons media plugins avcodec libavcodec ppc ;
SubDirHdrs [ FDirName $(SUBDIR) .. ] ;
SubDirHdrs [ FDirName $(SUBDIR) ../.. ] ;
SubDirHdrs [ FDirName $(SUBDIR) ../../libavutil ] ;
SubDirHdrs [ FDirName $(SUBDIR) ../../libswscale ] ;
# filter warnings we don't want here
TARGET_WARNING_CCFLAGS = [ FFilter $(TARGET_WARNING_CCFLAGS)
: -Wall -Wmissing-prototypes -Wsign-compare -Wpointer-arith ] ;
if $(HAIKU_GCC_VERSION[1]) >= 3 {
SubDirCcFlags -fomit-frame-pointer -fno-pic ;
} else {
SubDirCcFlags -fomit-frame-pointer -DPIC ;
}
local defines ;
defines = HAVE_AV_CONFIG_H=1 ;
if $(TARGET_ARCH) = x86 {
defines += ARCH_X86=1 ARCH_X86_32=1 ARCH_PPC=0 ARCH_SPARC=0 ;
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
defines += HAVE_MMX=1 HAVE_MMX2=1 HAVE_SSE=0 HAVE_SSE3=1 ;
defines += HAVE_ALTIVEC=0 ;
defines += HAVE_VIS=0 ;
} else if $(TARGET_ARCH) = ppc {
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=1 ARCH_SPARC=0 ;
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
defines += HAVE_ALTIVEC=1 ;
defines += HAVE_VIS=0 ;
} else if $(TARGET_ARCH) = sparc {
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=0 ARCH_SPARC=1 ;
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
defines += HAVE_ALTIVEC=0 ;
defines += HAVE_VIS=1 ;
}
defines = [ FDefines $(defines) ] ;
SubDirCcFlags $(defines) ;
SubDirC++Flags $(defines) ;
StaticLibrary libavcodec_ppc.a :
check_altivec.c
float_altivec.c
int_altivec.c
dsputil_altivec.c
dsputil_ppc.c
fdct_altivec.c
fft_altivec.c
idct_altivec.c
gmc_altivec.c
imgresample_altivec.c
h264_altivec.c
# h264_template_altivec.c
mpegvideo_altivec.c
vc1dsp_altivec.c
snow_altivec.c
;

View File

@ -0,0 +1,84 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file libavcodec/ppc/check_altivec.c
* Checks for AltiVec presence.
*/
#ifdef __APPLE__
#undef _POSIX_C_SOURCE
#include <sys/sysctl.h>
#elif defined(__OpenBSD__)
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#elif defined(__AMIGAOS4__)
#include <exec/exec.h>
#include <interfaces/exec.h>
#include <proto/exec.h>
#endif /* __APPLE__ */
/**
* This function MAY rely on signal() or fork() in order to make sure AltiVec
* is present.
*/
int has_altivec(void)
{
#ifdef __AMIGAOS4__
ULONG result = 0;
extern struct ExecIFace *IExec;
IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
if (result == VECTORTYPE_ALTIVEC) return 1;
return 0;
#elif defined(__APPLE__) || defined(__OpenBSD__)
#ifdef __OpenBSD__
int sels[2] = {CTL_MACHDEP, CPU_ALTIVEC};
#else
int sels[2] = {CTL_HW, HW_VECTORUNIT};
#endif
int has_vu = 0;
size_t len = sizeof(has_vu);
int err;
err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
if (err == 0) return has_vu != 0;
return 0;
#elif defined(RUNTIME_CPUDETECT)
int proc_ver;
// Support of mfspr PVR emulation added in Linux 2.6.17.
__asm__ volatile("mfspr %0, 287" : "=r" (proc_ver));
proc_ver >>= 16;
if (proc_ver & 0x8000 ||
proc_ver == 0x000c ||
proc_ver == 0x0039 || proc_ver == 0x003c ||
proc_ver == 0x0044 || proc_ver == 0x0045 ||
proc_ver == 0x0070)
return 1;
return 0;
#else
// Since we were compiled for AltiVec, just assume we have it
// until someone comes up with a proper way (not involving signal hacks).
return 1;
#endif /* __AMIGAOS4__ */
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,34 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H
#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H
#include <stdint.h>
int has_altivec(void);
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */

View File

@ -0,0 +1,306 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
#include "dsputil_ppc.h"
#include "dsputil_altivec.h"
void fdct_altivec(int16_t *block);
void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
int x16, int y16, int rounder);
void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
void float_init_altivec(DSPContext* c, AVCodecContext *avctx);
void int_init_altivec(DSPContext* c, AVCodecContext *avctx);
int mm_flags = 0;
int mm_support(void)
{
int result = 0;
#if HAVE_ALTIVEC
if (has_altivec()) {
result |= FF_MM_ALTIVEC;
}
#endif /* result */
return result;
}
#if CONFIG_POWERPC_PERF
unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
/* list below must match enum in dsputil_ppc.h */
static unsigned char* perfname[] = {
"ff_fft_calc_altivec",
"gmc1_altivec",
"dct_unquantize_h263_altivec",
"fdct_altivec",
"idct_add_altivec",
"idct_put_altivec",
"put_pixels16_altivec",
"avg_pixels16_altivec",
"avg_pixels8_altivec",
"put_pixels8_xy2_altivec",
"put_no_rnd_pixels8_xy2_altivec",
"put_pixels16_xy2_altivec",
"put_no_rnd_pixels16_xy2_altivec",
"hadamard8_diff8x8_altivec",
"hadamard8_diff16_altivec",
"avg_pixels8_xy2_altivec",
"clear_blocks_dcbz32_ppc",
"clear_blocks_dcbz128_ppc",
"put_h264_chroma_mc8_altivec",
"avg_h264_chroma_mc8_altivec",
"put_h264_qpel16_h_lowpass_altivec",
"avg_h264_qpel16_h_lowpass_altivec",
"put_h264_qpel16_v_lowpass_altivec",
"avg_h264_qpel16_v_lowpass_altivec",
"put_h264_qpel16_hv_lowpass_altivec",
"avg_h264_qpel16_hv_lowpass_altivec",
""
};
#include <stdio.h>
#endif
#if CONFIG_POWERPC_PERF
void powerpc_display_perf_report(void)
{
int i, j;
av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
for(i = 0 ; i < powerpc_perf_total ; i++) {
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
av_log(NULL, AV_LOG_INFO,
" Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n",
perfname[i],
j+1,
perfdata[j][i][powerpc_data_min],
perfdata[j][i][powerpc_data_max],
(double)perfdata[j][i][powerpc_data_sum] /
(double)perfdata[j][i][powerpc_data_num],
perfdata[j][i][powerpc_data_num]);
}
}
}
#endif /* CONFIG_POWERPC_PERF */
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
cache line size not equal to 32 bytes.
Fortunately all processor used by Apple up to at least the 7450 (aka second
generation G4) use 32 bytes cache line.
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
single cache line, so you need to know the cache line size to use it !
It's absurd, but it's fast...
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
size: 128 bytes. Oups.
The semantic of dcbz was changed, it always clear 32 bytes. so the function
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
which is defined to clear a cache line (as dcbz before). So we still can
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
see <http://developer.apple.com/technotes/tn/tn2087.html>
and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
register int misal = ((unsigned long)blocks & 0x00000010);
register int i = 0;
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
#if 1
if (misal) {
((unsigned long*)blocks)[0] = 0L;
((unsigned long*)blocks)[1] = 0L;
((unsigned long*)blocks)[2] = 0L;
((unsigned long*)blocks)[3] = 0L;
i += 16;
}
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
__asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
}
if (misal) {
((unsigned long*)blocks)[188] = 0L;
((unsigned long*)blocks)[189] = 0L;
((unsigned long*)blocks)[190] = 0L;
((unsigned long*)blocks)[191] = 0L;
i += 16;
}
#else
memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
}
/* same as above, when dcbzl clear a whole 128B cache line
i.e. the PPC970 aka G5 */
#if HAVE_DCBZL
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
register int misal = ((unsigned long)blocks & 0x0000007f);
register int i = 0;
POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
#if 1
if (misal) {
// we could probably also optimize this case,
// but there's not much point as the machines
// aren't available yet (2003-06-26)
memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
else
for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
__asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
}
#else
memset(blocks, 0, sizeof(DCTELEM)*6*64);
#endif
POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
}
#else
void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif
#if HAVE_DCBZL
/* check dcbz report how many bytes are set to 0 by dcbz */
/* update 24/06/2003 : replace dcbz by dcbzl to get
the intended effect (Apple "fixed" dcbz)
unfortunately this cannot be used unless the assembler
knows about dcbzl ... */
long check_dcbzl_effect(void)
{
register char *fakedata = av_malloc(1024);
register char *fakedata_middle;
register long zero = 0;
register long i = 0;
long count = 0;
if (!fakedata) {
return 0L;
}
fakedata_middle = (fakedata + 512);
memset(fakedata, 0xFF, 1024);
/* below the constraint "b" seems to mean "Address base register"
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
__asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
for (i = 0; i < 1024 ; i ++) {
if (fakedata[i] == (char)0)
count++;
}
av_free(fakedata);
return count;
}
#else
long check_dcbzl_effect(void)
{
return 0;
}
#endif
static void prefetch_ppc(void *mem, int stride, int h)
{
register const uint8_t *p = mem;
do {
__asm__ volatile ("dcbt 0,%0" : : "r" (p));
p+= stride;
} while(--h);
}
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
{
// Common optimizations whether AltiVec is available or not
c->prefetch = prefetch_ppc;
switch (check_dcbzl_effect()) {
case 32:
c->clear_blocks = clear_blocks_dcbz32_ppc;
break;
case 128:
c->clear_blocks = clear_blocks_dcbz128_ppc;
break;
default:
break;
}
#if HAVE_ALTIVEC
if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
if (has_altivec()) {
mm_flags |= FF_MM_ALTIVEC;
dsputil_init_altivec(c, avctx);
if(CONFIG_SNOW_DECODER) snow_init_altivec(c, avctx);
if(CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER)
vc1dsp_init_altivec(c, avctx);
float_init_altivec(c, avctx);
int_init_altivec(c, avctx);
c->gmc1 = gmc1_altivec;
#if CONFIG_ENCODERS
if (avctx->dct_algo == FF_DCT_AUTO ||
avctx->dct_algo == FF_DCT_ALTIVEC) {
c->fdct = fdct_altivec;
}
#endif //CONFIG_ENCODERS
if (avctx->lowres==0) {
if ((avctx->idct_algo == FF_IDCT_AUTO) ||
(avctx->idct_algo == FF_IDCT_ALTIVEC)) {
c->idct_put = idct_put_altivec;
c->idct_add = idct_add_altivec;
c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
}
}
#if CONFIG_POWERPC_PERF
{
int i, j;
for (i = 0 ; i < powerpc_perf_total ; i++) {
for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) {
perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL;
perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL;
perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL;
perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL;
}
}
}
#endif /* CONFIG_POWERPC_PERF */
}
#endif /* HAVE_ALTIVEC */
}

View File

@ -0,0 +1,154 @@
/*
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_DSPUTIL_PPC_H
#define AVCODEC_PPC_DSPUTIL_PPC_H
#include "config.h"
#if CONFIG_POWERPC_PERF
void powerpc_display_perf_report(void);
/* the 604* have 2, the G3* have 4, the G4s have 6,
and the G5 are completely different (they MUST use
HAVE_PPC64, and let's hope all future 64 bis PPC
will use the same PMCs... */
#define POWERPC_NUM_PMC_ENABLED 6
/* if you add to the enum below, also add to the perfname array
in dsputil_ppc.c */
enum powerpc_perf_index {
altivec_fft_num = 0,
altivec_gmc1_num,
altivec_dct_unquantize_h263_num,
altivec_fdct,
altivec_idct_add_num,
altivec_idct_put_num,
altivec_put_pixels16_num,
altivec_avg_pixels16_num,
altivec_avg_pixels8_num,
altivec_put_pixels8_xy2_num,
altivec_put_no_rnd_pixels8_xy2_num,
altivec_put_pixels16_xy2_num,
altivec_put_no_rnd_pixels16_xy2_num,
altivec_hadamard8_diff8x8_num,
altivec_hadamard8_diff16_num,
altivec_avg_pixels8_xy2_num,
powerpc_clear_blocks_dcbz32,
powerpc_clear_blocks_dcbz128,
altivec_put_h264_chroma_mc8_num,
altivec_avg_h264_chroma_mc8_num,
altivec_put_h264_qpel16_h_lowpass_num,
altivec_avg_h264_qpel16_h_lowpass_num,
altivec_put_h264_qpel16_v_lowpass_num,
altivec_avg_h264_qpel16_v_lowpass_num,
altivec_put_h264_qpel16_hv_lowpass_num,
altivec_avg_h264_qpel16_hv_lowpass_num,
powerpc_perf_total
};
enum powerpc_data_index {
powerpc_data_min = 0,
powerpc_data_max,
powerpc_data_sum,
powerpc_data_num,
powerpc_data_total
};
extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
#if !HAVE_PPC64
#define POWERP_PMC_DATATYPE unsigned long
#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 937" : "=r" (a))
#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 938" : "=r" (a))
#if (POWERPC_NUM_PMC_ENABLED > 2)
#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 941" : "=r" (a))
#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 942" : "=r" (a))
#else
#define POWERPC_GET_PMC3(a) do {} while (0)
#define POWERPC_GET_PMC4(a) do {} while (0)
#endif
#if (POWERPC_NUM_PMC_ENABLED > 4)
#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 929" : "=r" (a))
#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 930" : "=r" (a))
#else
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
#else /* HAVE_PPC64 */
#define POWERP_PMC_DATATYPE unsigned long long
#define POWERPC_GET_PMC1(a) __asm__ volatile("mfspr %0, 771" : "=r" (a))
#define POWERPC_GET_PMC2(a) __asm__ volatile("mfspr %0, 772" : "=r" (a))
#if (POWERPC_NUM_PMC_ENABLED > 2)
#define POWERPC_GET_PMC3(a) __asm__ volatile("mfspr %0, 773" : "=r" (a))
#define POWERPC_GET_PMC4(a) __asm__ volatile("mfspr %0, 774" : "=r" (a))
#else
#define POWERPC_GET_PMC3(a) do {} while (0)
#define POWERPC_GET_PMC4(a) do {} while (0)
#endif
#if (POWERPC_NUM_PMC_ENABLED > 4)
#define POWERPC_GET_PMC5(a) __asm__ volatile("mfspr %0, 775" : "=r" (a))
#define POWERPC_GET_PMC6(a) __asm__ volatile("mfspr %0, 776" : "=r" (a))
#else
#define POWERPC_GET_PMC5(a) do {} while (0)
#define POWERPC_GET_PMC6(a) do {} while (0)
#endif
#endif /* HAVE_PPC64 */
#define POWERPC_PERF_DECLARE(a, cond) \
POWERP_PMC_DATATYPE \
pmc_start[POWERPC_NUM_PMC_ENABLED], \
pmc_stop[POWERPC_NUM_PMC_ENABLED], \
pmc_loop_index;
#define POWERPC_PERF_START_COUNT(a, cond) do { \
POWERPC_GET_PMC6(pmc_start[5]); \
POWERPC_GET_PMC5(pmc_start[4]); \
POWERPC_GET_PMC4(pmc_start[3]); \
POWERPC_GET_PMC3(pmc_start[2]); \
POWERPC_GET_PMC2(pmc_start[1]); \
POWERPC_GET_PMC1(pmc_start[0]); \
} while (0)
#define POWERPC_PERF_STOP_COUNT(a, cond) do { \
POWERPC_GET_PMC1(pmc_stop[0]); \
POWERPC_GET_PMC2(pmc_stop[1]); \
POWERPC_GET_PMC3(pmc_stop[2]); \
POWERPC_GET_PMC4(pmc_stop[3]); \
POWERPC_GET_PMC5(pmc_stop[4]); \
POWERPC_GET_PMC6(pmc_stop[5]); \
if (cond) { \
for(pmc_loop_index = 0; \
pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \
pmc_loop_index++) { \
if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) { \
POWERP_PMC_DATATYPE diff = \
pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \
if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \
perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \
if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \
perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \
perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \
perfdata[pmc_loop_index][a][powerpc_data_num] ++; \
} \
} \
} \
} while (0)
#else /* CONFIG_POWERPC_PERF */
// those are needed to avoid empty statements.
#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused))
#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0)
#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0)
#endif /* CONFIG_POWERPC_PERF */
#endif /* AVCODEC_PPC_DSPUTIL_PPC_H */

View File

@ -0,0 +1,493 @@
/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the
* AltiVec optimized library for the FFMPEG Multimedia System
* Copyright (C) 2003 James Klicman <james@klicman.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/common.h"
#include "libavcodec/dsputil.h"
#include "dsputil_ppc.h"
#include "gcc_fixes.h"
#define vs16(v) ((vector signed short)(v))
#define vs32(v) ((vector signed int)(v))
#define vu8(v) ((vector unsigned char)(v))
#define vu16(v) ((vector unsigned short)(v))
#define vu32(v) ((vector unsigned int)(v))
#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */
#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */
#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */
#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */
#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */
#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */
#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */
#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
#define W0 -(2 * C2)
#define W1 (2 * C6)
#define W2 (SQRT_2 * C6)
#define W3 (SQRT_2 * C3)
#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
#define W8 (SQRT_2 * ( C7 - C3))
#define W9 (SQRT_2 * (-C1 - C3))
#define WA (SQRT_2 * (-C3 - C5))
#define WB (SQRT_2 * ( C5 - C3))
static vector float fdctconsts[3] = {
{ W0, W1, W2, W3 },
{ W4, W5, W6, W7 },
{ W8, W9, WA, WB }
};
#define LD_W0 vec_splat(cnsts0, 0)
#define LD_W1 vec_splat(cnsts0, 1)
#define LD_W2 vec_splat(cnsts0, 2)
#define LD_W3 vec_splat(cnsts0, 3)
#define LD_W4 vec_splat(cnsts1, 0)
#define LD_W5 vec_splat(cnsts1, 1)
#define LD_W6 vec_splat(cnsts1, 2)
#define LD_W7 vec_splat(cnsts1, 3)
#define LD_W8 vec_splat(cnsts2, 0)
#define LD_W9 vec_splat(cnsts2, 1)
#define LD_WA vec_splat(cnsts2, 2)
#define LD_WB vec_splat(cnsts2, 3)
#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
\
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
\
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
cnst = LD_W2; \
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
cnst = LD_W1; \
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
cnst = LD_W0; \
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
\
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
cnst = LD_W3; \
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
\
cnst = LD_W8; \
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
cnst = LD_W9; \
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
cnst = LD_WA; \
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
cnst = LD_WB; \
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
\
cnst = LD_W4; \
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
cnst = LD_W5; \
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
cnst = LD_W6; \
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
cnst = LD_W7; \
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
\
b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \
/* }}} */
#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
\
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
\
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
cnst = LD_W2; \
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
cnst = LD_W1; \
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
cnst = LD_W0; \
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
\
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
cnst = LD_W3; \
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
\
cnst = LD_W8; \
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
cnst = LD_W9; \
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
cnst = LD_WA; \
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
cnst = LD_WB; \
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
\
cnst = LD_W4; \
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
cnst = LD_W5; \
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
cnst = LD_W6; \
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
cnst = LD_W7; \
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
\
b7 = vec_add(b7, x2); /* b7 += x2; */ \
b5 = vec_add(b5, x3); /* b5 += x3; */ \
b3 = vec_add(b3, x2); /* b3 += x2; */ \
b1 = vec_add(b1, x3); /* b1 += x3; */ \
/* }}} */
/* two dimensional discrete cosine transform */
void fdct_altivec(int16_t *block)
{
POWERPC_PERF_DECLARE(altivec_fdct, 1);
vector signed short *bp;
vector float *cp;
vector float b00, b10, b20, b30, b40, b50, b60, b70;
vector float b01, b11, b21, b31, b41, b51, b61, b71;
vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
POWERPC_PERF_START_COUNT(altivec_fdct, 1);
/* setup constants {{{ */
/* mzero = -0.0 */
mzero = ((vector float)vec_splat_u32(-1));
mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero)));
cp = fdctconsts;
cnsts0 = vec_ld(0, cp); cp++;
cnsts1 = vec_ld(0, cp); cp++;
cnsts2 = vec_ld(0, cp);
/* }}} */
/* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
bp = (vector signed short*)block;
b00 = ((vector float)vec_ld(0, bp));
b40 = ((vector float)vec_ld(16*4, bp));
b01 = ((vector float)MERGE_S16(h, b00, b40));
b11 = ((vector float)MERGE_S16(l, b00, b40));
bp++;
b10 = ((vector float)vec_ld(0, bp));
b50 = ((vector float)vec_ld(16*4, bp));
b21 = ((vector float)MERGE_S16(h, b10, b50));
b31 = ((vector float)MERGE_S16(l, b10, b50));
bp++;
b20 = ((vector float)vec_ld(0, bp));
b60 = ((vector float)vec_ld(16*4, bp));
b41 = ((vector float)MERGE_S16(h, b20, b60));
b51 = ((vector float)MERGE_S16(l, b20, b60));
bp++;
b30 = ((vector float)vec_ld(0, bp));
b70 = ((vector float)vec_ld(16*4, bp));
b61 = ((vector float)MERGE_S16(h, b30, b70));
b71 = ((vector float)MERGE_S16(l, b30, b70));
x0 = ((vector float)MERGE_S16(h, b01, b41));
x1 = ((vector float)MERGE_S16(l, b01, b41));
x2 = ((vector float)MERGE_S16(h, b11, b51));
x3 = ((vector float)MERGE_S16(l, b11, b51));
x4 = ((vector float)MERGE_S16(h, b21, b61));
x5 = ((vector float)MERGE_S16(l, b21, b61));
x6 = ((vector float)MERGE_S16(h, b31, b71));
x7 = ((vector float)MERGE_S16(l, b31, b71));
b00 = ((vector float)MERGE_S16(h, x0, x4));
b10 = ((vector float)MERGE_S16(l, x0, x4));
b20 = ((vector float)MERGE_S16(h, x1, x5));
b30 = ((vector float)MERGE_S16(l, x1, x5));
b40 = ((vector float)MERGE_S16(h, x2, x6));
b50 = ((vector float)MERGE_S16(l, x2, x6));
b60 = ((vector float)MERGE_S16(h, x3, x7));
b70 = ((vector float)MERGE_S16(l, x3, x7));
#undef MERGE_S16
/* }}} */
/* Some of the initial calculations can be done as vector short before
* conversion to vector float. The following code section takes advantage
* of this.
*/
#if 1
/* fdct rows {{{ */
x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));
x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));
x1 = ((vector float)vec_add(vs16(b10), vs16(b60)));
x6 = ((vector float)vec_sub(vs16(b10), vs16(b60)));
x2 = ((vector float)vec_add(vs16(b20), vs16(b50)));
x5 = ((vector float)vec_sub(vs16(b20), vs16(b50)));
x3 = ((vector float)vec_add(vs16(b30), vs16(b40)));
x4 = ((vector float)vec_sub(vs16(b30), vs16(b40)));
b70 = ((vector float)vec_add(vs16(x0), vs16(x3)));
b10 = ((vector float)vec_add(vs16(x1), vs16(x2)));
b00 = ((vector float)vec_add(vs16(b70), vs16(b10)));
b40 = ((vector float)vec_sub(vs16(b70), vs16(b10)));
#define CTF0(n) \
b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \
b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \
b##n##1 = vec_ctf(vs32(b##n##1), 0); \
b##n##0 = vec_ctf(vs32(b##n##0), 0);
CTF0(0);
CTF0(4);
b20 = ((vector float)vec_sub(vs16(x0), vs16(x3)));
b60 = ((vector float)vec_sub(vs16(x1), vs16(x2)));
CTF0(2);
CTF0(6);
#undef CTF0
x0 = vec_add(b60, b20);
x1 = vec_add(b61, b21);
cnst = LD_W2;
x0 = vec_madd(cnst, x0, mzero);
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_W1;
b20 = vec_madd(cnst, b20, x0);
b21 = vec_madd(cnst, b21, x1);
cnst = LD_W0;
b60 = vec_madd(cnst, b60, x0);
b61 = vec_madd(cnst, b61, x1);
#define CTFX(x,b) \
b##0 = ((vector float)vec_unpackh(vs16(x))); \
b##1 = ((vector float)vec_unpackl(vs16(x))); \
b##0 = vec_ctf(vs32(b##0), 0); \
b##1 = vec_ctf(vs32(b##1), 0); \
CTFX(x4, b7);
CTFX(x5, b5);
CTFX(x6, b3);
CTFX(x7, b1);
#undef CTFX
x0 = vec_add(b70, b10);
x1 = vec_add(b50, b30);
x2 = vec_add(b70, b30);
x3 = vec_add(b50, b10);
x8 = vec_add(x2, x3);
cnst = LD_W3;
x8 = vec_madd(cnst, x8, mzero);
cnst = LD_W8;
x0 = vec_madd(cnst, x0, mzero);
cnst = LD_W9;
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_WA;
x2 = vec_madd(cnst, x2, x8);
cnst = LD_WB;
x3 = vec_madd(cnst, x3, x8);
cnst = LD_W4;
b70 = vec_madd(cnst, b70, x0);
cnst = LD_W5;
b50 = vec_madd(cnst, b50, x1);
cnst = LD_W6;
b30 = vec_madd(cnst, b30, x1);
cnst = LD_W7;
b10 = vec_madd(cnst, b10, x0);
b70 = vec_add(b70, x2);
b50 = vec_add(b50, x3);
b30 = vec_add(b30, x2);
b10 = vec_add(b10, x3);
x0 = vec_add(b71, b11);
x1 = vec_add(b51, b31);
x2 = vec_add(b71, b31);
x3 = vec_add(b51, b11);
x8 = vec_add(x2, x3);
cnst = LD_W3;
x8 = vec_madd(cnst, x8, mzero);
cnst = LD_W8;
x0 = vec_madd(cnst, x0, mzero);
cnst = LD_W9;
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_WA;
x2 = vec_madd(cnst, x2, x8);
cnst = LD_WB;
x3 = vec_madd(cnst, x3, x8);
cnst = LD_W4;
b71 = vec_madd(cnst, b71, x0);
cnst = LD_W5;
b51 = vec_madd(cnst, b51, x1);
cnst = LD_W6;
b31 = vec_madd(cnst, b31, x1);
cnst = LD_W7;
b11 = vec_madd(cnst, b11, x0);
b71 = vec_add(b71, x2);
b51 = vec_add(b51, x3);
b31 = vec_add(b31, x2);
b11 = vec_add(b11, x3);
/* }}} */
#else
/* convert to float {{{ */
#define CTF(n) \
vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \
vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \
b##n##1 = vec_ctf(vs32(b##n##1), 0); \
b##n##0 = vec_ctf(vs32(b##n##0), 0); \
CTF(0);
CTF(1);
CTF(2);
CTF(3);
CTF(4);
CTF(5);
CTF(6);
CTF(7);
#undef CTF
/* }}} */
FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70);
FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71);
#endif
/* 8x8 matrix transpose (vector float[8][2]) {{{ */
x0 = vec_mergel(b00, b20);
x1 = vec_mergeh(b00, b20);
x2 = vec_mergel(b10, b30);
x3 = vec_mergeh(b10, b30);
b00 = vec_mergeh(x1, x3);
b10 = vec_mergel(x1, x3);
b20 = vec_mergeh(x0, x2);
b30 = vec_mergel(x0, x2);
x4 = vec_mergel(b41, b61);
x5 = vec_mergeh(b41, b61);
x6 = vec_mergel(b51, b71);
x7 = vec_mergeh(b51, b71);
b41 = vec_mergeh(x5, x7);
b51 = vec_mergel(x5, x7);
b61 = vec_mergeh(x4, x6);
b71 = vec_mergel(x4, x6);
x0 = vec_mergel(b01, b21);
x1 = vec_mergeh(b01, b21);
x2 = vec_mergel(b11, b31);
x3 = vec_mergeh(b11, b31);
x4 = vec_mergel(b40, b60);
x5 = vec_mergeh(b40, b60);
x6 = vec_mergel(b50, b70);
x7 = vec_mergeh(b50, b70);
b40 = vec_mergeh(x1, x3);
b50 = vec_mergel(x1, x3);
b60 = vec_mergeh(x0, x2);
b70 = vec_mergel(x0, x2);
b01 = vec_mergeh(x5, x7);
b11 = vec_mergel(x5, x7);
b21 = vec_mergeh(x4, x6);
b31 = vec_mergel(x4, x6);
/* }}} */
FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
/* round, convert back to short {{{ */
#define CTS(n) \
b##n##0 = vec_round(b##n##0); \
b##n##1 = vec_round(b##n##1); \
b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
vec_st(vs16(b##n##0), 0, bp);
bp = (vector signed short*)block;
CTS(0); bp++;
CTS(1); bp++;
CTS(2); bp++;
CTS(3); bp++;
CTS(4); bp++;
CTS(5); bp++;
CTS(6); bp++;
CTS(7);
#undef CTS
/* }}} */
POWERPC_PERF_STOP_COUNT(altivec_fdct, 1);
}
/* vim:set foldmethod=marker foldlevel=0: */

View File

@ -0,0 +1,138 @@
/*
* FFT/IFFT transforms
* AltiVec-enabled
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
* Based on code Copyright (c) 2002 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
#include "gcc_fixes.h"
#include "dsputil_ppc.h"
#include "util_altivec.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
* input data must be permuted before with s->revtab table. No
* 1.0/sqrt(n) normalization is done.
* AltiVec-enabled
* This code assumes that the 'z' pointer is 16 bytes-aligned
* It also assumes all FFTComplex are 8 bytes-aligned pair of float
* The code is exactly the same as the SSE version, except
* that successive MUL + ADD/SUB have been merged into
* fused multiply-add ('vec_madd' in altivec)
*/
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
{
POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
register const vector float vczero = (const vector float)vec_splat_u32(0.);
int ln = s->nbits;
int j, np, np2;
int nblocks, nloops;
register FFTComplex *p, *q;
FFTComplex *cptr, *cptr1;
int k;
POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
np = 1 << ln;
{
vector float *r, a, b, a1, c1, c2;
r = (vector float *)&z[0];
c1 = vcii(p,p,n,n);
if (s->inverse) {
c2 = vcii(p,p,n,p);
} else {
c2 = vcii(p,p,p,n);
}
j = (np >> 2);
do {
a = vec_ld(0, r);
a1 = vec_ld(sizeof(vector float), r);
b = vec_perm(a,a,vcprmle(1,0,3,2));
a = vec_madd(a,c1,b);
/* do the pass 0 butterfly */
b = vec_perm(a1,a1,vcprmle(1,0,3,2));
b = vec_madd(a1,c1,b);
/* do the pass 0 butterfly */
/* multiply third by -i */
b = vec_perm(b,b,vcprmle(2,3,1,0));
/* do the pass 1 butterfly */
vec_st(vec_madd(b,c2,a), 0, r);
vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
r += 2;
} while (--j != 0);
}
/* pass 2 .. ln-1 */
nblocks = np >> 3;
nloops = 1 << 2;
np2 = np >> 1;
cptr1 = s->exptab1;
do {
p = z;
q = z + nloops;
j = nblocks;
do {
cptr = cptr1;
k = nloops >> 1;
do {
vector float a,b,c,t1;
a = vec_ld(0, (float*)p);
b = vec_ld(0, (float*)q);
/* complex mul */
c = vec_ld(0, (float*)cptr);
/* cre*re cim*re */
t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
c = vec_ld(sizeof(vector float), (float*)cptr);
/* -cim*im cre*im */
b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
/* butterfly */
vec_st(vec_add(a,b), 0, (float*)p);
vec_st(vec_sub(a,b), 0, (float*)q);
p += 2;
q += 2;
cptr += 4;
} while (--k);
p += nloops;
q += nloops;
} while (--j);
cptr1 += nloops * 2;
nblocks = nblocks >> 1;
nloops = nloops << 1;
} while (nblocks != 0);
POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
}

View File

@ -0,0 +1,311 @@
/*
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
#include "gcc_fixes.h"
#include "dsputil_altivec.h"
#include "util_altivec.h"
static void vector_fmul_altivec(float *dst, const float *src, int len)
{
int i;
vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst+i);
s = vec_ld(0, src+i);
d1 = vec_ld(16, dst+i);
d0 = vec_madd(d0, s, zero);
d1 = vec_madd(d1, vec_ld(16,src+i), zero);
vec_st(d0, 0, dst+i);
vec_st(d1, 16, dst+i);
}
}
static void vector_fmul_reverse_altivec(float *dst, const float *src0,
const float *src1, int len)
{
int i;
vector float d, s0, s1, h0, l0,
s2, s3, zero = (vector float)vec_splat_u32(0);
src1 += len-4;
for(i=0; i<len-7; i+=8) {
s1 = vec_ld(0, src1-i); // [a,b,c,d]
s0 = vec_ld(0, src0+i);
l0 = vec_mergel(s1, s1); // [c,c,d,d]
s3 = vec_ld(-16, src1-i);
h0 = vec_mergeh(s1, s1); // [a,a,b,b]
s2 = vec_ld(16, src0+i);
s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b]
vec_mergeh(l0,h0)); // [c,a,c,a]
// [d,c,b,a]
l0 = vec_mergel(s3, s3);
d = vec_madd(s0, s1, zero);
h0 = vec_mergeh(s3, s3);
vec_st(d, 0, dst+i);
s3 = vec_mergeh(vec_mergel(l0,h0),
vec_mergeh(l0,h0));
d = vec_madd(s2, s3, zero);
vec_st(d, 16, dst+i);
}
}
static void vector_fmul_add_add_altivec(float *dst, const float *src0,
const float *src1, const float *src2,
int src3, int len, int step)
{
int i;
vector float d, s0, s1, s2, t0, t1, edges;
vector unsigned char align = vec_lvsr(0,dst),
mask = vec_lvsl(0, dst);
#if 0 //FIXME: there is still something wrong
if (step == 2) {
int y;
vector float d0, d1, s3, t2;
vector unsigned int sel =
vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0));
t1 = vec_ld(16, dst);
for (i=0,y=0; i<len-3; i+=4,y+=8) {
s0 = vec_ld(0,src0+i);
s1 = vec_ld(0,src1+i);
s2 = vec_ld(0,src2+i);
// t0 = vec_ld(0, dst+y); //[x x x|a]
// t1 = vec_ld(16, dst+y); //[b c d|e]
t2 = vec_ld(31, dst+y); //[f g h|x]
d = vec_madd(s0,s1,s2); // [A B C D]
// [A A B B]
// [C C D D]
d0 = vec_perm(t0, t1, mask); // [a b c d]
d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d]
edges = vec_perm(t1, t0, mask);
t0 = vec_perm(edges, d0, align); // [x x x|A]
t1 = vec_perm(d0, edges, align); // [b B d|e]
vec_stl(t0, 0, dst+y);
d1 = vec_perm(t1, t2, mask); // [e f g h]
d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h]
edges = vec_perm(t2, t1, mask);
t1 = vec_perm(edges, d1, align); // [b B d|C]
t2 = vec_perm(d1, edges, align); // [f D h|x]
vec_stl(t1, 16, dst+y);
t0 = t1;
vec_stl(t2, 31, dst+y);
t1 = t2;
}
} else
#endif
if (step == 1 && src3 == 0)
for (i=0; i<len-3; i+=4) {
t0 = vec_ld(0, dst+i);
t1 = vec_ld(15, dst+i);
s0 = vec_ld(0, src0+i);
s1 = vec_ld(0, src1+i);
s2 = vec_ld(0, src2+i);
edges = vec_perm(t1 ,t0, mask);
d = vec_madd(s0,s1,s2);
t1 = vec_perm(d, edges, align);
t0 = vec_perm(edges, d, align);
vec_st(t1, 15, dst+i);
vec_st(t0, 0, dst+i);
}
else
ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
}
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
{
union {
vector float v;
float s[4];
} vadd;
vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
const vector unsigned char reverse = vcprm(3,2,1,0);
int i,j;
dst += len;
win += len;
src0+= len;
vadd.s[0] = add_bias;
vadd_bias = vec_splat(vadd.v, 0);
zero = (vector float)vec_splat_u32(0);
for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
s0 = vec_ld(i, src0);
s1 = vec_ld(j, src1);
wi = vec_ld(i, win);
wj = vec_ld(j, win);
s1 = vec_perm(s1, s1, reverse);
wj = vec_perm(wj, wj, reverse);
t0 = vec_madd(s0, wj, vadd_bias);
t0 = vec_nmsub(s1, wi, t0);
t1 = vec_madd(s0, wi, vadd_bias);
t1 = vec_madd(s1, wj, t1);
t1 = vec_perm(t1, t1, reverse);
vec_st(t0, i, dst);
vec_st(t1, j, dst);
}
}
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src, float mul, int len)
{
union {
vector float v;
float s[4];
} mul_u;
int i;
vector float src1, src2, dst1, dst2, mul_v, zero;
zero = (vector float)vec_splat_u32(0);
mul_u.s[0] = mul;
mul_v = vec_splat(mul_u.v, 0);
for(i=0; i<len; i+=8) {
src1 = vec_ctf(vec_ld(0, src+i), 0);
src2 = vec_ctf(vec_ld(16, src+i), 0);
dst1 = vec_madd(src1, mul_v, zero);
dst2 = vec_madd(src2, mul_v, zero);
vec_st(dst1, 0, dst+i);
vec_st(dst2, 16, dst+i);
}
}
static vector signed short
float_to_int16_one_altivec(const float *src)
{
vector float s0 = vec_ld(0, src);
vector float s1 = vec_ld(16, src);
vector signed int t0 = vec_cts(s0, 0);
vector signed int t1 = vec_cts(s1, 0);
return vec_packs(t0,t1);
}
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
int i;
vector signed short d0, d1, d;
vector unsigned char align;
if(((long)dst)&15) //FIXME
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst+i);
d = float_to_int16_one_altivec(src+i);
d1 = vec_ld(15, dst+i);
d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i));
align = vec_lvsr(0, dst+i);
d0 = vec_perm(d1, d, align);
d1 = vec_perm(d, d1, align);
vec_st(d0, 0, dst+i);
vec_st(d1,15, dst+i);
}
else
for(i=0; i<len-7; i+=8) {
d = float_to_int16_one_altivec(src+i);
vec_st(d, 0, dst+i);
}
}
static void
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
long len, int channels)
{
int i;
vector signed short d0, d1, d2, c0, c1, t0, t1;
vector unsigned char align;
if(channels == 1)
float_to_int16_altivec(dst, src[0], len);
else
if (channels == 2) {
if(((long)dst)&15)
for(i=0; i<len-7; i+=8) {
d0 = vec_ld(0, dst + i);
t0 = float_to_int16_one_altivec(src[0] + i);
d1 = vec_ld(31, dst + i);
t1 = float_to_int16_one_altivec(src[1] + i);
c0 = vec_mergeh(t0, t1);
c1 = vec_mergel(t0, t1);
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d2, c0, align);
d1 = vec_perm(c0, c1, align);
vec_st(d0, 0, dst + i);
d0 = vec_perm(c1, d2, align);
vec_st(d1, 15, dst + i);
vec_st(d0, 31, dst + i);
dst+=8;
}
else
for(i=0; i<len-7; i+=8) {
t0 = float_to_int16_one_altivec(src[0] + i);
t1 = float_to_int16_one_altivec(src[1] + i);
d0 = vec_mergeh(t0, t1);
d1 = vec_mergel(t0, t1);
vec_st(d0, 0, dst + i);
vec_st(d1, 16, dst + i);
dst+=8;
}
} else {
DECLARE_ALIGNED(16, int16_t, tmp[len]);
int c, j;
for (c = 0; c < channels; c++) {
float_to_int16_altivec(tmp, src[c], len);
for (i = 0, j = c; i < len; i++, j+=channels) {
dst[j] = tmp[i];
}
}
}
}
void float_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul = vector_fmul_altivec;
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
c->vector_fmul_add_add = vector_fmul_add_add_altivec;
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vector_fmul_window = vector_fmul_window_altivec;
c->float_to_int16 = float_to_int16_altivec;
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
}

View File

@ -0,0 +1,102 @@
/*
* gcc fixes for altivec.
* Used to workaround broken gcc (FSF gcc-3 pre gcc-3.3)
* and to stay somewhat compatible with Darwin.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_GCC_FIXES_H
#define AVCODEC_PPC_GCC_FIXES_H
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#if (__GNUC__ < 4)
# define REG_v(a)
#else
# define REG_v(a) __asm__ ( #a )
#endif
#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3)
/* This code was provided to me by Bartosch Pixa
* as a separate header file (broken_mergel.h).
* thanks to lu_zero for the workaround.
*
* See this mail for more information:
* http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html
*/
static inline vector signed char ff_vmrglb (vector signed char const A,
vector signed char const B)
{
static const vector unsigned char lowbyte = {
0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b,
0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f
};
return vec_perm (A, B, lowbyte);
}
static inline vector signed short ff_vmrglh (vector signed short const A,
vector signed short const B)
{
static const vector unsigned char lowhalf = {
0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b,
0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f
};
return vec_perm (A, B, lowhalf);
}
static inline vector signed int ff_vmrglw (vector signed int const A,
vector signed int const B)
{
static const vector unsigned char lowword = {
0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b,
0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f
};
return vec_perm (A, B, lowword);
}
/*#define ff_vmrglb ff_vmrglb
#define ff_vmrglh ff_vmrglh
#define ff_vmrglw ff_vmrglw
*/
#undef vec_mergel
#define vec_mergel(a1, a2) \
__ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \
((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
__ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \
((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
__ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \
((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
__ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \
((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
__ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \
((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
__ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \
((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
__ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \
((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
__altivec_link_error_invalid_argument ())))))))
#endif /* (__GNUC__ == 3 && __GNUC_MINOR__ < 3) */
#endif /* AVCODEC_PPC_GCC_FIXES_H */

View File

@ -0,0 +1,141 @@
/*
* GMC (Global Motion Compensation)
* AltiVec-enabled
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
#include "gcc_fixes.h"
#include "dsputil_ppc.h"
#include "util_altivec.h"
/*
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
to preserve proper dst alignment.
*/
#define GMC1_PERF_COND (h==8)
void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
{
POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) =
{rounder, rounder, rounder, rounder,
rounder, rounder, rounder, rounder};
const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) =
{
(16-x16)*(16-y16), /* A */
( x16)*(16-y16), /* B */
(16-x16)*( y16), /* C */
( x16)*( y16), /* D */
0, 0, 0, 0 /* padding */
};
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
int i;
unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
tempA = vec_ld(0, (unsigned short*)ABCD);
Av = vec_splat(tempA, 0);
Bv = vec_splat(tempA, 1);
Cv = vec_splat(tempA, 2);
Dv = vec_splat(tempA, 3);
rounderV = vec_ld(0, (unsigned short*)rounder_a);
// we'll be able to pick-up our 9 char elements
// at src from those 32 bytes
// we load the first batch here, as inside the loop
// we can re-use 'src+stride' from one iteration
// as the 'src' of the next.
src_0 = vec_ld(0, src);
src_1 = vec_ld(16, src);
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
if (src_really_odd != 0x0000000F) {
// if src & 0xF == 0xF, then (src+1) is properly aligned
// on the second vector.
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
} else {
srcvB = src_1;
}
srcvA = vec_mergeh(vczero, srcvA);
srcvB = vec_mergeh(vczero, srcvB);
for(i=0; i<h; i++) {
dst_odd = (unsigned long)dst & 0x0000000F;
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
dstv = vec_ld(0, dst);
// we we'll be able to pick-up our 9 char elements
// at src + stride from those 32 bytes
// then reuse the resulting 2 vectors srvcC and srcvD
// as the next srcvA and srcvB
src_0 = vec_ld(stride + 0, src);
src_1 = vec_ld(stride + 16, src);
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
if (src_really_odd != 0x0000000F) {
// if src & 0xF == 0xF, then (src+1) is properly aligned
// on the second vector.
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
} else {
srcvD = src_1;
}
srcvC = vec_mergeh(vczero, srcvC);
srcvD = vec_mergeh(vczero, srcvD);
// OK, now we (finally) do the math :-)
// those four instructions replaces 32 int muls & 32 int adds.
// isn't AltiVec nice ?
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
srcvA = srcvC;
srcvB = srcvD;
tempD = vec_sr(tempD, vcsr8);
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
if (dst_odd) {
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
} else {
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
}
vec_st(dstv2, 0, dst);
dst += stride;
src += stride;
}
POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,694 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
//#define DEBUG_ALIGNMENT
#ifdef DEBUG_ALIGNMENT
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
#else
#define ASSERT_ALIGNED(ptr) ;
#endif
/* this code assume that stride % 16 == 0 */
#define CHROMA_MC8_ALTIVEC_CORE \
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
\
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vB, vsrc1ssH, psum);\
psum = vec_mladd(vC, vsrc2ssH, psum);\
psum = vec_mladd(vD, vsrc3ssH, psum);\
psum = vec_sr(psum, v6us);\
\
vdst = vec_ld(0, dst);\
ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\
\
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\
vec_st(fsum, 0, dst);\
\
vsrc0ssH = vsrc2ssH;\
vsrc1ssH = vsrc3ssH;\
\
dst += stride;\
src += stride;
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
\
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
\
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vE, vsrc1ssH, psum);\
psum = vec_sr(psum, v6us);\
\
vdst = vec_ld(0, dst);\
ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\
\
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\
vec_st(fsum, 0, dst);\
\
dst += stride;\
src += stride;
void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
int stride, int h, int x, int y) {
POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
DECLARE_ALIGNED_16(signed int, ABCD[4]) =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
((8 - x) * ( y)),
(( x) * ( y))};
register int i;
vec_u8 fperm;
const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
LOAD_ZERO;
const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vec_u16 v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vec_u8 vsrc0uc, vsrc1uc;
vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8 vdst, ppsum, vfdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F};
} else {
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F};
}
vsrcAuc = vec_ld(0, src);
if (loadSecond)
vsrcBuc = vec_ld(16, src);
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
if (reallyBadAlign)
vsrc1uc = vsrcBuc;
else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
if (ABCD[3]) {
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
if (reallyBadAlign)
vsrc3uc = vsrcDuc;
else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE
}
}
} else {
const vec_s16 vE = vec_add(vB, vC);
if (ABCD[2]) { // x == 0 B == 0
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc;
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 15, src);
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc;
}
}
} else { // y == 0 C == 0
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(0, src);
vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(0, src);
vsrcDuc = vec_ld(15, src);
vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
if (reallyBadAlign)
vsrc1uc = vsrcDuc;
else
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
}
}
}
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
}
#undef CHROMA_MC8_ALTIVEC_CORE
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
register int i;
LOAD_ZERO;
const vec_u8 permM2 = vec_lvsl(-2, src);
const vec_u8 permM1 = vec_lvsl(-1, src);
const vec_u8 permP0 = vec_lvsl(+0, src);
const vec_u8 permP1 = vec_lvsl(+1, src);
const vec_u8 permP2 = vec_lvsl(+2, src);
const vec_u8 permP3 = vec_lvsl(+3, src);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_u16 v5us = vec_splat_u16(5);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
vec_u8 sum, vdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) {
vec_u8 srcR1 = vec_ld(-2, src);
vec_u8 srcR2 = vec_ld(14, src);
switch (align) {
default: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3);
} break;
case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2;
} break;
case 12: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
}
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst);
vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
vec_st(fsum, 0, dst);
src += srcStride;
dst += dstStride;
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
}
/* this code assume stride % 16 == 0 */
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
register int i;
LOAD_ZERO;
const vec_u8 perm = vec_lvsl(0, src);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u16 v5us = vec_splat_u16(5);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
uint8_t *srcbis = src - (srcStride * 2);
const vec_u8 srcM2a = vec_ld(0, srcbis);
const vec_u8 srcM2b = vec_ld(16, srcbis);
const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
//srcbis += srcStride;
const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcM1b = vec_ld(16, srcbis);
const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
//srcbis += srcStride;
const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcP0b = vec_ld(16, srcbis);
const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
//srcbis += srcStride;
const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcP1b = vec_ld(16, srcbis);
const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
//srcbis += srcStride;
const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcP2b = vec_ld(16, srcbis);
const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
//srcbis += srcStride;
vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
for (i = 0 ; i < 16 ; i++) {
srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
//srcbis += srcStride;
sum1A = vec_adds(srcP0ssA, srcP1ssA);
sum1B = vec_adds(srcP0ssB, srcP1ssB);
sum2A = vec_adds(srcM1ssA, srcP2ssA);
sum2B = vec_adds(srcM1ssB, srcP2ssB);
sum3A = vec_adds(srcM2ssA, srcP3ssA);
sum3B = vec_adds(srcM2ssB, srcP3ssB);
srcM2ssA = srcM1ssA;
srcM2ssB = srcM1ssB;
srcM1ssA = srcP0ssA;
srcM1ssB = srcP0ssB;
srcP0ssA = srcP1ssA;
srcP0ssB = srcP1ssB;
srcP1ssA = srcP2ssA;
srcP1ssB = srcP2ssB;
srcP2ssA = srcP3ssA;
srcP2ssB = srcP3ssB;
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst);
vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
vec_st(fsum, 0, dst);
dst += dstStride;
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
}
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i;
LOAD_ZERO;
const vec_u8 permM2 = vec_lvsl(-2, src);
const vec_u8 permM1 = vec_lvsl(-1, src);
const vec_u8 permP0 = vec_lvsl(+0, src);
const vec_u8 permP1 = vec_lvsl(+1, src);
const vec_u8 permP2 = vec_lvsl(+2, src);
const vec_u8 permP3 = vec_lvsl(+3, src);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u32 v10ui = vec_splat_u32(10);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16 v1ss = vec_splat_s16(1);
const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vec_u8 mperm = (const vec_u8)
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
int16_t *tmpbis = tmp;
vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB;
vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo;
vec_u8 fsum, sumv, sum, vdst;
vec_s16 ssume, ssumo;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) {
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vec_u8 srcR1 = vec_ld(-2, src);
vec_u8 srcR2 = vec_ld(14, src);
switch (align) {
default: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3);
} break;
case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2;
} break;
case 12: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
}
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B);
vec_st(psumA, 0, tmp);
vec_st(psumB, 16, tmp);
src += srcStride;
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
}
tmpM2ssA = vec_ld(0, tmpbis);
tmpM2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpM1ssA = vec_ld(0, tmpbis);
tmpM1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP0ssA = vec_ld(0, tmpbis);
tmpP0ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP1ssA = vec_ld(0, tmpbis);
tmpP1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP2ssA = vec_ld(0, tmpbis);
tmpP2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) {
const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride;
tmpM2ssA = tmpM1ssA;
tmpM2ssB = tmpM1ssB;
tmpM1ssA = tmpP0ssA;
tmpM1ssB = tmpP0ssB;
tmpP0ssA = tmpP1ssA;
tmpP0ssB = tmpP1ssB;
tmpP1ssA = tmpP2ssA;
tmpP1ssB = tmpP2ssB;
tmpP2ssA = tmpP3ssA;
tmpP2ssB = tmpP3ssB;
pp1Ae = vec_mule(sum1A, v20ss);
pp1Ao = vec_mulo(sum1A, v20ss);
pp1Be = vec_mule(sum1B, v20ss);
pp1Bo = vec_mulo(sum1B, v20ss);
pp2Ae = vec_mule(sum2A, v5ss);
pp2Ao = vec_mulo(sum2A, v5ss);
pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss);
pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss);
pp3Be = vec_sra((vec_s32)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss);
pp1cAe = vec_add(pp1Ae, v512si);
pp1cAo = vec_add(pp1Ao, v512si);
pp1cBe = vec_add(pp1Be, v512si);
pp1cBo = vec_add(pp1Bo, v512si);
pp32Ae = vec_sub(pp3Ae, pp2Ae);
pp32Ao = vec_sub(pp3Ao, pp2Ao);
pp32Be = vec_sub(pp3Be, pp2Be);
pp32Bo = vec_sub(pp3Bo, pp2Bo);
sumAe = vec_add(pp1cAe, pp32Ae);
sumAo = vec_add(pp1cAo, pp32Ao);
sumBe = vec_add(pp1cBe, pp32Be);
sumBo = vec_add(pp1cBo, pp32Bo);
ssumAe = vec_sra(sumAe, v10ui);
ssumAo = vec_sra(sumAo, v10ui);
ssumBe = vec_sra(sumBe, v10ui);
ssumBo = vec_sra(sumBo, v10ui);
ssume = vec_packs(ssumAe, ssumBe);
ssumo = vec_packs(ssumAo, ssumBo);
sumv = vec_packsu(ssume, ssumo);
sum = vec_perm(sumv, sumv, mperm);
ASSERT_ALIGNED(dst);
vdst = vec_ld(0, dst);
OP_U8_ALTIVEC(fsum, sum, vdst);
vec_st(fsum, 0, dst);
dst += dstStride;
}
POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
}

View File

@ -0,0 +1,227 @@
/*
* Copyright (c) 2001 Michel Lespinasse
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of ffmpeg.
*/
/*
* FFMpeg integration by Dieter Shirley
*
* This file is a direct copy of the altivec idct module from the libmpeg2
* project. I've deleted all of the libmpeg2 specific code, renamed the functions and
* re-ordered the function parameters. The only change to the IDCT function
* itself was to factor out the partial transposition, and to perform a full
* transpose at the end of the function.
*/
#include <stdlib.h> /* malloc(), free() */
#include <string.h>
#include "libavcodec/dsputil.h"
#include "gcc_fixes.h"
#include "types_altivec.h"
#include "dsputil_ppc.h"
#define IDCT_HALF \
/* 1st stage */ \
t1 = vec_mradds (a1, vx7, vx1 ); \
t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
t7 = vec_mradds (a2, vx5, vx3); \
t3 = vec_mradds (ma2, vx3, vx5); \
\
/* 2nd stage */ \
t5 = vec_adds (vx0, vx4); \
t0 = vec_subs (vx0, vx4); \
t2 = vec_mradds (a0, vx6, vx2); \
t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
t6 = vec_adds (t8, t3); \
t3 = vec_subs (t8, t3); \
t8 = vec_subs (t1, t7); \
t1 = vec_adds (t1, t7); \
\
/* 3rd stage */ \
t7 = vec_adds (t5, t2); \
t2 = vec_subs (t5, t2); \
t5 = vec_adds (t0, t4); \
t0 = vec_subs (t0, t4); \
t4 = vec_subs (t8, t3); \
t3 = vec_adds (t8, t3); \
\
/* 4th stage */ \
vy0 = vec_adds (t7, t1); \
vy7 = vec_subs (t7, t1); \
vy1 = vec_mradds (c4, t3, t5); \
vy6 = vec_mradds (mc4, t3, t5); \
vy2 = vec_mradds (c4, t4, t0); \
vy5 = vec_mradds (mc4, t4, t0); \
vy3 = vec_adds (t2, t6); \
vy4 = vec_subs (t2, t6);
#define IDCT \
vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
vec_u16 shift; \
\
c4 = vec_splat (constants[0], 0); \
a0 = vec_splat (constants[0], 1); \
a1 = vec_splat (constants[0], 2); \
a2 = vec_splat (constants[0], 3); \
mc4 = vec_splat (constants[0], 4); \
ma2 = vec_splat (constants[0], 5); \
bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
\
zero = vec_splat_s16 (0); \
shift = vec_splat_u16 (4); \
\
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
\
IDCT_HALF \
\
vx0 = vec_mergeh (vy0, vy4); \
vx1 = vec_mergel (vy0, vy4); \
vx2 = vec_mergeh (vy1, vy5); \
vx3 = vec_mergel (vy1, vy5); \
vx4 = vec_mergeh (vy2, vy6); \
vx5 = vec_mergel (vy2, vy6); \
vx6 = vec_mergeh (vy3, vy7); \
vx7 = vec_mergel (vy3, vy7); \
\
vy0 = vec_mergeh (vx0, vx4); \
vy1 = vec_mergel (vx0, vx4); \
vy2 = vec_mergeh (vx1, vx5); \
vy3 = vec_mergel (vx1, vx5); \
vy4 = vec_mergeh (vx2, vx6); \
vy5 = vec_mergel (vx2, vx6); \
vy6 = vec_mergeh (vx3, vx7); \
vy7 = vec_mergel (vx3, vx7); \
\
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
vx1 = vec_mergel (vy0, vy4); \
vx2 = vec_mergeh (vy1, vy5); \
vx3 = vec_mergel (vy1, vy5); \
vx4 = vec_mergeh (vy2, vy6); \
vx5 = vec_mergel (vy2, vy6); \
vx6 = vec_mergeh (vy3, vy7); \
vx7 = vec_mergel (vy3, vy7); \
\
IDCT_HALF \
\
shift = vec_splat_u16 (6); \
vx0 = vec_sra (vy0, shift); \
vx1 = vec_sra (vy1, shift); \
vx2 = vec_sra (vy2, shift); \
vx3 = vec_sra (vy3, shift); \
vx4 = vec_sra (vy4, shift); \
vx5 = vec_sra (vy5, shift); \
vx6 = vec_sra (vy6, shift); \
vx7 = vec_sra (vy7, shift);
static const vec_s16 constants[5] = {
{23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
{21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
};
void idct_put_altivec(uint8_t* dest, int stride, vec_s16* block)
{
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
vec_u8 tmp;
#if CONFIG_POWERPC_PERF
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
#endif
IDCT
#define COPY(dest,src) \
tmp = vec_packsu (src, src); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
COPY (dest, vx0) dest += stride;
COPY (dest, vx1) dest += stride;
COPY (dest, vx2) dest += stride;
COPY (dest, vx3) dest += stride;
COPY (dest, vx4) dest += stride;
COPY (dest, vx5) dest += stride;
COPY (dest, vx6) dest += stride;
COPY (dest, vx7)
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
}
void idct_add_altivec(uint8_t* dest, int stride, vec_s16* block)
{
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
vec_u8 tmp;
vec_s16 tmp2, tmp3;
vec_u8 perm0;
vec_u8 perm1;
vec_u8 p0, p1, p;
#if CONFIG_POWERPC_PERF
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
#endif
IDCT
p0 = vec_lvsl (0, dest);
p1 = vec_lvsl (stride, dest);
p = vec_splat_u8 (-1);
perm0 = vec_mergeh (p, p0);
perm1 = vec_mergeh (p, p1);
#define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
tmp = vec_ld (0, dest); \
tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
ADD (dest, vx0, perm0) dest += stride;
ADD (dest, vx1, perm1) dest += stride;
ADD (dest, vx2, perm0) dest += stride;
ADD (dest, vx3, perm1) dest += stride;
ADD (dest, vx4, perm0) dest += stride;
ADD (dest, vx5, perm1) dest += stride;
ADD (dest, vx6, perm0) dest += stride;
ADD (dest, vx7, perm1)
POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
}

View File

@ -0,0 +1,142 @@
/*
* High quality image resampling with polyphase filters
* Copyright (c) 2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file libavcodec/ppc/imgresample_altivec.c
* High quality image resampling with polyphase filters - AltiVec bits
*/
#include "util_altivec.h"
#define FILTER_BITS 8
typedef union {
vector signed short v;
signed short s[8];
} vec_ss;
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
int wrap, int16_t *filter)
{
int sum, i;
const uint8_t *s;
vector unsigned char *tv, tmp, dstv, zero;
vec_ss srchv[4], srclv[4], fv[4];
vector signed short zeros, sumhv, sumlv;
s = src;
for(i=0;i<4;i++) {
/*
The vec_madds later on does an implicit >>15 on the result.
Since FILTER_BITS is 8, and we have 15 bits of magnitude in
a signed short, we have just enough bits to pre-shift our
filter constants <<7 to compensate for vec_madds.
*/
fv[i].s[0] = filter[i] << (15-FILTER_BITS);
fv[i].v = vec_splat(fv[i].v, 0);
}
zero = vec_splat_u8(0);
zeros = vec_splat_s16(0);
/*
When we're resampling, we'd ideally like both our input buffers,
and output buffers to be 16-byte aligned, so we can do both aligned
reads and writes. Sadly we can't always have this at the moment, so
we opt for aligned writes, as unaligned writes have a huge overhead.
To do this, do enough scalar resamples to get dst 16-byte aligned.
*/
i = (-(int)dst) & 0xf;
while(i>0) {
sum = s[0 * wrap] * filter[0] +
s[1 * wrap] * filter[1] +
s[2 * wrap] * filter[2] +
s[3 * wrap] * filter[3];
sum = sum >> FILTER_BITS;
if (sum<0) sum = 0; else if (sum>255) sum=255;
dst[0] = sum;
dst++;
s++;
dst_width--;
i--;
}
/* Do our altivec resampling on 16 pixels at once. */
while(dst_width>=16) {
/* Read 16 (potentially unaligned) bytes from each of
4 lines into 4 vectors, and split them into shorts.
Interleave the multipy/accumulate for the resample
filter with the loads to hide the 3 cycle latency
the vec_madds have. */
tv = (vector unsigned char *) &s[0 * wrap];
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
tv = (vector unsigned char *) &s[1 * wrap];
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
tv = (vector unsigned char *) &s[2 * wrap];
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
tv = (vector unsigned char *) &s[3 * wrap];
tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
/* Pack the results into our destination vector,
and do an aligned write of that back to memory. */
dstv = vec_packsu(sumhv, sumlv) ;
vec_st(dstv, 0, (vector unsigned char *) dst);
dst+=16;
s+=16;
dst_width-=16;
}
/* If there are any leftover pixels, resample them
with the slow scalar method. */
while(dst_width>0) {
sum = s[0 * wrap] * filter[0] +
s[1 * wrap] * filter[1] +
s[2 * wrap] * filter[2] +
s[3 * wrap] * filter[3];
sum = sum >> FILTER_BITS;
if (sum<0) sum = 0; else if (sum>255) sum=255;
dst[0] = sum;
dst++;
s++;
dst_width--;
}
}

View File

@ -0,0 +1,26 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_IMGRESAMPLE_ALTIVEC_H
#define AVCODEC_PPC_IMGRESAMPLE_ALTIVEC_H
#include <stdint.h>
void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
int wrap, int16_t *filter);
#endif /* AVCODEC_PPC_IMGRESAMPLE_ALTIVEC_H */

View File

@ -0,0 +1,143 @@
/*
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
** @file libavcodec/ppc/int_altivec.c
** integer misc ops.
**/
#include "libavcodec/dsputil.h"
#include "gcc_fixes.h"
#include "dsputil_altivec.h"
#include "types_altivec.h"
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int size) {
int i, size16;
vector signed char vpix1;
vector signed short vpix2, vdiff, vpix1l,vpix1h;
union { vector signed int vscore;
int32_t score[4];
} u;
u.vscore = vec_splat_s32(0);
//
//XXX lazy way, fix it later
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
size16 = size >> 4;
while(size16) {
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
//load pix1 and the first batch of pix2
vpix1 = vec_unaligned_load(pix1);
vpix2 = vec_unaligned_load(pix2);
pix2 += 8;
//unpack
vpix1h = vec_unpackh(vpix1);
vdiff = vec_sub(vpix1h, vpix2);
vpix1l = vec_unpackl(vpix1);
// load another batch from pix2
vpix2 = vec_unaligned_load(pix2);
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
vdiff = vec_sub(vpix1l, vpix2);
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
pix1 += 16;
pix2 += 8;
size16--;
}
u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
size %= 16;
for (i = 0; i < size; i++) {
u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
}
return u.score[3];
}
static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
{
int i;
register vec_s16 vec, *pv;
for(i = 0; i < order; i += 8){
pv = (vec_s16*)v2;
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
v1 += 8;
v2 += 8;
}
}
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
{
int i;
register vec_s16 vec, *pv;
for(i = 0; i < order; i += 8){
pv = (vec_s16*)v2;
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
v1 += 8;
v2 += 8;
}
}
static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
{
int i;
LOAD_ZERO;
register vec_s16 vec1, *pv;
register vec_s32 res = vec_splat_s32(0), t;
register vec_u32 shifts;
DECLARE_ALIGNED_16(int32_t, ires);
shifts = zero_u32v;
if(shift & 0x10) shifts = vec_add(shifts, vec_sl(vec_splat_u32(0x08), vec_splat_u32(0x1)));
if(shift & 0x08) shifts = vec_add(shifts, vec_splat_u32(0x08));
if(shift & 0x04) shifts = vec_add(shifts, vec_splat_u32(0x04));
if(shift & 0x02) shifts = vec_add(shifts, vec_splat_u32(0x02));
if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
for(i = 0; i < order; i += 8){
pv = (vec_s16*)v1;
vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
t = vec_sr(t, shifts);
res = vec_sums(t, res);
v1 += 8;
v2 += 8;
}
res = vec_splat(res, 3);
vec_ste(res, 0, &ires);
return ires;
}
void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
c->add_int16 = add_int16_altivec;
c->sub_int16 = sub_int16_altivec;
c->scalarproduct_int16 = scalarproduct_int16_altivec;
}

View File

@ -0,0 +1,40 @@
/*
* simple math operations
* Copyright (c) 2001, 2002 Fabrice Bellard
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_MATHOPS_H
#define AVCODEC_PPC_MATHOPS_H
#include "config.h"
#if HAVE_PPC4XX
/* signed 16x16 -> 32 multiply add accumulate */
#define MAC16(rt, ra, rb) \
__asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
/* signed 16x16 -> 32 multiply */
#define MUL16(ra, rb) \
({ int __rt; \
__asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
__rt; })
#endif
#endif /* AVCODEC_PPC_MATHOPS_H */

View File

@ -0,0 +1,627 @@
/*
* Copyright (c) 2002 Dieter Shirley
*
* dct_unquantize_h263_altivec:
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <stdio.h>
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "gcc_fixes.h"
#include "dsputil_ppc.h"
#include "util_altivec.h"
// Swaps two variables (used for altivec registers)
#define SWAP(a,b) \
do { \
__typeof__(a) swap_temp=a; \
a=b; \
b=swap_temp; \
} while (0)
// transposes a matrix consisting of four vectors with four elements each
#define TRANSPOSE4(a,b,c,d) \
do { \
__typeof__(a) _trans_ach = vec_mergeh(a, c); \
__typeof__(a) _trans_acl = vec_mergel(a, c); \
__typeof__(a) _trans_bdh = vec_mergeh(b, d); \
__typeof__(a) _trans_bdl = vec_mergel(b, d); \
\
a = vec_mergeh(_trans_ach, _trans_bdh); \
b = vec_mergel(_trans_ach, _trans_bdh); \
c = vec_mergeh(_trans_acl, _trans_bdl); \
d = vec_mergel(_trans_acl, _trans_bdl); \
} while (0)
// Loads a four-byte value (int or float) from the target address
// into every element in the target vector. Only works if the
// target address is four-byte aligned (which should be always).
#define LOAD4(vec, address) \
{ \
__typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
vec = vec_ld(0, _load_addr); \
vec = vec_perm(vec, vec, _perm_vec); \
vec = vec_splat(vec, 0); \
}
#define FOUROF(a) {a,a,a,a}
int dct_quantize_altivec(MpegEncContext* s,
DCTELEM* data, int n,
int qscale, int* overflow)
{
int lastNonZero;
vector float row0, row1, row2, row3, row4, row5, row6, row7;
vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
const vector float zero = (const vector float)FOUROF(0.);
// used after quantize step
int oldBaseValue = 0;
// Load the data into the row/alt vectors
{
vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
data0 = vec_ld(0, data);
data1 = vec_ld(16, data);
data2 = vec_ld(32, data);
data3 = vec_ld(48, data);
data4 = vec_ld(64, data);
data5 = vec_ld(80, data);
data6 = vec_ld(96, data);
data7 = vec_ld(112, data);
// Transpose the data before we start
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
// load the data into floating point vectors. We load
// the high half of each row into the main row vectors
// and the low half into the alt vectors.
row0 = vec_ctf(vec_unpackh(data0), 0);
alt0 = vec_ctf(vec_unpackl(data0), 0);
row1 = vec_ctf(vec_unpackh(data1), 0);
alt1 = vec_ctf(vec_unpackl(data1), 0);
row2 = vec_ctf(vec_unpackh(data2), 0);
alt2 = vec_ctf(vec_unpackl(data2), 0);
row3 = vec_ctf(vec_unpackh(data3), 0);
alt3 = vec_ctf(vec_unpackl(data3), 0);
row4 = vec_ctf(vec_unpackh(data4), 0);
alt4 = vec_ctf(vec_unpackl(data4), 0);
row5 = vec_ctf(vec_unpackh(data5), 0);
alt5 = vec_ctf(vec_unpackl(data5), 0);
row6 = vec_ctf(vec_unpackh(data6), 0);
alt6 = vec_ctf(vec_unpackl(data6), 0);
row7 = vec_ctf(vec_unpackh(data7), 0);
alt7 = vec_ctf(vec_unpackl(data7), 0);
}
// The following block could exist as a separate an altivec dct
// function. However, if we put it inline, the DCT data can remain
// in the vector local variables, as floats, which we'll use during the
// quantize step...
{
const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);
int whichPass, whichHalf;
for(whichPass = 1; whichPass<=2; whichPass++) {
for(whichHalf = 1; whichHalf<=2; whichHalf++) {
vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
vector float tmp10, tmp11, tmp12, tmp13;
vector float z1, z2, z3, z4, z5;
tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
// dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
row0 = vec_add(tmp10, tmp11);
// dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
row4 = vec_sub(tmp10, tmp11);
// z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
// dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
// CONST_BITS-PASS1_BITS);
row2 = vec_madd(tmp13, vec_0_765366865, z1);
// dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
// CONST_BITS-PASS1_BITS);
row6 = vec_madd(tmp12, vec_1_847759065, z1);
z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
// z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
// z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
z3 = vec_madd(z3, vec_1_961570560, z5);
// z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
z4 = vec_madd(z4, vec_0_390180644, z5);
// The following adds are rolled into the multiplies above
// z3 = vec_add(z3, z5); // z3 += z5;
// z4 = vec_add(z4, z5); // z4 += z5;
// z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
// Wow! It's actually more efficient to roll this multiply
// into the adds below, even thought the multiply gets done twice!
// z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
// z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
// Same with this one...
// z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
// tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
// dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
// tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
// dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
// tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
// dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
// tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
// dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
// Swap the row values with the alts. If this is the first half,
// this sets up the low values to be acted on in the second half.
// If this is the second half, it puts the high values back in
// the row values where they are expected to be when we're done.
SWAP(row0, alt0);
SWAP(row1, alt1);
SWAP(row2, alt2);
SWAP(row3, alt3);
SWAP(row4, alt4);
SWAP(row5, alt5);
SWAP(row6, alt6);
SWAP(row7, alt7);
}
if (whichPass == 1) {
// transpose the data for the second pass
// First, block transpose the upper right with lower left.
SWAP(row4, alt0);
SWAP(row5, alt1);
SWAP(row6, alt2);
SWAP(row7, alt3);
// Now, transpose each block of four
TRANSPOSE4(row0, row1, row2, row3);
TRANSPOSE4(row4, row5, row6, row7);
TRANSPOSE4(alt0, alt1, alt2, alt3);
TRANSPOSE4(alt4, alt5, alt6, alt7);
}
}
}
// perform the quantize step, using the floating point data
// still in the row/alt registers
{
const int* biasAddr;
const vector signed int* qmat;
vector float bias, negBias;
if (s->mb_intra) {
vector signed int baseVector;
// We must cache element 0 in the intra case
// (it needs special handling).
baseVector = vec_cts(vec_splat(row0, 0), 0);
vec_ste(baseVector, 0, &oldBaseValue);
qmat = (vector signed int*)s->q_intra_matrix[qscale];
biasAddr = &(s->intra_quant_bias);
} else {
qmat = (vector signed int*)s->q_inter_matrix[qscale];
biasAddr = &(s->inter_quant_bias);
}
// Load the bias vector (We add 0.5 to the bias so that we're
// rounding when we convert to int, instead of flooring.)
{
vector signed int biasInt;
const vector float negOneFloat = (vector float)FOUROF(-1.0f);
LOAD4(biasInt, biasAddr);
bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
negBias = vec_madd(bias, negOneFloat, zero);
}
{
vector float q0, q1, q2, q3, q4, q5, q6, q7;
q0 = vec_ctf(qmat[0], QMAT_SHIFT);
q1 = vec_ctf(qmat[2], QMAT_SHIFT);
q2 = vec_ctf(qmat[4], QMAT_SHIFT);
q3 = vec_ctf(qmat[6], QMAT_SHIFT);
q4 = vec_ctf(qmat[8], QMAT_SHIFT);
q5 = vec_ctf(qmat[10], QMAT_SHIFT);
q6 = vec_ctf(qmat[12], QMAT_SHIFT);
q7 = vec_ctf(qmat[14], QMAT_SHIFT);
row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
vec_cmpgt(row0, zero));
row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
vec_cmpgt(row1, zero));
row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
vec_cmpgt(row2, zero));
row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
vec_cmpgt(row3, zero));
row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
vec_cmpgt(row4, zero));
row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
vec_cmpgt(row5, zero));
row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
vec_cmpgt(row6, zero));
row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
vec_cmpgt(row7, zero));
q0 = vec_ctf(qmat[1], QMAT_SHIFT);
q1 = vec_ctf(qmat[3], QMAT_SHIFT);
q2 = vec_ctf(qmat[5], QMAT_SHIFT);
q3 = vec_ctf(qmat[7], QMAT_SHIFT);
q4 = vec_ctf(qmat[9], QMAT_SHIFT);
q5 = vec_ctf(qmat[11], QMAT_SHIFT);
q6 = vec_ctf(qmat[13], QMAT_SHIFT);
q7 = vec_ctf(qmat[15], QMAT_SHIFT);
alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
vec_cmpgt(alt0, zero));
alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
vec_cmpgt(alt1, zero));
alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
vec_cmpgt(alt2, zero));
alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
vec_cmpgt(alt3, zero));
alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
vec_cmpgt(alt4, zero));
alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
vec_cmpgt(alt5, zero));
alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
vec_cmpgt(alt6, zero));
alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
vec_cmpgt(alt7, zero));
}
}
// Store the data back into the original block
{
vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
{
// Clamp for overflow
vector signed int max_q_int, min_q_int;
vector signed short max_q, min_q;
LOAD4(max_q_int, &(s->max_qcoeff));
LOAD4(min_q_int, &(s->min_qcoeff));
max_q = vec_pack(max_q_int, max_q_int);
min_q = vec_pack(min_q_int, min_q_int);
data0 = vec_max(vec_min(data0, max_q), min_q);
data1 = vec_max(vec_min(data1, max_q), min_q);
data2 = vec_max(vec_min(data2, max_q), min_q);
data4 = vec_max(vec_min(data4, max_q), min_q);
data5 = vec_max(vec_min(data5, max_q), min_q);
data6 = vec_max(vec_min(data6, max_q), min_q);
data7 = vec_max(vec_min(data7, max_q), min_q);
}
{
vector bool char zero_01, zero_23, zero_45, zero_67;
vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67;
vector signed char negOne = vec_splat_s8(-1);
vector signed char* scanPtr =
(vector signed char*)(s->intra_scantable.inverse);
signed char lastNonZeroChar;
// Determine the largest non-zero index.
zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero),
vec_cmpeq(data1, (vector signed short)zero));
zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero),
vec_cmpeq(data3, (vector signed short)zero));
zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero),
vec_cmpeq(data5, (vector signed short)zero));
zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero),
vec_cmpeq(data7, (vector signed short)zero));
// 64 biggest values
scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01);
scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23);
scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45);
scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67);
// 32 largest values
scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23);
scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67);
// 16 largest values
scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45);
// 8 largest values
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
vec_mergel(scanIndexes_01, negOne));
// 4 largest values
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
vec_mergel(scanIndexes_01, negOne));
// 2 largest values
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
vec_mergel(scanIndexes_01, negOne));
// largest value
scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne),
vec_mergel(scanIndexes_01, negOne));
scanIndexes_01 = vec_splat(scanIndexes_01, 0);
vec_ste(scanIndexes_01, 0, &lastNonZeroChar);
lastNonZero = lastNonZeroChar;
// While the data is still in vectors we check for the transpose IDCT permute
// and handle it using the vector unit if we can. This is the permute used
// by the altivec idct, so it is common when using the altivec dct.
if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) {
TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
}
vec_st(data0, 0, data);
vec_st(data1, 16, data);
vec_st(data2, 32, data);
vec_st(data3, 48, data);
vec_st(data4, 64, data);
vec_st(data5, 80, data);
vec_st(data6, 96, data);
vec_st(data7, 112, data);
}
}
// special handling of block[0]
if (s->mb_intra) {
if (!s->h263_aic) {
if (n < 4)
oldBaseValue /= s->y_dc_scale;
else
oldBaseValue /= s->c_dc_scale;
}
// Divide by 8, rounding the result
data[0] = (oldBaseValue + 4) >> 3;
}
// We handled the transpose permutation above and we don't
// need to permute the "no" permutation case.
if ((lastNonZero > 0) &&
(s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
(s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) {
ff_block_permute(data, s->dsp.idct_permutation,
s->intra_scantable.scantable, lastNonZero);
}
return lastNonZero;
}
/* AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned */
void dct_unquantize_h263_altivec(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
int i, level, qmul, qadd;
int nCoeffs;
assert(s->block_last_index[n]>=0);
POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
if (s->mb_intra) {
if (!s->h263_aic) {
if (n < 4)
block[0] = block[0] * s->y_dc_scale;
else
block[0] = block[0] * s->c_dc_scale;
}else
qadd = 0;
i = 1;
nCoeffs= 63; //does not always use zigzag table
} else {
i = 0;
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
}
{
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
DECLARE_ALIGNED_16(short, qmul8[]) =
{
qmul, qmul, qmul, qmul,
qmul, qmul, qmul, qmul
};
DECLARE_ALIGNED_16(short, qadd8[]) =
{
qadd, qadd, qadd, qadd,
qadd, qadd, qadd, qadd
};
DECLARE_ALIGNED_16(short, nqadd8[]) =
{
-qadd, -qadd, -qadd, -qadd,
-qadd, -qadd, -qadd, -qadd
};
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
register vector bool short blockv_null, blockv_neg;
register short backup_0 = block[0];
register int j = 0;
qmulv = vec_ld(0, qmul8);
qaddv = vec_ld(0, qadd8);
nqaddv = vec_ld(0, nqadd8);
#if 0 // block *is* 16 bytes-aligned, it seems.
// first make sure block[j] is 16 bytes-aligned
for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
level = block[j];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[j] = level;
}
}
#endif
// vectorize all the 16 bytes-aligned blocks
// of 8 elements
for(; (j + 7) <= nCoeffs ; j+=8) {
blockv = vec_ld(j << 1, block);
blockv_neg = vec_cmplt(blockv, vczero);
blockv_null = vec_cmpeq(blockv, vczero);
// choose between +qadd or -qadd as the third operand
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
// multiply & add (block{i,i+7} * qmul [+-] qadd)
temp1 = vec_mladd(blockv, qmulv, temp1);
// put 0 where block[{i,i+7} used to have 0
blockv = vec_sel(temp1, blockv, blockv_null);
vec_st(blockv, j << 1, block);
}
// if nCoeffs isn't a multiple of 8, finish the job
// using good old scalar units.
// (we could do it using a truncated vector,
// but I'm not sure it's worth the hassle)
for(; j <= nCoeffs ; j++) {
level = block[j];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[j] = level;
}
}
if (i == 1) {
// cheat. this avoid special-casing the first iteration
block[0] = backup_0;
}
}
POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
}
void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
void MPV_common_init_altivec(MpegEncContext *s)
{
if ((mm_flags & FF_MM_ALTIVEC) == 0) return;
if (s->avctx->lowres==0) {
if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
(s->avctx->idct_algo == FF_IDCT_ALTIVEC)) {
s->dsp.idct_put = idct_put_altivec;
s->dsp.idct_add = idct_add_altivec;
s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
}
}
// Test to make sure that the dct required alignments are met.
if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
(((long)(s->q_inter_matrix) & 0x0f) != 0)) {
av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned "
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
return;
}
if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) {
av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned "
"to use AltiVec DCT. Reverting to non-AltiVec version.\n");
return;
}
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
(s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
#if 0 /* seems to cause trouble under some circumstances */
s->dct_quantize = dct_quantize_altivec;
#endif
s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
}
}

View File

@ -0,0 +1,788 @@
/*
* AltiVec-optimized snow DSP utils
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
#include "libavcodec/snow.h"
#include "gcc_fixes.h"
#include "dsputil_altivec.h"
#undef NDEBUG
#include <assert.h>
//FIXME remove this replication
#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
{
int offset;
DWTELEM * buffer;
// av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
assert(buf->data_stack_top >= 0);
// assert(!buf->line[line]);
if (buf->line[line])
return buf->line[line];
offset = buf->line_width * line;
buffer = buf->data_stack[buf->data_stack_top];
buf->data_stack_top--;
buf->line[line] = buffer;
// av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
return buffer;
}
//altivec code
void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width)
{
#if 0
const int w2= (width+1)>>1;
DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]);
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
vector signed short t1, t2, x, y, tmp1, tmp2;
vector signed short *vbuf, *vtmp;
vector unsigned char align;
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
IDWTELEM b_0 = b[0];
vector signed short v7 = vec_splat_s16(7);
vbuf = (vector signed short *)b;
tmp1 = vec_ld (0, ref);
align = vec_lvsl (0, ref);
tmp2 = vec_ld (15, ref);
t1 = vec_perm(tmp1, tmp2, align);
for (i=0; i<w_l-15; i+=16) {
#if 0
/* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/
b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8);
#else
tmp1 = vec_ld (0, ref+8+i);
tmp2 = vec_ld (15, ref+8+i);
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1, vec_sld(t1,t2,2));
// y = vec_add(vec_add(y,y),y);
tmp1 = vec_ld (0, ref+12+i);
y = vec_add(y, vec_splat_s32(4));
y = vec_sra(y, vec_splat_u32(3));
tmp2 = vec_ld (15, ref+12+i);
*vbuf = vec_sub(*vbuf, y);
t1 = t2;
vbuf++;
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_add(vec_add(y,y),y);
tmp1 = vec_ld (0, ref+12+i);
y = vec_add(y, vec_splat_s32(4));
y = vec_sra(y, vec_splat_u32(3));
tmp2 = vec_ld (15, ref+12+i);
*vbuf = vec_sub(*vbuf, y);
t1=t2;
vbuf++;
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_add(vec_add(y,y),y);
tmp1 = vec_ld (0, ref+16+i);
y = vec_add(y, vec_splat_s32(4));
y = vec_sra(y, vec_splat_u32(3));
tmp2 = vec_ld (15, ref+16+i);
*vbuf = vec_sub(*vbuf, y);
t1=t2;
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_add(vec_add(y,y),y);
vbuf++;
y = vec_add(y, vec_splat_s32(4));
y = vec_sra(y, vec_splat_u32(3));
*vbuf = vec_sub(*vbuf, y);
t1=t2;
vbuf++;
#endif
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
}
{ // Lift 1
DWTELEM * const dst = b+w2;
i = 0;
for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
dst[i] = dst[i] - (b[i] + b[i + 1]);
}
align = vec_lvsl(0, b+i);
tmp1 = vec_ld(0, b+i);
vbuf = (vector signed int*) (dst + i);
tmp2 = vec_ld(15, b+i);
t1 = vec_perm(tmp1, tmp2, align);
for (; i<w_r-3; i+=4) {
#if 0
dst[i] = dst[i] - (b[i] + b[i + 1]);
dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]);
dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]);
dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]);
#else
tmp1 = vec_ld(0, b+4+i);
tmp2 = vec_ld(15, b+4+i);
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1, vec_sld(t1,t2,4));
*vbuf = vec_sub (*vbuf, y);
vbuf++;
t1 = t2;
#endif
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
{ // Lift 2
DWTELEM * const ref = b+w2 - 1;
DWTELEM b_0 = b[0];
vbuf= (vector signed int *) b;
tmp1 = vec_ld (0, ref);
align = vec_lvsl (0, ref);
tmp2 = vec_ld (15, ref);
t1= vec_perm(tmp1, tmp2, align);
i = 0;
for (; i<w_l-15; i+=16) {
#if 0
b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4);
b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4);
b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4);
b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4);
#else
tmp1 = vec_ld (0, ref+4+i);
tmp2 = vec_ld (15, ref+4+i);
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_sub(vec_splat_s32(8),y);
tmp1 = vec_ld (0, ref+8+i);
x = vec_sl(*vbuf,vec_splat_u32(2));
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
tmp2 = vec_ld (15, ref+8+i);
*vbuf = vec_sub( *vbuf, y);
t1 = t2;
vbuf++;
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_sub(vec_splat_s32(8),y);
tmp1 = vec_ld (0, ref+12+i);
x = vec_sl(*vbuf,vec_splat_u32(2));
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
tmp2 = vec_ld (15, ref+12+i);
*vbuf = vec_sub( *vbuf, y);
t1 = t2;
vbuf++;
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_sub(vec_splat_s32(8),y);
tmp1 = vec_ld (0, ref+16+i);
x = vec_sl(*vbuf,vec_splat_u32(2));
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
tmp2 = vec_ld (15, ref+16+i);
*vbuf = vec_sub( *vbuf, y);
t1 = t2;
vbuf++;
t2 = vec_perm(tmp1, tmp2, align);
y = vec_add(t1,vec_sld(t1,t2,4));
y = vec_sub(vec_splat_s32(8),y);
t1 = t2;
x = vec_sl(*vbuf,vec_splat_u32(2));
y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
*vbuf = vec_sub( *vbuf, y);
vbuf++;
#endif
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
}
{ // Lift 3
DWTELEM * const src = b+w2;
vbuf = (vector signed int *)b;
vtmp = (vector signed int *)temp;
i = 0;
align = vec_lvsl(0, src);
for (; i<w_r-3; i+=4) {
#if 0
temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1);
temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1);
temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1);
temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1);
#else
tmp1 = vec_ld(0,src+i);
t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4));
tmp2 = vec_ld(15,src+i);
t1 = vec_sub(vec_splat_s32(0),t1); //bad!
t1 = vec_add(t1,vec_add(t1,t1));
t2 = vec_perm(tmp1 ,tmp2 ,align);
t1 = vec_sra(t1,vec_splat_u32(1));
vbuf++;
*vtmp = vec_sub(t2,t1);
vtmp++;
#endif
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1);
}
{
//Interleave
int a;
vector signed int *t = (vector signed int *)temp,
*v = (vector signed int *)b;
snow_interleave_line_header(&i, width, b, temp);
for (; (i & 0xE) != 0xE; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=14; i>=0; i-=16){
a=i/4;
v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]);
v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]);
v[a+1]=vec_mergel(v[a>>1],t[a>>1]);
v[a]=vec_mergeh(v[a>>1],t[a>>1]);
}
}
#endif
}
void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width)
{
int i, w4 = width/4;
vector signed int *v0, *v1,*v2,*v3,*v4,*v5;
vector signed int t1, t2;
v0=(vector signed int *)b0;
v1=(vector signed int *)b1;
v2=(vector signed int *)b2;
v3=(vector signed int *)b3;
v4=(vector signed int *)b4;
v5=(vector signed int *)b5;
for (i=0; i< w4;i++) {
#if 0
b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
b3[i] -= ((b2[i] + b4[i]));
b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4;
b1[i] += (3*(b0[i] + b2[i]))>>1;
#else
t1 = vec_add(v3[i], v5[i]);
t2 = vec_add(t1, vec_add(t1,t1));
t1 = vec_add(t2, vec_splat_s32(4));
v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3)));
v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i]));
t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i]));
t2 = vec_sl(v2[i], vec_splat_u32(2));
v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4)));
t1 = vec_add(v0[i], v2[i]);
t2 = vec_add(t1, vec_add(t1,t1));
v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1)));
#endif
}
for(i*=4; i < width; i++)
{
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
}
#define LOAD_BLOCKS \
tmp1 = vec_ld(0, &block[3][y*src_stride]);\
align = vec_lvsl(0, &block[3][y*src_stride]);\
tmp2 = vec_ld(15, &block[3][y*src_stride]);\
\
b3 = vec_perm(tmp1,tmp2,align);\
\
tmp1 = vec_ld(0, &block[2][y*src_stride]);\
align = vec_lvsl(0, &block[2][y*src_stride]);\
tmp2 = vec_ld(15, &block[2][y*src_stride]);\
\
b2 = vec_perm(tmp1,tmp2,align);\
\
tmp1 = vec_ld(0, &block[1][y*src_stride]);\
align = vec_lvsl(0, &block[1][y*src_stride]);\
tmp2 = vec_ld(15, &block[1][y*src_stride]);\
\
b1 = vec_perm(tmp1,tmp2,align);\
\
tmp1 = vec_ld(0, &block[0][y*src_stride]);\
align = vec_lvsl(0, &block[0][y*src_stride]);\
tmp2 = vec_ld(15, &block[0][y*src_stride]);\
\
b0 = vec_perm(tmp1,tmp2,align);
#define LOAD_OBMCS \
tmp1 = vec_ld(0, obmc1);\
align = vec_lvsl(0, obmc1);\
tmp2 = vec_ld(15, obmc1);\
\
ob1 = vec_perm(tmp1,tmp2,align);\
\
tmp1 = vec_ld(0, obmc2);\
align = vec_lvsl(0, obmc2);\
tmp2 = vec_ld(15, obmc2);\
\
ob2 = vec_perm(tmp1,tmp2,align);\
\
tmp1 = vec_ld(0, obmc3);\
align = vec_lvsl(0, obmc3);\
tmp2 = vec_ld(15, obmc3);\
\
ob3 = vec_perm(tmp1,tmp2,align);\
\
tmp1 = vec_ld(0, obmc4);\
align = vec_lvsl(0, obmc4);\
tmp2 = vec_ld(15, obmc4);\
\
ob4 = vec_perm(tmp1,tmp2,align);
/* interleave logic
* h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
* h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
* h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
*/
#define STEPS_0_1\
h1 = (vector unsigned short)\
vec_mergeh(ob1, ob2);\
\
h2 = (vector unsigned short)\
vec_mergeh(ob3, ob4);\
\
ih = (vector unsigned char)\
vec_mergeh(h1,h2);\
\
l1 = (vector unsigned short) vec_mergeh(b3, b2);\
\
ih1 = (vector unsigned char) vec_mergel(h1, h2);\
\
l2 = (vector unsigned short) vec_mergeh(b1, b0);\
\
il = (vector unsigned char) vec_mergeh(l1, l2);\
\
v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
\
il1 = (vector unsigned char) vec_mergel(l1, l2);\
\
v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
#define FINAL_STEP_SCALAR\
for(x=0; x<b_w; x++)\
if(add){\
vbuf[x] += dst[x + src_x];\
vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
dst8[x + y*src_stride] = vbuf[x];\
}else{\
dst[x + src_x] -= vbuf[x];\
}
static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
const int obmc_stride,
uint8_t * * block, int b_w,
int b_h, int src_x, int src_y,
int src_stride, slice_buffer * sb,
int add, uint8_t * dst8)
{
int y, x;
DWTELEM * dst;
vector unsigned short h1, h2, l1, l2;
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
vector unsigned char b0,b1,b2,b3;
vector unsigned char ob1,ob2,ob3,ob4;
DECLARE_ALIGNED_16(int, vbuf[16]);
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
//FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
dst = slice_buffer_get_line(sb, src_y + y);
d = (vector signed int *)(dst + src_x);
//FIXME i could avoid some loads!
// load blocks
LOAD_BLOCKS
// load obmcs
LOAD_OBMCS
// steps 0 1
STEPS_0_1
FINAL_STEP_SCALAR
}
}
#define STEPS_2_3\
h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
\
h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
\
ih = (vector unsigned char) vec_mergeh(h1,h2);\
\
l1 = (vector unsigned short) vec_mergel(b3, b2);\
\
l2 = (vector unsigned short) vec_mergel(b1, b0);\
\
ih1 = (vector unsigned char) vec_mergel(h1,h2);\
\
il = (vector unsigned char) vec_mergeh(l1,l2);\
\
v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
\
il1 = (vector unsigned char) vec_mergel(l1,l2);\
\
v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
const int obmc_stride,
uint8_t * * block, int b_w,
int b_h, int src_x, int src_y,
int src_stride, slice_buffer * sb,
int add, uint8_t * dst8)
{
int y, x;
DWTELEM * dst;
vector unsigned short h1, h2, l1, l2;
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
vector unsigned char b0,b1,b2,b3;
vector unsigned char ob1,ob2,ob3,ob4;
DECLARE_ALIGNED_16(int, vbuf[b_w]);
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
//FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
dst = slice_buffer_get_line(sb, src_y + y);
d = (vector signed int *)(dst + src_x);
// load blocks
LOAD_BLOCKS
// load obmcs
LOAD_OBMCS
// steps 0 1 2 3
STEPS_0_1
STEPS_2_3
FINAL_STEP_SCALAR
}
}
#define FINAL_STEP_VEC \
\
if(add)\
{\
for(x=0; x<b_w/4; x++)\
{\
v[x] = vec_add(v[x], d[x]);\
v[x] = vec_sra(vec_add(v[x],\
vec_sl( vec_splat_s32(1),\
vec_splat_u32(7))),\
vec_splat_u32(8));\
\
mask = (vector bool int) vec_sl((vector signed int)\
vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\
\
mask = (vector bool int)\
vec_cmpeq((vector signed int)mask,\
(vector signed int)vec_splat_u32(0));\
\
vs = vec_sra(v[x],vec_splat_u32(8));\
vs = vec_sra(v[x],vec_splat_u32(8));\
vs = vec_sra(v[x],vec_splat_u32(15));\
\
vs = vec_nor(vs,vs);\
\
v[x]= vec_sel(v[x],vs,mask);\
}\
\
for(x=0; x<b_w; x++)\
dst8[x + y*src_stride] = vbuf[x];\
\
}\
else\
for(x=0; x<b_w/4; x++)\
d[x] = vec_sub(d[x], v[x]);
static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
const int obmc_stride,
uint8_t * * block, int b_w,
int b_h, int src_x, int src_y,
int src_stride, slice_buffer * sb,
int add, uint8_t * dst8)
{
int y, x;
DWTELEM * dst;
vector bool int mask;
vector signed int vs;
vector unsigned short h1, h2, l1, l2;
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
vector unsigned char b0,b1,b2,b3;
vector unsigned char ob1,ob2,ob3,ob4;
DECLARE_ALIGNED_16(int, vbuf[16]);
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
//FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
dst = slice_buffer_get_line(sb, src_y + y);
d = (vector signed int *)(dst + src_x);
//FIXME i could avoid some loads!
// load blocks
LOAD_BLOCKS
// load obmcs
LOAD_OBMCS
// steps 0 1
STEPS_0_1
FINAL_STEP_VEC
}
}
static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
const int obmc_stride,
uint8_t * * block, int b_w,
int b_h, int src_x, int src_y,
int src_stride, slice_buffer * sb,
int add, uint8_t * dst8)
{
int y, x;
DWTELEM * dst;
vector bool int mask;
vector signed int vs;
vector unsigned short h1, h2, l1, l2;
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
vector unsigned char b0,b1,b2,b3;
vector unsigned char ob1,ob2,ob3,ob4;
DECLARE_ALIGNED_16(int, vbuf[b_w]);
vector signed int *v = (vector signed int *)vbuf, *d;
for(y=0; y<b_h; y++){
//FIXME ugly misuse of obmc_stride
uint8_t *obmc1= obmc + y*obmc_stride;
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
dst = slice_buffer_get_line(sb, src_y + y);
d = (vector signed int *)(dst + src_x);
// load blocks
LOAD_BLOCKS
// load obmcs
LOAD_OBMCS
// steps 0 1 2 3
STEPS_0_1
STEPS_2_3
FINAL_STEP_VEC
}
}
void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride,
slice_buffer * sb, int add,
uint8_t * dst8)
{
if (src_x&15) {
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
b_w, b_h, src_x, src_y,
src_stride, sb, add, dst8);
else if (b_w == 8)
inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
b_w, b_h, src_x, src_y,
src_stride, sb, add, dst8);
else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
src_y, src_stride, sb, add, dst8);
} else {
if (b_w == 16)
inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
b_w, b_h, src_x, src_y,
src_stride, sb, add, dst8);
else if (b_w == 8)
inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
b_w, b_h, src_x, src_y,
src_stride, sb, add, dst8);
else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
src_y, src_stride, sb, add, dst8);
}
}
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
#if 0
c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
#endif
}

View File

@ -0,0 +1,46 @@
/*
* Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_TYPES_ALTIVEC_H
#define AVCODEC_PPC_TYPES_ALTIVEC_H
/***********************************************************************
* Vector types
**********************************************************************/
#define vec_u8 vector unsigned char
#define vec_s8 vector signed char
#define vec_u16 vector unsigned short
#define vec_s16 vector signed short
#define vec_u32 vector unsigned int
#define vec_s32 vector signed int
/***********************************************************************
* Null vector
**********************************************************************/
#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
#define zero_u8v (vec_u8) zerov
#define zero_s8v (vec_s8) zerov
#define zero_u16v (vec_u16) zerov
#define zero_s16v (vec_s16) zerov
#define zero_u32v (vec_u32) zerov
#define zero_s32v (vec_s32) zerov
#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */

View File

@ -0,0 +1,105 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file libavcodec/ppc/util_altivec.h
* Contains misc utility macros and inline functions
*/
#ifndef AVCODEC_PPC_UTIL_ALTIVEC_H
#define AVCODEC_PPC_UTIL_ALTIVEC_H
#include <stdint.h>
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
// used to build registers permutation vectors (vcprm)
// the 's' are for words in the _s_econd vector
#define WORD_0 0x00,0x01,0x02,0x03
#define WORD_1 0x04,0x05,0x06,0x07
#define WORD_2 0x08,0x09,0x0a,0x0b
#define WORD_3 0x0c,0x0d,0x0e,0x0f
#define WORD_s0 0x10,0x11,0x12,0x13
#define WORD_s1 0x14,0x15,0x16,0x17
#define WORD_s2 0x18,0x19,0x1a,0x1b
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
// vcprmle is used to keep the same index as in the SSE version.
// it's the same as vcprm, with the index inversed
// ('le' is Little Endian)
#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
// used to build inverse/identity vectors (vcii)
// n is _n_egative, p is _p_ositive
#define FLOAT_n -1.
#define FLOAT_p 1.
// Transpose 8x8 matrix of 16-bit elements (in-place)
#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
do { \
vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
\
A1 = vec_mergeh (a, e); \
B1 = vec_mergel (a, e); \
C1 = vec_mergeh (b, f); \
D1 = vec_mergel (b, f); \
E1 = vec_mergeh (c, g); \
F1 = vec_mergel (c, g); \
G1 = vec_mergeh (d, h); \
H1 = vec_mergel (d, h); \
\
A2 = vec_mergeh (A1, E1); \
B2 = vec_mergel (A1, E1); \
C2 = vec_mergeh (B1, F1); \
D2 = vec_mergel (B1, F1); \
E2 = vec_mergeh (C1, G1); \
F2 = vec_mergel (C1, G1); \
G2 = vec_mergeh (D1, H1); \
H2 = vec_mergel (D1, H1); \
\
a = vec_mergeh (A2, E2); \
b = vec_mergel (A2, E2); \
c = vec_mergeh (B2, F2); \
d = vec_mergel (B2, F2); \
e = vec_mergeh (C2, G2); \
f = vec_mergel (C2, G2); \
g = vec_mergeh (D2, H2); \
h = vec_mergel (D2, H2); \
} while (0)
/** \brief loads unaligned vector \a *src with offset \a offset
and returns it */
static inline vector unsigned char unaligned_load(int offset, uint8_t *src)
{
register vector unsigned char first = vec_ld(offset, src);
register vector unsigned char second = vec_ld(offset+15, src);
register vector unsigned char mask = vec_lvsl(offset, src);
return vec_perm(first, second, mask);
}
#endif /* AVCODEC_PPC_UTIL_ALTIVEC_H */

View File

@ -0,0 +1,330 @@
/*
* VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
* Copyright (c) 2006 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
#include "gcc_fixes.h"
#include "util_altivec.h"
// main steps of 8x8 transform
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
do { \
t0 = vec_sl(vec_add(s0, s4), vec_2); \
t0 = vec_add(vec_sl(t0, vec_1), t0); \
t0 = vec_add(t0, vec_rnd); \
t1 = vec_sl(vec_sub(s0, s4), vec_2); \
t1 = vec_add(vec_sl(t1, vec_1), t1); \
t1 = vec_add(t1, vec_rnd); \
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
t2 = vec_add(t2, vec_sl(s2, vec_4)); \
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
t4 = vec_add(t0, t2); \
t5 = vec_add(t1, t3); \
t6 = vec_sub(t1, t3); \
t7 = vec_sub(t0, t2); \
\
t0 = vec_sl(vec_add(s1, s3), vec_4); \
t0 = vec_add(t0, vec_sl(s5, vec_3)); \
t0 = vec_add(t0, vec_sl(s7, vec_2)); \
t0 = vec_add(t0, vec_sub(s5, s3)); \
\
t1 = vec_sl(vec_sub(s1, s5), vec_4); \
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
t1 = vec_sub(t1, vec_add(s1, s7)); \
\
t2 = vec_sl(vec_sub(s7, s3), vec_4); \
t2 = vec_add(t2, vec_sl(s1, vec_3)); \
t2 = vec_add(t2, vec_sl(s5, vec_2)); \
t2 = vec_add(t2, vec_sub(s1, s7)); \
\
t3 = vec_sl(vec_sub(s5, s7), vec_4); \
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
t3 = vec_add(t3, vec_sl(s1, vec_2)); \
t3 = vec_sub(t3, vec_add(s3, s5)); \
\
s0 = vec_add(t4, t0); \
s1 = vec_add(t5, t1); \
s2 = vec_add(t6, t2); \
s3 = vec_add(t7, t3); \
s4 = vec_sub(t7, t3); \
s5 = vec_sub(t6, t2); \
s6 = vec_sub(t5, t1); \
s7 = vec_sub(t4, t0); \
}while(0)
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
do { \
s0 = vec_sra(s0, vec_3); \
s1 = vec_sra(s1, vec_3); \
s2 = vec_sra(s2, vec_3); \
s3 = vec_sra(s3, vec_3); \
s4 = vec_sra(s4, vec_3); \
s5 = vec_sra(s5, vec_3); \
s6 = vec_sra(s6, vec_3); \
s7 = vec_sra(s7, vec_3); \
}while(0)
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
do { \
s0 = vec_sra(s0, vec_7); \
s1 = vec_sra(s1, vec_7); \
s2 = vec_sra(s2, vec_7); \
s3 = vec_sra(s3, vec_7); \
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
}while(0)
/* main steps of 4x4 transform */
#define STEP4(s0, s1, s2, s3, vec_rnd) \
do { \
t1 = vec_add(vec_sl(s0, vec_4), s0); \
t1 = vec_add(t1, vec_rnd); \
t2 = vec_add(vec_sl(s2, vec_4), s2); \
t0 = vec_add(t1, t2); \
t1 = vec_sub(t1, t2); \
t3 = vec_sl(vec_sub(s3, s1), vec_1); \
t3 = vec_add(t3, vec_sl(t3, vec_2)); \
t2 = vec_add(t3, vec_sl(s1, vec_5)); \
t3 = vec_add(t3, vec_sl(s3, vec_3)); \
t3 = vec_add(t3, vec_sl(s3, vec_2)); \
s0 = vec_add(t0, t2); \
s1 = vec_sub(t1, t3); \
s2 = vec_add(t1, t3); \
s3 = vec_sub(t0, t2); \
}while (0)
#define SHIFT_HOR4(s0, s1, s2, s3) \
s0 = vec_sra(s0, vec_3); \
s1 = vec_sra(s1, vec_3); \
s2 = vec_sra(s2, vec_3); \
s3 = vec_sra(s3, vec_3);
#define SHIFT_VERT4(s0, s1, s2, s3) \
s0 = vec_sra(s0, vec_7); \
s1 = vec_sra(s1, vec_7); \
s2 = vec_sra(s2, vec_7); \
s3 = vec_sra(s3, vec_7);
/** Do inverse transform on 8x8 block
*/
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector signed int vec_1s = vec_splat_s32(1);
const vector unsigned int vec_1 = vec_splat_u32(1);
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
vec_st(src0, 0, block);
vec_st(src1, 16, block);
vec_st(src2, 32, block);
vec_st(src3, 48, block);
vec_st(src4, 64, block);
vec_st(src5, 80, block);
vec_st(src6, 96, block);
vec_st(src7,112, block);
}
/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
const vector unsigned int vec_5 = vec_splat_u32(5);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector unsigned int vec_1 = vec_splat_u32(1);
vector unsigned char tmp;
vector signed short tmp2, tmp3;
vector unsigned char perm0, perm1, p0, p1, p;
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackh(src0);
s1 = vec_unpackh(src1);
s2 = vec_unpackh(src2);
s3 = vec_unpackh(src3);
s8 = vec_unpackl(src0);
s9 = vec_unpackl(src1);
sA = vec_unpackl(src2);
sB = vec_unpackl(src3);
STEP4(s0, s1, s2, s3, vec_64);
SHIFT_VERT4(s0, s1, s2, s3);
STEP4(s8, s9, sA, sB, vec_64);
SHIFT_VERT4(s8, s9, sA, sB);
src0 = vec_pack(s0, s8);
src1 = vec_pack(s1, s9);
src2 = vec_pack(s2, sA);
src3 = vec_pack(s3, sB);
p0 = vec_lvsl (0, dest);
p1 = vec_lvsl (stride, dest);
p = vec_splat_u8 (-1);
perm0 = vec_mergeh (p, p0);
perm1 = vec_mergeh (p, p1);
#define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
tmp = vec_ld (0, dest); \
tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
ADD (dest, src0, perm0) dest += stride;
ADD (dest, src1, perm1) dest += stride;
ADD (dest, src2, perm0) dest += stride;
ADD (dest, src3, perm1)
}
void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) {
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
}

View File

@ -0,0 +1,49 @@
SubDir HAIKU_TOP src add-ons media plugins avcodec libavcodec sparc ;
SubDirHdrs [ FDirName $(SUBDIR) .. ] ;
SubDirHdrs [ FDirName $(SUBDIR) ../.. ] ;
SubDirHdrs [ FDirName $(SUBDIR) ../../libavutil ] ;
SubDirHdrs [ FDirName $(SUBDIR) ../../libswscale ] ;
# filter warnings we don't want here
TARGET_WARNING_CCFLAGS = [ FFilter $(TARGET_WARNING_CCFLAGS)
: -Wall -Wmissing-prototypes -Wsign-compare -Wpointer-arith ] ;
if $(HAIKU_GCC_VERSION[1]) >= 3 {
SubDirCcFlags -fomit-frame-pointer -fno-pic ;
} else {
SubDirCcFlags -fomit-frame-pointer -DPIC ;
}
local defines ;
defines = HAVE_AV_CONFIG_H=1 ;
if $(TARGET_ARCH) = x86 {
defines += ARCH_X86=1 ARCH_X86_32=1 ARCH_PPC=0 ARCH_SPARC=0 ;
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
defines += HAVE_MMX=1 HAVE_MMX2=1 HAVE_SSE=0 HAVE_SSE3=1 ;
defines += HAVE_ALTIVEC=0 ;
defines += HAVE_VIS=0 ;
} else if $(TARGET_ARCH) = ppc {
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=1 ARCH_SPARC=0 ;
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
defines += HAVE_ALTIVEC=1 ;
defines += HAVE_VIS=0 ;
} else if $(TARGET_ARCH) = sparc {
defines += ARCH_X86=0 ARCH_X86_32=0 ARCH_PPC=0 ARCH_SPARC=1 ;
defines += HAVE_AMD3DNOW=0 HAVE_AMD3DNOWEXT=0 ;
defines += HAVE_MMX=0 HAVE_MMX2=0 HAVE_SSE=0 HAVE_SSE3=0 ;
defines += HAVE_ALTIVEC=0 ;
defines += HAVE_VIS=1 ;
}
defines = [ FDefines $(defines) ] ;
SubDirCcFlags $(defines) ;
SubDirC++Flags $(defines) ;
StaticLibrary libavcodec_sparc.a :
dsputil_vis.c
simple_idct_vis.c
;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,528 @@
/*
* SPARC VIS optimized inverse DCT
* Copyright (c) 2007 Denes Balatoni < dbalatoni XatX interware XdotX hu >
*
* I did consult the following fine web page about dct
* http://www.geocities.com/ssavekar/dct.htm
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/dsputil.h"
static const DECLARE_ALIGNED_8(int16_t, coeffs[28]) = {
- 1259,- 1259,- 1259,- 1259,
- 4989,- 4989,- 4989,- 4989,
-11045,-11045,-11045,-11045,
-19195,-19195,-19195,-19195,
-29126,-29126,-29126,-29126,
25080, 25080, 25080, 25080,
12785, 12785, 12785, 12785
};
static const DECLARE_ALIGNED_8(uint16_t, scale[4]) = {
65536>>6, 65536>>6, 65536>>6, 65536>>6
};
static const DECLARE_ALIGNED_8(uint16_t, rounder[4]) = {
1<<5, 1<<5, 1<<5, 1<<5
};
static const DECLARE_ALIGNED_8(uint16_t, expand[4]) = {
1<<14, 1<<14, 1<<14, 1<<14
};
#define INIT_IDCT \
"ldd [%1], %%f32 \n\t"\
"ldd [%1+8], %%f34 \n\t"\
"ldd [%1+16], %%f36 \n\t"\
"ldd [%1+24], %%f38 \n\t"\
"ldd [%1+32], %%f40 \n\t"\
"ldd [%1+40], %%f42 \n\t"\
"ldd [%1+48], %%f44 \n\t"\
"ldd [%0], %%f46 \n\t"\
"fzero %%f62 \n\t"\
#define LOADSCALE(in) \
"ldd [" in "], %%f0 \n\t"\
"ldd [" in "+16], %%f2 \n\t"\
"ldd [" in "+32], %%f4 \n\t"\
"ldd [" in "+48], %%f6 \n\t"\
"ldd [" in "+64], %%f8 \n\t"\
"ldd [" in "+80], %%f10 \n\t"\
"ldd [" in "+96], %%f12 \n\t"\
"ldd [" in "+112], %%f14 \n\t"\
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
\
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
\
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
\
"fpadd16 %%f0, %%f0, %%f0 \n\t"\
"fpadd16 %%f2, %%f2, %%f2 \n\t"\
"fpadd16 %%f4, %%f4, %%f4 \n\t"\
"fpadd16 %%f6, %%f6, %%f6 \n\t"\
"fpadd16 %%f8, %%f8, %%f8 \n\t"\
"fpadd16 %%f10, %%f10, %%f10 \n\t"\
"fpadd16 %%f12, %%f12, %%f12 \n\t"\
"fpadd16 %%f14, %%f14, %%f14 \n\t"\
#define LOAD(in) \
"ldd [" in "], %%f16 \n\t"\
"ldd [" in "+8], %%f18 \n\t"\
"ldd [" in "+16], %%f20 \n\t"\
"ldd [" in "+24], %%f22 \n\t"\
"ldd [" in "+32], %%f24 \n\t"\
"ldd [" in "+40], %%f26 \n\t"\
"ldd [" in "+48], %%f28 \n\t"\
"ldd [" in "+56], %%f30 \n\t"\
#define TRANSPOSE \
"fpmerge %%f16, %%f24, %%f0 \n\t"\
"fpmerge %%f20, %%f28, %%f2 \n\t"\
"fpmerge %%f17, %%f25, %%f4 \n\t"\
"fpmerge %%f21, %%f29, %%f6 \n\t"\
"fpmerge %%f18, %%f26, %%f8 \n\t"\
"fpmerge %%f22, %%f30, %%f10 \n\t"\
"fpmerge %%f19, %%f27, %%f12 \n\t"\
"fpmerge %%f23, %%f31, %%f14 \n\t"\
\
"fpmerge %%f0, %%f2, %%f16 \n\t"\
"fpmerge %%f1, %%f3, %%f18 \n\t"\
"fpmerge %%f4, %%f6, %%f20 \n\t"\
"fpmerge %%f5, %%f7, %%f22 \n\t"\
"fpmerge %%f8, %%f10, %%f24 \n\t"\
"fpmerge %%f9, %%f11, %%f26 \n\t"\
"fpmerge %%f12, %%f14, %%f28 \n\t"\
"fpmerge %%f13, %%f15, %%f30 \n\t"\
\
"fpmerge %%f16, %%f17, %%f0 \n\t"\
"fpmerge %%f18, %%f19, %%f2 \n\t"\
"fpmerge %%f20, %%f21, %%f4 \n\t"\
"fpmerge %%f22, %%f23, %%f6 \n\t"\
"fpmerge %%f24, %%f25, %%f8 \n\t"\
"fpmerge %%f26, %%f27, %%f10 \n\t"\
"fpmerge %%f28, %%f29, %%f12 \n\t"\
"fpmerge %%f30, %%f31, %%f14 \n\t"\
#define IDCT4ROWS \
/* 1. column */\
"fmul8ulx16 %%f0, %%f38, %%f28 \n\t"\
"for %%f4, %%f6, %%f60 \n\t"\
"fmul8ulx16 %%f2, %%f32, %%f18 \n\t"\
"fmul8ulx16 %%f2, %%f36, %%f22 \n\t"\
"fmul8ulx16 %%f2, %%f40, %%f26 \n\t"\
"fmul8ulx16 %%f2, %%f44, %%f30 \n\t"\
\
ADDROUNDER\
\
"fmul8sux16 %%f0, %%f38, %%f48 \n\t"\
"fcmpd %%fcc0, %%f62, %%f60 \n\t"\
"for %%f8, %%f10, %%f60 \n\t"\
"fmul8sux16 %%f2, %%f32, %%f50 \n\t"\
"fmul8sux16 %%f2, %%f36, %%f52 \n\t"\
"fmul8sux16 %%f2, %%f40, %%f54 \n\t"\
"fmul8sux16 %%f2, %%f44, %%f56 \n\t"\
\
"fpadd16 %%f48, %%f28, %%f28 \n\t"\
"fcmpd %%fcc1, %%f62, %%f60 \n\t"\
"for %%f12, %%f14, %%f60 \n\t"\
"fpadd16 %%f50, %%f18, %%f18 \n\t"\
"fpadd16 %%f52, %%f22, %%f22 \n\t"\
"fpadd16 %%f54, %%f26, %%f26 \n\t"\
"fpadd16 %%f56, %%f30, %%f30 \n\t"\
\
"fpadd16 %%f28, %%f0, %%f16 \n\t"\
"fcmpd %%fcc2, %%f62, %%f60 \n\t"\
"fpadd16 %%f28, %%f0, %%f20 \n\t"\
"fpadd16 %%f28, %%f0, %%f24 \n\t"\
"fpadd16 %%f28, %%f0, %%f28 \n\t"\
"fpadd16 %%f18, %%f2, %%f18 \n\t"\
"fpadd16 %%f22, %%f2, %%f22 \n\t"\
/* 2. column */\
"fbe %%fcc0, 3f \n\t"\
"fpadd16 %%f26, %%f2, %%f26 \n\t"\
"fmul8ulx16 %%f4, %%f34, %%f48 \n\t"\
"fmul8ulx16 %%f4, %%f42, %%f50 \n\t"\
"fmul8ulx16 %%f6, %%f36, %%f52 \n\t"\
"fmul8ulx16 %%f6, %%f44, %%f54 \n\t"\
"fmul8ulx16 %%f6, %%f32, %%f56 \n\t"\
"fmul8ulx16 %%f6, %%f40, %%f58 \n\t"\
\
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
"fpadd16 %%f20, %%f50, %%f20 \n\t"\
"fpsub16 %%f24, %%f50, %%f24 \n\t"\
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
"fpsub16 %%f26, %%f56, %%f26 \n\t"\
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
\
"fmul8sux16 %%f4, %%f34, %%f48 \n\t"\
"fmul8sux16 %%f4, %%f42, %%f50 \n\t"\
"fmul8sux16 %%f6, %%f36, %%f52 \n\t"\
"fmul8sux16 %%f6, %%f44, %%f54 \n\t"\
"fmul8sux16 %%f6, %%f32, %%f56 \n\t"\
"fmul8sux16 %%f6, %%f40, %%f58 \n\t"\
\
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
"fpadd16 %%f20, %%f50, %%f20 \n\t"\
"fpsub16 %%f24, %%f50, %%f24 \n\t"\
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
"fpsub16 %%f26, %%f56, %%f26 \n\t"\
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
\
"fpadd16 %%f16, %%f4, %%f16 \n\t"\
"fpsub16 %%f28, %%f4, %%f28 \n\t"\
"fpadd16 %%f18, %%f6, %%f18 \n\t"\
"fpsub16 %%f26, %%f6, %%f26 \n\t"\
/* 3. column */\
"3: \n\t"\
"fbe %%fcc1, 4f \n\t"\
"fpsub16 %%f30, %%f6, %%f30 \n\t"\
"fmul8ulx16 %%f8, %%f38, %%f48 \n\t"\
"fmul8ulx16 %%f10, %%f40, %%f50 \n\t"\
"fmul8ulx16 %%f10, %%f32, %%f52 \n\t"\
"fmul8ulx16 %%f10, %%f44, %%f54 \n\t"\
"fmul8ulx16 %%f10, %%f36, %%f56 \n\t"\
\
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
"fpsub16 %%f20, %%f48, %%f20 \n\t"\
"fpsub16 %%f24, %%f48, %%f24 \n\t"\
"fpadd16 %%f28, %%f48, %%f28 \n\t"\
"fpadd16 %%f18, %%f50, %%f18 \n\t"\
"fpsub16 %%f22, %%f52, %%f22 \n\t"\
"fpadd16 %%f26, %%f54, %%f26 \n\t"\
"fpadd16 %%f30, %%f56, %%f30 \n\t"\
\
"fmul8sux16 %%f8, %%f38, %%f48 \n\t"\
"fmul8sux16 %%f10, %%f40, %%f50 \n\t"\
"fmul8sux16 %%f10, %%f32, %%f52 \n\t"\
"fmul8sux16 %%f10, %%f44, %%f54 \n\t"\
"fmul8sux16 %%f10, %%f36, %%f56 \n\t"\
\
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
"fpsub16 %%f20, %%f48, %%f20 \n\t"\
"fpsub16 %%f24, %%f48, %%f24 \n\t"\
"fpadd16 %%f28, %%f48, %%f28 \n\t"\
"fpadd16 %%f18, %%f50, %%f18 \n\t"\
"fpsub16 %%f22, %%f52, %%f22 \n\t"\
"fpadd16 %%f26, %%f54, %%f26 \n\t"\
"fpadd16 %%f30, %%f56, %%f30 \n\t"\
\
"fpadd16 %%f16, %%f8, %%f16 \n\t"\
"fpsub16 %%f20, %%f8, %%f20 \n\t"\
"fpsub16 %%f24, %%f8, %%f24 \n\t"\
"fpadd16 %%f28, %%f8, %%f28 \n\t"\
"fpadd16 %%f18, %%f10, %%f18 \n\t"\
"fpsub16 %%f22, %%f10, %%f22 \n\t"\
/* 4. column */\
"4: \n\t"\
"fbe %%fcc2, 5f \n\t"\
"fpadd16 %%f30, %%f10, %%f30 \n\t"\
"fmul8ulx16 %%f12, %%f42, %%f48 \n\t"\
"fmul8ulx16 %%f12, %%f34, %%f50 \n\t"\
"fmul8ulx16 %%f14, %%f44, %%f52 \n\t"\
"fmul8ulx16 %%f14, %%f40, %%f54 \n\t"\
"fmul8ulx16 %%f14, %%f36, %%f56 \n\t"\
"fmul8ulx16 %%f14, %%f32, %%f58 \n\t"\
\
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
"fpsub16 %%f20, %%f50, %%f20 \n\t"\
"fpadd16 %%f24, %%f50, %%f24 \n\t"\
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
"fpadd16 %%f26, %%f56, %%f26 \n\t"\
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
\
"fmul8sux16 %%f12, %%f42, %%f48 \n\t"\
"fmul8sux16 %%f12, %%f34, %%f50 \n\t"\
"fmul8sux16 %%f14, %%f44, %%f52 \n\t"\
"fmul8sux16 %%f14, %%f40, %%f54 \n\t"\
"fmul8sux16 %%f14, %%f36, %%f56 \n\t"\
"fmul8sux16 %%f14, %%f32, %%f58 \n\t"\
\
"fpadd16 %%f16, %%f48, %%f16 \n\t"\
"fpsub16 %%f20, %%f50, %%f20 \n\t"\
"fpadd16 %%f24, %%f50, %%f24 \n\t"\
"fpsub16 %%f28, %%f48, %%f28 \n\t"\
"fpadd16 %%f18, %%f52, %%f18 \n\t"\
"fpsub16 %%f22, %%f54, %%f22 \n\t"\
"fpadd16 %%f26, %%f56, %%f26 \n\t"\
"fpsub16 %%f30, %%f58, %%f30 \n\t"\
\
"fpsub16 %%f20, %%f12, %%f20 \n\t"\
"fpadd16 %%f24, %%f12, %%f24 \n\t"\
"fpsub16 %%f22, %%f14, %%f22 \n\t"\
"fpadd16 %%f26, %%f14, %%f26 \n\t"\
"fpsub16 %%f30, %%f14, %%f30 \n\t"\
/* final butterfly */\
"5: \n\t"\
"fpsub16 %%f16, %%f18, %%f48 \n\t"\
"fpsub16 %%f20, %%f22, %%f50 \n\t"\
"fpsub16 %%f24, %%f26, %%f52 \n\t"\
"fpsub16 %%f28, %%f30, %%f54 \n\t"\
"fpadd16 %%f16, %%f18, %%f16 \n\t"\
"fpadd16 %%f20, %%f22, %%f20 \n\t"\
"fpadd16 %%f24, %%f26, %%f24 \n\t"\
"fpadd16 %%f28, %%f30, %%f28 \n\t"\
#define STOREROWS(out) \
"std %%f48, [" out "+112] \n\t"\
"std %%f50, [" out "+96] \n\t"\
"std %%f52, [" out "+80] \n\t"\
"std %%f54, [" out "+64] \n\t"\
"std %%f16, [" out "] \n\t"\
"std %%f20, [" out "+16] \n\t"\
"std %%f24, [" out "+32] \n\t"\
"std %%f28, [" out "+48] \n\t"\
#define SCALEROWS \
"fmul8sux16 %%f46, %%f48, %%f48 \n\t"\
"fmul8sux16 %%f46, %%f50, %%f50 \n\t"\
"fmul8sux16 %%f46, %%f52, %%f52 \n\t"\
"fmul8sux16 %%f46, %%f54, %%f54 \n\t"\
"fmul8sux16 %%f46, %%f16, %%f16 \n\t"\
"fmul8sux16 %%f46, %%f20, %%f20 \n\t"\
"fmul8sux16 %%f46, %%f24, %%f24 \n\t"\
"fmul8sux16 %%f46, %%f28, %%f28 \n\t"\
#define PUTPIXELSCLAMPED(dest) \
"fpack16 %%f48, %%f14 \n\t"\
"fpack16 %%f50, %%f12 \n\t"\
"fpack16 %%f16, %%f0 \n\t"\
"fpack16 %%f20, %%f2 \n\t"\
"fpack16 %%f24, %%f4 \n\t"\
"fpack16 %%f28, %%f6 \n\t"\
"fpack16 %%f54, %%f8 \n\t"\
"fpack16 %%f52, %%f10 \n\t"\
"st %%f0, [%3+" dest "] \n\t"\
"st %%f2, [%5+" dest "] \n\t"\
"st %%f4, [%6+" dest "] \n\t"\
"st %%f6, [%7+" dest "] \n\t"\
"st %%f8, [%8+" dest "] \n\t"\
"st %%f10, [%9+" dest "] \n\t"\
"st %%f12, [%10+" dest "] \n\t"\
"st %%f14, [%11+" dest "] \n\t"\
#define ADDPIXELSCLAMPED(dest) \
"ldd [%5], %%f18 \n\t"\
"ld [%3+" dest"], %%f0 \n\t"\
"ld [%6+" dest"], %%f2 \n\t"\
"ld [%7+" dest"], %%f4 \n\t"\
"ld [%8+" dest"], %%f6 \n\t"\
"ld [%9+" dest"], %%f8 \n\t"\
"ld [%10+" dest"], %%f10 \n\t"\
"ld [%11+" dest"], %%f12 \n\t"\
"ld [%12+" dest"], %%f14 \n\t"\
"fmul8x16 %%f0, %%f18, %%f0 \n\t"\
"fmul8x16 %%f2, %%f18, %%f2 \n\t"\
"fmul8x16 %%f4, %%f18, %%f4 \n\t"\
"fmul8x16 %%f6, %%f18, %%f6 \n\t"\
"fmul8x16 %%f8, %%f18, %%f8 \n\t"\
"fmul8x16 %%f10, %%f18, %%f10 \n\t"\
"fmul8x16 %%f12, %%f18, %%f12 \n\t"\
"fmul8x16 %%f14, %%f18, %%f14 \n\t"\
"fpadd16 %%f0, %%f16, %%f0 \n\t"\
"fpadd16 %%f2, %%f20, %%f2 \n\t"\
"fpadd16 %%f4, %%f24, %%f4 \n\t"\
"fpadd16 %%f6, %%f28, %%f6 \n\t"\
"fpadd16 %%f8, %%f54, %%f8 \n\t"\
"fpadd16 %%f10, %%f52, %%f10 \n\t"\
"fpadd16 %%f12, %%f50, %%f12 \n\t"\
"fpadd16 %%f14, %%f48, %%f14 \n\t"\
"fpack16 %%f0, %%f0 \n\t"\
"fpack16 %%f2, %%f2 \n\t"\
"fpack16 %%f4, %%f4 \n\t"\
"fpack16 %%f6, %%f6 \n\t"\
"fpack16 %%f8, %%f8 \n\t"\
"fpack16 %%f10, %%f10 \n\t"\
"fpack16 %%f12, %%f12 \n\t"\
"fpack16 %%f14, %%f14 \n\t"\
"st %%f0, [%3+" dest "] \n\t"\
"st %%f2, [%6+" dest "] \n\t"\
"st %%f4, [%7+" dest "] \n\t"\
"st %%f6, [%8+" dest "] \n\t"\
"st %%f8, [%9+" dest "] \n\t"\
"st %%f10, [%10+" dest "] \n\t"\
"st %%f12, [%11+" dest "] \n\t"\
"st %%f14, [%12+" dest "] \n\t"\
void ff_simple_idct_vis(DCTELEM *data) {
int out1, out2, out3, out4;
DECLARE_ALIGNED_8(int16_t, temp[8*8]);
__asm__ volatile(
INIT_IDCT
#define ADDROUNDER
// shift right 16-4=12
LOADSCALE("%2+8")
IDCT4ROWS
STOREROWS("%3+8")
LOADSCALE("%2+0")
IDCT4ROWS
"std %%f48, [%3+112] \n\t"
"std %%f50, [%3+96] \n\t"
"std %%f52, [%3+80] \n\t"
"std %%f54, [%3+64] \n\t"
// shift right 16+4
"ldd [%3+8], %%f18 \n\t"
"ldd [%3+24], %%f22 \n\t"
"ldd [%3+40], %%f26 \n\t"
"ldd [%3+56], %%f30 \n\t"
TRANSPOSE
IDCT4ROWS
SCALEROWS
STOREROWS("%2+0")
LOAD("%3+64")
TRANSPOSE
IDCT4ROWS
SCALEROWS
STOREROWS("%2+8")
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4)
: "0" (scale), "1" (coeffs), "2" (data), "3" (temp)
);
}
void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data) {
int out1, out2, out3, out4, out5;
int r1, r2, r3, r4, r5, r6, r7;
__asm__ volatile(
"wr %%g0, 0x8, %%gsr \n\t"
INIT_IDCT
"add %3, %4, %5 \n\t"
"add %5, %4, %6 \n\t"
"add %6, %4, %7 \n\t"
"add %7, %4, %8 \n\t"
"add %8, %4, %9 \n\t"
"add %9, %4, %10 \n\t"
"add %10, %4, %11 \n\t"
// shift right 16-4=12
LOADSCALE("%2+8")
IDCT4ROWS
STOREROWS("%2+8")
LOADSCALE("%2+0")
IDCT4ROWS
"std %%f48, [%2+112] \n\t"
"std %%f50, [%2+96] \n\t"
"std %%f52, [%2+80] \n\t"
"std %%f54, [%2+64] \n\t"
#undef ADDROUNDER
#define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
// shift right 16+4
"ldd [%2+8], %%f18 \n\t"
"ldd [%2+24], %%f22 \n\t"
"ldd [%2+40], %%f26 \n\t"
"ldd [%2+56], %%f30 \n\t"
TRANSPOSE
IDCT4ROWS
PUTPIXELSCLAMPED("0")
LOAD("%2+64")
TRANSPOSE
IDCT4ROWS
PUTPIXELSCLAMPED("4")
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5),
"=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
: "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size)
);
}
void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data) {
int out1, out2, out3, out4, out5, out6;
int r1, r2, r3, r4, r5, r6, r7;
__asm__ volatile(
"wr %%g0, 0x8, %%gsr \n\t"
INIT_IDCT
"add %3, %4, %6 \n\t"
"add %6, %4, %7 \n\t"
"add %7, %4, %8 \n\t"
"add %8, %4, %9 \n\t"
"add %9, %4, %10 \n\t"
"add %10, %4, %11 \n\t"
"add %11, %4, %12 \n\t"
#undef ADDROUNDER
#define ADDROUNDER
// shift right 16-4=12
LOADSCALE("%2+8")
IDCT4ROWS
STOREROWS("%2+8")
LOADSCALE("%2+0")
IDCT4ROWS
"std %%f48, [%2+112] \n\t"
"std %%f50, [%2+96] \n\t"
"std %%f52, [%2+80] \n\t"
"std %%f54, [%2+64] \n\t"
#undef ADDROUNDER
#define ADDROUNDER "fpadd16 %%f28, %%f46, %%f28 \n\t"
// shift right 16+4
"ldd [%2+8], %%f18 \n\t"
"ldd [%2+24], %%f22 \n\t"
"ldd [%2+40], %%f26 \n\t"
"ldd [%2+56], %%f30 \n\t"
TRANSPOSE
IDCT4ROWS
ADDPIXELSCLAMPED("0")
LOAD("%2+64")
TRANSPOSE
IDCT4ROWS
ADDPIXELSCLAMPED("4")
: "=r" (out1), "=r" (out2), "=r" (out3), "=r" (out4), "=r" (out5), "=r" (out6),
"=r" (r1), "=r" (r2), "=r" (r3), "=r" (r4), "=r" (r5), "=r" (r6), "=r" (r7)
: "0" (rounder), "1" (coeffs), "2" (data), "3" (dest), "4" (line_size), "5" (expand)
);
}

View File

@ -0,0 +1,331 @@
/*
* Copyright (C) 2003 David S. Miller <davem@redhat.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* You may be asking why I hard-code the instruction opcodes and don't
* use the normal VIS assembler mnenomics for the VIS instructions.
*
* The reason is that Sun, in their infinite wisdom, decided that a binary
* using a VIS instruction will cause it to be marked (in the ELF headers)
* as doing so, and this prevents the OS from loading such binaries if the
* current cpu doesn't have VIS. There is no way to easily override this
* behavior of the assembler that I am aware of.
*
* This totally defeats what libmpeg2 is trying to do which is allow a
* single binary to be created, and then detect the availability of VIS
* at runtime.
*
* I'm not saying that tainting the binary by default is bad, rather I'm
* saying that not providing a way to override this easily unnecessarily
* ties people's hands.
*
* Thus, we do the opcode encoding by hand and output 32-bit words in
* the assembler to keep the binary from becoming tainted.
*/
#ifndef AVCODEC_SPARC_VIS_H
#define AVCODEC_SPARC_VIS_H
#define vis_opc_base ((0x1 << 31) | (0x36 << 19))
#define vis_opf(X) ((X) << 5)
#define vis_sreg(X) (X)
#define vis_dreg(X) (((X)&0x1f)|((X)>>5))
#define vis_rs1_s(X) (vis_sreg(X) << 14)
#define vis_rs1_d(X) (vis_dreg(X) << 14)
#define vis_rs2_s(X) (vis_sreg(X) << 0)
#define vis_rs2_d(X) (vis_dreg(X) << 0)
#define vis_rd_s(X) (vis_sreg(X) << 25)
#define vis_rd_d(X) (vis_dreg(X) << 25)
#define vis_ss2s(opf,rs1,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs1_s(rs1) | \
vis_rs2_s(rs2) | \
vis_rd_s(rd)))
#define vis_dd2d(opf,rs1,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs1_d(rs1) | \
vis_rs2_d(rs2) | \
vis_rd_d(rd)))
#define vis_ss2d(opf,rs1,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs1_s(rs1) | \
vis_rs2_s(rs2) | \
vis_rd_d(rd)))
#define vis_sd2d(opf,rs1,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs1_s(rs1) | \
vis_rs2_d(rs2) | \
vis_rd_d(rd)))
#define vis_d2s(opf,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs2_d(rs2) | \
vis_rd_s(rd)))
#define vis_s2d(opf,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs2_s(rs2) | \
vis_rd_d(rd)))
#define vis_d12d(opf,rs1,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs1_d(rs1) | \
vis_rd_d(rd)))
#define vis_d22d(opf,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs2_d(rs2) | \
vis_rd_d(rd)))
#define vis_s12s(opf,rs1,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs1_s(rs1) | \
vis_rd_s(rd)))
#define vis_s22s(opf,rs2,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rs2_s(rs2) | \
vis_rd_s(rd)))
#define vis_s(opf,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rd_s(rd)))
#define vis_d(opf,rd) \
__asm__ volatile (".word %0" \
: : "i" (vis_opc_base | vis_opf(opf) | \
vis_rd_d(rd)))
#define vis_r2m(op,rd,mem) \
__asm__ volatile (#op "\t%%f" #rd ", [%0]" : : "r" (&(mem)) )
#define vis_r2m_2(op,rd,mem1,mem2) \
__asm__ volatile (#op "\t%%f" #rd ", [%0 + %1]" : : "r" (mem1), "r" (mem2) )
#define vis_m2r(op,mem,rd) \
__asm__ volatile (#op "\t[%0], %%f" #rd : : "r" (&(mem)) )
#define vis_m2r_2(op,mem1,mem2,rd) \
__asm__ volatile (#op "\t[%0 + %1], %%f" #rd : : "r" (mem1), "r" (mem2) )
static inline void vis_set_gsr(unsigned int _val)
{
register unsigned int val __asm__("g1");
val = _val;
__asm__ volatile(".word 0xa7804000"
: : "r" (val));
}
#define VIS_GSR_ALIGNADDR_MASK 0x0000007
#define VIS_GSR_ALIGNADDR_SHIFT 0
#define VIS_GSR_SCALEFACT_MASK 0x0000078
#define VIS_GSR_SCALEFACT_SHIFT 3
#define vis_ld32(mem,rs1) vis_m2r(ld, mem, rs1)
#define vis_ld32_2(mem1,mem2,rs1) vis_m2r_2(ld, mem1, mem2, rs1)
#define vis_st32(rs1,mem) vis_r2m(st, rs1, mem)
#define vis_st32_2(rs1,mem1,mem2) vis_r2m_2(st, rs1, mem1, mem2)
#define vis_ld64(mem,rs1) vis_m2r(ldd, mem, rs1)
#define vis_ld64_2(mem1,mem2,rs1) vis_m2r_2(ldd, mem1, mem2, rs1)
#define vis_st64(rs1,mem) vis_r2m(std, rs1, mem)
#define vis_st64_2(rs1,mem1,mem2) vis_r2m_2(std, rs1, mem1, mem2)
#define vis_ldblk(mem, rd) \
do { register void *__mem __asm__("g1"); \
__mem = &(mem); \
__asm__ volatile(".word 0xc1985e00 | %1" \
: \
: "r" (__mem), \
"i" (vis_rd_d(rd)) \
: "memory"); \
} while (0)
#define vis_stblk(rd, mem) \
do { register void *__mem __asm__("g1"); \
__mem = &(mem); \
__asm__ volatile(".word 0xc1b85e00 | %1" \
: \
: "r" (__mem), \
"i" (vis_rd_d(rd)) \
: "memory"); \
} while (0)
#define vis_membar_storestore() \
__asm__ volatile(".word 0x8143e008" : : : "memory")
#define vis_membar_sync() \
__asm__ volatile(".word 0x8143e040" : : : "memory")
/* 16 and 32 bit partitioned addition and subtraction. The normal
* versions perform 4 16-bit or 2 32-bit additions or subtractions.
* The 's' versions perform 2 16-bit or 1 32-bit additions or
* subtractions.
*/
#define vis_padd16(rs1,rs2,rd) vis_dd2d(0x50, rs1, rs2, rd)
#define vis_padd16s(rs1,rs2,rd) vis_ss2s(0x51, rs1, rs2, rd)
#define vis_padd32(rs1,rs2,rd) vis_dd2d(0x52, rs1, rs2, rd)
#define vis_padd32s(rs1,rs2,rd) vis_ss2s(0x53, rs1, rs2, rd)
#define vis_psub16(rs1,rs2,rd) vis_dd2d(0x54, rs1, rs2, rd)
#define vis_psub16s(rs1,rs2,rd) vis_ss2s(0x55, rs1, rs2, rd)
#define vis_psub32(rs1,rs2,rd) vis_dd2d(0x56, rs1, rs2, rd)
#define vis_psub32s(rs1,rs2,rd) vis_ss2s(0x57, rs1, rs2, rd)
/* Pixel formatting instructions. */
#define vis_pack16(rs2,rd) vis_d2s( 0x3b, rs2, rd)
#define vis_pack32(rs1,rs2,rd) vis_dd2d(0x3a, rs1, rs2, rd)
#define vis_packfix(rs2,rd) vis_d2s( 0x3d, rs2, rd)
#define vis_expand(rs2,rd) vis_s2d( 0x4d, rs2, rd)
#define vis_pmerge(rs1,rs2,rd) vis_ss2d(0x4b, rs1, rs2, rd)
/* Partitioned multiply instructions. */
#define vis_mul8x16(rs1,rs2,rd) vis_sd2d(0x31, rs1, rs2, rd)
#define vis_mul8x16au(rs1,rs2,rd) vis_ss2d(0x33, rs1, rs2, rd)
#define vis_mul8x16al(rs1,rs2,rd) vis_ss2d(0x35, rs1, rs2, rd)
#define vis_mul8sux16(rs1,rs2,rd) vis_dd2d(0x36, rs1, rs2, rd)
#define vis_mul8ulx16(rs1,rs2,rd) vis_dd2d(0x37, rs1, rs2, rd)
#define vis_muld8sux16(rs1,rs2,rd) vis_ss2d(0x38, rs1, rs2, rd)
#define vis_muld8ulx16(rs1,rs2,rd) vis_ss2d(0x39, rs1, rs2, rd)
/* Alignment instructions. */
static inline void *vis_alignaddr(void *_ptr)
{
register void *ptr __asm__("g1");
ptr = _ptr;
__asm__ volatile(".word %2"
: "=&r" (ptr)
: "0" (ptr),
"i" (vis_opc_base | vis_opf(0x18) |
vis_rs1_s(1) |
vis_rs2_s(0) |
vis_rd_s(1)));
return ptr;
}
static inline void vis_alignaddr_g0(void *_ptr)
{
register void *ptr __asm__("g1");
ptr = _ptr;
__asm__ volatile(".word %2"
: "=&r" (ptr)
: "0" (ptr),
"i" (vis_opc_base | vis_opf(0x18) |
vis_rs1_s(1) |
vis_rs2_s(0) |
vis_rd_s(0)));
}
static inline void *vis_alignaddrl(void *_ptr)
{
register void *ptr __asm__("g1");
ptr = _ptr;
__asm__ volatile(".word %2"
: "=&r" (ptr)
: "0" (ptr),
"i" (vis_opc_base | vis_opf(0x19) |
vis_rs1_s(1) |
vis_rs2_s(0) |
vis_rd_s(1)));
return ptr;
}
static inline void vis_alignaddrl_g0(void *_ptr)
{
register void *ptr __asm__("g1");
ptr = _ptr;
__asm__ volatile(".word %2"
: "=&r" (ptr)
: "0" (ptr),
"i" (vis_opc_base | vis_opf(0x19) |
vis_rs1_s(1) |
vis_rs2_s(0) |
vis_rd_s(0)));
}
#define vis_faligndata(rs1,rs2,rd) vis_dd2d(0x48, rs1, rs2, rd)
/* Logical operate instructions. */
#define vis_fzero(rd) vis_d( 0x60, rd)
#define vis_fzeros(rd) vis_s( 0x61, rd)
#define vis_fone(rd) vis_d( 0x7e, rd)
#define vis_fones(rd) vis_s( 0x7f, rd)
#define vis_src1(rs1,rd) vis_d12d(0x74, rs1, rd)
#define vis_src1s(rs1,rd) vis_s12s(0x75, rs1, rd)
#define vis_src2(rs2,rd) vis_d22d(0x78, rs2, rd)
#define vis_src2s(rs2,rd) vis_s22s(0x79, rs2, rd)
#define vis_not1(rs1,rd) vis_d12d(0x6a, rs1, rd)
#define vis_not1s(rs1,rd) vis_s12s(0x6b, rs1, rd)
#define vis_not2(rs2,rd) vis_d22d(0x66, rs2, rd)
#define vis_not2s(rs2,rd) vis_s22s(0x67, rs2, rd)
#define vis_or(rs1,rs2,rd) vis_dd2d(0x7c, rs1, rs2, rd)
#define vis_ors(rs1,rs2,rd) vis_ss2s(0x7d, rs1, rs2, rd)
#define vis_nor(rs1,rs2,rd) vis_dd2d(0x62, rs1, rs2, rd)
#define vis_nors(rs1,rs2,rd) vis_ss2s(0x63, rs1, rs2, rd)
#define vis_and(rs1,rs2,rd) vis_dd2d(0x70, rs1, rs2, rd)
#define vis_ands(rs1,rs2,rd) vis_ss2s(0x71, rs1, rs2, rd)
#define vis_nand(rs1,rs2,rd) vis_dd2d(0x6e, rs1, rs2, rd)
#define vis_nands(rs1,rs2,rd) vis_ss2s(0x6f, rs1, rs2, rd)
#define vis_xor(rs1,rs2,rd) vis_dd2d(0x6c, rs1, rs2, rd)
#define vis_xors(rs1,rs2,rd) vis_ss2s(0x6d, rs1, rs2, rd)
#define vis_xnor(rs1,rs2,rd) vis_dd2d(0x72, rs1, rs2, rd)
#define vis_xnors(rs1,rs2,rd) vis_ss2s(0x73, rs1, rs2, rd)
#define vis_ornot1(rs1,rs2,rd) vis_dd2d(0x7a, rs1, rs2, rd)
#define vis_ornot1s(rs1,rs2,rd) vis_ss2s(0x7b, rs1, rs2, rd)
#define vis_ornot2(rs1,rs2,rd) vis_dd2d(0x76, rs1, rs2, rd)
#define vis_ornot2s(rs1,rs2,rd) vis_ss2s(0x77, rs1, rs2, rd)
#define vis_andnot1(rs1,rs2,rd) vis_dd2d(0x68, rs1, rs2, rd)
#define vis_andnot1s(rs1,rs2,rd) vis_ss2s(0x69, rs1, rs2, rd)
#define vis_andnot2(rs1,rs2,rd) vis_dd2d(0x64, rs1, rs2, rd)
#define vis_andnot2s(rs1,rs2,rd) vis_ss2s(0x65, rs1, rs2, rd)
/* Pixel component distance. */
#define vis_pdist(rs1,rs2,rd) vis_dd2d(0x3e, rs1, rs2, rd)
#endif /* AVCODEC_SPARC_VIS_H */