Updated FFMPEG to version 1.1.2, using this project: http://sourceforge.net/projects/ffmpeg4android/

This commit is contained in:
Sergii Pylypenko
2013-02-21 18:29:51 +02:00
parent 758a9658d2
commit fff7a99a41
3492 changed files with 886704 additions and 5414 deletions

View File

@@ -0,0 +1,21 @@
OBJS += ppc/dsputil_ppc.o \
ppc/videodsp_ppc.o \
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o
FFT-OBJS-$(HAVE_GNU_AS) += ppc/fft_altivec_s.o
ALTIVEC-OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o \
$(FFT-OBJS-yes)
ALTIVEC-OBJS-$(CONFIG_H264DSP) += ppc/h264_altivec.o
ALTIVEC-OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodec_altivec.o
ALTIVEC-OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o
ALTIVEC-OBJS-$(CONFIG_VC1_DECODER) += ppc/vc1dsp_altivec.o
ALTIVEC-OBJS-$(CONFIG_VP8_DECODER) += ppc/vp8dsp_altivec.o
ALTIVEC-OBJS += ppc/dsputil_altivec.o \
ppc/fdct_altivec.o \
ppc/float_altivec.o \
ppc/fmtconvert_altivec.o \
ppc/gmc_altivec.o \
ppc/idct_altivec.o \
ppc/int_altivec.o \

View File

@@ -0,0 +1,133 @@
/*
* Copyright (c) 2009 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)
#if ARCH_PPC64
#define PTR .quad
#define lp ld
#define lpx ldx
#define stp std
#define stpu stdu
#define PS 8
#define L(s) JOIN(., s)
.macro extfunc name
.global X(\name)
.section .opd, "aw"
X(\name):
.quad L(\name), .TOC.@tocbase, 0
.previous
.type X(\name), STT_FUNC
L(\name):
.endm
.macro movrel rd, sym, gp
ld \rd, \sym@got(r2)
.endm
.macro get_got rd
.endm
#else /* ARCH_PPC64 */
#define PTR .int
#define lp lwz
#define lpx lwzx
#define stp stw
#define stpu stwu
#define PS 4
#define L(s) s
.macro extfunc name
.global X(\name)
.type X(\name), STT_FUNC
X(\name):
\name:
.endm
.macro movrel rd, sym, gp
#if CONFIG_PIC
lwz \rd, \sym@got(\gp)
#else
lis \rd, \sym@ha
la \rd, \sym@l(\rd)
#endif
.endm
.macro get_got rd
#if CONFIG_PIC
bcl 20, 31, .Lgot\@
.Lgot\@:
mflr \rd
addis \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@ha
addi \rd, \rd, _GLOBAL_OFFSET_TABLE_ - .Lgot\@@l
#endif
.endm
#endif /* ARCH_PPC64 */
#if HAVE_IBM_ASM
.macro DEFINE_REG n
.equiv r\n, \n
.equiv f\n, \n
.equiv v\n, \n
.endm
DEFINE_REG 0
DEFINE_REG 1
DEFINE_REG 2
DEFINE_REG 3
DEFINE_REG 4
DEFINE_REG 5
DEFINE_REG 6
DEFINE_REG 7
DEFINE_REG 8
DEFINE_REG 9
DEFINE_REG 10
DEFINE_REG 11
DEFINE_REG 12
DEFINE_REG 13
DEFINE_REG 14
DEFINE_REG 15
DEFINE_REG 16
DEFINE_REG 17
DEFINE_REG 18
DEFINE_REG 19
DEFINE_REG 20
DEFINE_REG 21
DEFINE_REG 22
DEFINE_REG 23
DEFINE_REG 24
DEFINE_REG 25
DEFINE_REG 26
DEFINE_REG 27
DEFINE_REG 28
DEFINE_REG 29
DEFINE_REG 30
DEFINE_REG 31
#endif /* HAVE_IBM_ASM */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_DSPUTIL_ALTIVEC_H
#define AVCODEC_PPC_DSPUTIL_ALTIVEC_H
#include <stdint.h>
#include "libavcodec/dsputil.h"
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h);
void ff_fdct_altivec(int16_t *block);
void ff_gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h,
int x16, int y16, int rounder);
void ff_idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
void ff_dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
void ff_float_init_altivec(DSPContext* c, AVCodecContext *avctx);
void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx);
#endif /* AVCODEC_PPC_DSPUTIL_ALTIVEC_H */

View File

@@ -0,0 +1,194 @@
/*
* Copyright (c) 2002 Brian Foley
* Copyright (c) 2002 Dieter Shirley
* Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <string.h>
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
/* ***** WARNING ***** WARNING ***** WARNING ***** */
/*
clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
cache line size not equal to 32 bytes.
Fortunately all processor used by Apple up to at least the 7450 (aka second
generation G4) use 32 bytes cache line.
This is due to the use of the 'dcbz' instruction. It simply clear to zero a
single cache line, so you need to know the cache line size to use it !
It's absurd, but it's fast...
update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
size: 128 bytes. Oups.
The semantic of dcbz was changed, it always clear 32 bytes. so the function
below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
which is defined to clear a cache line (as dcbz before). So we still can
distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
see <http://developer.apple.com/technotes/tn/tn2087.html>
and <http://developer.apple.com/technotes/tn/tn2086.html>
*/
static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
{
register int misal = ((unsigned long)blocks & 0x00000010);
register int i = 0;
if (misal) {
((unsigned long*)blocks)[0] = 0L;
((unsigned long*)blocks)[1] = 0L;
((unsigned long*)blocks)[2] = 0L;
((unsigned long*)blocks)[3] = 0L;
i += 16;
}
for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
__asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
}
if (misal) {
((unsigned long*)blocks)[188] = 0L;
((unsigned long*)blocks)[189] = 0L;
((unsigned long*)blocks)[190] = 0L;
((unsigned long*)blocks)[191] = 0L;
i += 16;
}
}
/* same as above, when dcbzl clear a whole 128B cache line
i.e. the PPC970 aka G5 */
#if HAVE_DCBZL
static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
register int misal = ((unsigned long)blocks & 0x0000007f);
register int i = 0;
if (misal) {
// we could probably also optimize this case,
// but there's not much point as the machines
// aren't available yet (2003-06-26)
memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
else
for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
__asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
}
}
#else
static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
{
memset(blocks, 0, sizeof(DCTELEM)*6*64);
}
#endif
#if HAVE_DCBZL
/* check dcbz report how many bytes are set to 0 by dcbz */
/* update 24/06/2003 : replace dcbz by dcbzl to get
the intended effect (Apple "fixed" dcbz)
unfortunately this cannot be used unless the assembler
knows about dcbzl ... */
static long check_dcbzl_effect(void)
{
register char *fakedata = av_malloc(1024);
register char *fakedata_middle;
register long zero = 0;
register long i = 0;
long count = 0;
if (!fakedata) {
return 0L;
}
fakedata_middle = (fakedata + 512);
memset(fakedata, 0xFF, 1024);
/* below the constraint "b" seems to mean "Address base register"
in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
__asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
for (i = 0; i < 1024 ; i ++) {
if (fakedata[i] == (char)0)
count++;
}
av_free(fakedata);
return count;
}
#else
static long check_dcbzl_effect(void)
{
return 0;
}
#endif
void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
int mm_flags = av_get_cpu_flags();
if (avctx->dsp_mask) {
if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
mm_flags |= (avctx->dsp_mask & 0xffff);
else
mm_flags &= ~(avctx->dsp_mask & 0xffff);
}
// Common optimizations whether AltiVec is available or not
if (!high_bit_depth) {
switch (check_dcbzl_effect()) {
case 32:
c->clear_blocks = clear_blocks_dcbz32_ppc;
break;
case 128:
c->clear_blocks = clear_blocks_dcbz128_ppc;
break;
default:
break;
}
}
#if HAVE_ALTIVEC
if(CONFIG_H264_DECODER) ff_dsputil_h264_init_ppc(c, avctx);
if (mm_flags & AV_CPU_FLAG_ALTIVEC) {
ff_dsputil_init_altivec(c, avctx);
ff_float_init_altivec(c, avctx);
ff_int_init_altivec(c, avctx);
c->gmc1 = ff_gmc1_altivec;
#if CONFIG_ENCODERS
if (avctx->bits_per_raw_sample <= 8 &&
(avctx->dct_algo == FF_DCT_AUTO ||
avctx->dct_algo == FF_DCT_ALTIVEC)) {
c->fdct = ff_fdct_altivec;
}
#endif //CONFIG_ENCODERS
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if ((avctx->idct_algo == FF_IDCT_AUTO) ||
(avctx->idct_algo == FF_IDCT_ALTIVEC)) {
c->idct_put = ff_idct_put_altivec;
c->idct_add = ff_idct_add_altivec;
c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
}
}
}
#endif /* HAVE_ALTIVEC */
}

View File

@@ -0,0 +1,463 @@
/*
* Copyright (C) 2003 James Klicman <james@klicman.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/common.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
#define vs16(v) ((vector signed short)(v))
#define vs32(v) ((vector signed int)(v))
#define vu8(v) ((vector unsigned char)(v))
#define vu16(v) ((vector unsigned short)(v))
#define vu32(v) ((vector unsigned int)(v))
#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */
#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */
#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */
#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */
#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */
#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */
#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */
#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */
#define W0 -(2 * C2)
#define W1 (2 * C6)
#define W2 (SQRT_2 * C6)
#define W3 (SQRT_2 * C3)
#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7))
#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7))
#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7))
#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7))
#define W8 (SQRT_2 * ( C7 - C3))
#define W9 (SQRT_2 * (-C1 - C3))
#define WA (SQRT_2 * (-C3 - C5))
#define WB (SQRT_2 * ( C5 - C3))
static vector float fdctconsts[3] = {
{ W0, W1, W2, W3 },
{ W4, W5, W6, W7 },
{ W8, W9, WA, WB }
};
#define LD_W0 vec_splat(cnsts0, 0)
#define LD_W1 vec_splat(cnsts0, 1)
#define LD_W2 vec_splat(cnsts0, 2)
#define LD_W3 vec_splat(cnsts0, 3)
#define LD_W4 vec_splat(cnsts1, 0)
#define LD_W5 vec_splat(cnsts1, 1)
#define LD_W6 vec_splat(cnsts1, 2)
#define LD_W7 vec_splat(cnsts1, 3)
#define LD_W8 vec_splat(cnsts2, 0)
#define LD_W9 vec_splat(cnsts2, 1)
#define LD_WA vec_splat(cnsts2, 2)
#define LD_WB vec_splat(cnsts2, 3)
#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
\
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
\
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
cnst = LD_W2; \
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
cnst = LD_W1; \
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
cnst = LD_W0; \
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
\
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
cnst = LD_W3; \
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
\
cnst = LD_W8; \
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
cnst = LD_W9; \
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
cnst = LD_WA; \
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
cnst = LD_WB; \
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
\
cnst = LD_W4; \
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
cnst = LD_W5; \
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
cnst = LD_W6; \
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
cnst = LD_W7; \
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
\
b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \
b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \
b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \
b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \
/* }}} */
#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \
x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \
x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \
x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \
x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \
x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \
x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \
x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \
x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \
\
b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \
b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \
b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \
b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \
\
b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \
b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \
b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \
cnst = LD_W2; \
b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \
cnst = LD_W1; \
b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \
cnst = LD_W0; \
b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \
\
x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \
x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \
x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \
x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \
x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \
cnst = LD_W3; \
x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \
\
cnst = LD_W8; \
x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \
cnst = LD_W9; \
x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \
cnst = LD_WA; \
x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \
cnst = LD_WB; \
x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \
\
cnst = LD_W4; \
b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \
cnst = LD_W5; \
b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \
cnst = LD_W6; \
b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \
cnst = LD_W7; \
b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \
\
b7 = vec_add(b7, x2); /* b7 += x2; */ \
b5 = vec_add(b5, x3); /* b5 += x3; */ \
b3 = vec_add(b3, x2); /* b3 += x2; */ \
b1 = vec_add(b1, x3); /* b1 += x3; */ \
/* }}} */
/* two dimensional discrete cosine transform */
void ff_fdct_altivec(int16_t *block)
{
vector signed short *bp;
vector float *cp;
vector float b00, b10, b20, b30, b40, b50, b60, b70;
vector float b01, b11, b21, b31, b41, b51, b61, b71;
vector float mzero, cnst, cnsts0, cnsts1, cnsts2;
vector float x0, x1, x2, x3, x4, x5, x6, x7, x8;
/* setup constants {{{ */
/* mzero = -0.0 */
mzero = ((vector float)vec_splat_u32(-1));
mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero)));
cp = fdctconsts;
cnsts0 = vec_ld(0, cp); cp++;
cnsts1 = vec_ld(0, cp); cp++;
cnsts2 = vec_ld(0, cp);
/* }}} */
/* 8x8 matrix transpose (vector short[8]) {{{ */
#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b))
bp = (vector signed short*)block;
b00 = ((vector float)vec_ld(0, bp));
b40 = ((vector float)vec_ld(16*4, bp));
b01 = ((vector float)MERGE_S16(h, b00, b40));
b11 = ((vector float)MERGE_S16(l, b00, b40));
bp++;
b10 = ((vector float)vec_ld(0, bp));
b50 = ((vector float)vec_ld(16*4, bp));
b21 = ((vector float)MERGE_S16(h, b10, b50));
b31 = ((vector float)MERGE_S16(l, b10, b50));
bp++;
b20 = ((vector float)vec_ld(0, bp));
b60 = ((vector float)vec_ld(16*4, bp));
b41 = ((vector float)MERGE_S16(h, b20, b60));
b51 = ((vector float)MERGE_S16(l, b20, b60));
bp++;
b30 = ((vector float)vec_ld(0, bp));
b70 = ((vector float)vec_ld(16*4, bp));
b61 = ((vector float)MERGE_S16(h, b30, b70));
b71 = ((vector float)MERGE_S16(l, b30, b70));
x0 = ((vector float)MERGE_S16(h, b01, b41));
x1 = ((vector float)MERGE_S16(l, b01, b41));
x2 = ((vector float)MERGE_S16(h, b11, b51));
x3 = ((vector float)MERGE_S16(l, b11, b51));
x4 = ((vector float)MERGE_S16(h, b21, b61));
x5 = ((vector float)MERGE_S16(l, b21, b61));
x6 = ((vector float)MERGE_S16(h, b31, b71));
x7 = ((vector float)MERGE_S16(l, b31, b71));
b00 = ((vector float)MERGE_S16(h, x0, x4));
b10 = ((vector float)MERGE_S16(l, x0, x4));
b20 = ((vector float)MERGE_S16(h, x1, x5));
b30 = ((vector float)MERGE_S16(l, x1, x5));
b40 = ((vector float)MERGE_S16(h, x2, x6));
b50 = ((vector float)MERGE_S16(l, x2, x6));
b60 = ((vector float)MERGE_S16(h, x3, x7));
b70 = ((vector float)MERGE_S16(l, x3, x7));
#undef MERGE_S16
/* }}} */
/* Some of the initial calculations can be done as vector short before
* conversion to vector float. The following code section takes advantage
* of this.
*/
/* fdct rows {{{ */
x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));
x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));
x1 = ((vector float)vec_add(vs16(b10), vs16(b60)));
x6 = ((vector float)vec_sub(vs16(b10), vs16(b60)));
x2 = ((vector float)vec_add(vs16(b20), vs16(b50)));
x5 = ((vector float)vec_sub(vs16(b20), vs16(b50)));
x3 = ((vector float)vec_add(vs16(b30), vs16(b40)));
x4 = ((vector float)vec_sub(vs16(b30), vs16(b40)));
b70 = ((vector float)vec_add(vs16(x0), vs16(x3)));
b10 = ((vector float)vec_add(vs16(x1), vs16(x2)));
b00 = ((vector float)vec_add(vs16(b70), vs16(b10)));
b40 = ((vector float)vec_sub(vs16(b70), vs16(b10)));
#define CTF0(n) \
b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \
b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \
b##n##1 = vec_ctf(vs32(b##n##1), 0); \
b##n##0 = vec_ctf(vs32(b##n##0), 0);
CTF0(0);
CTF0(4);
b20 = ((vector float)vec_sub(vs16(x0), vs16(x3)));
b60 = ((vector float)vec_sub(vs16(x1), vs16(x2)));
CTF0(2);
CTF0(6);
#undef CTF0
x0 = vec_add(b60, b20);
x1 = vec_add(b61, b21);
cnst = LD_W2;
x0 = vec_madd(cnst, x0, mzero);
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_W1;
b20 = vec_madd(cnst, b20, x0);
b21 = vec_madd(cnst, b21, x1);
cnst = LD_W0;
b60 = vec_madd(cnst, b60, x0);
b61 = vec_madd(cnst, b61, x1);
#define CTFX(x,b) \
b##0 = ((vector float)vec_unpackh(vs16(x))); \
b##1 = ((vector float)vec_unpackl(vs16(x))); \
b##0 = vec_ctf(vs32(b##0), 0); \
b##1 = vec_ctf(vs32(b##1), 0); \
CTFX(x4, b7);
CTFX(x5, b5);
CTFX(x6, b3);
CTFX(x7, b1);
#undef CTFX
x0 = vec_add(b70, b10);
x1 = vec_add(b50, b30);
x2 = vec_add(b70, b30);
x3 = vec_add(b50, b10);
x8 = vec_add(x2, x3);
cnst = LD_W3;
x8 = vec_madd(cnst, x8, mzero);
cnst = LD_W8;
x0 = vec_madd(cnst, x0, mzero);
cnst = LD_W9;
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_WA;
x2 = vec_madd(cnst, x2, x8);
cnst = LD_WB;
x3 = vec_madd(cnst, x3, x8);
cnst = LD_W4;
b70 = vec_madd(cnst, b70, x0);
cnst = LD_W5;
b50 = vec_madd(cnst, b50, x1);
cnst = LD_W6;
b30 = vec_madd(cnst, b30, x1);
cnst = LD_W7;
b10 = vec_madd(cnst, b10, x0);
b70 = vec_add(b70, x2);
b50 = vec_add(b50, x3);
b30 = vec_add(b30, x2);
b10 = vec_add(b10, x3);
x0 = vec_add(b71, b11);
x1 = vec_add(b51, b31);
x2 = vec_add(b71, b31);
x3 = vec_add(b51, b11);
x8 = vec_add(x2, x3);
cnst = LD_W3;
x8 = vec_madd(cnst, x8, mzero);
cnst = LD_W8;
x0 = vec_madd(cnst, x0, mzero);
cnst = LD_W9;
x1 = vec_madd(cnst, x1, mzero);
cnst = LD_WA;
x2 = vec_madd(cnst, x2, x8);
cnst = LD_WB;
x3 = vec_madd(cnst, x3, x8);
cnst = LD_W4;
b71 = vec_madd(cnst, b71, x0);
cnst = LD_W5;
b51 = vec_madd(cnst, b51, x1);
cnst = LD_W6;
b31 = vec_madd(cnst, b31, x1);
cnst = LD_W7;
b11 = vec_madd(cnst, b11, x0);
b71 = vec_add(b71, x2);
b51 = vec_add(b51, x3);
b31 = vec_add(b31, x2);
b11 = vec_add(b11, x3);
/* }}} */
/* 8x8 matrix transpose (vector float[8][2]) {{{ */
x0 = vec_mergel(b00, b20);
x1 = vec_mergeh(b00, b20);
x2 = vec_mergel(b10, b30);
x3 = vec_mergeh(b10, b30);
b00 = vec_mergeh(x1, x3);
b10 = vec_mergel(x1, x3);
b20 = vec_mergeh(x0, x2);
b30 = vec_mergel(x0, x2);
x4 = vec_mergel(b41, b61);
x5 = vec_mergeh(b41, b61);
x6 = vec_mergel(b51, b71);
x7 = vec_mergeh(b51, b71);
b41 = vec_mergeh(x5, x7);
b51 = vec_mergel(x5, x7);
b61 = vec_mergeh(x4, x6);
b71 = vec_mergel(x4, x6);
x0 = vec_mergel(b01, b21);
x1 = vec_mergeh(b01, b21);
x2 = vec_mergel(b11, b31);
x3 = vec_mergeh(b11, b31);
x4 = vec_mergel(b40, b60);
x5 = vec_mergeh(b40, b60);
x6 = vec_mergel(b50, b70);
x7 = vec_mergeh(b50, b70);
b40 = vec_mergeh(x1, x3);
b50 = vec_mergel(x1, x3);
b60 = vec_mergeh(x0, x2);
b70 = vec_mergel(x0, x2);
b01 = vec_mergeh(x5, x7);
b11 = vec_mergel(x5, x7);
b21 = vec_mergeh(x4, x6);
b31 = vec_mergel(x4, x6);
/* }}} */
FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70);
FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71);
/* round, convert back to short {{{ */
#define CTS(n) \
b##n##0 = vec_round(b##n##0); \
b##n##1 = vec_round(b##n##1); \
b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \
b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \
b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \
vec_st(vs16(b##n##0), 0, bp);
bp = (vector signed short*)block;
CTS(0); bp++;
CTS(1); bp++;
CTS(2); bp++;
CTS(3); bp++;
CTS(4); bp++;
CTS(5); bp++;
CTS(6); bp++;
CTS(7);
#undef CTS
/* }}} */
}
/* vim:set foldmethod=marker foldlevel=0: */

View File

@@ -0,0 +1,150 @@
/*
* FFT/IFFT transforms
* AltiVec-enabled
* Copyright (c) 2009 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/fft.h"
/**
* Do a complex FFT with the parameters defined in ff_fft_init(). The
* input data must be permuted before with s->revtab table. No
* 1.0/sqrt(n) normalization is done.
* AltiVec-enabled
* This code assumes that the 'z' pointer is 16 bytes-aligned
* It also assumes all FFTComplex are 8 bytes-aligned pair of float
*/
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z);
#if HAVE_GNU_AS
static void ff_imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int j, k;
int n = 1 << s->mdct_bits;
int n4 = n >> 2;
int n8 = n >> 3;
int n32 = n >> 5;
const uint16_t *revtabj = s->revtab;
const uint16_t *revtabk = s->revtab+n4;
const vec_f *tcos = (const vec_f*)(s->tcos+n8);
const vec_f *tsin = (const vec_f*)(s->tsin+n8);
const vec_f *pin = (const vec_f*)(input+n4);
vec_f *pout = (vec_f*)(output+n4);
/* pre rotation */
k = n32-1;
do {
vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d;
#define CMULA(p,o0,o1,o2,o3)\
a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\
b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\
re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\
im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\
cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\
sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\
r##p = im*cos - re*sin;\
i##p = re*cos + im*sin;
#define STORE2(v,dst)\
j = dst;\
vec_ste(v, 0, output+j*2);\
vec_ste(v, 4, output+j*2);
#define STORE8(p)\
a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\
b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\
c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\
d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\
STORE2(a, revtabk[ p*2-4]);\
STORE2(b, revtabk[ p*2-3]);\
STORE2(c, revtabj[-p*2+2]);\
STORE2(d, revtabj[-p*2+3]);
cos0 = tcos[k];
sin0 = tsin[k];
cos1 = tcos[-k-1];
sin1 = tsin[-k-1];
CMULA(0, 0,1,2,3);
CMULA(1, 2,3,0,1);
STORE8(0);
STORE8(1);
revtabj += 4;
revtabk -= 4;
k--;
} while(k >= 0);
ff_fft_calc_altivec(s, (FFTComplex*)output);
/* post rotation + reordering */
j = -n32;
k = n32-1;
do {
vec_f cos,sin,re,im,a,b,c,d;
#define CMULB(d0,d1,o)\
re = pout[o*2];\
im = pout[o*2+1];\
cos = tcos[o];\
sin = tsin[o];\
d0 = im*sin - re*cos;\
d1 = re*sin + im*cos;
CMULB(a,b,j);
CMULB(c,d,k);
pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2));
pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0));
pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2));
pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0));
j++;
k--;
} while(k >= 0);
}
static void ff_imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input)
{
int k;
int n = 1 << s->mdct_bits;
int n4 = n >> 2;
int n16 = n >> 4;
vec_u32 sign = {1U<<31,1U<<31,1U<<31,1U<<31};
vec_u32 *p0 = (vec_u32*)(output+n4);
vec_u32 *p1 = (vec_u32*)(output+n4*3);
ff_imdct_half_altivec(s, output+n4, input);
for (k = 0; k < n16; k++) {
vec_u32 a = p0[k] ^ sign;
vec_u32 b = p1[-k-1];
p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0));
p1[k] = vec_perm(b, b, vcprm(3,2,1,0));
}
}
#endif /* HAVE_GNU_AS */
av_cold void ff_fft_init_altivec(FFTContext *s)
{
#if HAVE_GNU_AS
s->fft_calc = ff_fft_calc_interleave_altivec;
if (s->mdct_bits >= 5) {
s->imdct_calc = ff_imdct_calc_altivec;
s->imdct_half = ff_imdct_half_altivec;
}
#endif
}

View File

@@ -0,0 +1,449 @@
/*
* FFT transform with Altivec optimizations
* Copyright (c) 2009 Loren Merritt
*
* This algorithm (though not any of the implementation details) is
* based on libdjbfft by D. J. Bernstein.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* These functions are not individually interchangeable with the C versions.
* While C takes arrays of FFTComplex, Altivec leaves intermediate results
* in blocks as convenient to the vector size.
* i.e. {4x real, 4x imaginary, 4x real, ...}
*
* I ignore standard calling convention.
* Instead, the following registers are treated as global constants:
* v14: zero
* v15..v18: cosines
* v19..v29: permutations
* r9: 16
* r12: ff_cos_tabs
* and the rest are free for local use.
*/
#include "config.h"
#include "asm.S"
.text
.macro addi2 ra, imm // add 32-bit immediate
.if \imm & 0xffff
addi \ra, \ra, \imm@l
.endif
.if (\imm+0x8000)>>16
addis \ra, \ra, \imm@ha
.endif
.endm
.macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
.endm
.macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
vperm \b2,\b0,\b1,v20
vperm \b3,\b0,\b1,v21
vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
vaddfp \b0,\b2,\b3
vsubfp \b1,\b2,\b3
vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
vmrghw \b2,\b0,\b1
vperm \b3,\b0,\b1,v22
vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
vaddfp \b0,\b2,\b3
vsubfp \b1,\b2,\b3
vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
vperm \b2,\b0,\b1,v23
vperm \b3,\b0,\b1,v24
.endm
.macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
vperm \a2,\a0,\a1,v20 // FFT4 ...
vperm \a3,\a0,\a1,v21
vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
vaddfp \a0,\a2,\a3
vsubfp \a1,\a2,\a3
vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
vmrghw \a2,\a0,\a1
vperm \a3,\a0,\a1,v22
vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
vaddfp \a0,\a2,\a3
vsubfp \a1,\a2,\a3
vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
vperm \a2,\a0,\a1,v23
vperm \a3,\a0,\a1,v24
vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
.endm
.macro BF d0,d1,s0,s1
vsubfp \d1,\s0,\s1
vaddfp \d0,\s0,\s1
.endm
.macro zip d0,d1,s0,s1
vmrghw \d0,\s0,\s1
vmrglw \d1,\s0,\s1
.endm
.macro def_fft4 interleave
fft4\interleave\()_altivec:
lvx v0, 0,r3
lvx v1,r9,r3
FFT4 v0,v1,v2,v3
.ifnb \interleave
zip v0,v1,v2,v3
stvx v0, 0,r3
stvx v1,r9,r3
.else
stvx v2, 0,r3
stvx v3,r9,r3
.endif
blr
.endm
.macro def_fft8 interleave
fft8\interleave\()_altivec:
addi r4,r3,32
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
.ifnb \interleave
zip v4,v5,v0,v1
zip v6,v7,v2,v3
stvx v4, 0,r3
stvx v5,r9,r3
stvx v6, 0,r4
stvx v7,r9,r4
.else
stvx v0, 0,r3
stvx v1,r9,r3
stvx v2, 0,r4
stvx v3,r9,r4
.endif
blr
.endm
.macro def_fft16 interleave
fft16\interleave\()_altivec:
addi r5,r3,64
addi r6,r3,96
addi r4,r3,32
lvx v0, 0,r5
lvx v1,r9,r5
lvx v2, 0,r6
lvx v3,r9,r6
FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
lvx v0, 0,r3
lvx v1,r9,r3
lvx v2, 0,r4
lvx v3,r9,r4
FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
vmaddfp v8,v4,v15,v14 // r2*wre
vmaddfp v9,v5,v15,v14 // i2*wre
vmaddfp v10,v6,v15,v14 // r3*wre
vmaddfp v11,v7,v15,v14 // i3*wre
vmaddfp v8,v5,v16,v8 // i2*wim
vnmsubfp v9,v4,v16,v9 // r2*wim
vnmsubfp v10,v7,v16,v10 // i3*wim
vmaddfp v11,v6,v16,v11 // r3*wim
BF v10,v12,v10,v8
BF v11,v13,v9,v11
BF v0,v4,v0,v10
BF v3,v7,v3,v12
BF v1,v5,v1,v11
BF v2,v6,v2,v13
.ifnb \interleave
zip v8, v9,v0,v1
zip v10,v11,v2,v3
zip v12,v13,v4,v5
zip v14,v15,v6,v7
stvx v8, 0,r3
stvx v9,r9,r3
stvx v10, 0,r4
stvx v11,r9,r4
stvx v12, 0,r5
stvx v13,r9,r5
stvx v14, 0,r6
stvx v15,r9,r6
.else
stvx v0, 0,r3
stvx v4, 0,r5
stvx v3,r9,r4
stvx v7,r9,r6
stvx v1,r9,r3
stvx v5,r9,r5
stvx v2, 0,r4
stvx v6, 0,r6
.endif
blr
.endm
// void pass(float *z, float *wre, int n)
.macro PASS interleave, suffix
fft_pass\suffix\()_altivec:
mtctr r5
slwi r0,r5,4
slwi r7,r5,6 // o2
slwi r5,r5,5 // o1
add r10,r5,r7 // o3
add r0,r4,r0 // wim
addi r6,r5,16 // o1+16
addi r8,r7,16 // o2+16
addi r11,r10,16 // o3+16
1:
lvx v8, 0,r4 // wre
lvx v10, 0,r0 // wim
sub r0,r0,r9
lvx v9, 0,r0
vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
lvx v4,r3,r7 // r2 = z[o2]
lvx v5,r3,r8 // i2 = z[o2+16]
lvx v6,r3,r10 // r3 = z[o3]
lvx v7,r3,r11 // i3 = z[o3+16]
vmaddfp v10,v4,v8,v14 // r2*wre
vmaddfp v11,v5,v8,v14 // i2*wre
vmaddfp v12,v6,v8,v14 // r3*wre
vmaddfp v13,v7,v8,v14 // i3*wre
lvx v0, 0,r3 // r0 = z[0]
lvx v3,r3,r6 // i1 = z[o1+16]
vmaddfp v10,v5,v9,v10 // i2*wim
vnmsubfp v11,v4,v9,v11 // r2*wim
vnmsubfp v12,v7,v9,v12 // i3*wim
vmaddfp v13,v6,v9,v13 // r3*wim
lvx v1,r3,r9 // i0 = z[16]
lvx v2,r3,r5 // r1 = z[o1]
BF v12,v8,v12,v10
BF v13,v9,v11,v13
BF v0,v4,v0,v12
BF v3,v7,v3,v8
.if !\interleave
stvx v0, 0,r3
stvx v4,r3,r7
stvx v3,r3,r6
stvx v7,r3,r11
.endif
BF v1,v5,v1,v13
BF v2,v6,v2,v9
.if !\interleave
stvx v1,r3,r9
stvx v2,r3,r5
stvx v5,r3,r8
stvx v6,r3,r10
.else
vmrghw v8,v0,v1
vmrglw v9,v0,v1
stvx v8, 0,r3
stvx v9,r3,r9
vmrghw v8,v2,v3
vmrglw v9,v2,v3
stvx v8,r3,r5
stvx v9,r3,r6
vmrghw v8,v4,v5
vmrglw v9,v4,v5
stvx v8,r3,r7
stvx v9,r3,r8
vmrghw v8,v6,v7
vmrglw v9,v6,v7
stvx v8,r3,r10
stvx v9,r3,r11
.endif
addi r3,r3,32
addi r4,r4,16
bdnz 1b
sub r3,r3,r5
blr
.endm
#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
#define WORD_0 0x00,0x01,0x02,0x03
#define WORD_1 0x04,0x05,0x06,0x07
#define WORD_2 0x08,0x09,0x0a,0x0b
#define WORD_3 0x0c,0x0d,0x0e,0x0f
#define WORD_s0 0x10,0x11,0x12,0x13
#define WORD_s1 0x14,0x15,0x16,0x17
#define WORD_s2 0x18,0x19,0x1a,0x1b
#define WORD_s3 0x1c,0x1d,0x1e,0x1f
#define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
.rodata
.align 4
fft_data:
.float 0, 0, 0, 0
.float 1, 0.92387953, M_SQRT1_2, 0.38268343
.float 0, 0.38268343, M_SQRT1_2, 0.92387953
.float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
.float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
vcprm(s0,3,2,1)
vcprm(0,1,s2,s1)
vcprm(2,3,s0,s3)
vcprm(2,s3,3,s2)
vcprm(0,1,s0,s1)
vcprm(2,3,s2,s3)
vcprm(2,3,0,1)
vcprm(1,2,s3,s0)
vcprm(0,3,s2,s1)
vcprm(0,2,s1,s3)
vcprm(1,3,s0,s2)
.macro lvm b, r, regs:vararg
lvx \r, 0, \b
addi \b, \b, 16
.ifnb \regs
lvm \b, \regs
.endif
.endm
.macro stvm b, r, regs:vararg
stvx \r, 0, \b
addi \b, \b, 16
.ifnb \regs
stvm \b, \regs
.endif
.endm
.macro fft_calc interleave
extfunc ff_fft_calc\interleave\()_altivec
mflr r0
stp r0, 2*PS(r1)
stpu r1, -(160+16*PS)(r1)
get_got r11
addi r6, r1, 16*PS
stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
mfvrsave r0
stw r0, 15*PS(r1)
li r6, 0xfffffffc
mtvrsave r6
movrel r6, fft_data, r11
lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
li r9, 16
movrel r12, X(ff_cos_tabs), r11
movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
lwz r3, 0(r3)
subi r3, r3, 2
slwi r3, r3, 2+ARCH_PPC64
lpx r3, r3, r6
mtctr r3
mr r3, r4
bctrl
addi r6, r1, 16*PS
lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lwz r6, 15*PS(r1)
mtvrsave r6
lp r1, 0(r1)
lp r0, 2*PS(r1)
mtlr r0
blr
.endm
.macro DECL_FFT suffix, bits, n, n2, n4
fft\n\suffix\()_altivec:
mflr r0
stp r0,PS*(\bits-3)(r1)
bl fft\n2\()_altivec
addi2 r3,\n*4
bl fft\n4\()_altivec
addi2 r3,\n*2
bl fft\n4\()_altivec
addi2 r3,\n*-6
lp r0,PS*(\bits-3)(r1)
lp r4,\bits*PS(r12)
mtlr r0
li r5,\n/16
b fft_pass\suffix\()_altivec
.endm
.macro DECL_FFTS interleave, suffix
.text
def_fft4 \suffix
def_fft8 \suffix
def_fft16 \suffix
PASS \interleave, \suffix
DECL_FFT \suffix, 5, 32, 16, 8
DECL_FFT \suffix, 6, 64, 32, 16
DECL_FFT \suffix, 7, 128, 64, 32
DECL_FFT \suffix, 8, 256, 128, 64
DECL_FFT \suffix, 9, 512, 256, 128
DECL_FFT \suffix,10, 1024, 512, 256
DECL_FFT \suffix,11, 2048, 1024, 512
DECL_FFT \suffix,12, 4096, 2048, 1024
DECL_FFT \suffix,13, 8192, 4096, 2048
DECL_FFT \suffix,14,16384, 8192, 4096
DECL_FFT \suffix,15,32768,16384, 8192
DECL_FFT \suffix,16,65536,32768,16384
fft_calc \suffix
.rodata
.align 3
fft_dispatch_tab\suffix\()_altivec:
PTR fft4\suffix\()_altivec
PTR fft8\suffix\()_altivec
PTR fft16\suffix\()_altivec
PTR fft32\suffix\()_altivec
PTR fft64\suffix\()_altivec
PTR fft128\suffix\()_altivec
PTR fft256\suffix\()_altivec
PTR fft512\suffix\()_altivec
PTR fft1024\suffix\()_altivec
PTR fft2048\suffix\()_altivec
PTR fft4096\suffix\()_altivec
PTR fft8192\suffix\()_altivec
PTR fft16384\suffix\()_altivec
PTR fft32768\suffix\()_altivec
PTR fft65536\suffix\()_altivec
.endm
DECL_FFTS 0
DECL_FFTS 1, _interleave

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
static void vector_fmul_reverse_altivec(float *dst, const float *src0,
const float *src1, int len)
{
int i;
vector float d, s0, s1, h0, l0,
s2, s3, zero = (vector float)vec_splat_u32(0);
src1 += len-4;
for(i=0; i<len-7; i+=8) {
s1 = vec_ld(0, src1-i); // [a,b,c,d]
s0 = vec_ld(0, src0+i);
l0 = vec_mergel(s1, s1); // [c,c,d,d]
s3 = vec_ld(-16, src1-i);
h0 = vec_mergeh(s1, s1); // [a,a,b,b]
s2 = vec_ld(16, src0+i);
s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b]
vec_mergeh(l0,h0)); // [c,a,c,a]
// [d,c,b,a]
l0 = vec_mergel(s3, s3);
d = vec_madd(s0, s1, zero);
h0 = vec_mergeh(s3, s3);
vec_st(d, 0, dst+i);
s3 = vec_mergeh(vec_mergel(l0,h0),
vec_mergeh(l0,h0));
d = vec_madd(s2, s3, zero);
vec_st(d, 16, dst+i);
}
}
static void vector_fmul_add_altivec(float *dst, const float *src0,
const float *src1, const float *src2,
int len)
{
int i;
vector float d, s0, s1, s2, t0, t1, edges;
vector unsigned char align = vec_lvsr(0,dst),
mask = vec_lvsl(0, dst);
for (i=0; i<len-3; i+=4) {
t0 = vec_ld(0, dst+i);
t1 = vec_ld(15, dst+i);
s0 = vec_ld(0, src0+i);
s1 = vec_ld(0, src1+i);
s2 = vec_ld(0, src2+i);
edges = vec_perm(t1 ,t0, mask);
d = vec_madd(s0,s1,s2);
t1 = vec_perm(d, edges, align);
t0 = vec_perm(edges, d, align);
vec_st(t1, 15, dst+i);
vec_st(t0, 0, dst+i);
}
}
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
{
vector float zero, t0, t1, s0, s1, wi, wj;
const vector unsigned char reverse = vcprm(3,2,1,0);
int i,j;
dst += len;
win += len;
src0+= len;
zero = (vector float)vec_splat_u32(0);
for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
s0 = vec_ld(i, src0);
s1 = vec_ld(j, src1);
wi = vec_ld(i, win);
wj = vec_ld(j, win);
s1 = vec_perm(s1, s1, reverse);
wj = vec_perm(wj, wj, reverse);
t0 = vec_madd(s0, wj, zero);
t0 = vec_nmsub(s1, wi, t0);
t1 = vec_madd(s0, wi, zero);
t1 = vec_madd(s1, wj, t1);
t1 = vec_perm(t1, t1, reverse);
vec_st(t0, i, dst);
vec_st(t1, j, dst);
}
}
void ff_float_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->vector_fmul_reverse = vector_fmul_reverse_altivec;
c->vector_fmul_add = vector_fmul_add_altivec;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->vector_fmul_window = vector_fmul_window_altivec;
}
}

View File

@@ -0,0 +1,165 @@
/*
* Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/fmtconvert.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavutil/mem.h"
#include "dsputil_altivec.h"
static void int32_to_float_fmul_scalar_altivec(float *dst, const int *src,
float mul, int len)
{
union {
vector float v;
float s[4];
} mul_u;
int i;
vector float src1, src2, dst1, dst2, mul_v, zero;
zero = (vector float)vec_splat_u32(0);
mul_u.s[0] = mul;
mul_v = vec_splat(mul_u.v, 0);
for (i = 0; i < len; i += 8) {
src1 = vec_ctf(vec_ld(0, src+i), 0);
src2 = vec_ctf(vec_ld(16, src+i), 0);
dst1 = vec_madd(src1, mul_v, zero);
dst2 = vec_madd(src2, mul_v, zero);
vec_st(dst1, 0, dst+i);
vec_st(dst2, 16, dst+i);
}
}
static vector signed short float_to_int16_one_altivec(const float *src)
{
vector float s0 = vec_ld(0, src);
vector float s1 = vec_ld(16, src);
vector signed int t0 = vec_cts(s0, 0);
vector signed int t1 = vec_cts(s1, 0);
return vec_packs(t0,t1);
}
static void float_to_int16_altivec(int16_t *dst, const float *src, long len)
{
int i;
vector signed short d0, d1, d;
vector unsigned char align;
if (((long)dst) & 15) { //FIXME
for (i = 0; i < len - 7; i += 8) {
d0 = vec_ld(0, dst+i);
d = float_to_int16_one_altivec(src + i);
d1 = vec_ld(15, dst+i);
d1 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d1, d, align);
d1 = vec_perm(d, d1, align);
vec_st(d0, 0, dst + i);
vec_st(d1, 15, dst + i);
}
} else {
for (i = 0; i < len - 7; i += 8) {
d = float_to_int16_one_altivec(src + i);
vec_st(d, 0, dst + i);
}
}
}
#define VSTE_INC(dst, v, elem, inc) do { \
vector signed short s = vec_splat(v, elem); \
vec_ste(s, 0, dst); \
dst += inc; \
} while (0)
static void float_to_int16_stride_altivec(int16_t *dst, const float *src,
long len, int stride)
{
int i;
vector signed short d, s;
for (i = 0; i < len - 7; i += 8) {
d = float_to_int16_one_altivec(src + i);
VSTE_INC(dst, d, 0, stride);
VSTE_INC(dst, d, 1, stride);
VSTE_INC(dst, d, 2, stride);
VSTE_INC(dst, d, 3, stride);
VSTE_INC(dst, d, 4, stride);
VSTE_INC(dst, d, 5, stride);
VSTE_INC(dst, d, 6, stride);
VSTE_INC(dst, d, 7, stride);
}
}
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src,
long len, int channels)
{
int i;
vector signed short d0, d1, d2, c0, c1, t0, t1;
vector unsigned char align;
if (channels == 1)
float_to_int16_altivec(dst, src[0], len);
else {
if (channels == 2) {
if (((long)dst) & 15) {
for (i = 0; i < len - 7; i += 8) {
d0 = vec_ld(0, dst + i);
t0 = float_to_int16_one_altivec(src[0] + i);
d1 = vec_ld(31, dst + i);
t1 = float_to_int16_one_altivec(src[1] + i);
c0 = vec_mergeh(t0, t1);
c1 = vec_mergel(t0, t1);
d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
align = vec_lvsr(0, dst + i);
d0 = vec_perm(d2, c0, align);
d1 = vec_perm(c0, c1, align);
vec_st(d0, 0, dst + i);
d0 = vec_perm(c1, d2, align);
vec_st(d1, 15, dst + i);
vec_st(d0, 31, dst + i);
dst += 8;
}
} else {
for (i = 0; i < len - 7; i += 8) {
t0 = float_to_int16_one_altivec(src[0] + i);
t1 = float_to_int16_one_altivec(src[1] + i);
d0 = vec_mergeh(t0, t1);
d1 = vec_mergel(t0, t1);
vec_st(d0, 0, dst + i);
vec_st(d1, 16, dst + i);
dst += 8;
}
}
} else {
for (i = 0; i < channels; i++)
float_to_int16_stride_altivec(dst + i, src[i], len, channels);
}
}
}
void ff_fmt_convert_init_altivec(FmtConvertContext *c, AVCodecContext *avctx)
{
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_altivec;
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = float_to_int16_altivec;
c->float_to_int16_interleave = float_to_int16_interleave_altivec;
}
}

View File

@@ -0,0 +1,131 @@
/*
* GMC (Global Motion Compensation)
* AltiVec-enabled
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
/*
altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
to preserve proper dst alignment.
*/
void ff_gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
{
const DECLARE_ALIGNED(16, unsigned short, rounder_a) = rounder;
const DECLARE_ALIGNED(16, unsigned short, ABCD)[8] =
{
(16-x16)*(16-y16), /* A */
( x16)*(16-y16), /* B */
(16-x16)*( y16), /* C */
( x16)*( y16), /* D */
0, 0, 0, 0 /* padding */
};
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
int i;
unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
tempA = vec_ld(0, (const unsigned short*)ABCD);
Av = vec_splat(tempA, 0);
Bv = vec_splat(tempA, 1);
Cv = vec_splat(tempA, 2);
Dv = vec_splat(tempA, 3);
rounderV = vec_splat((vec_u16)vec_lde(0, &rounder_a), 0);
// we'll be able to pick-up our 9 char elements
// at src from those 32 bytes
// we load the first batch here, as inside the loop
// we can re-use 'src+stride' from one iteration
// as the 'src' of the next.
src_0 = vec_ld(0, src);
src_1 = vec_ld(16, src);
srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
if (src_really_odd != 0x0000000F) {
// if src & 0xF == 0xF, then (src+1) is properly aligned
// on the second vector.
srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
} else {
srcvB = src_1;
}
srcvA = vec_mergeh(vczero, srcvA);
srcvB = vec_mergeh(vczero, srcvB);
for(i=0; i<h; i++) {
dst_odd = (unsigned long)dst & 0x0000000F;
src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
dstv = vec_ld(0, dst);
// we we'll be able to pick-up our 9 char elements
// at src + stride from those 32 bytes
// then reuse the resulting 2 vectors srvcC and srcvD
// as the next srcvA and srcvB
src_0 = vec_ld(stride + 0, src);
src_1 = vec_ld(stride + 16, src);
srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
if (src_really_odd != 0x0000000F) {
// if src & 0xF == 0xF, then (src+1) is properly aligned
// on the second vector.
srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
} else {
srcvD = src_1;
}
srcvC = vec_mergeh(vczero, srcvC);
srcvD = vec_mergeh(vczero, srcvD);
// OK, now we (finally) do the math :-)
// those four instructions replaces 32 int muls & 32 int adds.
// isn't AltiVec nice ?
tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
srcvA = srcvC;
srcvB = srcvD;
tempD = vec_sr(tempD, vcsr8);
dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
if (dst_odd) {
dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
} else {
dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
}
vec_st(dstv2, 0, dst);
dst += stride;
src += stride;
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,775 @@
/*
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/mem.h"
#ifdef DEBUG
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
#else
#define ASSERT_ALIGNED(ptr) ;
#endif
/* this code assume that stride % 16 == 0 */
#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
\
psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
psum = vec_mladd(vB, vsrc1ssH, psum);\
psum = vec_mladd(vC, vsrc2ssH, psum);\
psum = vec_mladd(vD, vsrc3ssH, psum);\
psum = BIAS2(psum);\
psum = vec_sr(psum, v6us);\
\
vdst = vec_ld(0, dst);\
ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\
\
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\
vec_st(fsum, 0, dst);\
\
vsrc0ssH = vsrc2ssH;\
vsrc1ssH = vsrc3ssH;\
\
dst += stride;\
src += stride;
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
\
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
\
psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vE, vsrc1ssH, psum);\
psum = vec_sr(psum, v6us);\
\
vdst = vec_ld(0, dst);\
ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\
\
OP_U8_ALTIVEC(fsum, vfdst, vdst);\
\
vec_st(fsum, 0, dst);\
\
dst += stride;\
src += stride;
#define noop(a) a
#define add28(a) vec_add(v28ss, a)
#ifdef PREFIX_h264_chroma_mc8_altivec
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
int stride, int h, int x, int y) {
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
((8 - x) * ( y)),
(( x) * ( y))};
register int i;
vec_u8 fperm;
const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
LOAD_ZERO;
const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vec_u16 v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
vec_u8 vsrc0uc, vsrc1uc;
vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8 vdst, ppsum, vfdst, fsum;
if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F};
} else {
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F};
}
vsrcAuc = vec_ld(0, src);
if (loadSecond)
vsrcBuc = vec_ld(16, src);
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
if (reallyBadAlign)
vsrc1uc = vsrcBuc;
else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
if (ABCD[3]) {
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
if (reallyBadAlign)
vsrc3uc = vsrcDuc;
else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
}
}
} else {
const vec_s16 vE = vec_add(vB, vC);
if (ABCD[2]) { // x == 0 B == 0
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc;
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 15, src);
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
vsrc0uc = vsrc1uc;
}
}
} else { // y == 0 C == 0
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(0, src);
vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(0, src);
vsrcDuc = vec_ld(15, src);
vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
if (reallyBadAlign)
vsrc1uc = vsrcDuc;
else
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE_SIMPLE
}
}
}
}
}
#endif
/* this code assume that stride % 16 == 0 */
#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
DECLARE_ALIGNED(16, signed int, ABCD)[4] =
{((8 - x) * (8 - y)),
(( x) * (8 - y)),
((8 - x) * ( y)),
(( x) * ( y))};
register int i;
vec_u8 fperm;
const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
LOAD_ZERO;
const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vec_u16 v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
vec_u8 vsrc0uc, vsrc1uc;
vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8 vdst, ppsum, vfdst, fsum;
if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F};
} else {
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F};
}
vsrcAuc = vec_ld(0, src);
if (loadSecond)
vsrcBuc = vec_ld(16, src);
vsrcperm0 = vec_lvsl(0, src);
vsrcperm1 = vec_lvsl(1, src);
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
if (reallyBadAlign)
vsrc1uc = vsrcBuc;
else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
}
} else {
vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src);
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
if (reallyBadAlign)
vsrc3uc = vsrcDuc;
else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
}
}
}
#endif
#undef noop
#undef add28
#undef CHROMA_MC8_ALTIVEC_CORE
/* this code assume stride % 16 == 0 */
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
register int i;
LOAD_ZERO;
const vec_u8 permM2 = vec_lvsl(-2, src);
const vec_u8 permM1 = vec_lvsl(-1, src);
const vec_u8 permP0 = vec_lvsl(+0, src);
const vec_u8 permP1 = vec_lvsl(+1, src);
const vec_u8 permP2 = vec_lvsl(+2, src);
const vec_u8 permP3 = vec_lvsl(+3, src);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_u16 v5us = vec_splat_u16(5);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB;
vec_u8 sum, fsum;
for (i = 0 ; i < 16 ; i ++) {
vec_u8 srcR1 = vec_ld(-2, src);
vec_u8 srcR2 = vec_ld(14, src);
switch (align) {
default: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3);
} break;
case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2;
} break;
case 12: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
}
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst);
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
vec_st(fsum, 0, dst);
src += srcStride;
dst += dstStride;
}
}
#endif
/* this code assume stride % 16 == 0 */
#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
register int i;
LOAD_ZERO;
const vec_u8 perm = vec_lvsl(0, src);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u16 v5us = vec_splat_u16(5);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
uint8_t *srcbis = src - (srcStride * 2);
const vec_u8 srcM2a = vec_ld(0, srcbis);
const vec_u8 srcM2b = vec_ld(16, srcbis);
const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
//srcbis += srcStride;
const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcM1b = vec_ld(16, srcbis);
const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
//srcbis += srcStride;
const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcP0b = vec_ld(16, srcbis);
const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
//srcbis += srcStride;
const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcP1b = vec_ld(16, srcbis);
const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
//srcbis += srcStride;
const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
const vec_u8 srcP2b = vec_ld(16, srcbis);
const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
//srcbis += srcStride;
vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vec_u8 sum, fsum, srcP3a, srcP3b, srcP3;
for (i = 0 ; i < 16 ; i++) {
srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
//srcbis += srcStride;
sum1A = vec_adds(srcP0ssA, srcP1ssA);
sum1B = vec_adds(srcP0ssB, srcP1ssB);
sum2A = vec_adds(srcM1ssA, srcP2ssA);
sum2B = vec_adds(srcM1ssB, srcP2ssB);
sum3A = vec_adds(srcM2ssA, srcP3ssA);
sum3B = vec_adds(srcM2ssB, srcP3ssB);
srcM2ssA = srcM1ssA;
srcM2ssB = srcM1ssB;
srcM1ssA = srcP0ssA;
srcM1ssB = srcP0ssB;
srcP0ssA = srcP1ssA;
srcP0ssB = srcP1ssB;
srcP1ssA = srcP2ssA;
srcP1ssB = srcP2ssB;
srcP2ssA = srcP3ssA;
srcP2ssB = srcP3ssB;
pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B);
psumA = vec_sub(pp3A, pp2A);
psumB = vec_sub(pp3B, pp2B);
sumA = vec_sra(psumA, v5us);
sumB = vec_sra(psumB, v5us);
sum = vec_packsu(sumA, sumB);
ASSERT_ALIGNED(dst);
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
vec_st(fsum, 0, dst);
dst += dstStride;
}
}
#endif
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
register int i;
LOAD_ZERO;
const vec_u8 permM2 = vec_lvsl(-2, src);
const vec_u8 permM1 = vec_lvsl(-1, src);
const vec_u8 permP0 = vec_lvsl(+0, src);
const vec_u8 permP1 = vec_lvsl(+1, src);
const vec_u8 permP2 = vec_lvsl(+2, src);
const vec_u8 permP3 = vec_lvsl(+3, src);
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u32 v10ui = vec_splat_u32(10);
const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16 v1ss = vec_splat_s16(1);
const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16);
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vec_u8 mperm = (const vec_u8)
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
int16_t *tmpbis = tmp;
vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB;
vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo;
vec_u8 fsum, sumv, sum;
vec_s16 ssume, ssumo;
src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) {
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vec_u8 srcR1 = vec_ld(-2, src);
vec_u8 srcR2 = vec_ld(14, src);
switch (align) {
default: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = vec_perm(srcR1, srcR2, permP3);
} break;
case 11: {
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = vec_perm(srcR1, srcR2, permP2);
srcP3 = srcR2;
} break;
case 12: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = vec_perm(srcR1, srcR2, permP1);
srcP2 = srcR2;
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 13: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0);
srcP1 = srcR2;
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 14: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2;
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
case 15: {
vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0);
srcP1 = vec_perm(srcR2, srcR3, permP1);
srcP2 = vec_perm(srcR2, srcR3, permP2);
srcP3 = vec_perm(srcR2, srcR3, permP3);
} break;
}
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B);
sum2A = vec_adds(srcM1A, srcP2A);
sum2B = vec_adds(srcM1B, srcP2B);
sum3A = vec_adds(srcM2A, srcP3A);
sum3B = vec_adds(srcM2B, srcP3B);
pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B);
vec_st(psumA, 0, tmp);
vec_st(psumB, 16, tmp);
src += srcStride;
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
}
tmpM2ssA = vec_ld(0, tmpbis);
tmpM2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpM1ssA = vec_ld(0, tmpbis);
tmpM1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP0ssA = vec_ld(0, tmpbis);
tmpP0ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP1ssA = vec_ld(0, tmpbis);
tmpP1ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
tmpP2ssA = vec_ld(0, tmpbis);
tmpP2ssB = vec_ld(16, tmpbis);
tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) {
const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride;
tmpM2ssA = tmpM1ssA;
tmpM2ssB = tmpM1ssB;
tmpM1ssA = tmpP0ssA;
tmpM1ssB = tmpP0ssB;
tmpP0ssA = tmpP1ssA;
tmpP0ssB = tmpP1ssB;
tmpP1ssA = tmpP2ssA;
tmpP1ssB = tmpP2ssB;
tmpP2ssA = tmpP3ssA;
tmpP2ssB = tmpP3ssB;
pp1Ae = vec_mule(sum1A, v20ss);
pp1Ao = vec_mulo(sum1A, v20ss);
pp1Be = vec_mule(sum1B, v20ss);
pp1Bo = vec_mulo(sum1B, v20ss);
pp2Ae = vec_mule(sum2A, v5ss);
pp2Ao = vec_mulo(sum2A, v5ss);
pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss);
pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss);
pp3Be = vec_sra((vec_s32)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss);
pp1cAe = vec_add(pp1Ae, v512si);
pp1cAo = vec_add(pp1Ao, v512si);
pp1cBe = vec_add(pp1Be, v512si);
pp1cBo = vec_add(pp1Bo, v512si);
pp32Ae = vec_sub(pp3Ae, pp2Ae);
pp32Ao = vec_sub(pp3Ao, pp2Ao);
pp32Be = vec_sub(pp3Be, pp2Be);
pp32Bo = vec_sub(pp3Bo, pp2Bo);
sumAe = vec_add(pp1cAe, pp32Ae);
sumAo = vec_add(pp1cAo, pp32Ao);
sumBe = vec_add(pp1cBe, pp32Be);
sumBo = vec_add(pp1cBo, pp32Bo);
ssumAe = vec_sra(sumAe, v10ui);
ssumAo = vec_sra(sumAo, v10ui);
ssumBe = vec_sra(sumBe, v10ui);
ssumBo = vec_sra(sumBo, v10ui);
ssume = vec_packs(ssumAe, ssumBe);
ssumo = vec_packs(ssumAo, ssumBo);
sumv = vec_packsu(ssume, ssumo);
sum = vec_perm(sumv, sumv, mperm);
ASSERT_ALIGNED(dst);
OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
vec_st(fsum, 0, dst);
dst += dstStride;
}
}
#endif

View File

@@ -0,0 +1,217 @@
/*
* Copyright (c) 2001 Michel Lespinasse
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/*
* NOTE: This code is based on GPL code from the libmpeg2 project. The
* author, Michel Lespinasses, has given explicit permission to release
* under LGPL as part of FFmpeg.
*/
/*
* FFmpeg integration by Dieter Shirley
*
* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
* project. I've deleted all of the libmpeg2-specific code, renamed the
* functions and reordered the function parameters. The only change to the
* IDCT function itself was to factor out the partial transposition, and to
* perform a full transpose at the end of the function.
*/
#include <stdlib.h> /* malloc(), free() */
#include <string.h>
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
#define IDCT_HALF \
/* 1st stage */ \
t1 = vec_mradds (a1, vx7, vx1 ); \
t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
t7 = vec_mradds (a2, vx5, vx3); \
t3 = vec_mradds (ma2, vx3, vx5); \
\
/* 2nd stage */ \
t5 = vec_adds (vx0, vx4); \
t0 = vec_subs (vx0, vx4); \
t2 = vec_mradds (a0, vx6, vx2); \
t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
t6 = vec_adds (t8, t3); \
t3 = vec_subs (t8, t3); \
t8 = vec_subs (t1, t7); \
t1 = vec_adds (t1, t7); \
\
/* 3rd stage */ \
t7 = vec_adds (t5, t2); \
t2 = vec_subs (t5, t2); \
t5 = vec_adds (t0, t4); \
t0 = vec_subs (t0, t4); \
t4 = vec_subs (t8, t3); \
t3 = vec_adds (t8, t3); \
\
/* 4th stage */ \
vy0 = vec_adds (t7, t1); \
vy7 = vec_subs (t7, t1); \
vy1 = vec_mradds (c4, t3, t5); \
vy6 = vec_mradds (mc4, t3, t5); \
vy2 = vec_mradds (c4, t4, t0); \
vy5 = vec_mradds (mc4, t4, t0); \
vy3 = vec_adds (t2, t6); \
vy4 = vec_subs (t2, t6);
#define IDCT \
vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
vec_u16 shift; \
\
c4 = vec_splat (constants[0], 0); \
a0 = vec_splat (constants[0], 1); \
a1 = vec_splat (constants[0], 2); \
a2 = vec_splat (constants[0], 3); \
mc4 = vec_splat (constants[0], 4); \
ma2 = vec_splat (constants[0], 5); \
bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
\
zero = vec_splat_s16 (0); \
shift = vec_splat_u16 (4); \
\
vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
\
IDCT_HALF \
\
vx0 = vec_mergeh (vy0, vy4); \
vx1 = vec_mergel (vy0, vy4); \
vx2 = vec_mergeh (vy1, vy5); \
vx3 = vec_mergel (vy1, vy5); \
vx4 = vec_mergeh (vy2, vy6); \
vx5 = vec_mergel (vy2, vy6); \
vx6 = vec_mergeh (vy3, vy7); \
vx7 = vec_mergel (vy3, vy7); \
\
vy0 = vec_mergeh (vx0, vx4); \
vy1 = vec_mergel (vx0, vx4); \
vy2 = vec_mergeh (vx1, vx5); \
vy3 = vec_mergel (vx1, vx5); \
vy4 = vec_mergeh (vx2, vx6); \
vy5 = vec_mergel (vx2, vx6); \
vy6 = vec_mergeh (vx3, vx7); \
vy7 = vec_mergel (vx3, vx7); \
\
vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
vx1 = vec_mergel (vy0, vy4); \
vx2 = vec_mergeh (vy1, vy5); \
vx3 = vec_mergel (vy1, vy5); \
vx4 = vec_mergeh (vy2, vy6); \
vx5 = vec_mergel (vy2, vy6); \
vx6 = vec_mergeh (vy3, vy7); \
vx7 = vec_mergel (vy3, vy7); \
\
IDCT_HALF \
\
shift = vec_splat_u16 (6); \
vx0 = vec_sra (vy0, shift); \
vx1 = vec_sra (vy1, shift); \
vx2 = vec_sra (vy2, shift); \
vx3 = vec_sra (vy3, shift); \
vx4 = vec_sra (vy4, shift); \
vx5 = vec_sra (vy5, shift); \
vx6 = vec_sra (vy6, shift); \
vx7 = vec_sra (vy7, shift);
static const vec_s16 constants[5] = {
{23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
{21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
};
void ff_idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
{
vec_s16 *block = (vec_s16*)blk;
vec_u8 tmp;
IDCT
#define COPY(dest,src) \
tmp = vec_packsu (src, src); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
COPY (dest, vx0) dest += stride;
COPY (dest, vx1) dest += stride;
COPY (dest, vx2) dest += stride;
COPY (dest, vx3) dest += stride;
COPY (dest, vx4) dest += stride;
COPY (dest, vx5) dest += stride;
COPY (dest, vx6) dest += stride;
COPY (dest, vx7)
}
void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
{
vec_s16 *block = (vec_s16*)blk;
vec_u8 tmp;
vec_s16 tmp2, tmp3;
vec_u8 perm0;
vec_u8 perm1;
vec_u8 p0, p1, p;
IDCT
p0 = vec_lvsl (0, dest);
p1 = vec_lvsl (stride, dest);
p = vec_splat_u8 (-1);
perm0 = vec_mergeh (p, p0);
perm1 = vec_mergeh (p, p1);
#define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
tmp = vec_ld (0, dest); \
tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
ADD (dest, vx0, perm0) dest += stride;
ADD (dest, vx1, perm1) dest += stride;
ADD (dest, vx2, perm0) dest += stride;
ADD (dest, vx3, perm1) dest += stride;
ADD (dest, vx4, perm0) dest += stride;
ADD (dest, vx5, perm1) dest += stride;
ADD (dest, vx6, perm0) dest += stride;
ADD (dest, vx7, perm1)
}

View File

@@ -0,0 +1,144 @@
/*
* Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
** @file
** integer misc ops.
**/
#include "config.h"
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
#include "libavutil/ppc/types_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
int size) {
int i, size16;
vector signed char vpix1;
vector signed short vpix2, vdiff, vpix1l,vpix1h;
union { vector signed int vscore;
int32_t score[4];
} u;
u.vscore = vec_splat_s32(0);
//
//XXX lazy way, fix it later
#define vec_unaligned_load(b) \
vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b));
size16 = size >> 4;
while(size16) {
// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
//load pix1 and the first batch of pix2
vpix1 = vec_unaligned_load(pix1);
vpix2 = vec_unaligned_load(pix2);
pix2 += 8;
//unpack
vpix1h = vec_unpackh(vpix1);
vdiff = vec_sub(vpix1h, vpix2);
vpix1l = vec_unpackl(vpix1);
// load another batch from pix2
vpix2 = vec_unaligned_load(pix2);
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
vdiff = vec_sub(vpix1l, vpix2);
u.vscore = vec_msum(vdiff, vdiff, u.vscore);
pix1 += 16;
pix2 += 8;
size16--;
}
u.vscore = vec_sums(u.vscore, vec_splat_s32(0));
size %= 16;
for (i = 0; i < size; i++) {
u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
}
return u.score[3];
}
static int32_t scalarproduct_int16_altivec(const int16_t *v1, const int16_t *v2,
int order)
{
int i;
LOAD_ZERO;
const vec_s16 *pv;
register vec_s16 vec1;
register vec_s32 res = vec_splat_s32(0), t;
int32_t ires;
for(i = 0; i < order; i += 8){
pv = (const vec_s16*)v1;
vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
res = vec_sums(t, res);
v1 += 8;
v2 += 8;
}
res = vec_splat(res, 3);
vec_ste(res, 0, &ires);
return ires;
}
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
LOAD_ZERO;
vec_s16 *pv1 = (vec_s16*)v1;
register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
register vec_s16 t0, t1, i0, i1, i4;
register vec_s16 i2 = vec_ld(0, v2), i3 = vec_ld(0, v3);
register vec_s32 res = zero_s32v;
register vec_u8 align = vec_lvsl(0, v2);
int32_t ires;
order >>= 4;
do {
i1 = vec_ld(16, v2);
t0 = vec_perm(i2, i1, align);
i2 = vec_ld(32, v2);
t1 = vec_perm(i1, i2, align);
i0 = pv1[0];
i1 = pv1[1];
res = vec_msum(t0, i0, res);
res = vec_msum(t1, i1, res);
i4 = vec_ld(16, v3);
t0 = vec_perm(i3, i4, align);
i3 = vec_ld(32, v3);
t1 = vec_perm(i4, i3, align);
pv1[0] = vec_mladd(t0, muls, i0);
pv1[1] = vec_mladd(t1, muls, i1);
pv1 += 2;
v2 += 8;
v3 += 8;
} while(--order);
res = vec_splat(vec_sums(res, zero_s32v), 3);
vec_ste(res, 0, &ires);
return ires;
}
void ff_int_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
c->scalarproduct_int16 = scalarproduct_int16_altivec;
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
}

View File

@@ -0,0 +1,79 @@
/*
* simple math operations
* Copyright (c) 2001, 2002 Fabrice Bellard
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVCODEC_PPC_MATHOPS_H
#define AVCODEC_PPC_MATHOPS_H
#include <stdint.h>
#include "config.h"
#include "libavutil/common.h"
#if HAVE_PPC4XX
/* signed 16x16 -> 32 multiply add accumulate */
#define MAC16(rt, ra, rb) \
__asm__ ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb));
/* signed 16x16 -> 32 multiply */
#define MUL16(ra, rb) \
({ int __rt; \
__asm__ ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \
__rt; })
#endif
#define MULH MULH
static inline av_const int MULH(int a, int b){
int r;
__asm__ ("mulhw %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
return r;
}
#if !ARCH_PPC64
static inline av_const int64_t MAC64(int64_t d, int a, int b)
{
union { uint64_t x; unsigned hl[2]; } x = { d };
int h, l;
__asm__ ("mullw %3, %4, %5 \n\t"
"mulhw %2, %4, %5 \n\t"
"addc %1, %1, %3 \n\t"
"adde %0, %0, %2 \n\t"
: "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
: "r"(a), "r"(b));
return x.x;
}
#define MAC64(d, a, b) ((d) = MAC64(d, a, b))
static inline av_const int64_t MLS64(int64_t d, int a, int b)
{
union { uint64_t x; unsigned hl[2]; } x = { d };
int h, l;
__asm__ ("mullw %3, %4, %5 \n\t"
"mulhw %2, %4, %5 \n\t"
"subfc %1, %3, %1 \n\t"
"subfe %0, %2, %0 \n\t"
: "+r"(x.hl[0]), "+r"(x.hl[1]), "=&r"(h), "=&r"(l)
: "r"(a), "r"(b));
return x.x;
}
#define MLS64(d, a, b) ((d) = MLS64(d, a, b))
#endif
#endif /* AVCODEC_PPC_MATHOPS_H */

View File

@@ -0,0 +1,129 @@
/*
* Altivec optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "dsputil_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegaudiodsp.h"
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
#define SUM8(op, sum, w, p) \
{ \
op(sum, (w)[0 * 64], (p)[0 * 64]); \
op(sum, (w)[1 * 64], (p)[1 * 64]); \
op(sum, (w)[2 * 64], (p)[2 * 64]); \
op(sum, (w)[3 * 64], (p)[3 * 64]); \
op(sum, (w)[4 * 64], (p)[4 * 64]); \
op(sum, (w)[5 * 64], (p)[5 * 64]); \
op(sum, (w)[6 * 64], (p)[6 * 64]); \
op(sum, (w)[7 * 64], (p)[7 * 64]); \
}
static void apply_window(const float *buf, const float *win1,
const float *win2, float *sum1, float *sum2, int len)
{
const vector float *win1a = (const vector float *) win1;
const vector float *win2a = (const vector float *) win2;
const vector float *bufa = (const vector float *) buf;
vector float *sum1a = (vector float *) sum1;
vector float *sum2a = (vector float *) sum2;
vector float av_uninit(v0), av_uninit(v4);
vector float v1, v2, v3;
len = len >> 2;
#define MULT(a, b) \
{ \
v1 = vec_ld(a, win1a); \
v2 = vec_ld(b, win2a); \
v3 = vec_ld(a, bufa); \
v0 = vec_madd(v3, v1, v0); \
v4 = vec_madd(v2, v3, v4); \
}
while (len--) {
v0 = vec_xor(v0, v0);
v4 = vec_xor(v4, v4);
MULT( 0, 0);
MULT( 256, 64);
MULT( 512, 128);
MULT( 768, 192);
MULT(1024, 256);
MULT(1280, 320);
MULT(1536, 384);
MULT(1792, 448);
vec_st(v0, 0, sum1a);
vec_st(v4, 0, sum2a);
sum1a++;
sum2a++;
win1a++;
win2a++;
bufa++;
}
}
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
int incr)
{
LOCAL_ALIGNED_16(float, suma, [17]);
LOCAL_ALIGNED_16(float, sumb, [17]);
LOCAL_ALIGNED_16(float, sumc, [17]);
LOCAL_ALIGNED_16(float, sumd, [17]);
float sum;
int j;
float *out2 = out + 32 * incr;
/* copy to avoid wrap */
memcpy(in + 512, in, 32 * sizeof(*in));
apply_window(in + 16, win , win + 512, suma, sumc, 16);
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
SUM8(MLSS, suma[0], win + 32, in + 48);
sumc[ 0] = 0;
sumb[16] = 0;
sumd[16] = 0;
out[0 ] = suma[ 0];
out += incr;
out2 -= incr;
for(j=1;j<16;j++) {
*out = suma[ j] - sumd[16-j];
*out2 = -sumb[16-j] - sumc[ j];
out += incr;
out2 -= incr;
}
sum = 0;
SUM8(MLSS, sum, win + 16 + 32, in + 32);
*out = sum;
}
void ff_mpadsp_init_altivec(MPADSPContext *s)
{
s->apply_window_float = apply_window_mp3;
}

View File

@@ -0,0 +1,124 @@
/*
* Copyright (c) 2002 Dieter Shirley
*
* dct_unquantize_h263_altivec:
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <stdio.h>
#include "libavutil/cpu.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_altivec.h"
/* AltiVec version of dct_unquantize_h263
this code assumes `block' is 16 bytes-aligned */
static void dct_unquantize_h263_altivec(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int i, level, qmul, qadd;
int nCoeffs;
assert(s->block_last_index[n]>=0);
qadd = (qscale - 1) | 1;
qmul = qscale << 1;
if (s->mb_intra) {
if (!s->h263_aic) {
if (n < 4)
block[0] = block[0] * s->y_dc_scale;
else
block[0] = block[0] * s->c_dc_scale;
}else
qadd = 0;
i = 1;
nCoeffs= 63; //does not always use zigzag table
} else {
i = 0;
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
}
{
register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
DECLARE_ALIGNED(16, short, qmul8) = qmul;
DECLARE_ALIGNED(16, short, qadd8) = qadd;
register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
register vector bool short blockv_null, blockv_neg;
register short backup_0 = block[0];
register int j = 0;
qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
nqaddv = vec_sub(vczero, qaddv);
// vectorize all the 16 bytes-aligned blocks
// of 8 elements
for(; (j + 7) <= nCoeffs ; j+=8) {
blockv = vec_ld(j << 1, block);
blockv_neg = vec_cmplt(blockv, vczero);
blockv_null = vec_cmpeq(blockv, vczero);
// choose between +qadd or -qadd as the third operand
temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
// multiply & add (block{i,i+7} * qmul [+-] qadd)
temp1 = vec_mladd(blockv, qmulv, temp1);
// put 0 where block[{i,i+7} used to have 0
blockv = vec_sel(temp1, blockv, blockv_null);
vec_st(blockv, j << 1, block);
}
// if nCoeffs isn't a multiple of 8, finish the job
// using good old scalar units.
// (we could do it using a truncated vector,
// but I'm not sure it's worth the hassle)
for(; j <= nCoeffs ; j++) {
level = block[j];
if (level) {
if (level < 0) {
level = level * qmul - qadd;
} else {
level = level * qmul + qadd;
}
block[j] = level;
}
}
if (i == 1) {
// cheat. this avoid special-casing the first iteration
block[0] = backup_0;
}
}
}
void ff_MPV_common_init_altivec(MpegEncContext *s)
{
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC)) return;
if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
(s->avctx->dct_algo == FF_DCT_ALTIVEC)) {
s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
}
}

View File

@@ -0,0 +1,347 @@
/*
* VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
* Copyright (c) 2006 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/vc1dsp.h"
// main steps of 8x8 transform
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
do { \
t0 = vec_sl(vec_add(s0, s4), vec_2); \
t0 = vec_add(vec_sl(t0, vec_1), t0); \
t0 = vec_add(t0, vec_rnd); \
t1 = vec_sl(vec_sub(s0, s4), vec_2); \
t1 = vec_add(vec_sl(t1, vec_1), t1); \
t1 = vec_add(t1, vec_rnd); \
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \
t2 = vec_add(t2, vec_sl(s2, vec_4)); \
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \
t4 = vec_add(t0, t2); \
t5 = vec_add(t1, t3); \
t6 = vec_sub(t1, t3); \
t7 = vec_sub(t0, t2); \
\
t0 = vec_sl(vec_add(s1, s3), vec_4); \
t0 = vec_add(t0, vec_sl(s5, vec_3)); \
t0 = vec_add(t0, vec_sl(s7, vec_2)); \
t0 = vec_add(t0, vec_sub(s5, s3)); \
\
t1 = vec_sl(vec_sub(s1, s5), vec_4); \
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \
t1 = vec_sub(t1, vec_add(s1, s7)); \
\
t2 = vec_sl(vec_sub(s7, s3), vec_4); \
t2 = vec_add(t2, vec_sl(s1, vec_3)); \
t2 = vec_add(t2, vec_sl(s5, vec_2)); \
t2 = vec_add(t2, vec_sub(s1, s7)); \
\
t3 = vec_sl(vec_sub(s5, s7), vec_4); \
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \
t3 = vec_add(t3, vec_sl(s1, vec_2)); \
t3 = vec_sub(t3, vec_add(s3, s5)); \
\
s0 = vec_add(t4, t0); \
s1 = vec_add(t5, t1); \
s2 = vec_add(t6, t2); \
s3 = vec_add(t7, t3); \
s4 = vec_sub(t7, t3); \
s5 = vec_sub(t6, t2); \
s6 = vec_sub(t5, t1); \
s7 = vec_sub(t4, t0); \
}while(0)
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
do { \
s0 = vec_sra(s0, vec_3); \
s1 = vec_sra(s1, vec_3); \
s2 = vec_sra(s2, vec_3); \
s3 = vec_sra(s3, vec_3); \
s4 = vec_sra(s4, vec_3); \
s5 = vec_sra(s5, vec_3); \
s6 = vec_sra(s6, vec_3); \
s7 = vec_sra(s7, vec_3); \
}while(0)
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
do { \
s0 = vec_sra(s0, vec_7); \
s1 = vec_sra(s1, vec_7); \
s2 = vec_sra(s2, vec_7); \
s3 = vec_sra(s3, vec_7); \
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \
}while(0)
/* main steps of 4x4 transform */
#define STEP4(s0, s1, s2, s3, vec_rnd) \
do { \
t1 = vec_add(vec_sl(s0, vec_4), s0); \
t1 = vec_add(t1, vec_rnd); \
t2 = vec_add(vec_sl(s2, vec_4), s2); \
t0 = vec_add(t1, t2); \
t1 = vec_sub(t1, t2); \
t3 = vec_sl(vec_sub(s3, s1), vec_1); \
t3 = vec_add(t3, vec_sl(t3, vec_2)); \
t2 = vec_add(t3, vec_sl(s1, vec_5)); \
t3 = vec_add(t3, vec_sl(s3, vec_3)); \
t3 = vec_add(t3, vec_sl(s3, vec_2)); \
s0 = vec_add(t0, t2); \
s1 = vec_sub(t1, t3); \
s2 = vec_add(t1, t3); \
s3 = vec_sub(t0, t2); \
}while (0)
#define SHIFT_HOR4(s0, s1, s2, s3) \
s0 = vec_sra(s0, vec_3); \
s1 = vec_sra(s1, vec_3); \
s2 = vec_sra(s2, vec_3); \
s3 = vec_sra(s3, vec_3);
#define SHIFT_VERT4(s0, s1, s2, s3) \
s0 = vec_sra(s0, vec_7); \
s1 = vec_sra(s1, vec_7); \
s2 = vec_sra(s2, vec_7); \
s3 = vec_sra(s3, vec_7);
/** Do inverse transform on 8x8 block
*/
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64])
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector signed int vec_1s = vec_splat_s32(1);
const vector unsigned int vec_1 = vec_splat_u32(1);
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64);
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64);
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
vec_st(src0, 0, block);
vec_st(src1, 16, block);
vec_st(src2, 32, block);
vec_st(src3, 48, block);
vec_st(src4, 64, block);
vec_st(src5, 80, block);
vec_st(src6, 96, block);
vec_st(src7,112, block);
}
/** Do inverse transform on 8x4 part of block
*/
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block)
{
vector signed short src0, src1, src2, src3, src4, src5, src6, src7;
vector signed int s0, s1, s2, s3, s4, s5, s6, s7;
vector signed int s8, s9, sA, sB, sC, sD, sE, sF;
vector signed int t0, t1, t2, t3, t4, t5, t6, t7;
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4));
const vector unsigned int vec_7 = vec_splat_u32(7);
const vector unsigned int vec_5 = vec_splat_u32(5);
const vector unsigned int vec_4 = vec_splat_u32(4);
const vector signed int vec_4s = vec_splat_s32(4);
const vector unsigned int vec_3 = vec_splat_u32(3);
const vector unsigned int vec_2 = vec_splat_u32(2);
const vector unsigned int vec_1 = vec_splat_u32(1);
vector unsigned char tmp;
vector signed short tmp2, tmp3;
vector unsigned char perm0, perm1, p0, p1, p;
src0 = vec_ld( 0, block);
src1 = vec_ld( 16, block);
src2 = vec_ld( 32, block);
src3 = vec_ld( 48, block);
src4 = vec_ld( 64, block);
src5 = vec_ld( 80, block);
src6 = vec_ld( 96, block);
src7 = vec_ld(112, block);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackl(src0);
s1 = vec_unpackl(src1);
s2 = vec_unpackl(src2);
s3 = vec_unpackl(src3);
s4 = vec_unpackl(src4);
s5 = vec_unpackl(src5);
s6 = vec_unpackl(src6);
s7 = vec_unpackl(src7);
s8 = vec_unpackh(src0);
s9 = vec_unpackh(src1);
sA = vec_unpackh(src2);
sB = vec_unpackh(src3);
sC = vec_unpackh(src4);
sD = vec_unpackh(src5);
sE = vec_unpackh(src6);
sF = vec_unpackh(src7);
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s);
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7);
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s);
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF);
src0 = vec_pack(s8, s0);
src1 = vec_pack(s9, s1);
src2 = vec_pack(sA, s2);
src3 = vec_pack(sB, s3);
src4 = vec_pack(sC, s4);
src5 = vec_pack(sD, s5);
src6 = vec_pack(sE, s6);
src7 = vec_pack(sF, s7);
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7);
s0 = vec_unpackh(src0);
s1 = vec_unpackh(src1);
s2 = vec_unpackh(src2);
s3 = vec_unpackh(src3);
s8 = vec_unpackl(src0);
s9 = vec_unpackl(src1);
sA = vec_unpackl(src2);
sB = vec_unpackl(src3);
STEP4(s0, s1, s2, s3, vec_64);
SHIFT_VERT4(s0, s1, s2, s3);
STEP4(s8, s9, sA, sB, vec_64);
SHIFT_VERT4(s8, s9, sA, sB);
src0 = vec_pack(s0, s8);
src1 = vec_pack(s1, s9);
src2 = vec_pack(s2, sA);
src3 = vec_pack(s3, sB);
p0 = vec_lvsl (0, dest);
p1 = vec_lvsl (stride, dest);
p = vec_splat_u8 (-1);
perm0 = vec_mergeh (p, p0);
perm1 = vec_mergeh (p, p1);
#define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
tmp = vec_ld (0, dest); \
tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \
tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \
vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest);
ADD (dest, src0, perm0) dest += stride;
ADD (dest, src1, perm1) dest += stride;
ADD (dest, src2, perm0) dest += stride;
ADD (dest, src3, perm1)
}
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec
#include "h264_altivec_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec
#include "h264_altivec_template.c"
#undef OP_U8_ALTIVEC
#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
void ff_vc1dsp_init_altivec(VC1DSPContext* dsp)
{
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec;
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec;
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
}

View File

@@ -0,0 +1,35 @@
/*
* Copyright (c) 2003-2004 Romain Dolbeau
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/videodsp.h"
static void prefetch_ppc(uint8_t *mem, ptrdiff_t stride, int h)
{
register const uint8_t *p = mem;
do {
__asm__ volatile ("dcbt 0,%0" : : "r" (p));
p += stride;
} while(--h);
}
void ff_videodsp_init_ppc(VideoDSPContext *ctx, int bpc)
{
ctx->prefetch = prefetch_ppc;
}

View File

@@ -0,0 +1,187 @@
/*
* Copyright (C) 2009 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavcodec/vp3dsp.h"
#if HAVE_ALTIVEC
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/dsputil.h"
#include "dsputil_altivec.h"
static const vec_s16 constants =
{0, 64277, 60547, 54491, 46341, 36410, 25080, 12785};
static const vec_u8 interleave_high =
{0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
#define IDCT_START \
vec_s16 A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;\
vec_s16 Ed, Gd, Add, Bdd, Fd, Hd;\
vec_s16 eight = vec_splat_s16(8);\
vec_u16 four = vec_splat_u16(4);\
\
vec_s16 C1 = vec_splat(constants, 1);\
vec_s16 C2 = vec_splat(constants, 2);\
vec_s16 C3 = vec_splat(constants, 3);\
vec_s16 C4 = vec_splat(constants, 4);\
vec_s16 C5 = vec_splat(constants, 5);\
vec_s16 C6 = vec_splat(constants, 6);\
vec_s16 C7 = vec_splat(constants, 7);\
\
vec_s16 b0 = vec_ld(0x00, block);\
vec_s16 b1 = vec_ld(0x10, block);\
vec_s16 b2 = vec_ld(0x20, block);\
vec_s16 b3 = vec_ld(0x30, block);\
vec_s16 b4 = vec_ld(0x40, block);\
vec_s16 b5 = vec_ld(0x50, block);\
vec_s16 b6 = vec_ld(0x60, block);\
vec_s16 b7 = vec_ld(0x70, block);
// these functions do (a*C)>>16
// things are tricky because a is signed, but C unsigned.
// M15 is used if C fits in 15 bit unsigned (C6,C7)
// M16 is used if C requires 16 bits unsigned
static inline vec_s16 M15(vec_s16 a, vec_s16 C)
{
return (vec_s16)vec_perm(vec_mule(a,C), vec_mulo(a,C), interleave_high);
}
static inline vec_s16 M16(vec_s16 a, vec_s16 C)
{
return vec_add(a, M15(a, C));
}
#define IDCT_1D(ADD, SHIFT)\
A = vec_add(M16(b1, C1), M15(b7, C7));\
B = vec_sub(M15(b1, C7), M16(b7, C1));\
C = vec_add(M16(b3, C3), M16(b5, C5));\
D = vec_sub(M16(b5, C3), M16(b3, C5));\
\
Ad = M16(vec_sub(A, C), C4);\
Bd = M16(vec_sub(B, D), C4);\
\
Cd = vec_add(A, C);\
Dd = vec_add(B, D);\
\
E = ADD(M16(vec_add(b0, b4), C4));\
F = ADD(M16(vec_sub(b0, b4), C4));\
\
G = vec_add(M16(b2, C2), M15(b6, C6));\
H = vec_sub(M15(b2, C6), M16(b6, C2));\
\
Ed = vec_sub(E, G);\
Gd = vec_add(E, G);\
\
Add = vec_add(F, Ad);\
Bdd = vec_sub(Bd, H);\
\
Fd = vec_sub(F, Ad);\
Hd = vec_add(Bd, H);\
\
b0 = SHIFT(vec_add(Gd, Cd));\
b7 = SHIFT(vec_sub(Gd, Cd));\
\
b1 = SHIFT(vec_add(Add, Hd));\
b2 = SHIFT(vec_sub(Add, Hd));\
\
b3 = SHIFT(vec_add(Ed, Dd));\
b4 = SHIFT(vec_sub(Ed, Dd));\
\
b5 = SHIFT(vec_add(Fd, Bdd));\
b6 = SHIFT(vec_sub(Fd, Bdd));
#define NOP(a) a
#define ADD8(a) vec_add(a, eight)
#define SHIFT4(a) vec_sra(a, four)
static void vp3_idct_put_altivec(uint8_t *dst, int stride, DCTELEM block[64])
{
vec_u8 t;
IDCT_START
// pixels are signed; so add 128*16 in addition to the normal 8
vec_s16 v2048 = vec_sl(vec_splat_s16(1), vec_splat_u16(11));
eight = vec_add(eight, v2048);
IDCT_1D(NOP, NOP)
TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
IDCT_1D(ADD8, SHIFT4)
#define PUT(a)\
t = vec_packsu(a, a);\
vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
vec_ste((vec_u32)t, 4, (unsigned int *)dst);
PUT(b0) dst += stride;
PUT(b1) dst += stride;
PUT(b2) dst += stride;
PUT(b3) dst += stride;
PUT(b4) dst += stride;
PUT(b5) dst += stride;
PUT(b6) dst += stride;
PUT(b7)
}
static void vp3_idct_add_altivec(uint8_t *dst, int stride, DCTELEM block[64])
{
LOAD_ZERO;
vec_u8 t, vdst;
vec_s16 vdst_16;
vec_u8 vdst_mask = vec_mergeh(vec_splat_u8(-1), vec_lvsl(0, dst));
IDCT_START
IDCT_1D(NOP, NOP)
TRANSPOSE8(b0, b1, b2, b3, b4, b5, b6, b7);
IDCT_1D(ADD8, SHIFT4)
#define ADD(a)\
vdst = vec_ld(0, dst);\
vdst_16 = (vec_s16)vec_perm(vdst, zero_u8v, vdst_mask);\
vdst_16 = vec_adds(a, vdst_16);\
t = vec_packsu(vdst_16, vdst_16);\
vec_ste((vec_u32)t, 0, (unsigned int *)dst);\
vec_ste((vec_u32)t, 4, (unsigned int *)dst);
ADD(b0) dst += stride;
ADD(b1) dst += stride;
ADD(b2) dst += stride;
ADD(b3) dst += stride;
ADD(b4) dst += stride;
ADD(b5) dst += stride;
ADD(b6) dst += stride;
ADD(b7)
}
#endif /* HAVE_ALTIVEC */
av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
{
#if HAVE_ALTIVEC
if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
c->idct_put = vp3_idct_put_altivec;
c->idct_add = vp3_idct_add_altivec;
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
}
#endif
}

View File

@@ -0,0 +1,304 @@
/*
* VP8 compatible video decoder
*
* Copyright (C) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/ppc/types_altivec.h"
#include "libavutil/ppc/util_altivec.h"
#include "libavcodec/vp8dsp.h"
#include "dsputil_altivec.h"
#define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ }
// h subpel filter uses msum to multiply+add 4 pixel taps at once
static const vec_s8 h_subpel_filters_inner[7] =
{
REPT4( -6, 123, 12, -1),
REPT4(-11, 108, 36, -8),
REPT4( -9, 93, 50, -6),
REPT4(-16, 77, 77, -16),
REPT4( -6, 50, 93, -9),
REPT4( -8, 36, 108, -11),
REPT4( -1, 12, 123, -6),
};
// for 6tap filters, these are the outer two taps
// The zeros mask off pixels 4-7 when filtering 0-3
// and vice-versa
static const vec_s8 h_subpel_filters_outer[3] =
{
REPT4(0, 0, 2, 1),
REPT4(0, 0, 3, 3),
REPT4(0, 0, 1, 2),
};
#define LOAD_H_SUBPEL_FILTER(i) \
vec_s8 filter_inner = h_subpel_filters_inner[i]; \
vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \
vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2)
#define FILTER_H(dstv, off) \
a = vec_ld((off)-is6tap-1, src); \
b = vec_ld((off)-is6tap-1+15, src); \
\
pixh = vec_perm(a, b, permh##off); \
pixl = vec_perm(a, b, perml##off); \
filth = vec_msum(filter_inner, pixh, c64); \
filtl = vec_msum(filter_inner, pixl, c64); \
\
if (is6tap) { \
outer = vec_perm(a, b, perm_6tap##off); \
filth = vec_msum(filter_outerh, outer, filth); \
filtl = vec_msum(filter_outerl, outer, filtl); \
} \
if (w == 4) \
filtl = filth; /* discard pixels 4-7 */ \
dstv = vec_packs(filth, filtl); \
dstv = vec_sra(dstv, c7)
static av_always_inline
void put_vp8_epel_h_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
uint8_t *src, ptrdiff_t src_stride,
int h, int mx, int w, int is6tap)
{
LOAD_H_SUBPEL_FILTER(mx-1);
vec_u8 align_vec0, align_vec8, permh0, permh8, filt;
vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
vec_u8 a, b, pixh, pixl, outer;
vec_s16 f16h, f16l;
vec_s32 filth, filtl;
vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
vec_u16 c7 = vec_splat_u16(7);
align_vec0 = vec_lvsl( -is6tap-1, src);
align_vec8 = vec_lvsl(8-is6tap-1, src);
permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
perm_inner = vec_add(perm_inner, vec_splat_u8(4));
perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
while (h --> 0) {
FILTER_H(f16h, 0);
if (w == 16) {
FILTER_H(f16l, 8);
filt = vec_packsu(f16h, f16l);
vec_st(filt, 0, dst);
} else {
filt = vec_packsu(f16h, f16h);
vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
if (w == 8)
vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
}
src += src_stride;
dst += dst_stride;
}
}
// v subpel filter does a simple vertical multiply + add
static const vec_u8 v_subpel_filters[7] =
{
{ 0, 6, 123, 12, 1, 0 },
{ 2, 11, 108, 36, 8, 1 },
{ 0, 9, 93, 50, 6, 0 },
{ 3, 16, 77, 77, 16, 3 },
{ 0, 6, 50, 93, 9, 0 },
{ 1, 8, 36, 108, 11, 2 },
{ 0, 1, 12, 123, 6, 0 },
};
#define LOAD_V_SUBPEL_FILTER(i) \
vec_u8 subpel_filter = v_subpel_filters[i]; \
vec_u8 f0 = vec_splat(subpel_filter, 0); \
vec_u8 f1 = vec_splat(subpel_filter, 1); \
vec_u8 f2 = vec_splat(subpel_filter, 2); \
vec_u8 f3 = vec_splat(subpel_filter, 3); \
vec_u8 f4 = vec_splat(subpel_filter, 4); \
vec_u8 f5 = vec_splat(subpel_filter, 5)
#define FILTER_V(dstv, vec_mul) \
s1f = (vec_s16)vec_mul(s1, f1); \
s2f = (vec_s16)vec_mul(s2, f2); \
s3f = (vec_s16)vec_mul(s3, f3); \
s4f = (vec_s16)vec_mul(s4, f4); \
s2f = vec_subs(s2f, s1f); \
s3f = vec_subs(s3f, s4f); \
if (is6tap) { \
s0f = (vec_s16)vec_mul(s0, f0); \
s5f = (vec_s16)vec_mul(s5, f5); \
s2f = vec_adds(s2f, s0f); \
s3f = vec_adds(s3f, s5f); \
} \
dstv = vec_adds(s2f, s3f); \
dstv = vec_adds(dstv, c64); \
dstv = vec_sra(dstv, c7)
static av_always_inline
void put_vp8_epel_v_altivec_core(uint8_t *dst, ptrdiff_t dst_stride,
uint8_t *src, ptrdiff_t src_stride,
int h, int my, int w, int is6tap)
{
LOAD_V_SUBPEL_FILTER(my-1);
vec_u8 s0, s1, s2, s3, s4, s5, filt, align_vech, perm_vec, align_vecl;
vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
vec_u16 c7 = vec_splat_u16(7);
// we want pixels 0-7 to be in the even positions and 8-15 in the odd,
// so combine this permute with the alignment permute vector
align_vech = vec_lvsl(0, src);
align_vecl = vec_sld(align_vech, align_vech, 8);
if (w ==16)
perm_vec = vec_mergeh(align_vech, align_vecl);
else
perm_vec = vec_mergeh(align_vech, align_vech);
if (is6tap)
s0 = load_with_perm_vec(-2*src_stride, src, perm_vec);
s1 = load_with_perm_vec(-1*src_stride, src, perm_vec);
s2 = load_with_perm_vec( 0*src_stride, src, perm_vec);
s3 = load_with_perm_vec( 1*src_stride, src, perm_vec);
if (is6tap)
s4 = load_with_perm_vec( 2*src_stride, src, perm_vec);
src += (2+is6tap)*src_stride;
while (h --> 0) {
if (is6tap)
s5 = load_with_perm_vec(0, src, perm_vec);
else
s4 = load_with_perm_vec(0, src, perm_vec);
FILTER_V(f16h, vec_mule);
if (w == 16) {
FILTER_V(f16l, vec_mulo);
filt = vec_packsu(f16h, f16l);
vec_st(filt, 0, dst);
} else {
filt = vec_packsu(f16h, f16h);
if (w == 4)
filt = (vec_u8)vec_splat((vec_u32)filt, 0);
else
vec_ste((vec_u32)filt, 4, (uint32_t*)dst);
vec_ste((vec_u32)filt, 0, (uint32_t*)dst);
}
if (is6tap)
s0 = s1;
s1 = s2;
s2 = s3;
s3 = s4;
if (is6tap)
s4 = s5;
dst += dst_stride;
src += src_stride;
}
}
#define EPEL_FUNCS(WIDTH, TAPS) \
static av_noinline \
void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
{ \
put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \
} \
\
static av_noinline \
void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \
{ \
put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \
}
#define EPEL_HV(WIDTH, HTAPS, VTAPS) \
static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my) \
{ \
DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \
if (VTAPS == 6) { \
put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*stride, stride, h+5, mx, my); \
put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+2*16, 16, h, mx, my); \
} else { \
put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-stride, stride, h+4, mx, my); \
put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, stride, tmp+16, 16, h, mx, my); \
} \
}
EPEL_FUNCS(16,6)
EPEL_FUNCS(8, 6)
EPEL_FUNCS(8, 4)
EPEL_FUNCS(4, 6)
EPEL_FUNCS(4, 4)
EPEL_HV(16, 6,6)
EPEL_HV(8, 6,6)
EPEL_HV(8, 4,6)
EPEL_HV(8, 6,4)
EPEL_HV(8, 4,4)
EPEL_HV(4, 6,6)
EPEL_HV(4, 4,6)
EPEL_HV(4, 6,4)
EPEL_HV(4, 4,4)
static void put_vp8_pixels16_altivec(uint8_t *dst, ptrdiff_t stride, uint8_t *src, ptrdiff_t s, int h, int mx, int my)
{
ff_put_pixels16_altivec(dst, src, stride, h);
}
av_cold void ff_vp8dsp_init_altivec(VP8DSPContext *c)
{
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
c->put_vp8_epel_pixels_tab[0][0][0] = put_vp8_pixels16_altivec;
c->put_vp8_epel_pixels_tab[0][0][2] = put_vp8_epel16_h6_altivec;
c->put_vp8_epel_pixels_tab[0][2][0] = put_vp8_epel16_v6_altivec;
c->put_vp8_epel_pixels_tab[0][2][2] = put_vp8_epel16_h6v6_altivec;
c->put_vp8_epel_pixels_tab[1][0][2] = put_vp8_epel8_h6_altivec;
c->put_vp8_epel_pixels_tab[1][2][0] = put_vp8_epel8_v6_altivec;
c->put_vp8_epel_pixels_tab[1][0][1] = put_vp8_epel8_h4_altivec;
c->put_vp8_epel_pixels_tab[1][1][0] = put_vp8_epel8_v4_altivec;
c->put_vp8_epel_pixels_tab[1][2][2] = put_vp8_epel8_h6v6_altivec;
c->put_vp8_epel_pixels_tab[1][1][1] = put_vp8_epel8_h4v4_altivec;
c->put_vp8_epel_pixels_tab[1][1][2] = put_vp8_epel8_h6v4_altivec;
c->put_vp8_epel_pixels_tab[1][2][1] = put_vp8_epel8_h4v6_altivec;
c->put_vp8_epel_pixels_tab[2][0][2] = put_vp8_epel4_h6_altivec;
c->put_vp8_epel_pixels_tab[2][2][0] = put_vp8_epel4_v6_altivec;
c->put_vp8_epel_pixels_tab[2][0][1] = put_vp8_epel4_h4_altivec;
c->put_vp8_epel_pixels_tab[2][1][0] = put_vp8_epel4_v4_altivec;
c->put_vp8_epel_pixels_tab[2][2][2] = put_vp8_epel4_h6v6_altivec;
c->put_vp8_epel_pixels_tab[2][1][1] = put_vp8_epel4_h4v4_altivec;
c->put_vp8_epel_pixels_tab[2][1][2] = put_vp8_epel4_h6v4_altivec;
c->put_vp8_epel_pixels_tab[2][2][1] = put_vp8_epel4_h4v6_altivec;
}