Updated FFMPEG to version 1.1.2, using this project: http://sourceforge.net/projects/ffmpeg4android/

This commit is contained in:
Sergii Pylypenko
2013-02-21 18:29:51 +02:00
parent 758a9658d2
commit fff7a99a41
3492 changed files with 886704 additions and 5414 deletions

View File

@@ -0,0 +1,5 @@
OBJS += x86/cpu.o \
x86/float_dsp_init.o \
YASM-OBJS += x86/cpuid.o \
x86/float_dsp.o \

View File

@@ -0,0 +1,110 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_ASM_H
#define AVUTIL_X86_ASM_H
#include <stdint.h>
#include "config.h"
#if ARCH_X86_64
# define OPSIZE "q"
# define REG_a "rax"
# define REG_b "rbx"
# define REG_c "rcx"
# define REG_d "rdx"
# define REG_D "rdi"
# define REG_S "rsi"
# define PTR_SIZE "8"
typedef int64_t x86_reg;
# define REG_SP "rsp"
# define REG_BP "rbp"
# define REGBP rbp
# define REGa rax
# define REGb rbx
# define REGc rcx
# define REGd rdx
# define REGSP rsp
#elif ARCH_X86_32
# define OPSIZE "l"
# define REG_a "eax"
# define REG_b "ebx"
# define REG_c "ecx"
# define REG_d "edx"
# define REG_D "edi"
# define REG_S "esi"
# define PTR_SIZE "4"
typedef int32_t x86_reg;
# define REG_SP "esp"
# define REG_BP "ebp"
# define REGBP ebp
# define REGa eax
# define REGb ebx
# define REGc ecx
# define REGd edx
# define REGSP esp
#else
typedef int x86_reg;
#endif
#define HAVE_7REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE && HAVE_EBP_AVAILABLE))
#define HAVE_6REGS (ARCH_X86_64 || (HAVE_EBX_AVAILABLE || HAVE_EBP_AVAILABLE))
#if ARCH_X86_64 && defined(PIC)
# define BROKEN_RELOCATIONS 1
#endif
/*
* If gcc is not set to support sse (-msse) it will not accept xmm registers
* in the clobber list for inline asm. XMM_CLOBBERS takes a list of xmm
* registers to be marked as clobbered and evaluates to nothing if they are
* not supported, or to the list itself if they are supported. Since a clobber
* list may not be empty, XMM_CLOBBERS_ONLY should be used if the xmm
* registers are the only in the clobber list.
* For example a list with "eax" and "xmm0" as clobbers should become:
* : XMM_CLOBBERS("xmm0",) "eax"
* and a list with only "xmm0" should become:
* XMM_CLOBBERS_ONLY("xmm0")
*/
#if HAVE_XMM_CLOBBERS
# define XMM_CLOBBERS(...) __VA_ARGS__
# define XMM_CLOBBERS_ONLY(...) : __VA_ARGS__
#else
# define XMM_CLOBBERS(...)
# define XMM_CLOBBERS_ONLY(...)
#endif
/* Use to export labels from asm. */
#define LABEL_MANGLE(a) EXTERN_PREFIX #a
// Use rip-relative addressing if compiling PIC code on x86-64.
#if ARCH_X86_64 && defined(PIC)
# define LOCAL_MANGLE(a) #a "(%%rip)"
#else
# define LOCAL_MANGLE(a) #a
#endif
#define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
#endif /* AVUTIL_X86_ASM_H */

View File

@@ -0,0 +1,61 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* @file
* byte swapping routines
*/
#ifndef AVUTIL_X86_BSWAP_H
#define AVUTIL_X86_BSWAP_H
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#if HAVE_INLINE_ASM
#if !AV_GCC_VERSION_AT_LEAST(4,1)
#define av_bswap16 av_bswap16
static av_always_inline av_const unsigned av_bswap16(unsigned x)
{
__asm__("rorw $8, %w0" : "+r"(x));
return x;
}
#endif /* !AV_GCC_VERSION_AT_LEAST(4,1) */
#if !AV_GCC_VERSION_AT_LEAST(4,5)
#define av_bswap32 av_bswap32
static av_always_inline av_const uint32_t av_bswap32(uint32_t x)
{
__asm__("bswap %0" : "+r" (x));
return x;
}
#if ARCH_X86_64
#define av_bswap64 av_bswap64
static inline uint64_t av_const av_bswap64(uint64_t x)
{
__asm__("bswap %0": "=r" (x) : "0" (x));
return x;
}
#endif
#endif /* !AV_GCC_VERSION_AT_LEAST(4,5) */
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_BSWAP_H */

View File

@@ -0,0 +1,201 @@
/*
* CPU detection code, extracted from mmx.h
* (c)1997-99 by H. Dietz and R. Fisher
* Converted to C and improved by Fabrice Bellard.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <stdlib.h>
#include <string.h>
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#if HAVE_YASM
#define cpuid(index, eax, ebx, ecx, edx) \
ff_cpu_cpuid(index, &eax, &ebx, &ecx, &edx)
#define xgetbv(index, eax, edx) \
ff_cpu_xgetbv(index, &eax, &edx)
#elif HAVE_INLINE_ASM
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index, eax, ebx, ecx, edx) \
__asm__ volatile ( \
"mov %%"REG_b", %%"REG_S" \n\t" \
"cpuid \n\t" \
"xchg %%"REG_b", %%"REG_S \
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \
: "0" (index))
#define xgetbv(index, eax, edx) \
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
#define get_eflags(x) \
__asm__ volatile ("pushfl \n" \
"pop %0 \n" \
: "=r"(x))
#define set_eflags(x) \
__asm__ volatile ("push %0 \n" \
"popfl \n" \
:: "r"(x))
#endif /* HAVE_INLINE_ASM */
#if ARCH_X86_64
#define cpuid_test() 1
#elif HAVE_YASM
#define cpuid_test ff_cpu_cpuid_test
#elif HAVE_INLINE_ASM
static int cpuid_test(void)
{
x86_reg a, c;
/* Check if CPUID is supported by attempting to toggle the ID bit in
* the EFLAGS register. */
get_eflags(a);
set_eflags(a ^ 0x200000);
get_eflags(c);
return a != c;
}
#endif
/* Function to test if multimedia instructions are supported... */
int ff_get_cpu_flags_x86(void)
{
int rval = 0;
#ifdef cpuid
int eax, ebx, ecx, edx;
int max_std_level, max_ext_level, std_caps = 0, ext_caps = 0;
int family = 0, model = 0;
union { int i[3]; char c[12]; } vendor;
if (!cpuid_test())
return 0; /* CPUID not supported */
cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
if (max_std_level >= 1) {
cpuid(1, eax, ebx, ecx, std_caps);
family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
if (std_caps & (1 << 15))
rval |= AV_CPU_FLAG_CMOV;
if (std_caps & (1 << 23))
rval |= AV_CPU_FLAG_MMX;
if (std_caps & (1 << 25))
rval |= AV_CPU_FLAG_MMXEXT;
#if HAVE_SSE
if (std_caps & (1 << 25))
rval |= AV_CPU_FLAG_SSE;
if (std_caps & (1 << 26))
rval |= AV_CPU_FLAG_SSE2;
if (ecx & 1)
rval |= AV_CPU_FLAG_SSE3;
if (ecx & 0x00000200 )
rval |= AV_CPU_FLAG_SSSE3;
if (ecx & 0x00080000 )
rval |= AV_CPU_FLAG_SSE4;
if (ecx & 0x00100000 )
rval |= AV_CPU_FLAG_SSE42;
#if HAVE_AVX
/* Check OXSAVE and AVX bits */
if ((ecx & 0x18000000) == 0x18000000) {
/* Check for OS support */
xgetbv(0, eax, edx);
if ((eax & 0x6) == 0x6)
rval |= AV_CPU_FLAG_AVX;
}
#endif /* HAVE_AVX */
#endif /* HAVE_SSE */
}
cpuid(0x80000000, max_ext_level, ebx, ecx, edx);
if (max_ext_level >= 0x80000001) {
cpuid(0x80000001, eax, ebx, ecx, ext_caps);
if (ext_caps & (1U << 31))
rval |= AV_CPU_FLAG_3DNOW;
if (ext_caps & (1 << 30))
rval |= AV_CPU_FLAG_3DNOWEXT;
if (ext_caps & (1 << 23))
rval |= AV_CPU_FLAG_MMX;
if (ext_caps & (1 << 22))
rval |= AV_CPU_FLAG_MMXEXT;
/* Allow for selectively disabling SSE2 functions on AMD processors
with SSE2 support but not SSE4a. This includes Athlon64, some
Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
than SSE2 often enough to utilize this special-case flag.
AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
so that SSE2 is used unless explicitly disabled by checking
AV_CPU_FLAG_SSE2SLOW. */
if (!strncmp(vendor.c, "AuthenticAMD", 12) &&
rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) {
rval |= AV_CPU_FLAG_SSE2SLOW;
}
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
* used unless the OS has AVX support. */
if (rval & AV_CPU_FLAG_AVX) {
if (ecx & 0x00000800)
rval |= AV_CPU_FLAG_XOP;
if (ecx & 0x00010000)
rval |= AV_CPU_FLAG_FMA4;
}
}
if (!strncmp(vendor.c, "GenuineIntel", 12)) {
if (family == 6 && (model == 9 || model == 13 || model == 14)) {
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and
* 6/14 (core1 "yonah") theoretically support sse2, but it's
* usually slower than mmx, so let's just pretend they don't.
* AV_CPU_FLAG_SSE2 is disabled and AV_CPU_FLAG_SSE2SLOW is
* enabled so that SSE2 is not used unless explicitly enabled
* by checking AV_CPU_FLAG_SSE2SLOW. The same situation
* applies for AV_CPU_FLAG_SSE3 and AV_CPU_FLAG_SSE3SLOW. */
if (rval & AV_CPU_FLAG_SSE2)
rval ^= AV_CPU_FLAG_SSE2SLOW | AV_CPU_FLAG_SSE2;
if (rval & AV_CPU_FLAG_SSE3)
rval ^= AV_CPU_FLAG_SSE3SLOW | AV_CPU_FLAG_SSE3;
}
/* The Atom processor has SSSE3 support, which is useful in many cases,
* but sometimes the SSSE3 version is slower than the SSE2 equivalent
* on the Atom, but is generally faster on other processors supporting
* SSSE3. This flag allows for selectively disabling certain SSSE3
* functions on the Atom. */
if (family == 6 && model == 28)
rval |= AV_CPU_FLAG_ATOM;
}
#endif /* cpuid */
return rval;
}

View File

@@ -0,0 +1,61 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_CPU_H
#define AVUTIL_X86_CPU_H
#include "config.h"
#include "libavutil/cpu.h"
#define CPUEXT(flags, suffix, cpuext) \
(HAVE_ ## cpuext ## suffix && ((flags) & AV_CPU_FLAG_ ## cpuext))
#define AV_CPU_FLAG_AMD3DNOW AV_CPU_FLAG_3DNOW
#define AV_CPU_FLAG_AMD3DNOWEXT AV_CPU_FLAG_3DNOWEXT
#define EXTERNAL_AMD3DNOW(flags) CPUEXT(flags, _EXTERNAL, AMD3DNOW)
#define EXTERNAL_AMD3DNOWEXT(flags) CPUEXT(flags, _EXTERNAL, AMD3DNOWEXT)
#define EXTERNAL_MMX(flags) CPUEXT(flags, _EXTERNAL, MMX)
#define EXTERNAL_MMXEXT(flags) CPUEXT(flags, _EXTERNAL, MMXEXT)
#define EXTERNAL_SSE(flags) CPUEXT(flags, _EXTERNAL, SSE)
#define EXTERNAL_SSE2(flags) CPUEXT(flags, _EXTERNAL, SSE2)
#define EXTERNAL_SSE3(flags) CPUEXT(flags, _EXTERNAL, SSE3)
#define EXTERNAL_SSSE3(flags) CPUEXT(flags, _EXTERNAL, SSSE3)
#define EXTERNAL_SSE4(flags) CPUEXT(flags, _EXTERNAL, SSE4)
#define EXTERNAL_SSE42(flags) CPUEXT(flags, _EXTERNAL, SSE42)
#define EXTERNAL_AVX(flags) CPUEXT(flags, _EXTERNAL, AVX)
#define EXTERNAL_FMA4(flags) CPUEXT(flags, _EXTERNAL, FMA4)
#define INLINE_AMD3DNOW(flags) CPUEXT(flags, _INLINE, AMD3DNOW)
#define INLINE_AMD3DNOWEXT(flags) CPUEXT(flags, _INLINE, AMD3DNOWEXT)
#define INLINE_MMX(flags) CPUEXT(flags, _INLINE, MMX)
#define INLINE_MMXEXT(flags) CPUEXT(flags, _INLINE, MMXEXT)
#define INLINE_SSE(flags) CPUEXT(flags, _INLINE, SSE)
#define INLINE_SSE2(flags) CPUEXT(flags, _INLINE, SSE2)
#define INLINE_SSE3(flags) CPUEXT(flags, _INLINE, SSE3)
#define INLINE_SSSE3(flags) CPUEXT(flags, _INLINE, SSSE3)
#define INLINE_SSE4(flags) CPUEXT(flags, _INLINE, SSE4)
#define INLINE_SSE42(flags) CPUEXT(flags, _INLINE, SSE42)
#define INLINE_AVX(flags) CPUEXT(flags, _INLINE, AVX)
#define INLINE_FMA4(flags) CPUEXT(flags, _INLINE, FMA4)
void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx);
void ff_cpu_xgetbv(int op, int *eax, int *edx);
int ff_cpu_cpuid_test(void);
#endif /* AVUTIL_X86_CPU_H */

View File

@@ -0,0 +1,91 @@
;*****************************************************************************
;* Copyright (C) 2005-2010 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void ff_cpu_cpuid(int index, int *eax, int *ebx, int *ecx, int *edx)
;-----------------------------------------------------------------------------
cglobal cpu_cpuid, 5,7
push rbx
push r4
push r3
push r2
push r1
mov eax, r0d
xor ecx, ecx
cpuid
pop r4
mov [r4], eax
pop r4
mov [r4], ebx
pop r4
mov [r4], ecx
pop r4
mov [r4], edx
pop rbx
RET
;-----------------------------------------------------------------------------
; void ff_cpu_xgetbv(int op, int *eax, int *edx)
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; int ff_cpu_cpuid_test(void)
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
push esi
push edi
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
ret
%endif

View File

@@ -0,0 +1,170 @@
;*****************************************************************************
;* x86-optimized Float DSP functions
;*
;* Copyright 2006 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
;-----------------------------------------------------------------------------
%macro VECTOR_FMUL 0
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 2*mmsize]
ALIGN 16
.loop:
mova m0, [src0q + lenq]
mova m1, [src0q + lenq + mmsize]
mulps m0, m0, [src1q + lenq]
mulps m1, m1, [src1q + lenq + mmsize]
mova [dstq + lenq], m0
mova [dstq + lenq + mmsize], m1
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMUL
%endif
;------------------------------------------------------------------------------
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMAC_SCALAR 0
%if UNIX64
cglobal vector_fmac_scalar, 3,3,3, dst, src, len
%else
cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSS m0, mulm
%else
%if WIN64
mova xmm0, xmm2
%endif
shufps xmm0, xmm0, 0
%if cpuflag(avx)
vinsertf128 m0, m0, xmm0, 1
%endif
%endif
lea lenq, [lend*4-2*mmsize]
.loop:
mulps m1, m0, [srcq+lenq ]
mulps m2, m0, [srcq+lenq+mmsize]
addps m1, m1, [dstq+lenq ]
addps m2, m2, [dstq+lenq+mmsize]
mova [dstq+lenq ], m1
mova [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMAC_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_FMAC_SCALAR
%endif
;------------------------------------------------------------------------------
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
;------------------------------------------------------------------------------
%macro VECTOR_FMUL_SCALAR 0
%if UNIX64
cglobal vector_fmul_scalar, 3,3,2, dst, src, len
%else
cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
movss m0, mulm
%elif WIN64
SWAP 0, 2
%endif
shufps m0, m0, 0
lea lenq, [lend*4-mmsize]
.loop:
mova m1, [srcq+lenq]
mulps m1, m0
mova [dstq+lenq], m1
sub lenq, mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse
VECTOR_FMUL_SCALAR
;------------------------------------------------------------------------------
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
; int len)
;------------------------------------------------------------------------------
%macro VECTOR_DMUL_SCALAR 0
%if ARCH_X86_32
cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
mov lenq, lenaddrm
%elif UNIX64
cglobal vector_dmul_scalar, 3,3,3, dst, src, len
%else
cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
%endif
%if ARCH_X86_32
VBROADCASTSD m0, mulm
%else
%if WIN64
movlhps xmm2, xmm2
%if cpuflag(avx)
vinsertf128 ymm2, ymm2, xmm2, 1
%endif
SWAP 0, 2
%else
movlhps xmm0, xmm0
%if cpuflag(avx)
vinsertf128 ymm0, ymm0, xmm0, 1
%endif
%endif
%endif
lea lenq, [lend*8-2*mmsize]
.loop:
mulpd m1, m0, [srcq+lenq ]
mulpd m2, m0, [srcq+lenq+mmsize]
mova [dstq+lenq ], m1
mova [dstq+lenq+mmsize], m2
sub lenq, 2*mmsize
jge .loop
REP_RET
%endmacro
INIT_XMM sse2
VECTOR_DMUL_SCALAR
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
VECTOR_DMUL_SCALAR
%endif

View File

@@ -0,0 +1,60 @@
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include "libavutil/cpu.h"
#include "libavutil/float_dsp.h"
#include "cpu.h"
extern void ff_vector_fmul_sse(float *dst, const float *src0, const float *src1,
int len);
extern void ff_vector_fmul_avx(float *dst, const float *src0, const float *src1,
int len);
extern void ff_vector_fmac_scalar_sse(float *dst, const float *src, float mul,
int len);
extern void ff_vector_fmac_scalar_avx(float *dst, const float *src, float mul,
int len);
extern void ff_vector_fmul_scalar_sse(float *dst, const float *src, float mul,
int len);
extern void ff_vector_dmul_scalar_sse2(double *dst, const double *src,
double mul, int len);
extern void ff_vector_dmul_scalar_avx(double *dst, const double *src,
double mul, int len);
void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
{
int mm_flags = av_get_cpu_flags();
if (EXTERNAL_SSE(mm_flags)) {
fdsp->vector_fmul = ff_vector_fmul_sse;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_sse;
fdsp->vector_fmul_scalar = ff_vector_fmul_scalar_sse;
}
if (EXTERNAL_SSE2(mm_flags)) {
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
}
if (EXTERNAL_AVX(mm_flags)) {
fdsp->vector_fmul = ff_vector_fmul_avx;
fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
}
}

View File

@@ -0,0 +1,97 @@
/*
* Copyright (c) 2010 Alexander Strange <astrange@ithinksw.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_INTREADWRITE_H
#define AVUTIL_X86_INTREADWRITE_H
#include <stdint.h>
#include "config.h"
#include "libavutil/attributes.h"
#if HAVE_MMX
#if !HAVE_FAST_64BIT && defined(__MMX__)
#define AV_COPY64 AV_COPY64
static av_always_inline void AV_COPY64(void *d, const void *s)
{
__asm__("movq %1, %%mm0 \n\t"
"movq %%mm0, %0 \n\t"
: "=m"(*(uint64_t*)d)
: "m" (*(const uint64_t*)s)
: "mm0");
}
#define AV_SWAP64 AV_SWAP64
static av_always_inline void AV_SWAP64(void *a, void *b)
{
__asm__("movq %1, %%mm0 \n\t"
"movq %0, %%mm1 \n\t"
"movq %%mm0, %0 \n\t"
"movq %%mm1, %1 \n\t"
: "+m"(*(uint64_t*)a), "+m"(*(uint64_t*)b)
::"mm0", "mm1");
}
#define AV_ZERO64 AV_ZERO64
static av_always_inline void AV_ZERO64(void *d)
{
__asm__("pxor %%mm0, %%mm0 \n\t"
"movq %%mm0, %0 \n\t"
: "=m"(*(uint64_t*)d)
:: "mm0");
}
#endif /* !HAVE_FAST_64BIT && defined(__MMX__) */
#ifdef __SSE__
#define AV_COPY128 AV_COPY128
static av_always_inline void AV_COPY128(void *d, const void *s)
{
struct v {uint64_t v[2];};
__asm__("movaps %1, %%xmm0 \n\t"
"movaps %%xmm0, %0 \n\t"
: "=m"(*(struct v*)d)
: "m" (*(const struct v*)s)
: "xmm0");
}
#endif /* __SSE__ */
#ifdef __SSE2__
#define AV_ZERO128 AV_ZERO128
static av_always_inline void AV_ZERO128(void *d)
{
struct v {uint64_t v[2];};
__asm__("pxor %%xmm0, %%xmm0 \n\t"
"movdqa %%xmm0, %0 \n\t"
: "=m"(*(struct v*)d)
:: "xmm0");
}
#endif /* __SSE2__ */
#endif /* HAVE_MMX */
#endif /* AVUTIL_X86_INTREADWRITE_H */

View File

@@ -0,0 +1,43 @@
/*
* copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef AVUTIL_X86_TIMER_H
#define AVUTIL_X86_TIMER_H
#include <stdint.h>
#if HAVE_INLINE_ASM
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
{
uint32_t a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((uint64_t)d << 32) + a;
}
#elif HAVE_RDTSC
#define AV_READ_TIME __rdtsc
#endif /* HAVE_INLINE_ASM */
#endif /* AVUTIL_X86_TIMER_H */

View File

@@ -0,0 +1,73 @@
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2008 Ramiro Polla <ramiro.polla@gmail.com>
*
* This file is part of Libav.
*
* Libav is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* Libav is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with Libav; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <inttypes.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include "libavutil/bswap.h"
#define storexmmregs(mem) \
__asm__ volatile( \
"movups %%xmm6 , 0x00(%0)\n\t" \
"movups %%xmm7 , 0x10(%0)\n\t" \
"movups %%xmm8 , 0x20(%0)\n\t" \
"movups %%xmm9 , 0x30(%0)\n\t" \
"movups %%xmm10, 0x40(%0)\n\t" \
"movups %%xmm11, 0x50(%0)\n\t" \
"movups %%xmm12, 0x60(%0)\n\t" \
"movups %%xmm13, 0x70(%0)\n\t" \
"movups %%xmm14, 0x80(%0)\n\t" \
"movups %%xmm15, 0x90(%0)\n\t" \
:: "r"(mem) : "memory")
#define testxmmclobbers(func, ctx, ...) \
uint64_t xmm[2][10][2]; \
int ret; \
storexmmregs(xmm[0]); \
ret = __real_ ## func(ctx, __VA_ARGS__); \
storexmmregs(xmm[1]); \
if (memcmp(xmm[0], xmm[1], sizeof(xmm[0]))) { \
int i; \
av_log(ctx, AV_LOG_ERROR, \
"XMM REGS CLOBBERED IN %s!\n", #func); \
for (i = 0; i < 10; i ++) \
if (xmm[0][i][0] != xmm[1][i][0] || \
xmm[0][i][1] != xmm[1][i][1]) { \
av_log(ctx, AV_LOG_ERROR, \
"xmm%-2d = %016"PRIx64"%016"PRIx64"\n", \
6 + i, av_bswap64(xmm[0][i][0]), \
av_bswap64(xmm[0][i][1])); \
av_log(ctx, AV_LOG_ERROR, \
" -> %016"PRIx64"%016"PRIx64"\n", \
av_bswap64(xmm[1][i][0]), \
av_bswap64(xmm[1][i][1])); \
} \
abort(); \
} \
return ret
#define wrap(func) \
int __real_ ## func; \
int __wrap_ ## func; \
int __wrap_ ## func

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,667 @@
;*****************************************************************************
;* x86util.asm
;*****************************************************************************
;* Copyright (C) 2008-2010 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%define program_name ff
%define cpuflags_mmxext cpuflags_mmx2
%include "libavutil/x86/x86inc.asm"
%macro SBUTTERFLY 4
%if avx_enabled == 0
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
%else
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%endif
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%2, m%3
SWAP %2, %4, %3
%endmacro
%macro SBUTTERFLYPS 3
unpcklps m%3, m%1, m%2
unpckhps m%1, m%1, m%2
SWAP %1, %3, %2
%endmacro
%macro TRANSPOSE4x4B 5
SBUTTERFLY bw, %1, %2, %5
SBUTTERFLY bw, %3, %4, %5
SBUTTERFLY wd, %1, %3, %5
SBUTTERFLY wd, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SBUTTERFLY qdq, %1, %2, %5
SBUTTERFLY qdq, %3, %4, %5
%endmacro
%macro TRANSPOSE4x4D 5
SBUTTERFLY dq, %1, %2, %5
SBUTTERFLY dq, %3, %4, %5
SBUTTERFLY qdq, %1, %3, %5
SBUTTERFLY qdq, %2, %4, %5
SWAP %2, %3
%endmacro
; identical behavior to TRANSPOSE4x4D, but using SSE1 float ops
%macro TRANSPOSE4x4PS 5
SBUTTERFLYPS %1, %2, %5
SBUTTERFLYPS %3, %4, %5
movlhps m%5, m%1, m%3
movhlps m%3, m%1
SWAP %5, %1
movlhps m%5, m%2, m%4
movhlps m%4, m%2
SWAP %5, %2, %3
%endmacro
%macro TRANSPOSE8x8W 9-11
%if ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9
SBUTTERFLY wd, %3, %4, %9
SBUTTERFLY wd, %5, %6, %9
SBUTTERFLY wd, %7, %8, %9
SBUTTERFLY dq, %1, %3, %9
SBUTTERFLY dq, %2, %4, %9
SBUTTERFLY dq, %5, %7, %9
SBUTTERFLY dq, %6, %8, %9
SBUTTERFLY qdq, %1, %5, %9
SBUTTERFLY qdq, %2, %6, %9
SBUTTERFLY qdq, %3, %7, %9
SBUTTERFLY qdq, %4, %8, %9
SWAP %2, %5
SWAP %4, %7
%else
; in: m0..m7, unless %11 in which case m6 is in %9
; out: m0..m7, unless %11 in which case m4 is in %10
; spills into %9 and %10
%if %0<11
movdqa %9, m%7
%endif
SBUTTERFLY wd, %1, %2, %7
movdqa %10, m%2
movdqa m%7, %9
SBUTTERFLY wd, %3, %4, %2
SBUTTERFLY wd, %5, %6, %2
SBUTTERFLY wd, %7, %8, %2
SBUTTERFLY dq, %1, %3, %2
movdqa %9, m%3
movdqa m%2, %10
SBUTTERFLY dq, %2, %4, %3
SBUTTERFLY dq, %5, %7, %3
SBUTTERFLY dq, %6, %8, %3
SBUTTERFLY qdq, %1, %5, %3
SBUTTERFLY qdq, %2, %6, %3
movdqa %10, m%2
movdqa m%3, %9
SBUTTERFLY qdq, %3, %7, %2
SBUTTERFLY qdq, %4, %8, %2
SWAP %2, %5
SWAP %4, %7
%if %0<11
movdqa m%5, %10
%endif
%endif
%endmacro
; PABSW macro assumes %1 != %2, while ABS1/2 macros work in-place
%macro PABSW 2
%if cpuflag(ssse3)
pabsw %1, %2
%elif cpuflag(mmxext)
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
%else
pxor %1, %1
pcmpgtw %1, %2
pxor %2, %1
psubw %2, %1
SWAP %1, %2
%endif
%endmacro
%macro PSIGNW_MMX 2
pxor %1, %2
psubw %1, %2
%endmacro
%macro PSIGNW_SSSE3 2
psignw %1, %2
%endmacro
%macro ABS1_MMX 2 ; a, tmp
pxor %2, %2
pcmpgtw %2, %1
pxor %1, %2
psubw %1, %2
%endmacro
%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
pxor %3, %3
pxor %4, %4
pcmpgtw %3, %1
pcmpgtw %4, %2
pxor %1, %3
pxor %2, %4
psubw %1, %3
psubw %2, %4
%endmacro
%macro ABS1_MMXEXT 2 ; a, tmp
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
%endmacro
%macro ABS2_MMXEXT 4 ; a, b, tmp0, tmp1
pxor %3, %3
pxor %4, %4
psubw %3, %1
psubw %4, %2
pmaxsw %1, %3
pmaxsw %2, %4
%endmacro
%macro ABS1_SSSE3 2
pabsw %1, %1
%endmacro
%macro ABS2_SSSE3 4
pabsw %1, %1
pabsw %2, %2
%endmacro
%macro ABSB_MMX 2
pxor %2, %2
psubb %2, %1
pminub %1, %2
%endmacro
%macro ABSB2_MMX 4
pxor %3, %3
pxor %4, %4
psubb %3, %1
psubb %4, %2
pminub %1, %3
pminub %2, %4
%endmacro
%macro ABSD2_MMX 4
pxor %3, %3
pxor %4, %4
pcmpgtd %3, %1
pcmpgtd %4, %2
pxor %1, %3
pxor %2, %4
psubd %1, %3
psubd %2, %4
%endmacro
%macro ABSB_SSSE3 2
pabsb %1, %1
%endmacro
%macro ABSB2_SSSE3 4
pabsb %1, %1
pabsb %2, %2
%endmacro
%macro ABS4 6
ABS2 %1, %2, %5, %6
ABS2 %3, %4, %5, %6
%endmacro
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
%define ABSB ABSB_MMX
%define ABSB2 ABSB2_MMX
%macro SPLATB_LOAD 3
%if cpuflag(ssse3)
movd %1, [%2-3]
pshufb %1, %3
%else
movd %1, [%2-3] ;to avoid crossing a cacheline
punpcklbw %1, %1
SPLATW %1, %1, 3
%endif
%endmacro
%macro SPLATB_REG 3
%if cpuflag(ssse3)
movd %1, %2d
pshufb %1, %3
%else
movd %1, %2d
punpcklbw %1, %1
SPLATW %1, %1, 0
%endif
%endmacro
%macro PALIGNR 4-5
%if cpuflag(ssse3)
%if %0==5
palignr %1, %2, %3, %4
%else
palignr %1, %2, %3
%endif
%elif cpuflag(mmx) ; [dst,] src1, src2, imm, tmp
%define %%dst %1
%if %0==5
%ifnidn %1, %2
mova %%dst, %2
%endif
%rotate 1
%endif
%ifnidn %4, %2
mova %4, %2
%endif
%if mmsize==8
psllq %%dst, (8-%3)*8
psrlq %4, %3*8
%else
pslldq %%dst, 16-%3
psrldq %4, %3
%endif
por %%dst, %4
%endif
%endmacro
%macro PSHUFLW 1+
%if mmsize == 8
pshufw %1
%else
pshuflw %1
%endif
%endmacro
%macro PSWAPD 2
%if cpuflag(mmxext)
pshufw %1, %2, q1032
%elif cpuflag(3dnowext)
pswapd %1, %2
%elif cpuflag(3dnow)
movq %1, %2
psrlq %1, 32
punpckldq %1, %2
%endif
%endmacro
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
pand m%3, m%5, m%4 ; src .. y6 .. y4
pand m%1, m%5, m%2 ; dst .. y6 .. y4
%else
mova m%1, %5
pand m%3, m%1, m%4 ; src .. y6 .. y4
pand m%1, m%1, m%2 ; dst .. y6 .. y4
%endif
psrlw m%2, 8 ; dst .. y7 .. y5
psrlw m%4, 8 ; src .. y7 .. y5
%endmacro
%macro SUMSUB_BA 3-4
%if %0==3
padd%1 m%2, m%3
padd%1 m%3, m%3
psub%1 m%3, m%2
%else
%if avx_enabled == 0
mova m%4, m%2
padd%1 m%2, m%3
psub%1 m%3, m%4
%else
padd%1 m%4, m%2, m%3
psub%1 m%3, m%2
SWAP %2, %4
%endif
%endif
%endmacro
%macro SUMSUB_BADC 5-6
%if %0==6
SUMSUB_BA %1, %2, %3, %6
SUMSUB_BA %1, %4, %5, %6
%else
padd%1 m%2, m%3
padd%1 m%4, m%5
padd%1 m%3, m%3
padd%1 m%5, m%5
psub%1 m%3, m%2
psub%1 m%5, m%4
%endif
%endmacro
%macro SUMSUB2_AB 4
%ifnum %3
psub%1 m%4, m%2, m%3
psub%1 m%4, m%3
padd%1 m%2, m%2
padd%1 m%2, m%3
%else
mova m%4, m%2
padd%1 m%2, m%2
padd%1 m%2, %3
psub%1 m%4, %3
psub%1 m%4, %3
%endif
%endmacro
%macro SUMSUB2_BA 4
%if avx_enabled == 0
mova m%4, m%2
padd%1 m%2, m%3
padd%1 m%2, m%3
psub%1 m%3, m%4
psub%1 m%3, m%4
%else
padd%1 m%4, m%2, m%3
padd%1 m%4, m%3
psub%1 m%3, m%2
psub%1 m%3, m%2
SWAP %2, %4
%endif
%endmacro
%macro SUMSUBD2_AB 5
%ifnum %4
psra%1 m%5, m%2, 1 ; %3: %3>>1
psra%1 m%4, m%3, 1 ; %2: %2>>1
padd%1 m%4, m%2 ; %3: %3>>1+%2
psub%1 m%5, m%3 ; %2: %2>>1-%3
SWAP %2, %5
SWAP %3, %4
%else
mova %5, m%2
mova %4, m%3
psra%1 m%3, 1 ; %3: %3>>1
psra%1 m%2, 1 ; %2: %2>>1
padd%1 m%3, %5 ; %3: %3>>1+%2
psub%1 m%2, %4 ; %2: %2>>1-%3
%endif
%endmacro
%macro DCT4_1D 5
%ifnum %5
SUMSUB_BADC w, %4, %1, %3, %2, %5
SUMSUB_BA w, %3, %4, %5
SUMSUB2_AB w, %1, %2, %5
SWAP %1, %3, %4, %5, %2
%else
SUMSUB_BADC w, %4, %1, %3, %2
SUMSUB_BA w, %3, %4
mova [%5], m%2
SUMSUB2_AB w, %1, [%5], %2
SWAP %1, %3, %4, %2
%endif
%endmacro
%macro IDCT4_1D 6-7
%ifnum %6
SUMSUBD2_AB %1, %3, %5, %7, %6
; %3: %3>>1-%5 %5: %3+%5>>1
SUMSUB_BA %1, %4, %2, %7
; %4: %2+%4 %2: %2-%4
SUMSUB_BADC %1, %5, %4, %3, %2, %7
; %5: %2+%4 + (%3+%5>>1)
; %4: %2+%4 - (%3+%5>>1)
; %3: %2-%4 + (%3>>1-%5)
; %2: %2-%4 - (%3>>1-%5)
%else
%ifidn %1, w
SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
%else
SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
%endif
SUMSUB_BA %1, %4, %2
SUMSUB_BADC %1, %5, %4, %3, %2
%endif
SWAP %2, %5, %4
; %2: %2+%4 + (%3+%5>>1) row0
; %3: %2-%4 + (%3>>1-%5) row1
; %4: %2-%4 - (%3>>1-%5) row2
; %5: %2+%4 - (%3+%5>>1) row3
%endmacro
%macro LOAD_DIFF 5
%ifidn %3, none
movh %1, %4
movh %2, %5
punpcklbw %1, %2
punpcklbw %2, %2
psubw %1, %2
%else
movh %1, %4
punpcklbw %1, %3
movh %2, %5
punpcklbw %2, %3
psubw %1, %2
%endif
%endmacro
%macro STORE_DCT 6
movq [%5+%6+ 0], m%1
movq [%5+%6+ 8], m%2
movq [%5+%6+16], m%3
movq [%5+%6+24], m%4
movhps [%5+%6+32], m%1
movhps [%5+%6+40], m%2
movhps [%5+%6+48], m%3
movhps [%5+%6+56], m%4
%endmacro
%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
%macro DIFFx2 6-7
movh %3, %5
punpcklbw %3, %4
psraw %1, 6
paddsw %1, %3
movh %3, %6
punpcklbw %3, %4
psraw %2, 6
paddsw %2, %3
packuswb %2, %1
%endmacro
%macro STORE_DIFF 4
movh %2, %4
punpcklbw %2, %3
psraw %1, 6
paddsw %1, %2
packuswb %1, %1
movh %4, %1
%endmacro
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
movh %3, [%7]
movh %4, [%7+%8]
psraw %1, %6
psraw %2, %6
punpcklbw %3, %5
punpcklbw %4, %5
paddw %3, %1
paddw %4, %2
packuswb %3, %5
packuswb %4, %5
movh [%7], %3
movh [%7+%8], %4
%endmacro
%macro PMINUB 3 ; dst, src, ignored
%if cpuflag(mmxext)
pminub %1, %2
%else ; dst, src, tmp
mova %3, %1
psubusb %3, %2
psubb %1, %3
%endif
%endmacro
%macro SPLATW 2-3 0
%if mmsize == 16
pshuflw %1, %2, (%3)*0x55
punpcklqdq %1, %1
%elif cpuflag(mmxext)
pshufw %1, %2, (%3)*0x55
%else
%ifnidn %1, %2
mova %1, %2
%endif
%if %3 & 2
punpckhwd %1, %1
%else
punpcklwd %1, %1
%endif
%if %3 & 1
punpckhwd %1, %1
%else
punpcklwd %1, %1
%endif
%endif
%endmacro
%macro SPLATD 1
%if mmsize == 8
punpckldq %1, %1
%elif cpuflag(sse2)
pshufd %1, %1, 0
%elif cpuflag(sse)
shufps %1, %1, 0
%endif
%endmacro
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
pminsw %1, %3
%endmacro
%macro PMINSD_MMX 3 ; dst, src, tmp
mova %3, %2
pcmpgtd %3, %1
pxor %1, %2
pand %1, %3
pxor %1, %2
%endmacro
%macro PMAXSD_MMX 3 ; dst, src, tmp
mova %3, %1
pcmpgtd %3, %2
pand %1, %3
pandn %3, %2
por %1, %3
%endmacro
%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp
PMINSD_MMX %1, %3, %4
PMAXSD_MMX %1, %2, %4
%endmacro
%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused
cvtdq2ps %1, %1
minps %1, %3
maxps %1, %2
cvtps2dq %1, %1
%endmacro
%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused
pminsd %1, %3
pmaxsd %1, %2
%endmacro
%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
%if cpuflag(avx)
vbroadcastss %1, %2
%else ; sse
movss %1, %2
shufps %1, %1, 0
%endif
%endmacro
%macro VBROADCASTSD 2 ; dst xmm/ymm, src m64
%if cpuflag(avx) && mmsize == 32
vbroadcastsd %1, %2
%elif cpuflag(sse3)
movddup %1, %2
%else ; sse2
movsd %1, %2
movlhps %1, %1
%endif
%endmacro
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80
db %1, %1
%else
db %1*2
db %1*2+1
%endif
%rotate 1
%endrep
%endmacro
%macro PMOVSXWD 2; dst, src
%if cpuflag(sse4)
pmovsxwd %1, %2
%else
%ifnidn %1, %2
mova %1, %2
%endif
punpcklwd %1, %1
psrad %1, 16
%endif
%endmacro