Updated FFMPEG to version 1.1.2, using this project: http://sourceforge.net/projects/ffmpeg4android/
This commit is contained in:
79
project/jni/ffmpeg/libavcodec/x86/Makefile
Normal file
79
project/jni/ffmpeg/libavcodec/x86/Makefile
Normal file
@@ -0,0 +1,79 @@
|
||||
OBJS += x86/fmtconvert_init.o
|
||||
|
||||
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
|
||||
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
|
||||
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
|
||||
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
|
||||
OBJS-$(CONFIG_FFT) += x86/fft_init.o
|
||||
OBJS-$(CONFIG_GPL) += x86/idct_mmx.o
|
||||
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
|
||||
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
|
||||
OBJS-$(CONFIG_LPC) += x86/lpc.o
|
||||
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o
|
||||
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o
|
||||
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
|
||||
OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
|
||||
x86/rv40dsp_init.o
|
||||
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
|
||||
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
|
||||
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
|
||||
OBJS-$(CONFIG_VP5_DECODER) += x86/vp56dsp_init.o
|
||||
OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp_init.o
|
||||
OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
|
||||
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
|
||||
|
||||
MMX-OBJS += x86/dsputil_mmx.o \
|
||||
x86/fdct.o \
|
||||
x86/idct_mmx_xvid.o \
|
||||
x86/idct_sse2_xvid.o \
|
||||
x86/simple_idct.o \
|
||||
|
||||
MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp.o \
|
||||
x86/dwt.o
|
||||
MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
|
||||
x86/motion_est.o
|
||||
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
|
||||
|
||||
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
|
||||
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
|
||||
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
|
||||
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o
|
||||
YASM-OBJS-$(CONFIG_DWT) += x86/dwt_yasm.o
|
||||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
|
||||
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
|
||||
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
|
||||
x86/h264_chromamc_10bit.o
|
||||
YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
|
||||
x86/h264_deblock_10bit.o \
|
||||
x86/h264_idct.o \
|
||||
x86/h264_idct_10bit.o \
|
||||
x86/h264_weight.o \
|
||||
x86/h264_weight_10bit.o
|
||||
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
|
||||
x86/h264_intrapred_10bit.o
|
||||
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
|
||||
x86/h264_qpel_10bit.o
|
||||
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
|
||||
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
|
||||
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
|
||||
YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
|
||||
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
|
||||
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
|
||||
x86/rv40dsp.o
|
||||
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
|
||||
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
|
||||
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
|
||||
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
|
||||
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o
|
||||
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
|
||||
|
||||
YASM-OBJS += x86/dsputil.o \
|
||||
x86/deinterlace.o \
|
||||
x86/fmtconvert.o \
|
||||
460
project/jni/ffmpeg/libavcodec/x86/ac3dsp.asm
Normal file
460
project/jni/ffmpeg/libavcodec/x86/ac3dsp.asm
Normal file
@@ -0,0 +1,460 @@
|
||||
;*****************************************************************************
|
||||
;* x86-optimized AC-3 DSP utils
|
||||
;* Copyright (c) 2011 Justin Ruggles
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
; 16777216.0f - used in ff_float_to_fixed24()
|
||||
pf_1_24: times 4 dd 0x4B800000
|
||||
|
||||
; used in ff_ac3_compute_mantissa_size()
|
||||
cextern ac3_bap_bits
|
||||
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
|
||||
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
|
||||
|
||||
; used in ff_ac3_extract_exponents()
|
||||
pd_1: times 4 dd 1
|
||||
pd_151: times 4 dd 151
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro AC3_EXPONENT_MIN 0
|
||||
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
|
||||
shl reuse_blksq, 8
|
||||
jz .end
|
||||
LOOP_ALIGN
|
||||
.nextexp:
|
||||
mov offsetq, reuse_blksq
|
||||
mova m0, [expq+offsetq]
|
||||
sub offsetq, 256
|
||||
LOOP_ALIGN
|
||||
.nextblk:
|
||||
PMINUB m0, [expq+offsetq], m1
|
||||
sub offsetq, 256
|
||||
jae .nextblk
|
||||
mova [expq], m0
|
||||
add expq, mmsize
|
||||
sub expnq, mmsize
|
||||
jg .nextexp
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define LOOP_ALIGN
|
||||
INIT_MMX mmx
|
||||
AC3_EXPONENT_MIN
|
||||
%if HAVE_MMXEXT_EXTERNAL
|
||||
%define LOOP_ALIGN ALIGN 16
|
||||
INIT_MMX mmxext
|
||||
AC3_EXPONENT_MIN
|
||||
%endif
|
||||
%if HAVE_SSE2_EXTERNAL
|
||||
INIT_XMM sse2
|
||||
AC3_EXPONENT_MIN
|
||||
%endif
|
||||
%undef LOOP_ALIGN
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
|
||||
;
|
||||
; This function uses 2 different methods to calculate a valid result.
|
||||
; 1) logical 'or' of abs of each element
|
||||
; This is used for ssse3 because of the pabsw instruction.
|
||||
; It is also used for mmx because of the lack of min/max instructions.
|
||||
; 2) calculate min/max for the array, then or(abs(min),abs(max))
|
||||
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
|
||||
%macro OR_WORDS_HORIZ 2 ; src, tmp
|
||||
%if cpuflag(sse2)
|
||||
movhlps %2, %1
|
||||
por %1, %2
|
||||
pshuflw %2, %1, q0032
|
||||
por %1, %2
|
||||
pshuflw %2, %1, q0001
|
||||
por %1, %2
|
||||
%elif cpuflag(mmxext)
|
||||
pshufw %2, %1, q0032
|
||||
por %1, %2
|
||||
pshufw %2, %1, q0001
|
||||
por %1, %2
|
||||
%else ; mmx
|
||||
movq %2, %1
|
||||
psrlq %2, 32
|
||||
por %1, %2
|
||||
movq %2, %1
|
||||
psrlq %2, 16
|
||||
por %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro AC3_MAX_MSB_ABS_INT16 1
|
||||
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
.loop:
|
||||
%ifidn %1, min_max
|
||||
mova m0, [srcq]
|
||||
mova m1, [srcq+mmsize]
|
||||
pminsw m2, m0
|
||||
pminsw m2, m1
|
||||
pmaxsw m3, m0
|
||||
pmaxsw m3, m1
|
||||
%else ; or_abs
|
||||
%if notcpuflag(ssse3)
|
||||
mova m0, [srcq]
|
||||
mova m1, [srcq+mmsize]
|
||||
ABS2 m0, m1, m3, m4
|
||||
%else ; ssse3
|
||||
; using memory args is faster for ssse3
|
||||
pabsw m0, [srcq]
|
||||
pabsw m1, [srcq+mmsize]
|
||||
%endif
|
||||
por m2, m0
|
||||
por m2, m1
|
||||
%endif
|
||||
add srcq, mmsize*2
|
||||
sub lend, mmsize
|
||||
ja .loop
|
||||
%ifidn %1, min_max
|
||||
ABS2 m2, m3, m0, m1
|
||||
por m2, m3
|
||||
%endif
|
||||
OR_WORDS_HORIZ m2, m0
|
||||
movd eax, m2
|
||||
and eax, 0xFFFF
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define ABS2 ABS2_MMX
|
||||
AC3_MAX_MSB_ABS_INT16 or_abs
|
||||
INIT_MMX mmxext
|
||||
%define ABS2 ABS2_MMXEXT
|
||||
AC3_MAX_MSB_ABS_INT16 min_max
|
||||
INIT_XMM sse2
|
||||
AC3_MAX_MSB_ABS_INT16 min_max
|
||||
INIT_XMM ssse3
|
||||
%define ABS2 ABS2_SSSE3
|
||||
AC3_MAX_MSB_ABS_INT16 or_abs
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
|
||||
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
|
||||
movd m0, shiftd
|
||||
.loop:
|
||||
mova m1, [srcq ]
|
||||
mova m2, [srcq+mmsize ]
|
||||
mova m3, [srcq+mmsize*2]
|
||||
mova m4, [srcq+mmsize*3]
|
||||
%3 m1, m0
|
||||
%3 m2, m0
|
||||
%3 m3, m0
|
||||
%3 m4, m0
|
||||
mova [srcq ], m1
|
||||
mova [srcq+mmsize ], m2
|
||||
mova [srcq+mmsize*2], m3
|
||||
mova [srcq+mmsize*3], m4
|
||||
add srcq, mmsize*4
|
||||
sub lend, mmsize*32/%2
|
||||
ja .loop
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
INIT_MMX mmx
|
||||
AC3_SHIFT l, 16, psllw
|
||||
INIT_XMM sse2
|
||||
AC3_SHIFT l, 16, psllw
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
INIT_MMX mmx
|
||||
AC3_SHIFT r, 32, psrad
|
||||
INIT_XMM sse2
|
||||
AC3_SHIFT r, 32, psrad
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
|
||||
; than round-to-nearest.
|
||||
INIT_MMX 3dnow
|
||||
cglobal float_to_fixed24, 3, 3, 0, dst, src, len
|
||||
movq m0, [pf_1_24]
|
||||
.loop:
|
||||
movq m1, [srcq ]
|
||||
movq m2, [srcq+8 ]
|
||||
movq m3, [srcq+16]
|
||||
movq m4, [srcq+24]
|
||||
pfmul m1, m0
|
||||
pfmul m2, m0
|
||||
pfmul m3, m0
|
||||
pfmul m4, m0
|
||||
pf2id m1, m1
|
||||
pf2id m2, m2
|
||||
pf2id m3, m3
|
||||
pf2id m4, m4
|
||||
movq [dstq ], m1
|
||||
movq [dstq+8 ], m2
|
||||
movq [dstq+16], m3
|
||||
movq [dstq+24], m4
|
||||
add srcq, 32
|
||||
add dstq, 32
|
||||
sub lend, 8
|
||||
ja .loop
|
||||
femms
|
||||
RET
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal float_to_fixed24, 3, 3, 3, dst, src, len
|
||||
movaps m0, [pf_1_24]
|
||||
.loop:
|
||||
movaps m1, [srcq ]
|
||||
movaps m2, [srcq+16]
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
cvtps2pi mm0, m1
|
||||
movhlps m1, m1
|
||||
cvtps2pi mm1, m1
|
||||
cvtps2pi mm2, m2
|
||||
movhlps m2, m2
|
||||
cvtps2pi mm3, m2
|
||||
movq [dstq ], mm0
|
||||
movq [dstq+ 8], mm1
|
||||
movq [dstq+16], mm2
|
||||
movq [dstq+24], mm3
|
||||
add srcq, 32
|
||||
add dstq, 32
|
||||
sub lend, 8
|
||||
ja .loop
|
||||
emms
|
||||
RET
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
|
||||
movaps m0, [pf_1_24]
|
||||
.loop:
|
||||
movaps m1, [srcq ]
|
||||
movaps m2, [srcq+16 ]
|
||||
movaps m3, [srcq+32 ]
|
||||
movaps m4, [srcq+48 ]
|
||||
%ifdef m8
|
||||
movaps m5, [srcq+64 ]
|
||||
movaps m6, [srcq+80 ]
|
||||
movaps m7, [srcq+96 ]
|
||||
movaps m8, [srcq+112]
|
||||
%endif
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
mulps m3, m0
|
||||
mulps m4, m0
|
||||
%ifdef m8
|
||||
mulps m5, m0
|
||||
mulps m6, m0
|
||||
mulps m7, m0
|
||||
mulps m8, m0
|
||||
%endif
|
||||
cvtps2dq m1, m1
|
||||
cvtps2dq m2, m2
|
||||
cvtps2dq m3, m3
|
||||
cvtps2dq m4, m4
|
||||
%ifdef m8
|
||||
cvtps2dq m5, m5
|
||||
cvtps2dq m6, m6
|
||||
cvtps2dq m7, m7
|
||||
cvtps2dq m8, m8
|
||||
%endif
|
||||
movdqa [dstq ], m1
|
||||
movdqa [dstq+16 ], m2
|
||||
movdqa [dstq+32 ], m3
|
||||
movdqa [dstq+48 ], m4
|
||||
%ifdef m8
|
||||
movdqa [dstq+64 ], m5
|
||||
movdqa [dstq+80 ], m6
|
||||
movdqa [dstq+96 ], m7
|
||||
movdqa [dstq+112], m8
|
||||
add srcq, 128
|
||||
add dstq, 128
|
||||
sub lenq, 32
|
||||
%else
|
||||
add srcq, 64
|
||||
add dstq, 64
|
||||
sub lenq, 16
|
||||
%endif
|
||||
ja .loop
|
||||
REP_RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro PHADDD4 2 ; xmm src, xmm tmp
|
||||
movhlps %2, %1
|
||||
paddd %1, %2
|
||||
pshufd %2, %1, 0x1
|
||||
paddd %1, %2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
|
||||
movdqa m0, [mant_cntq ]
|
||||
movdqa m1, [mant_cntq+ 1*16]
|
||||
paddw m0, [mant_cntq+ 2*16]
|
||||
paddw m1, [mant_cntq+ 3*16]
|
||||
paddw m0, [mant_cntq+ 4*16]
|
||||
paddw m1, [mant_cntq+ 5*16]
|
||||
paddw m0, [mant_cntq+ 6*16]
|
||||
paddw m1, [mant_cntq+ 7*16]
|
||||
paddw m0, [mant_cntq+ 8*16]
|
||||
paddw m1, [mant_cntq+ 9*16]
|
||||
paddw m0, [mant_cntq+10*16]
|
||||
paddw m1, [mant_cntq+11*16]
|
||||
pmaddwd m0, [ac3_bap_bits ]
|
||||
pmaddwd m1, [ac3_bap_bits+16]
|
||||
paddd m0, m1
|
||||
PHADDD4 m0, m1
|
||||
movd sumd, m0
|
||||
movdqa m3, [pw_bap_mul1]
|
||||
movhpd m0, [mant_cntq +2]
|
||||
movlpd m0, [mant_cntq+1*32+2]
|
||||
movhpd m1, [mant_cntq+2*32+2]
|
||||
movlpd m1, [mant_cntq+3*32+2]
|
||||
movhpd m2, [mant_cntq+4*32+2]
|
||||
movlpd m2, [mant_cntq+5*32+2]
|
||||
pmulhuw m0, m3
|
||||
pmulhuw m1, m3
|
||||
pmulhuw m2, m3
|
||||
paddusw m0, m1
|
||||
paddusw m0, m2
|
||||
pmaddwd m0, [pw_bap_mul2]
|
||||
PHADDD4 m0, m1
|
||||
movd eax, m0
|
||||
add eax, sumd
|
||||
RET
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
|
||||
;------------------------------------------------------------------------------
|
||||
|
||||
%macro PABSD 1-2 ; src/dst, unused
|
||||
%if cpuflag(ssse3)
|
||||
pabsd %1, %1
|
||||
%else ; src/dst, tmp
|
||||
pxor %2, %2
|
||||
pcmpgtd %2, %1
|
||||
pxor %1, %2
|
||||
psubd %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if HAVE_AMD3DNOW_EXTERNAL
|
||||
INIT_MMX 3dnow
|
||||
cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len
|
||||
add expq, lenq
|
||||
lea coefq, [coefq+4*lenq]
|
||||
neg lenq
|
||||
movq m3, [pd_1]
|
||||
movq m4, [pd_151]
|
||||
.loop:
|
||||
movq m0, [coefq+4*lenq ]
|
||||
movq m1, [coefq+4*lenq+8]
|
||||
PABSD m0, m2
|
||||
PABSD m1, m2
|
||||
pslld m0, 1
|
||||
por m0, m3
|
||||
pi2fd m2, m0
|
||||
psrld m2, 23
|
||||
movq m0, m4
|
||||
psubd m0, m2
|
||||
pslld m1, 1
|
||||
por m1, m3
|
||||
pi2fd m2, m1
|
||||
psrld m2, 23
|
||||
movq m1, m4
|
||||
psubd m1, m2
|
||||
packssdw m0, m0
|
||||
packuswb m0, m0
|
||||
packssdw m1, m1
|
||||
packuswb m1, m1
|
||||
punpcklwd m0, m1
|
||||
movd [expq+lenq], m0
|
||||
add lenq, 4
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endif
|
||||
|
||||
%macro AC3_EXTRACT_EXPONENTS 0
|
||||
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
|
||||
add expq, lenq
|
||||
lea coefq, [coefq+4*lenq]
|
||||
neg lenq
|
||||
mova m2, [pd_1]
|
||||
mova m3, [pd_151]
|
||||
.loop:
|
||||
; move 4 32-bit coefs to xmm0
|
||||
mova m0, [coefq+4*lenq]
|
||||
; absolute value
|
||||
PABSD m0, m1
|
||||
; convert to float and extract exponents
|
||||
pslld m0, 1
|
||||
por m0, m2
|
||||
cvtdq2ps m1, m0
|
||||
psrld m1, 23
|
||||
mova m0, m3
|
||||
psubd m0, m1
|
||||
; move the lowest byte in each of 4 dwords to the low dword
|
||||
; NOTE: We cannot just extract the low bytes with pshufb because the dword
|
||||
; result for 16777215 is -1 due to float inaccuracy. Using packuswb
|
||||
; clips this to 0, which is the correct exponent.
|
||||
packssdw m0, m0
|
||||
packuswb m0, m0
|
||||
movd [expq+lenq], m0
|
||||
|
||||
add lenq, 4
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if HAVE_SSE2_EXTERNAL
|
||||
INIT_XMM sse2
|
||||
AC3_EXTRACT_EXPONENTS
|
||||
%endif
|
||||
%if HAVE_SSSE3_EXTERNAL
|
||||
INIT_XMM ssse3
|
||||
AC3_EXTRACT_EXPONENTS
|
||||
%endif
|
||||
232
project/jni/ffmpeg/libavcodec/x86/ac3dsp_init.c
Normal file
232
project/jni/ffmpeg/libavcodec/x86/ac3dsp_init.c
Normal file
@@ -0,0 +1,232 @@
|
||||
/*
|
||||
* x86-optimized AC-3 DSP utils
|
||||
* Copyright (c) 2011 Justin Ruggles
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "libavcodec/ac3.h"
|
||||
#include "libavcodec/ac3dsp.h"
|
||||
|
||||
extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
extern void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
extern void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
|
||||
|
||||
extern int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
|
||||
extern int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
|
||||
extern int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
|
||||
extern int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
|
||||
|
||||
extern void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
|
||||
extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
|
||||
|
||||
extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
|
||||
extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
|
||||
|
||||
extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
|
||||
extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
|
||||
extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
|
||||
|
||||
extern int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
|
||||
|
||||
extern void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
extern void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
extern void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
|
||||
|
||||
#if ARCH_X86_32 && defined(__INTEL_COMPILER)
|
||||
# undef HAVE_7REGS
|
||||
# define HAVE_7REGS 0
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE_INLINE && HAVE_7REGS
|
||||
|
||||
#define IF1(x) x
|
||||
#define IF0(x)
|
||||
|
||||
#define MIX5(mono, stereo) \
|
||||
__asm__ volatile ( \
|
||||
"movss 0(%1), %%xmm5 \n" \
|
||||
"movss 8(%1), %%xmm6 \n" \
|
||||
"movss 24(%1), %%xmm7 \n" \
|
||||
"shufps $0, %%xmm5, %%xmm5 \n" \
|
||||
"shufps $0, %%xmm6, %%xmm6 \n" \
|
||||
"shufps $0, %%xmm7, %%xmm7 \n" \
|
||||
"1: \n" \
|
||||
"movaps (%0, %2), %%xmm0 \n" \
|
||||
"movaps (%0, %3), %%xmm1 \n" \
|
||||
"movaps (%0, %4), %%xmm2 \n" \
|
||||
"movaps (%0, %5), %%xmm3 \n" \
|
||||
"movaps (%0, %6), %%xmm4 \n" \
|
||||
"mulps %%xmm5, %%xmm0 \n" \
|
||||
"mulps %%xmm6, %%xmm1 \n" \
|
||||
"mulps %%xmm5, %%xmm2 \n" \
|
||||
"mulps %%xmm7, %%xmm3 \n" \
|
||||
"mulps %%xmm7, %%xmm4 \n" \
|
||||
stereo("addps %%xmm1, %%xmm0 \n") \
|
||||
"addps %%xmm1, %%xmm2 \n" \
|
||||
"addps %%xmm3, %%xmm0 \n" \
|
||||
"addps %%xmm4, %%xmm2 \n" \
|
||||
mono("addps %%xmm2, %%xmm0 \n") \
|
||||
"movaps %%xmm0, (%0, %2) \n" \
|
||||
stereo("movaps %%xmm2, (%0, %3) \n") \
|
||||
"add $16, %0 \n" \
|
||||
"jl 1b \n" \
|
||||
: "+&r"(i) \
|
||||
: "r"(matrix), \
|
||||
"r"(samples[0] + len), \
|
||||
"r"(samples[1] + len), \
|
||||
"r"(samples[2] + len), \
|
||||
"r"(samples[3] + len), \
|
||||
"r"(samples[4] + len) \
|
||||
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
|
||||
"memory" \
|
||||
);
|
||||
|
||||
#define MIX_MISC(stereo) \
|
||||
__asm__ volatile ( \
|
||||
"mov %5, %2 \n" \
|
||||
"1: \n" \
|
||||
"mov -%c7(%6, %2, %c8), %3 \n" \
|
||||
"movaps (%3, %0), %%xmm0 \n" \
|
||||
stereo("movaps %%xmm0, %%xmm1 \n") \
|
||||
"mulps %%xmm4, %%xmm0 \n" \
|
||||
stereo("mulps %%xmm5, %%xmm1 \n") \
|
||||
"2: \n" \
|
||||
"mov (%6, %2, %c8), %1 \n" \
|
||||
"movaps (%1, %0), %%xmm2 \n" \
|
||||
stereo("movaps %%xmm2, %%xmm3 \n") \
|
||||
"mulps (%4, %2, 8), %%xmm2 \n" \
|
||||
stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \
|
||||
"addps %%xmm2, %%xmm0 \n" \
|
||||
stereo("addps %%xmm3, %%xmm1 \n") \
|
||||
"add $4, %2 \n" \
|
||||
"jl 2b \n" \
|
||||
"mov %5, %2 \n" \
|
||||
stereo("mov (%6, %2, %c8), %1 \n") \
|
||||
"movaps %%xmm0, (%3, %0) \n" \
|
||||
stereo("movaps %%xmm1, (%1, %0) \n") \
|
||||
"add $16, %0 \n" \
|
||||
"jl 1b \n" \
|
||||
: "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \
|
||||
: "r"(matrix_simd + in_ch), \
|
||||
"g"((intptr_t) - 4 * (in_ch - 1)), \
|
||||
"r"(samp + in_ch), \
|
||||
"i"(sizeof(float *)), "i"(sizeof(float *)/4) \
|
||||
: "memory" \
|
||||
);
|
||||
|
||||
static void ac3_downmix_sse(float **samples, float (*matrix)[2],
|
||||
int out_ch, int in_ch, int len)
|
||||
{
|
||||
int (*matrix_cmp)[2] = (int(*)[2])matrix;
|
||||
intptr_t i, j, k, m;
|
||||
|
||||
i = -len * sizeof(float);
|
||||
if (in_ch == 5 && out_ch == 2 &&
|
||||
!(matrix_cmp[0][1] | matrix_cmp[2][0] |
|
||||
matrix_cmp[3][1] | matrix_cmp[4][0] |
|
||||
(matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
|
||||
(matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
|
||||
MIX5(IF0, IF1);
|
||||
} else if (in_ch == 5 && out_ch == 1 &&
|
||||
matrix_cmp[0][0] == matrix_cmp[2][0] &&
|
||||
matrix_cmp[3][0] == matrix_cmp[4][0]) {
|
||||
MIX5(IF1, IF0);
|
||||
} else {
|
||||
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
|
||||
float *samp[AC3_MAX_CHANNELS];
|
||||
|
||||
for (j = 0; j < in_ch; j++)
|
||||
samp[j] = samples[j] + len;
|
||||
|
||||
j = 2 * in_ch * sizeof(float);
|
||||
__asm__ volatile (
|
||||
"1: \n"
|
||||
"sub $8, %0 \n"
|
||||
"movss (%2, %0), %%xmm4 \n"
|
||||
"movss 4(%2, %0), %%xmm5 \n"
|
||||
"shufps $0, %%xmm4, %%xmm4 \n"
|
||||
"shufps $0, %%xmm5, %%xmm5 \n"
|
||||
"movaps %%xmm4, (%1, %0, 4) \n"
|
||||
"movaps %%xmm5, 16(%1, %0, 4) \n"
|
||||
"jg 1b \n"
|
||||
: "+&r"(j)
|
||||
: "r"(matrix_simd), "r"(matrix)
|
||||
: "memory"
|
||||
);
|
||||
if (out_ch == 2) {
|
||||
MIX_MISC(IF1);
|
||||
} else {
|
||||
MIX_MISC(IF0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
|
||||
|
||||
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOW(mm_flags)) {
|
||||
c->extract_exponents = ff_ac3_extract_exponents_3dnow;
|
||||
if (!bit_exact) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE(mm_flags)) {
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
|
||||
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
|
||||
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
|
||||
c->extract_exponents = ff_ac3_extract_exponents_sse2;
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
|
||||
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
|
||||
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSSE3(mm_flags)) {
|
||||
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
|
||||
if (!(mm_flags & AV_CPU_FLAG_ATOM)) {
|
||||
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_SSE_INLINE && HAVE_7REGS
|
||||
if (INLINE_SSE(mm_flags)) {
|
||||
c->downmix = ac3_downmix_sse;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
229
project/jni/ffmpeg/libavcodec/x86/cabac.h
Normal file
229
project/jni/ffmpeg/libavcodec/x86/cabac.h
Normal file
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_CABAC_H
|
||||
#define AVCODEC_X86_CABAC_H
|
||||
|
||||
#include "libavcodec/cabac.h"
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/internal.h"
|
||||
#include "config.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
#define TABLES_ARG , "r"(tables)
|
||||
|
||||
#if HAVE_FAST_CMOV
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
"cmp "low" , "tmp" \n\t"\
|
||||
"cmova %%ecx , "range" \n\t"\
|
||||
"sbb %%rcx , %%rcx \n\t"\
|
||||
"and %%ecx , "tmp" \n\t"\
|
||||
"xor %%rcx , "retq" \n\t"\
|
||||
"sub "tmp" , "low" \n\t"
|
||||
#else /* HAVE_FAST_CMOV */
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
|
||||
"sub "low" , "tmp" \n\t"\
|
||||
"sar $31 , "tmp" \n\t"\
|
||||
"sub %%ecx , "range" \n\t"\
|
||||
"and "tmp" , "range" \n\t"\
|
||||
"add %%ecx , "range" \n\t"\
|
||||
"shl $17 , %%ecx \n\t"\
|
||||
"and "tmp" , %%ecx \n\t"\
|
||||
"sub %%ecx , "low" \n\t"\
|
||||
"xor "tmp" , "ret" \n\t"\
|
||||
"movslq "ret" , "retq" \n\t"
|
||||
#endif /* HAVE_FAST_CMOV */
|
||||
|
||||
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
|
||||
"movzbl "statep" , "ret" \n\t"\
|
||||
"mov "range" , "tmp" \n\t"\
|
||||
"and $0xC0 , "range" \n\t"\
|
||||
"lea ("ret", "range", 2), %%ecx \n\t"\
|
||||
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
|
||||
"sub "range" , "tmp" \n\t"\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
|
||||
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
|
||||
"shl %%cl , "range" \n\t"\
|
||||
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
|
||||
"shl %%cl , "low" \n\t"\
|
||||
"mov "tmpbyte" , "statep" \n\t"\
|
||||
"test "lowword" , "lowword" \n\t"\
|
||||
"jnz 2f \n\t"\
|
||||
"mov "byte" , %%"REG_c" \n\t"\
|
||||
"add"OPSIZE" $2 , "byte" \n\t"\
|
||||
"movzwl (%%"REG_c") , "tmp" \n\t"\
|
||||
"lea -1("low") , %%ecx \n\t"\
|
||||
"xor "low" , %%ecx \n\t"\
|
||||
"shr $15 , %%ecx \n\t"\
|
||||
"bswap "tmp" \n\t"\
|
||||
"shr $15 , "tmp" \n\t"\
|
||||
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
|
||||
"sub $0xFFFF , "tmp" \n\t"\
|
||||
"neg %%ecx \n\t"\
|
||||
"add $7 , %%ecx \n\t"\
|
||||
"shl %%cl , "tmp" \n\t"\
|
||||
"add "tmp" , "low" \n\t"\
|
||||
"2: \n\t"
|
||||
|
||||
#else /* BROKEN_RELOCATIONS */
|
||||
#define TABLES_ARG
|
||||
#define RIP_ARG
|
||||
|
||||
#if HAVE_FAST_CMOV
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
"cmp "low" , "tmp" \n\t"\
|
||||
"cmova %%ecx , "range" \n\t"\
|
||||
"sbb %%ecx , %%ecx \n\t"\
|
||||
"and %%ecx , "tmp" \n\t"\
|
||||
"xor %%ecx , "ret" \n\t"\
|
||||
"sub "tmp" , "low" \n\t"
|
||||
#else /* HAVE_FAST_CMOV */
|
||||
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
|
||||
"mov "tmp" , %%ecx \n\t"\
|
||||
"shl $17 , "tmp" \n\t"\
|
||||
"sub "low" , "tmp" \n\t"\
|
||||
"sar $31 , "tmp" \n\t" /*lps_mask*/\
|
||||
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
|
||||
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
|
||||
"add %%ecx , "range" \n\t" /*new range*/\
|
||||
"shl $17 , %%ecx \n\t"\
|
||||
"and "tmp" , %%ecx \n\t"\
|
||||
"sub %%ecx , "low" \n\t"\
|
||||
"xor "tmp" , "ret" \n\t"
|
||||
#endif /* HAVE_FAST_CMOV */
|
||||
|
||||
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
|
||||
"movzbl "statep" , "ret" \n\t"\
|
||||
"mov "range" , "tmp" \n\t"\
|
||||
"and $0xC0 , "range" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
|
||||
"sub "range" , "tmp" \n\t"\
|
||||
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
|
||||
"shl %%cl , "range" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
|
||||
"shl %%cl , "low" \n\t"\
|
||||
"mov "tmpbyte" , "statep" \n\t"\
|
||||
"test "lowword" , "lowword" \n\t"\
|
||||
" jnz 2f \n\t"\
|
||||
"mov "byte" , %%"REG_c" \n\t"\
|
||||
"add"OPSIZE" $2 , "byte" \n\t"\
|
||||
"movzwl (%%"REG_c") , "tmp" \n\t"\
|
||||
"lea -1("low") , %%ecx \n\t"\
|
||||
"xor "low" , %%ecx \n\t"\
|
||||
"shr $15 , %%ecx \n\t"\
|
||||
"bswap "tmp" \n\t"\
|
||||
"shr $15 , "tmp" \n\t"\
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
|
||||
"sub $0xFFFF , "tmp" \n\t"\
|
||||
"neg %%ecx \n\t"\
|
||||
"add $7 , %%ecx \n\t"\
|
||||
"shl %%cl , "tmp" \n\t"\
|
||||
"add "tmp" , "low" \n\t"\
|
||||
"2: \n\t"
|
||||
|
||||
#endif /* BROKEN_RELOCATIONS */
|
||||
|
||||
|
||||
#if HAVE_7REGS && !(defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|
||||
&& !(defined(__i386) && !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
|
||||
#define get_cabac_inline get_cabac_inline_x86
|
||||
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
|
||||
uint8_t *const state)
|
||||
{
|
||||
int bit, tmp;
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
|
||||
"%2", "%q2", "%3", "%b3",
|
||||
"%c6(%5)", "%c7(%5)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%8")
|
||||
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
|
||||
: "r"(state), "r"(c),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end))
|
||||
TABLES_ARG
|
||||
,"1"(c->low), "2"(c->range)
|
||||
: "%"REG_c, "memory"
|
||||
);
|
||||
return bit & 1;
|
||||
}
|
||||
#endif /* HAVE_7REGS */
|
||||
|
||||
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
|
||||
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
|
||||
{
|
||||
x86_reg tmp;
|
||||
__asm__ volatile(
|
||||
"movl %c6(%2), %k1 \n\t"
|
||||
"movl %c3(%2), %%eax \n\t"
|
||||
"shl $17, %k1 \n\t"
|
||||
"add %%eax, %%eax \n\t"
|
||||
"sub %k1, %%eax \n\t"
|
||||
"cltd \n\t"
|
||||
"and %%edx, %k1 \n\t"
|
||||
"add %k1, %%eax \n\t"
|
||||
"xor %%edx, %%ecx \n\t"
|
||||
"sub %%edx, %%ecx \n\t"
|
||||
"test %%ax, %%ax \n\t"
|
||||
"jnz 1f \n\t"
|
||||
"mov %c4(%2), %1 \n\t"
|
||||
"subl $0xFFFF, %%eax \n\t"
|
||||
"movzwl (%1), %%edx \n\t"
|
||||
"bswap %%edx \n\t"
|
||||
"shrl $15, %%edx \n\t"
|
||||
"add $2, %1 \n\t"
|
||||
"addl %%edx, %%eax \n\t"
|
||||
"mov %1, %c4(%2) \n\t"
|
||||
"1: \n\t"
|
||||
"movl %%eax, %c3(%2) \n\t"
|
||||
|
||||
: "+c"(val), "=&r"(tmp)
|
||||
: "r"(c),
|
||||
"i"(offsetof(CABACContext, low)),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(offsetof(CABACContext, range))
|
||||
: "%eax", "%edx", "memory"
|
||||
);
|
||||
return val;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVCODEC_X86_CABAC_H */
|
||||
505
project/jni/ffmpeg/libavcodec/x86/cavsdsp.c
Normal file
505
project/jni/ffmpeg/libavcodec/x86/cavsdsp.c
Normal file
@@ -0,0 +1,505 @@
|
||||
/*
|
||||
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
|
||||
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
|
||||
*
|
||||
* MMX-optimized DSP functions, based on H.264 optimizations by
|
||||
* Michael Niedermayer and Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/cavsdsp.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "config.h"
|
||||
|
||||
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
|
||||
|
||||
/* in/out: mma=mma+mmb, mmb=mmb-mma */
|
||||
#define SUMSUB_BA( a, b ) \
|
||||
"paddw "#b", "#a" \n\t"\
|
||||
"paddw "#b", "#b" \n\t"\
|
||||
"psubw "#a", "#b" \n\t"
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* inverse transform
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */
|
||||
"movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */
|
||||
"movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */
|
||||
"movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */
|
||||
"movq %%mm4, %%mm0 \n\t"
|
||||
"movq %%mm5, %%mm3 \n\t"
|
||||
"movq %%mm2, %%mm6 \n\t"
|
||||
"movq %%mm7, %%mm1 \n\t"
|
||||
|
||||
"paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */
|
||||
"paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */
|
||||
"paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */
|
||||
"paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */
|
||||
"paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */
|
||||
"paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */
|
||||
"paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */
|
||||
"paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */
|
||||
"psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
|
||||
"paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
|
||||
"psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
|
||||
"paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
|
||||
|
||||
"movq %%mm5, %%mm4 \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"movq %%mm3, %%mm0 \n\t"
|
||||
"movq %%mm1, %%mm2 \n\t"
|
||||
SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
|
||||
"paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */
|
||||
"paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */
|
||||
"paddw %%mm7, %%mm7 \n\t"
|
||||
"paddw %%mm5, %%mm5 \n\t"
|
||||
"paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */
|
||||
"paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */
|
||||
|
||||
SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
|
||||
"psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */
|
||||
"movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */
|
||||
"psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */
|
||||
"paddw %%mm1, %%mm1 \n\t"
|
||||
"paddw %%mm3, %%mm3 \n\t"
|
||||
"psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */
|
||||
"paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */
|
||||
|
||||
"movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */
|
||||
"movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */
|
||||
"movq %%mm2, %%mm4 \n\t"
|
||||
"movq %%mm6, %%mm0 \n\t"
|
||||
"psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */
|
||||
"psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */
|
||||
"paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */
|
||||
"paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */
|
||||
"paddw %%mm2, %%mm2 \n\t"
|
||||
"paddw %%mm0, %%mm0 \n\t"
|
||||
"psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
|
||||
"paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
|
||||
|
||||
"movq (%0), %%mm2 \n\t" /* mm2 = src0 */
|
||||
"movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */
|
||||
SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
|
||||
"psllw $3, %%mm0 \n\t"
|
||||
"psllw $3, %%mm2 \n\t"
|
||||
"paddw %1, %%mm0 \n\t" /* add rounding bias */
|
||||
"paddw %1, %%mm2 \n\t" /* add rounding bias */
|
||||
|
||||
SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
|
||||
SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
|
||||
SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
|
||||
SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
|
||||
SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
|
||||
SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
|
||||
:: "r"(block), "m"(bias)
|
||||
);
|
||||
}
|
||||
|
||||
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
||||
{
|
||||
int i;
|
||||
DECLARE_ALIGNED(8, int16_t, b2)[64];
|
||||
|
||||
for(i=0; i<2; i++){
|
||||
DECLARE_ALIGNED(8, uint64_t, tmp);
|
||||
|
||||
cavs_idct8_1d(block+4*i, ff_pw_4.a);
|
||||
|
||||
__asm__ volatile(
|
||||
"psraw $3, %%mm7 \n\t"
|
||||
"psraw $3, %%mm6 \n\t"
|
||||
"psraw $3, %%mm5 \n\t"
|
||||
"psraw $3, %%mm4 \n\t"
|
||||
"psraw $3, %%mm3 \n\t"
|
||||
"psraw $3, %%mm2 \n\t"
|
||||
"psraw $3, %%mm1 \n\t"
|
||||
"psraw $3, %%mm0 \n\t"
|
||||
"movq %%mm7, %0 \n\t"
|
||||
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
|
||||
"movq %%mm0, 8(%1) \n\t"
|
||||
"movq %%mm6, 24(%1) \n\t"
|
||||
"movq %%mm7, 40(%1) \n\t"
|
||||
"movq %%mm4, 56(%1) \n\t"
|
||||
"movq %0, %%mm7 \n\t"
|
||||
TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
|
||||
"movq %%mm7, (%1) \n\t"
|
||||
"movq %%mm1, 16(%1) \n\t"
|
||||
"movq %%mm0, 32(%1) \n\t"
|
||||
"movq %%mm3, 48(%1) \n\t"
|
||||
: "=m"(tmp)
|
||||
: "r"(b2+32*i)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
for(i=0; i<2; i++){
|
||||
cavs_idct8_1d(b2+4*i, ff_pw_64.a);
|
||||
|
||||
__asm__ volatile(
|
||||
"psraw $7, %%mm7 \n\t"
|
||||
"psraw $7, %%mm6 \n\t"
|
||||
"psraw $7, %%mm5 \n\t"
|
||||
"psraw $7, %%mm4 \n\t"
|
||||
"psraw $7, %%mm3 \n\t"
|
||||
"psraw $7, %%mm2 \n\t"
|
||||
"psraw $7, %%mm1 \n\t"
|
||||
"psraw $7, %%mm0 \n\t"
|
||||
"movq %%mm7, (%0) \n\t"
|
||||
"movq %%mm5, 16(%0) \n\t"
|
||||
"movq %%mm3, 32(%0) \n\t"
|
||||
"movq %%mm1, 48(%0) \n\t"
|
||||
"movq %%mm0, 64(%0) \n\t"
|
||||
"movq %%mm2, 80(%0) \n\t"
|
||||
"movq %%mm4, 96(%0) \n\t"
|
||||
"movq %%mm6, 112(%0) \n\t"
|
||||
:: "r"(b2+4*i)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
ff_add_pixels_clamped_mmx(b2, dst, stride);
|
||||
}
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
* motion compensation
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/* vertical filter [-1 -2 96 42 -7 0] */
|
||||
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"pmullw %5, %%mm6 \n\t"\
|
||||
"movq "#D", %%mm7 \n\t"\
|
||||
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
|
||||
"psllw $3, "#E" \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"psraw $3, "#E" \n\t"\
|
||||
"paddw %%mm7, %%mm6 \n\t"\
|
||||
"paddw "#E", %%mm6 \n\t"\
|
||||
"paddw "#B", "#B" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psraw $1, "#B" \n\t"\
|
||||
"psubw "#A", %%mm6 \n\t"\
|
||||
"paddw %4, %%mm6 \n\t"\
|
||||
"psraw $7, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
/* vertical filter [ 0 -1 5 5 -1 0] */
|
||||
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"paddw "#D", %%mm6 \n\t"\
|
||||
"pmullw %5, %%mm6 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"paddw %4, %%mm6 \n\t"\
|
||||
"psraw $3, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
/* vertical filter [ 0 -7 42 96 -2 -1] */
|
||||
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
|
||||
"movd (%0), "#F" \n\t"\
|
||||
"movq "#C", %%mm6 \n\t"\
|
||||
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
|
||||
"movq "#D", %%mm7 \n\t"\
|
||||
"pmullw %5, %%mm7 \n\t"\
|
||||
"psllw $3, "#B" \n\t"\
|
||||
"psubw "#B", %%mm6 \n\t"\
|
||||
"psraw $3, "#B" \n\t"\
|
||||
"paddw %%mm7, %%mm6 \n\t"\
|
||||
"paddw "#B", %%mm6 \n\t"\
|
||||
"paddw "#E", "#E" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, "#F" \n\t"\
|
||||
"psubw "#E", %%mm6 \n\t"\
|
||||
"psraw $1, "#E" \n\t"\
|
||||
"psubw "#F", %%mm6 \n\t"\
|
||||
"paddw %4, %%mm6 \n\t"\
|
||||
"psraw $7, %%mm6 \n\t"\
|
||||
"packuswb %%mm6, %%mm6 \n\t"\
|
||||
OP(%%mm6, (%1), A, d) \
|
||||
"add %3, %1 \n\t"
|
||||
|
||||
|
||||
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
|
||||
int w= 2;\
|
||||
src -= 2*srcStride;\
|
||||
\
|
||||
while(w--){\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"movd (%0), %%mm0 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm1 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm2 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm3 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"movd (%0), %%mm4 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
|
||||
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
|
||||
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(h==16){\
|
||||
__asm__ volatile(\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
|
||||
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
|
||||
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
|
||||
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
|
||||
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
src += 4-(h+5)*srcStride;\
|
||||
dst += 4-h*dstStride;\
|
||||
}
|
||||
|
||||
#define QPEL_CAVS(OPNAME, OP, MMX)\
|
||||
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
int h=8;\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm7, %%mm7 \n\t"\
|
||||
"movq %5, %%mm6 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movq (%0), %%mm0 \n\t"\
|
||||
"movq 1(%0), %%mm2 \n\t"\
|
||||
"movq %%mm0, %%mm1 \n\t"\
|
||||
"movq %%mm2, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"\
|
||||
"paddw %%mm2, %%mm0 \n\t"\
|
||||
"paddw %%mm3, %%mm1 \n\t"\
|
||||
"pmullw %%mm6, %%mm0 \n\t"\
|
||||
"pmullw %%mm6, %%mm1 \n\t"\
|
||||
"movq -1(%0), %%mm2 \n\t"\
|
||||
"movq 2(%0), %%mm4 \n\t"\
|
||||
"movq %%mm2, %%mm3 \n\t"\
|
||||
"movq %%mm4, %%mm5 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"\
|
||||
"paddw %%mm4, %%mm2 \n\t"\
|
||||
"paddw %%mm3, %%mm5 \n\t"\
|
||||
"psubw %%mm2, %%mm0 \n\t"\
|
||||
"psubw %%mm5, %%mm1 \n\t"\
|
||||
"movq %6, %%mm5 \n\t"\
|
||||
"paddw %%mm5, %%mm0 \n\t"\
|
||||
"paddw %%mm5, %%mm1 \n\t"\
|
||||
"psraw $3, %%mm0 \n\t"\
|
||||
"psraw $3, %%mm1 \n\t"\
|
||||
"packuswb %%mm1, %%mm0 \n\t"\
|
||||
OP(%%mm0, (%1),%%mm5, q) \
|
||||
"add %3, %0 \n\t"\
|
||||
"add %4, %1 \n\t"\
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+m"(h)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
|
||||
}\
|
||||
\
|
||||
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
||||
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
|
||||
#define CAVS_MC(OPNAME, SIZE, MMX) \
|
||||
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
|
||||
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
|
||||
#define AVG_3DNOW_OP(a,b,temp, size) \
|
||||
"mov" #size " " #b ", " #temp " \n\t"\
|
||||
"pavgusb " #temp ", " #a " \n\t"\
|
||||
"mov" #size " " #a ", " #b " \n\t"
|
||||
#define AVG_MMXEXT_OP(a, b, temp, size) \
|
||||
"mov" #size " " #b ", " #temp " \n\t"\
|
||||
"pavgb " #temp ", " #a " \n\t"\
|
||||
"mov" #size " " #a ", " #b " \n\t"
|
||||
|
||||
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
QPEL_CAVS(put_, PUT_OP, mmxext)
|
||||
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
|
||||
|
||||
CAVS_MC(put_, 8, mmxext)
|
||||
CAVS_MC(put_, 16, mmxext)
|
||||
CAVS_MC(avg_, 8, mmxext)
|
||||
CAVS_MC(avg_, 16, mmxext)
|
||||
|
||||
static void ff_cavsdsp_init_mmxext(CAVSDSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
#define dspfunc(PFX, IDX, NUM) \
|
||||
c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \
|
||||
c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmxext; \
|
||||
c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmxext; \
|
||||
c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmxext; \
|
||||
c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmxext; \
|
||||
|
||||
dspfunc(put_cavs_qpel, 0, 16);
|
||||
dspfunc(put_cavs_qpel, 1, 8);
|
||||
dspfunc(avg_cavs_qpel, 0, 16);
|
||||
dspfunc(avg_cavs_qpel, 1, 8);
|
||||
#undef dspfunc
|
||||
c->cavs_idct8_add = cavs_idct8_add_mmx;
|
||||
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
|
||||
}
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_AMD3DNOW_INLINE
|
||||
QPEL_CAVS(put_, PUT_OP, 3dnow)
|
||||
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
|
||||
|
||||
CAVS_MC(put_, 8, 3dnow)
|
||||
CAVS_MC(put_, 16,3dnow)
|
||||
CAVS_MC(avg_, 8, 3dnow)
|
||||
CAVS_MC(avg_, 16,3dnow)
|
||||
|
||||
static void ff_cavsdsp_init_3dnow(CAVSDSPContext* c, AVCodecContext *avctx) {
|
||||
#define dspfunc(PFX, IDX, NUM) \
|
||||
c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmxext; \
|
||||
c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
|
||||
c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
|
||||
c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
|
||||
c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
|
||||
|
||||
dspfunc(put_cavs_qpel, 0, 16);
|
||||
dspfunc(put_cavs_qpel, 1, 8);
|
||||
dspfunc(avg_cavs_qpel, 0, 16);
|
||||
dspfunc(avg_cavs_qpel, 1, 8);
|
||||
#undef dspfunc
|
||||
c->cavs_idct8_add = cavs_idct8_add_mmx;
|
||||
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
|
||||
}
|
||||
#endif /* HAVE_AMD3DNOW_INLINE */
|
||||
|
||||
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) ff_cavsdsp_init_mmxext(c, avctx);
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
#if HAVE_AMD3DNOW_INLINE
|
||||
if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx);
|
||||
#endif /* HAVE_AMD3DNOW_INLINE */
|
||||
}
|
||||
490
project/jni/ffmpeg/libavcodec/x86/dct32.asm
Normal file
490
project/jni/ffmpeg/libavcodec/x86/dct32.asm
Normal file
@@ -0,0 +1,490 @@
|
||||
;******************************************************************************
|
||||
;* 32 point SSE-optimized DCT transform
|
||||
;* Copyright (c) 2010 Vitor Sessak
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
align 32
|
||||
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
|
||||
dd 0.553104, 0.582935, 0.622504, 0.674808
|
||||
dd -10.190008, -3.407609, -2.057781, -1.484165
|
||||
dd -1.169440, -0.972568, -0.839350, -0.744536
|
||||
dd 0.502419, 0.522499, 0.566944, 0.646822
|
||||
dd 0.788155, 1.060678, 1.722447, 5.101149
|
||||
dd 0.509796, 0.601345, 0.899976, 2.562916
|
||||
dd 0.509796, 0.601345, 0.899976, 2.562916
|
||||
dd 1.000000, 1.000000, 1.306563, 0.541196
|
||||
dd 1.000000, 1.000000, 1.306563, 0.541196
|
||||
dd 1.000000, 0.707107, 1.000000, -0.707107
|
||||
dd 1.000000, 0.707107, 1.000000, -0.707107
|
||||
dd 0.707107, 0.707107, 0.707107, 0.707107
|
||||
|
||||
align 32
|
||||
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
|
||||
|
||||
%macro BUTTERFLY 4
|
||||
subps %4, %1, %2
|
||||
addps %2, %2, %1
|
||||
mulps %1, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY0 5
|
||||
%if cpuflag(sse2) && notcpuflag(avx)
|
||||
pshufd %4, %1, %5
|
||||
xorps %1, %2
|
||||
addps %1, %4
|
||||
mulps %1, %3
|
||||
%else
|
||||
shufps %4, %1, %1, %5
|
||||
xorps %1, %1, %2
|
||||
addps %4, %4, %1
|
||||
mulps %1, %4, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY2 4
|
||||
BUTTERFLY0 %1, %2, %3, %4, 0x1b
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY3 4
|
||||
BUTTERFLY0 %1, %2, %3, %4, 0xb1
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERFLY3V 5
|
||||
movaps m%5, m%1
|
||||
addps m%1, m%2
|
||||
subps m%5, m%2
|
||||
SWAP %2, %5
|
||||
mulps m%2, [ps_cos_vec+192]
|
||||
movaps m%5, m%3
|
||||
addps m%3, m%4
|
||||
subps m%4, m%5
|
||||
mulps m%4, [ps_cos_vec+192]
|
||||
%endmacro
|
||||
|
||||
%macro PASS6_AND_PERMUTE 0
|
||||
mov tmpd, [outq+4]
|
||||
movss m7, [outq+72]
|
||||
addss m7, [outq+76]
|
||||
movss m3, [outq+56]
|
||||
addss m3, [outq+60]
|
||||
addss m4, m3
|
||||
movss m2, [outq+52]
|
||||
addss m2, m3
|
||||
movss m3, [outq+104]
|
||||
addss m3, [outq+108]
|
||||
addss m1, m3
|
||||
addss m5, m4
|
||||
movss [outq+ 16], m1
|
||||
movss m1, [outq+100]
|
||||
addss m1, m3
|
||||
movss m3, [outq+40]
|
||||
movss [outq+ 48], m1
|
||||
addss m3, [outq+44]
|
||||
movss m1, [outq+100]
|
||||
addss m4, m3
|
||||
addss m3, m2
|
||||
addss m1, [outq+108]
|
||||
movss [outq+ 40], m3
|
||||
addss m2, [outq+36]
|
||||
movss m3, [outq+8]
|
||||
movss [outq+ 56], m2
|
||||
addss m3, [outq+12]
|
||||
movss [outq+ 32], m3
|
||||
movss m3, [outq+80]
|
||||
movss [outq+ 8], m5
|
||||
movss [outq+ 80], m1
|
||||
movss m2, [outq+52]
|
||||
movss m5, [outq+120]
|
||||
addss m5, [outq+124]
|
||||
movss m1, [outq+64]
|
||||
addss m2, [outq+60]
|
||||
addss m0, m5
|
||||
addss m5, [outq+116]
|
||||
mov [outq+64], tmpd
|
||||
addss m6, m0
|
||||
addss m1, m6
|
||||
mov tmpd, [outq+12]
|
||||
mov [outq+ 96], tmpd
|
||||
movss [outq+ 4], m1
|
||||
movss m1, [outq+24]
|
||||
movss [outq+ 24], m4
|
||||
movss m4, [outq+88]
|
||||
addss m4, [outq+92]
|
||||
addss m3, m4
|
||||
addss m4, [outq+84]
|
||||
mov tmpd, [outq+108]
|
||||
addss m1, [outq+28]
|
||||
addss m0, m1
|
||||
addss m1, m5
|
||||
addss m6, m3
|
||||
addss m3, m0
|
||||
addss m0, m7
|
||||
addss m5, [outq+20]
|
||||
addss m7, m1
|
||||
movss [outq+ 12], m6
|
||||
mov [outq+112], tmpd
|
||||
movss m6, [outq+28]
|
||||
movss [outq+ 28], m0
|
||||
movss m0, [outq+36]
|
||||
movss [outq+ 36], m7
|
||||
addss m1, m4
|
||||
movss m7, [outq+116]
|
||||
addss m0, m2
|
||||
addss m7, [outq+124]
|
||||
movss [outq+ 72], m0
|
||||
movss m0, [outq+44]
|
||||
addss m2, m0
|
||||
movss [outq+ 44], m1
|
||||
movss [outq+ 88], m2
|
||||
addss m0, [outq+60]
|
||||
mov tmpd, [outq+60]
|
||||
mov [outq+120], tmpd
|
||||
movss [outq+104], m0
|
||||
addss m4, m5
|
||||
addss m5, [outq+68]
|
||||
movss [outq+52], m4
|
||||
movss [outq+60], m5
|
||||
movss m4, [outq+68]
|
||||
movss m5, [outq+20]
|
||||
movss [outq+ 20], m3
|
||||
addss m5, m7
|
||||
addss m7, m6
|
||||
addss m4, m5
|
||||
movss m2, [outq+84]
|
||||
addss m2, [outq+92]
|
||||
addss m5, m2
|
||||
movss [outq+ 68], m4
|
||||
addss m2, m7
|
||||
movss m4, [outq+76]
|
||||
movss [outq+ 84], m2
|
||||
movss [outq+ 76], m5
|
||||
addss m7, m4
|
||||
addss m6, [outq+124]
|
||||
addss m4, m6
|
||||
addss m6, [outq+92]
|
||||
movss [outq+100], m4
|
||||
movss [outq+108], m6
|
||||
movss m6, [outq+92]
|
||||
movss [outq+92], m7
|
||||
addss m6, [outq+124]
|
||||
movss [outq+116], m6
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx
|
||||
SECTION_TEXT
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
|
||||
cglobal dct32_float, 2,3,8, out, in, tmp
|
||||
; pass 1
|
||||
vmovaps m4, [inq+0]
|
||||
vinsertf128 m5, m5, [inq+96], 1
|
||||
vinsertf128 m5, m5, [inq+112], 0
|
||||
vshufps m5, m5, m5, 0x1b
|
||||
BUTTERFLY m4, m5, [ps_cos_vec], m6
|
||||
|
||||
vmovaps m2, [inq+64]
|
||||
vinsertf128 m6, m6, [inq+32], 1
|
||||
vinsertf128 m6, m6, [inq+48], 0
|
||||
vshufps m6, m6, m6, 0x1b
|
||||
BUTTERFLY m2, m6, [ps_cos_vec+32], m0
|
||||
|
||||
; pass 2
|
||||
|
||||
BUTTERFLY m5, m6, [ps_cos_vec+64], m0
|
||||
BUTTERFLY m4, m2, [ps_cos_vec+64], m7
|
||||
|
||||
|
||||
; pass 3
|
||||
vperm2f128 m3, m6, m4, 0x31
|
||||
vperm2f128 m1, m6, m4, 0x20
|
||||
vshufps m3, m3, m3, 0x1b
|
||||
|
||||
BUTTERFLY m1, m3, [ps_cos_vec+96], m6
|
||||
|
||||
|
||||
vperm2f128 m4, m5, m2, 0x20
|
||||
vperm2f128 m5, m5, m2, 0x31
|
||||
vshufps m5, m5, m5, 0x1b
|
||||
|
||||
BUTTERFLY m4, m5, [ps_cos_vec+96], m6
|
||||
|
||||
; pass 4
|
||||
vmovaps m6, [ps_p1p1m1m1+0]
|
||||
vmovaps m2, [ps_cos_vec+128]
|
||||
|
||||
BUTTERFLY2 m5, m6, m2, m7
|
||||
BUTTERFLY2 m4, m6, m2, m7
|
||||
BUTTERFLY2 m1, m6, m2, m7
|
||||
BUTTERFLY2 m3, m6, m2, m7
|
||||
|
||||
|
||||
; pass 5
|
||||
vshufps m6, m6, m6, 0xcc
|
||||
vmovaps m2, [ps_cos_vec+160]
|
||||
|
||||
BUTTERFLY3 m5, m6, m2, m7
|
||||
BUTTERFLY3 m4, m6, m2, m7
|
||||
BUTTERFLY3 m1, m6, m2, m7
|
||||
BUTTERFLY3 m3, m6, m2, m7
|
||||
|
||||
vperm2f128 m6, m3, m3, 0x31
|
||||
vmovaps [outq], m3
|
||||
|
||||
vextractf128 [outq+64], m5, 1
|
||||
vextractf128 [outq+32], m5, 0
|
||||
|
||||
vextractf128 [outq+80], m4, 1
|
||||
vextractf128 [outq+48], m4, 0
|
||||
|
||||
vperm2f128 m0, m1, m1, 0x31
|
||||
vmovaps [outq+96], m1
|
||||
|
||||
vzeroupper
|
||||
|
||||
; pass 6, no SIMD...
|
||||
INIT_XMM
|
||||
PASS6_AND_PERMUTE
|
||||
RET
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define SPILL SWAP
|
||||
%define UNSPILL SWAP
|
||||
|
||||
%macro PASS5 0
|
||||
nop ; FIXME code alignment
|
||||
SWAP 5, 8
|
||||
SWAP 4, 12
|
||||
SWAP 6, 14
|
||||
SWAP 7, 13
|
||||
SWAP 0, 15
|
||||
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
|
||||
TRANSPOSE4x4PS 8, 9, 10, 11, 0
|
||||
BUTTERFLY3V 8, 9, 10, 11, 0
|
||||
addps m10, m11
|
||||
TRANSPOSE4x4PS 12, 13, 14, 15, 0
|
||||
BUTTERFLY3V 12, 13, 14, 15, 0
|
||||
addps m14, m15
|
||||
addps m12, m14
|
||||
addps m14, m13
|
||||
addps m13, m15
|
||||
%endmacro
|
||||
|
||||
%macro PASS6 0
|
||||
SWAP 9, 12
|
||||
SWAP 11, 14
|
||||
movss [outq+0x00], m8
|
||||
pshuflw m0, m8, 0xe
|
||||
movss [outq+0x10], m9
|
||||
pshuflw m1, m9, 0xe
|
||||
movss [outq+0x20], m10
|
||||
pshuflw m2, m10, 0xe
|
||||
movss [outq+0x30], m11
|
||||
pshuflw m3, m11, 0xe
|
||||
movss [outq+0x40], m12
|
||||
pshuflw m4, m12, 0xe
|
||||
movss [outq+0x50], m13
|
||||
pshuflw m5, m13, 0xe
|
||||
movss [outq+0x60], m14
|
||||
pshuflw m6, m14, 0xe
|
||||
movaps [outq+0x70], m15
|
||||
pshuflw m7, m15, 0xe
|
||||
addss m0, m1
|
||||
addss m1, m2
|
||||
movss [outq+0x08], m0
|
||||
addss m2, m3
|
||||
movss [outq+0x18], m1
|
||||
addss m3, m4
|
||||
movss [outq+0x28], m2
|
||||
addss m4, m5
|
||||
movss [outq+0x38], m3
|
||||
addss m5, m6
|
||||
movss [outq+0x48], m4
|
||||
addss m6, m7
|
||||
movss [outq+0x58], m5
|
||||
movss [outq+0x68], m6
|
||||
movss [outq+0x78], m7
|
||||
|
||||
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
|
||||
movhlps m0, m1
|
||||
pshufd m1, m1, 3
|
||||
SWAP 0, 2, 4, 6, 8, 10, 12, 14
|
||||
SWAP 1, 3, 5, 7, 9, 11, 13, 15
|
||||
%rep 7
|
||||
movhlps m0, m1
|
||||
pshufd m1, m1, 3
|
||||
addss m15, m1
|
||||
SWAP 0, 2, 4, 6, 8, 10, 12, 14
|
||||
SWAP 1, 3, 5, 7, 9, 11, 13, 15
|
||||
%endrep
|
||||
%assign i 4
|
||||
%rep 15
|
||||
addss m0, m1
|
||||
movss [outq+i], m0
|
||||
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
%assign i i+8
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%else ; ARCH_X86_32
|
||||
%macro SPILL 2 ; xmm#, mempos
|
||||
movaps [outq+(%2-8)*16], m%1
|
||||
%endmacro
|
||||
%macro UNSPILL 2
|
||||
movaps m%1, [outq+(%2-8)*16]
|
||||
%endmacro
|
||||
|
||||
%define PASS6 PASS6_AND_PERMUTE
|
||||
%macro PASS5 0
|
||||
movaps m2, [ps_cos_vec+160]
|
||||
shufps m3, m3, 0xcc
|
||||
|
||||
BUTTERFLY3 m5, m3, m2, m1
|
||||
SPILL 5, 8
|
||||
|
||||
UNSPILL 1, 9
|
||||
BUTTERFLY3 m1, m3, m2, m5
|
||||
SPILL 1, 14
|
||||
|
||||
BUTTERFLY3 m4, m3, m2, m5
|
||||
SPILL 4, 12
|
||||
|
||||
BUTTERFLY3 m7, m3, m2, m5
|
||||
SPILL 7, 13
|
||||
|
||||
UNSPILL 5, 10
|
||||
BUTTERFLY3 m5, m3, m2, m7
|
||||
SPILL 5, 10
|
||||
|
||||
UNSPILL 4, 11
|
||||
BUTTERFLY3 m4, m3, m2, m7
|
||||
SPILL 4, 11
|
||||
|
||||
BUTTERFLY3 m6, m3, m2, m7
|
||||
SPILL 6, 9
|
||||
|
||||
BUTTERFLY3 m0, m3, m2, m7
|
||||
SPILL 0, 15
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
|
||||
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
|
||||
%macro DCT32_FUNC 0
|
||||
cglobal dct32_float, 2, 3, 16, out, in, tmp
|
||||
; pass 1
|
||||
|
||||
movaps m0, [inq+0]
|
||||
LOAD_INV m1, [inq+112]
|
||||
BUTTERFLY m0, m1, [ps_cos_vec], m3
|
||||
|
||||
movaps m7, [inq+64]
|
||||
LOAD_INV m4, [inq+48]
|
||||
BUTTERFLY m7, m4, [ps_cos_vec+32], m3
|
||||
|
||||
; pass 2
|
||||
movaps m2, [ps_cos_vec+64]
|
||||
BUTTERFLY m1, m4, m2, m3
|
||||
SPILL 1, 11
|
||||
SPILL 4, 8
|
||||
|
||||
; pass 1
|
||||
movaps m1, [inq+16]
|
||||
LOAD_INV m6, [inq+96]
|
||||
BUTTERFLY m1, m6, [ps_cos_vec+16], m3
|
||||
|
||||
movaps m4, [inq+80]
|
||||
LOAD_INV m5, [inq+32]
|
||||
BUTTERFLY m4, m5, [ps_cos_vec+48], m3
|
||||
|
||||
; pass 2
|
||||
BUTTERFLY m0, m7, m2, m3
|
||||
|
||||
movaps m2, [ps_cos_vec+80]
|
||||
BUTTERFLY m6, m5, m2, m3
|
||||
|
||||
BUTTERFLY m1, m4, m2, m3
|
||||
|
||||
; pass 3
|
||||
movaps m2, [ps_cos_vec+96]
|
||||
shufps m1, m1, 0x1b
|
||||
BUTTERFLY m0, m1, m2, m3
|
||||
SPILL 0, 15
|
||||
SPILL 1, 14
|
||||
|
||||
UNSPILL 0, 8
|
||||
shufps m5, m5, 0x1b
|
||||
BUTTERFLY m0, m5, m2, m3
|
||||
|
||||
UNSPILL 1, 11
|
||||
shufps m6, m6, 0x1b
|
||||
BUTTERFLY m1, m6, m2, m3
|
||||
SPILL 1, 11
|
||||
|
||||
shufps m4, m4, 0x1b
|
||||
BUTTERFLY m7, m4, m2, m3
|
||||
|
||||
; pass 4
|
||||
movaps m3, [ps_p1p1m1m1+0]
|
||||
movaps m2, [ps_cos_vec+128]
|
||||
|
||||
BUTTERFLY2 m5, m3, m2, m1
|
||||
|
||||
BUTTERFLY2 m0, m3, m2, m1
|
||||
SPILL 0, 9
|
||||
|
||||
BUTTERFLY2 m6, m3, m2, m1
|
||||
SPILL 6, 10
|
||||
|
||||
UNSPILL 0, 11
|
||||
BUTTERFLY2 m0, m3, m2, m1
|
||||
SPILL 0, 11
|
||||
|
||||
BUTTERFLY2 m4, m3, m2, m1
|
||||
|
||||
BUTTERFLY2 m7, m3, m2, m1
|
||||
|
||||
UNSPILL 6, 14
|
||||
BUTTERFLY2 m6, m3, m2, m1
|
||||
|
||||
UNSPILL 0, 15
|
||||
BUTTERFLY2 m0, m3, m2, m1
|
||||
|
||||
PASS5
|
||||
PASS6
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_INV 2
|
||||
%if cpuflag(sse2)
|
||||
pshufd %1, %2, 0x1b
|
||||
%elif cpuflag(sse)
|
||||
movaps %1, %2
|
||||
shufps %1, %1, 0x1b
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
DCT32_FUNC
|
||||
INIT_XMM sse2
|
||||
DCT32_FUNC
|
||||
82
project/jni/ffmpeg/libavcodec/x86/deinterlace.asm
Normal file
82
project/jni/ffmpeg/libavcodec/x86/deinterlace.asm
Normal file
@@ -0,0 +1,82 @@
|
||||
;******************************************************************************
|
||||
;* MMX optimized deinterlacing functions
|
||||
;* Copyright (c) 2010 Vitor Sessak
|
||||
;* Copyright (c) 2002 Michael Niedermayer
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_4
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro DEINTERLACE 1
|
||||
%ifidn %1, inplace
|
||||
;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
|
||||
cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
|
||||
%else
|
||||
;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
|
||||
cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
|
||||
%endif
|
||||
pxor mm7, mm7
|
||||
movq mm6, [pw_4]
|
||||
.nextrow:
|
||||
movd mm0, [lum_m4q]
|
||||
movd mm1, [lum_m3q]
|
||||
movd mm2, [lum_m2q]
|
||||
%ifidn %1, inplace
|
||||
movd [lum_m4q], mm2
|
||||
%endif
|
||||
movd mm3, [lum_m1q]
|
||||
movd mm4, [lumq]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpcklbw mm2, mm7
|
||||
punpcklbw mm3, mm7
|
||||
punpcklbw mm4, mm7
|
||||
paddw mm1, mm3
|
||||
psllw mm2, 1
|
||||
paddw mm0, mm4
|
||||
psllw mm1, 2
|
||||
paddw mm2, mm6
|
||||
paddw mm1, mm2
|
||||
psubusw mm1, mm0
|
||||
psrlw mm1, 3
|
||||
packuswb mm1, mm7
|
||||
%ifidn %1, inplace
|
||||
movd [lum_m2q], mm1
|
||||
%else
|
||||
movd [dstq], mm1
|
||||
add dstq, 4
|
||||
%endif
|
||||
add lum_m4q, 4
|
||||
add lum_m3q, 4
|
||||
add lum_m2q, 4
|
||||
add lum_m1q, 4
|
||||
add lumq, 4
|
||||
sub sized, 4
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
DEINTERLACE ""
|
||||
|
||||
DEINTERLACE inplace
|
||||
101
project/jni/ffmpeg/libavcodec/x86/diracdsp_mmx.c
Normal file
101
project/jni/ffmpeg/libavcodec/x86/diracdsp_mmx.c
Normal file
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Copyright (C) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "dsputil_mmx.h"
|
||||
#include "diracdsp_mmx.h"
|
||||
|
||||
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
|
||||
|
||||
#define HPEL_FILTER(MMSIZE, EXT) \
|
||||
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
|
||||
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
|
||||
\
|
||||
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
|
||||
const uint8_t *src, int stride, int width, int height) \
|
||||
{ \
|
||||
while( height-- ) \
|
||||
{ \
|
||||
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
|
||||
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
|
||||
\
|
||||
dsth += stride; \
|
||||
dstv += stride; \
|
||||
dstc += stride; \
|
||||
src += stride; \
|
||||
} \
|
||||
}
|
||||
|
||||
#if !ARCH_X86_64
|
||||
HPEL_FILTER(8, mmx)
|
||||
#endif
|
||||
HPEL_FILTER(16, sse2)
|
||||
|
||||
#define PIXFUNC(PFX, IDX, EXT) \
|
||||
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
|
||||
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
|
||||
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
|
||||
|
||||
void ff_diracdsp_init_mmx(DiracDSPContext* c)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_YASM
|
||||
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
|
||||
#if !ARCH_X86_64
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_mmx;
|
||||
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
PIXFUNC(put, 0, mmx);
|
||||
PIXFUNC(avg, 0, mmx);
|
||||
#endif
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||
PIXFUNC(avg, 0, mmxext);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
#if HAVE_YASM
|
||||
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
|
||||
c->add_rect_clamped = ff_add_rect_clamped_sse2;
|
||||
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
|
||||
|
||||
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
|
||||
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
|
||||
#endif
|
||||
#if HAVE_SSE2_INLINE
|
||||
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
|
||||
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
|
||||
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
|
||||
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
47
project/jni/ffmpeg/libavcodec/x86/diracdsp_mmx.h
Normal file
47
project/jni/ffmpeg/libavcodec/x86/diracdsp_mmx.h
Normal file
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_DIRACDSP_H
|
||||
#define AVCODEC_X86_DIRACDSP_H
|
||||
|
||||
#include "libavcodec/diracdsp.h"
|
||||
|
||||
void ff_diracdsp_init_mmx(DiracDSPContext* c);
|
||||
|
||||
DECL_DIRAC_PIXOP(put, mmx);
|
||||
DECL_DIRAC_PIXOP(avg, mmx);
|
||||
DECL_DIRAC_PIXOP(avg, mmxext);
|
||||
|
||||
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
|
||||
|
||||
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
|
||||
|
||||
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
|
||||
|
||||
#endif
|
||||
260
project/jni/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
Normal file
260
project/jni/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
Normal file
@@ -0,0 +1,260 @@
|
||||
;******************************************************************************
|
||||
;* Copyright (c) 2010 David Conrad
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pw_3: times 8 dw 3
|
||||
pw_7: times 8 dw 7
|
||||
pw_16: times 8 dw 16
|
||||
pw_32: times 8 dw 32
|
||||
pb_128: times 16 db 128
|
||||
|
||||
section .text
|
||||
|
||||
%macro UNPACK_ADD 6
|
||||
mov%5 %1, %3
|
||||
mov%6 m5, %4
|
||||
mova m4, %1
|
||||
mova %2, m5
|
||||
punpcklbw %1, m7
|
||||
punpcklbw m5, m7
|
||||
punpckhbw m4, m7
|
||||
punpckhbw %2, m7
|
||||
paddw %1, m5
|
||||
paddw %2, m4
|
||||
%endmacro
|
||||
|
||||
%macro HPEL_FILTER 1
|
||||
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
|
||||
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
|
||||
mov src0q, srcq
|
||||
lea stridex3q, [3*strideq]
|
||||
sub src0q, stridex3q
|
||||
pxor m7, m7
|
||||
.loop:
|
||||
; 7*(src[0] + src[1])
|
||||
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
|
||||
pmullw m0, [pw_7]
|
||||
pmullw m1, [pw_7]
|
||||
|
||||
; 3*( ... + src[-2] + src[3])
|
||||
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
pmullw m0, [pw_3]
|
||||
pmullw m1, [pw_3]
|
||||
|
||||
; ... - 7*(src[-1] + src[2])
|
||||
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
|
||||
pmullw m2, [pw_7]
|
||||
pmullw m3, [pw_7]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
; ... - (src[-3] + src[4])
|
||||
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
paddw m0, [pw_16]
|
||||
paddw m1, [pw_16]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m1
|
||||
mova [dstq], m0
|
||||
add dstq, mmsize
|
||||
add srcq, mmsize
|
||||
add src0q, mmsize
|
||||
sub widthd, mmsize
|
||||
jg .loop
|
||||
RET
|
||||
|
||||
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
|
||||
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
|
||||
dec widthd
|
||||
pxor m7, m7
|
||||
and widthd, ~(mmsize-1)
|
||||
.loop:
|
||||
; 7*(src[0] + src[1])
|
||||
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
|
||||
pmullw m0, [pw_7]
|
||||
pmullw m1, [pw_7]
|
||||
|
||||
; 3*( ... + src[-2] + src[3])
|
||||
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
pmullw m0, [pw_3]
|
||||
pmullw m1, [pw_3]
|
||||
|
||||
; ... - 7*(src[-1] + src[2])
|
||||
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
|
||||
pmullw m2, [pw_7]
|
||||
pmullw m3, [pw_7]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
; ... - (src[-3] + src[4])
|
||||
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
|
||||
paddw m0, [pw_16]
|
||||
paddw m1, [pw_16]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m1
|
||||
mova [dstq + widthq], m0
|
||||
sub widthd, mmsize
|
||||
jge .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro PUT_RECT 1
|
||||
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
|
||||
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
|
||||
mova m0, [pb_128]
|
||||
add wd, (mmsize-1)
|
||||
and wd, ~(mmsize-1)
|
||||
|
||||
%if ARCH_X86_64
|
||||
mov r7d, r5m
|
||||
mov r8d, wd
|
||||
%define wspill r8d
|
||||
%define hd r7d
|
||||
%else
|
||||
mov r4m, wd
|
||||
%define wspill r4m
|
||||
%define hd r5mp
|
||||
%endif
|
||||
|
||||
.loopy
|
||||
lea src2q, [srcq+src_strideq*2]
|
||||
lea dst2q, [dstq+dst_strideq]
|
||||
.loopx:
|
||||
sub wd, mmsize
|
||||
mova m1, [srcq +2*wq]
|
||||
mova m2, [src2q+2*wq]
|
||||
packsswb m1, [srcq +2*wq+mmsize]
|
||||
packsswb m2, [src2q+2*wq+mmsize]
|
||||
paddb m1, m0
|
||||
paddb m2, m0
|
||||
mova [dstq +wq], m1
|
||||
mova [dst2q+wq], m2
|
||||
jg .loopx
|
||||
|
||||
lea srcq, [srcq+src_strideq*4]
|
||||
lea dstq, [dstq+dst_strideq*2]
|
||||
sub hd, 2
|
||||
mov wd, wspill
|
||||
jg .loopy
|
||||
RET
|
||||
%endm
|
||||
|
||||
%macro ADD_RECT 1
|
||||
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
|
||||
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
|
||||
mova m0, [pw_32]
|
||||
add wd, (mmsize-1)
|
||||
and wd, ~(mmsize-1)
|
||||
|
||||
%if ARCH_X86_64
|
||||
mov r8d, wd
|
||||
%define wspill r8d
|
||||
%else
|
||||
mov r5m, wd
|
||||
%define wspill r5m
|
||||
%endif
|
||||
|
||||
.loop:
|
||||
sub wd, mmsize
|
||||
movu m1, [srcq +2*wq] ; FIXME: ensure alignment
|
||||
paddw m1, m0
|
||||
psraw m1, 6
|
||||
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
|
||||
paddw m2, m0
|
||||
psraw m2, 6
|
||||
paddw m1, [idwtq+2*wq]
|
||||
paddw m2, [idwtq+2*wq+mmsize]
|
||||
packuswb m1, m2
|
||||
mova [dstq +wq], m1
|
||||
jg .loop
|
||||
|
||||
lea srcq, [srcq + 2*strideq]
|
||||
add dstq, strideq
|
||||
lea idwtq, [idwtq+ 2*idwt_strideq]
|
||||
sub hd, 1
|
||||
mov wd, wspill
|
||||
jg .loop
|
||||
RET
|
||||
%endm
|
||||
|
||||
%macro ADD_OBMC 2
|
||||
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
|
||||
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
|
||||
pxor m4, m4
|
||||
.loop:
|
||||
%assign i 0
|
||||
%rep %1 / mmsize
|
||||
mova m0, [srcq+i]
|
||||
mova m1, m0
|
||||
punpcklbw m0, m4
|
||||
punpckhbw m1, m4
|
||||
mova m2, [obmcq+i]
|
||||
mova m3, m2
|
||||
punpcklbw m2, m4
|
||||
punpckhbw m3, m4
|
||||
pmullw m0, m2
|
||||
pmullw m1, m3
|
||||
movu m2, [dstq+2*i]
|
||||
movu m3, [dstq+2*i+mmsize]
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
movu [dstq+2*i], m0
|
||||
movu [dstq+2*i+mmsize], m1
|
||||
%assign i i+mmsize
|
||||
%endrep
|
||||
lea srcq, [srcq+strideq]
|
||||
lea dstq, [dstq+2*strideq]
|
||||
add obmcq, 32
|
||||
sub yblend, 1
|
||||
jg .loop
|
||||
RET
|
||||
%endm
|
||||
|
||||
INIT_MMX
|
||||
%if ARCH_X86_64 == 0
|
||||
PUT_RECT mmx
|
||||
ADD_RECT mmx
|
||||
|
||||
HPEL_FILTER mmx
|
||||
ADD_OBMC 32, mmx
|
||||
ADD_OBMC 16, mmx
|
||||
%endif
|
||||
ADD_OBMC 8, mmx
|
||||
|
||||
INIT_XMM
|
||||
PUT_RECT sse2
|
||||
ADD_RECT sse2
|
||||
|
||||
HPEL_FILTER sse2
|
||||
ADD_OBMC 32, sse2
|
||||
ADD_OBMC 16, sse2
|
||||
65
project/jni/ffmpeg/libavcodec/x86/dnxhdenc.c
Normal file
65
project/jni/ffmpeg/libavcodec/x86/dnxhdenc.c
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* VC3/DNxHD SIMD functions
|
||||
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
|
||||
*
|
||||
* VC-3 encoder funded by the British Broadcasting Corporation
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/dnxhdenc.h"
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
|
||||
static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pxor %%xmm5, %%xmm5 \n\t"
|
||||
"movq (%0), %%xmm0 \n\t"
|
||||
"add %2, %0 \n\t"
|
||||
"movq (%0), %%xmm1 \n\t"
|
||||
"movq (%0, %2), %%xmm2 \n\t"
|
||||
"movq (%0, %2,2), %%xmm3 \n\t"
|
||||
"punpcklbw %%xmm5, %%xmm0 \n\t"
|
||||
"punpcklbw %%xmm5, %%xmm1 \n\t"
|
||||
"punpcklbw %%xmm5, %%xmm2 \n\t"
|
||||
"punpcklbw %%xmm5, %%xmm3 \n\t"
|
||||
"movdqa %%xmm0, (%1) \n\t"
|
||||
"movdqa %%xmm1, 16(%1) \n\t"
|
||||
"movdqa %%xmm2, 32(%1) \n\t"
|
||||
"movdqa %%xmm3, 48(%1) \n\t"
|
||||
"movdqa %%xmm3 , 64(%1) \n\t"
|
||||
"movdqa %%xmm2 , 80(%1) \n\t"
|
||||
"movdqa %%xmm1 , 96(%1) \n\t"
|
||||
"movdqa %%xmm0, 112(%1) \n\t"
|
||||
: "+r" (pixels)
|
||||
: "r" (block), "r" ((x86_reg)line_size)
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
|
||||
{
|
||||
#if HAVE_SSE2_INLINE
|
||||
if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) {
|
||||
if (ctx->cid_table->bit_depth == 8)
|
||||
ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
|
||||
}
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
}
|
||||
1022
project/jni/ffmpeg/libavcodec/x86/dsputil.asm
Normal file
1022
project/jni/ffmpeg/libavcodec/x86/dsputil.asm
Normal file
File diff suppressed because it is too large
Load Diff
855
project/jni/ffmpeg/libavcodec/x86/dsputil_avg_template.c
Normal file
855
project/jni/ffmpeg/libavcodec/x86/dsputil_avg_template.c
Normal file
@@ -0,0 +1,855 @@
|
||||
/*
|
||||
* DSP utils : average functions are compiled twice for 3dnow/mmxext
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
||||
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
|
||||
clobber bug - now it will work with 2.95.2 and also with -fPIC
|
||||
*/
|
||||
static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm1 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
#ifndef SKIP_FOR_3DNOW
|
||||
static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $8, %2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" 16(%2), %%mm0 \n\t"
|
||||
PAVGB" 24(%2), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pcmpeqb %%mm6, %%mm6 \n\t"
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $8, %2 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%2), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq 16(%2), %%mm2 \n\t"
|
||||
"movq 24(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $8, %2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
PAVGB" (%3), %%mm1 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" 16(%2), %%mm0 \n\t"
|
||||
PAVGB" 24(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
PAVGB" (%3), %%mm1 \n\t"
|
||||
"movq %%mm1, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
#endif /* SKIP_FOR_3DNOW */
|
||||
|
||||
static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 8(%1, %3), %%mm3 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm1 \n\t"
|
||||
PAVGB" 9(%1), %%mm2 \n\t"
|
||||
PAVGB" 9(%1, %3), %%mm3 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"movq %%mm2, 8(%2) \n\t"
|
||||
"movq %%mm3, 8(%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 8(%1, %3), %%mm3 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm1 \n\t"
|
||||
PAVGB" 9(%1), %%mm2 \n\t"
|
||||
PAVGB" 9(%1, %3), %%mm3 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"movq %%mm2, 8(%2) \n\t"
|
||||
"movq %%mm3, 8(%2, %3) \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
#ifndef SKIP_FOR_3DNOW
|
||||
static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $16, %2 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" 16(%2), %%mm0 \n\t"
|
||||
PAVGB" 24(%2), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $16, %2 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
PAVGB" 8(%3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" 8(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
PAVGB" 8(%3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGB" 16(%2), %%mm0 \n\t"
|
||||
PAVGB" 24(%2), %%mm1 \n\t"
|
||||
PAVGB" (%3), %%mm0 \n\t"
|
||||
PAVGB" 8(%3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
|
||||
static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"pcmpeqb %%mm6, %%mm6 \n\t"
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"movq (%2), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $16, %2 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%2), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq 16(%2), %%mm2 \n\t"
|
||||
"movq 24(%2), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm2, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%3) \n\t"
|
||||
"movq %%mm1, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
:"r"(src1Stride), "r"(dstStride)
|
||||
:"memory");*/
|
||||
}
|
||||
#endif /* SKIP_FOR_3DNOW */
|
||||
|
||||
/* GL: this function does incorrect rounding if overflow */
|
||||
static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BONE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"psubusb %%mm6, %%mm0 \n\t"
|
||||
"psubusb %%mm6, %%mm2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm2 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"psubusb %%mm6, %%mm0 \n\t"
|
||||
"psubusb %%mm6, %%mm2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm2 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"pcmpeqb %%mm6, %%mm6 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"movq (%1, %3,2), %%mm0 \n\t"
|
||||
"movq 1(%1, %3,2), %%mm1 \n\t"
|
||||
"movq (%1, %4), %%mm2 \n\t"
|
||||
"movq 1(%1, %4), %%mm3 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm3 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm3, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"movq %%mm0, (%2, %3,2) \n\t"
|
||||
"movq %%mm2, (%2, %4) \n\t"
|
||||
"lea (%1, %3,4), %1 \n\t"
|
||||
"lea (%2, %3,4), %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jg 1b \n\t"
|
||||
: "+g"(h), "+r"(pixels), "+r"(block)
|
||||
: "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"sub %3, %2 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm2, %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %3) \n\t"
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
PAVGB" %%mm1, %%mm2 \n\t"
|
||||
PAVGB" %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D" (block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
/* GL: this function does incorrect rounding if overflow */
|
||||
static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BONE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"sub %3, %2 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"psubusb %%mm6, %%mm1 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm2, %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %3) \n\t"
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"psubusb %%mm6, %%mm1 \n\t"
|
||||
PAVGB" %%mm1, %%mm2 \n\t"
|
||||
PAVGB" %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D" (block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"pcmpeqb %%mm6, %%mm6 \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm2, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"movq (%1, %3,2), %%mm1 \n\t"
|
||||
"movq (%1, %4), %%mm0 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm0 \n\t"
|
||||
PAVGB" %%mm1, %%mm2 \n\t"
|
||||
PAVGB" %%mm0, %%mm1 \n\t"
|
||||
"pxor %%mm6, %%mm2 \n\t"
|
||||
"pxor %%mm6, %%mm1 \n\t"
|
||||
"movq %%mm2, (%2, %3,2) \n\t"
|
||||
"movq %%mm1, (%2, %4) \n\t"
|
||||
"lea (%1, %3,4), %1 \n\t"
|
||||
"lea (%2, %3,4), %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jg 1b \n\t"
|
||||
:"+g"(h), "+r"(pixels), "+r" (block)
|
||||
:"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"movq (%2, %3), %%mm1 \n\t"
|
||||
PAVGB" (%1), %%mm0 \n\t"
|
||||
PAVGB" (%1, %3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"movq (%2, %3), %%mm1 \n\t"
|
||||
PAVGB" (%1), %%mm0 \n\t"
|
||||
PAVGB" (%1, %3), %%mm1 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm2 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" (%2, %3), %%mm2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm2 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" (%2, %3), %%mm2 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"sub %3, %2 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm2, %%mm1 \n\t"
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm4 \n\t"
|
||||
PAVGB" %%mm3, %%mm0 \n\t"
|
||||
PAVGB" %%mm4, %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %3) \n\t"
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
PAVGB" %%mm1, %%mm2 \n\t"
|
||||
PAVGB" %%mm0, %%mm1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm4 \n\t"
|
||||
PAVGB" %%mm3, %%mm2 \n\t"
|
||||
PAVGB" %%mm4, %%mm1 \n\t"
|
||||
"movq %%mm2, (%2, %3) \n\t"
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
/* Note this is not correctly rounded, but this function is only
|
||||
* used for B-frames so it does not matter. */
|
||||
static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BONE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
PAVGB" 1(%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"psubusb %%mm6, %%mm2 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm1 \n\t"
|
||||
PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
PAVGB" %%mm1, %%mm0 \n\t"
|
||||
PAVGB" %%mm2, %%mm1 \n\t"
|
||||
PAVGB" (%2), %%mm0 \n\t"
|
||||
PAVGB" (%2, %3), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
PAVGB" 1(%1, %3), %%mm1 \n\t"
|
||||
PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
PAVGB" %%mm1, %%mm2 \n\t"
|
||||
PAVGB" %%mm0, %%mm1 \n\t"
|
||||
PAVGB" (%2), %%mm2 \n\t"
|
||||
PAVGB" (%2, %3), %%mm1 \n\t"
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
//FIXME the following could be optimized too ...
|
||||
static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
|
||||
DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(put_pixels8_y2)(block , pixels , line_size, h);
|
||||
DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
|
||||
DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(avg_pixels8)(block , pixels , line_size, h);
|
||||
DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(avg_pixels8_x2)(block , pixels , line_size, h);
|
||||
DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(avg_pixels8_y2)(block , pixels , line_size, h);
|
||||
DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
|
||||
DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
|
||||
#define QPEL_2TAP_L3(OPNAME) \
|
||||
static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
|
||||
__asm__ volatile(\
|
||||
"1: \n\t"\
|
||||
"movq (%1,%2), %%mm0 \n\t"\
|
||||
"movq 8(%1,%2), %%mm1 \n\t"\
|
||||
PAVGB" (%1,%3), %%mm0 \n\t"\
|
||||
PAVGB" 8(%1,%3), %%mm1 \n\t"\
|
||||
PAVGB" (%1), %%mm0 \n\t"\
|
||||
PAVGB" 8(%1), %%mm1 \n\t"\
|
||||
STORE_OP( (%1,%4),%%mm0)\
|
||||
STORE_OP(8(%1,%4),%%mm1)\
|
||||
"movq %%mm0, (%1,%4) \n\t"\
|
||||
"movq %%mm1, 8(%1,%4) \n\t"\
|
||||
"add %5, %1 \n\t"\
|
||||
"decl %0 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
:"+g"(h), "+r"(src)\
|
||||
:"r"((x86_reg)off1), "r"((x86_reg)off2),\
|
||||
"r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
|
||||
:"memory"\
|
||||
);\
|
||||
}\
|
||||
static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
|
||||
__asm__ volatile(\
|
||||
"1: \n\t"\
|
||||
"movq (%1,%2), %%mm0 \n\t"\
|
||||
PAVGB" (%1,%3), %%mm0 \n\t"\
|
||||
PAVGB" (%1), %%mm0 \n\t"\
|
||||
STORE_OP((%1,%4),%%mm0)\
|
||||
"movq %%mm0, (%1,%4) \n\t"\
|
||||
"add %5, %1 \n\t"\
|
||||
"decl %0 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
:"+g"(h), "+r"(src)\
|
||||
:"r"((x86_reg)off1), "r"((x86_reg)off2),\
|
||||
"r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
|
||||
:"memory"\
|
||||
);\
|
||||
}
|
||||
|
||||
#ifndef SKIP_FOR_3DNOW
|
||||
#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
|
||||
QPEL_2TAP_L3(avg_)
|
||||
#undef STORE_OP
|
||||
#define STORE_OP(a,b)
|
||||
QPEL_2TAP_L3(put_)
|
||||
#undef STORE_OP
|
||||
#undef QPEL_2TAP_L3
|
||||
#endif /* SKIP_FOR_3DNOW */
|
||||
2762
project/jni/ffmpeg/libavcodec/x86/dsputil_mmx.c
Normal file
2762
project/jni/ffmpeg/libavcodec/x86/dsputil_mmx.c
Normal file
File diff suppressed because it is too large
Load Diff
121
project/jni/ffmpeg/libavcodec/x86/dsputil_mmx.h
Normal file
121
project/jni/ffmpeg/libavcodec/x86/dsputil_mmx.h
Normal file
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* MMX optimized DSP utils
|
||||
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_DSPUTIL_MMX_H
|
||||
#define AVCODEC_X86_DSPUTIL_MMX_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
|
||||
typedef struct xmm_reg { uint64_t a, b; } xmm_reg;
|
||||
|
||||
extern const uint64_t ff_bone;
|
||||
extern const uint64_t ff_wtwo;
|
||||
|
||||
extern const uint64_t ff_pdw_80000000[2];
|
||||
|
||||
extern const xmm_reg ff_pw_3;
|
||||
extern const xmm_reg ff_pw_4;
|
||||
extern const xmm_reg ff_pw_5;
|
||||
extern const xmm_reg ff_pw_8;
|
||||
extern const uint64_t ff_pw_15;
|
||||
extern const xmm_reg ff_pw_16;
|
||||
extern const xmm_reg ff_pw_18;
|
||||
extern const uint64_t ff_pw_20;
|
||||
extern const xmm_reg ff_pw_27;
|
||||
extern const xmm_reg ff_pw_28;
|
||||
extern const xmm_reg ff_pw_32;
|
||||
extern const uint64_t ff_pw_42;
|
||||
extern const uint64_t ff_pw_53;
|
||||
extern const xmm_reg ff_pw_63;
|
||||
extern const xmm_reg ff_pw_64;
|
||||
extern const uint64_t ff_pw_96;
|
||||
extern const uint64_t ff_pw_128;
|
||||
extern const uint64_t ff_pw_255;
|
||||
|
||||
extern const xmm_reg ff_pb_1;
|
||||
extern const xmm_reg ff_pb_3;
|
||||
extern const uint64_t ff_pb_7;
|
||||
extern const uint64_t ff_pb_1F;
|
||||
extern const uint64_t ff_pb_3F;
|
||||
extern const uint64_t ff_pb_81;
|
||||
extern const xmm_reg ff_pb_A1;
|
||||
extern const xmm_reg ff_pb_F8;
|
||||
extern const uint64_t ff_pb_FC;
|
||||
extern const xmm_reg ff_pb_FE;
|
||||
|
||||
extern const double ff_pd_1[2];
|
||||
extern const double ff_pd_2[2];
|
||||
|
||||
#define SBUTTERFLY(a,b,t,n,m)\
|
||||
"mov" #m " " #a ", " #t " \n\t" /* abcd */\
|
||||
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
|
||||
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
|
||||
|
||||
#define TRANSPOSE4(a,b,c,d,t)\
|
||||
SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
|
||||
SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
|
||||
SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
|
||||
SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
|
||||
|
||||
#define MOVQ_WONE(regd) \
|
||||
__asm__ volatile ( \
|
||||
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
|
||||
"psrlw $15, %%" #regd ::)
|
||||
|
||||
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
|
||||
void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
|
||||
|
||||
void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
|
||||
void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
|
||||
void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
|
||||
void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
|
||||
void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride);
|
||||
|
||||
void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
||||
void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, int stride, int rnd);
|
||||
|
||||
void ff_put_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
void ff_put_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t *pixels, int line_size);
|
||||
|
||||
void ff_mmx_idct(DCTELEM *block);
|
||||
void ff_mmxext_idct(DCTELEM *block);
|
||||
|
||||
|
||||
void ff_deinterlace_line_mmx(uint8_t *dst,
|
||||
const uint8_t *lum_m4, const uint8_t *lum_m3,
|
||||
const uint8_t *lum_m2, const uint8_t *lum_m1,
|
||||
const uint8_t *lum,
|
||||
int size);
|
||||
|
||||
void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4,
|
||||
const uint8_t *lum_m3,
|
||||
const uint8_t *lum_m2,
|
||||
const uint8_t *lum_m1,
|
||||
const uint8_t *lum, int size);
|
||||
|
||||
#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
|
||||
101
project/jni/ffmpeg/libavcodec/x86/dsputil_qns_template.c
Normal file
101
project/jni/ffmpeg/libavcodec/x86/dsputil_qns_template.c
Normal file
@@ -0,0 +1,101 @@
|
||||
/*
|
||||
* DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
|
||||
* Copyright (c) 2004 Michael Niedermayer
|
||||
*
|
||||
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
|
||||
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
|
||||
|
||||
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
|
||||
{
|
||||
x86_reg i=0;
|
||||
|
||||
assert(FFABS(scale) < MAX_ABS);
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
|
||||
SET_RND(mm6);
|
||||
__asm__ volatile(
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"movd %4, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"psraw $6, %%mm0 \n\t"
|
||||
"psraw $6, %%mm1 \n\t"
|
||||
"pmullw (%3, %0), %%mm0 \n\t"
|
||||
"pmullw 8(%3, %0), %%mm1 \n\t"
|
||||
"pmaddwd %%mm0, %%mm0 \n\t"
|
||||
"pmaddwd %%mm1, %%mm1 \n\t"
|
||||
"paddd %%mm1, %%mm0 \n\t"
|
||||
"psrld $4, %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
PHADDD(%%mm7, %%mm6)
|
||||
"psrld $2, %%mm7 \n\t"
|
||||
"movd %%mm7, %0 \n\t"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
||||
);
|
||||
return i;
|
||||
}
|
||||
|
||||
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
|
||||
{
|
||||
x86_reg i=0;
|
||||
|
||||
if(FFABS(scale) < MAX_ABS){
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
SET_RND(mm6);
|
||||
__asm__ volatile(
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %0) \n\t"
|
||||
"movq %%mm1, 8(%2, %0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" // FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "g"(scale)
|
||||
);
|
||||
}else{
|
||||
for(i=0; i<8*8; i++){
|
||||
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
|
||||
}
|
||||
}
|
||||
}
|
||||
590
project/jni/ffmpeg/libavcodec/x86/dsputil_rnd_template.c
Normal file
590
project/jni/ffmpeg/libavcodec/x86/dsputil_rnd_template.c
Normal file
@@ -0,0 +1,590 @@
|
||||
/*
|
||||
* DSP utils mmx functions are compiled twice for rnd/no_rnd
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
|
||||
* and improved by Zdenek Kabelac <kabi@users.sf.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
// put_pixels
|
||||
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $8, %2 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
|
||||
"movq %%mm4, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq %%mm5, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 16(%2), %%mm1 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"movq (%1), %%mm2 \n\t"
|
||||
"movq 24(%2), %%mm3 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq %%mm5, (%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%1, %3), %%mm2 \n\t"
|
||||
"movq 9(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
"movq %%mm5, 8(%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"movq 1(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"movq 8(%1), %%mm0 \n\t"
|
||||
"movq 9(%1), %%mm1 \n\t"
|
||||
"movq 8(%1, %3), %%mm2 \n\t"
|
||||
"movq 9(%1, %3), %%mm3 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
"movq %%mm5, 8(%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"testl $1, %0 \n\t"
|
||||
" jz 1f \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
"add $16, %2 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%3) \n\t"
|
||||
"movq %%mm5, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"decl %0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%2), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 8(%2), %%mm3 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%3) \n\t"
|
||||
"movq %%mm5, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 16(%2), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 24(%2), %%mm3 \n\t"
|
||||
"add %4, %1 \n\t"
|
||||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
|
||||
"movq %%mm4, (%3) \n\t"
|
||||
"movq %%mm5, 8(%3) \n\t"
|
||||
"add %5, %3 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
|
||||
:"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"),%%mm2 \n\t"
|
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"),%%mm0 \n\t"
|
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||
"movq %%mm4, (%2) \n\t"
|
||||
"movq %%mm5, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_ZERO(mm7);
|
||||
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm4 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"xor %%"REG_a", %%"REG_a" \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddusw %%mm2, %%mm0 \n\t"
|
||||
"paddusw %%mm3, %%mm1 \n\t"
|
||||
"paddusw %%mm6, %%mm4 \n\t"
|
||||
"paddusw %%mm6, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"psrlw $2, %%mm4 \n\t"
|
||||
"psrlw $2, %%mm5 \n\t"
|
||||
"packuswb %%mm5, %%mm4 \n\t"
|
||||
"movq %%mm4, (%2, %%"REG_a") \n\t"
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm2, %%mm4 \n\t"
|
||||
"paddusw %%mm3, %%mm5 \n\t"
|
||||
"paddusw %%mm6, %%mm0 \n\t"
|
||||
"paddusw %%mm6, %%mm1 \n\t"
|
||||
"paddusw %%mm4, %%mm0 \n\t"
|
||||
"paddusw %%mm5, %%mm1 \n\t"
|
||||
"psrlw $2, %%mm0 \n\t"
|
||||
"psrlw $2, %%mm1 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
"movq %%mm0, (%2, %%"REG_a") \n\t"
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels)
|
||||
:"D"(block), "r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
// avg_pixels
|
||||
static void av_unused DEF(avg, pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"movd %1, %%mm1 \n\t"
|
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
"movd %%mm2, %0 \n\t"
|
||||
:"+m"(*block)
|
||||
:"m"(*pixels)
|
||||
:"memory");
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
}
|
||||
while (--h);
|
||||
}
|
||||
|
||||
// in case more speed is needed - unroling would certainly help
|
||||
static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm0 \n\t"
|
||||
"movq %1, %%mm1 \n\t"
|
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
"movq %%mm2, %0 \n\t"
|
||||
:"+m"(*block)
|
||||
:"m"(*pixels)
|
||||
:"memory");
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
}
|
||||
while (--h);
|
||||
}
|
||||
|
||||
static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm0 \n\t"
|
||||
"movq %1, %%mm1 \n\t"
|
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq 8%0, %%mm0 \n\t"
|
||||
"movq 8%1, %%mm1 \n\t"
|
||||
OP_AVG(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
"movq %%mm2, 8%0 \n\t"
|
||||
:"+m"(*block)
|
||||
:"m"(*pixels)
|
||||
:"memory");
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
}
|
||||
while (--h);
|
||||
}
|
||||
|
||||
static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movq %1, %%mm0 \n\t"
|
||||
"movq 1%1, %%mm1 \n\t"
|
||||
"movq %0, %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, %0 \n\t"
|
||||
:"+m"(*block)
|
||||
:"m"(*pixels)
|
||||
:"memory");
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static av_unused void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movq %1, %%mm0 \n\t"
|
||||
"movq %2, %%mm1 \n\t"
|
||||
"movq %0, %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, %0 \n\t"
|
||||
:"+m"(*dst)
|
||||
:"m"(*src1), "m"(*src2)
|
||||
:"memory");
|
||||
dst += dstStride;
|
||||
src1 += src1Stride;
|
||||
src2 += 8;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movq %1, %%mm0 \n\t"
|
||||
"movq 1%1, %%mm1 \n\t"
|
||||
"movq %0, %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, %0 \n\t"
|
||||
"movq 8%1, %%mm0 \n\t"
|
||||
"movq 9%1, %%mm1 \n\t"
|
||||
"movq 8%0, %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, 8%0 \n\t"
|
||||
:"+m"(*block)
|
||||
:"m"(*pixels)
|
||||
:"memory");
|
||||
pixels += line_size;
|
||||
block += line_size;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static av_unused void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
JUMPALIGN();
|
||||
do {
|
||||
__asm__ volatile(
|
||||
"movq %1, %%mm0 \n\t"
|
||||
"movq %2, %%mm1 \n\t"
|
||||
"movq %0, %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, %0 \n\t"
|
||||
"movq 8%1, %%mm0 \n\t"
|
||||
"movq 8%2, %%mm1 \n\t"
|
||||
"movq 8%0, %%mm3 \n\t"
|
||||
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
|
||||
OP_AVG(%%mm3, %%mm2, %%mm0, %%mm6)
|
||||
"movq %%mm0, 8%0 \n\t"
|
||||
:"+m"(*dst)
|
||||
:"m"(*src1), "m"(*src2)
|
||||
:"memory");
|
||||
dst += dstStride;
|
||||
src1 += src1Stride;
|
||||
src2 += 16;
|
||||
} while (--h);
|
||||
}
|
||||
|
||||
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_BFE(mm6);
|
||||
__asm__ volatile(
|
||||
"lea (%3, %3), %%"REG_a" \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
OP_AVG(%%mm3, %%mm4, %%mm0, %%mm6)
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||
"movq %%mm0, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
|
||||
"movq (%2), %%mm3 \n\t"
|
||||
OP_AVG(%%mm3, %%mm4, %%mm2, %%mm6)
|
||||
"movq (%2, %3), %%mm3 \n\t"
|
||||
OP_AVG(%%mm3, %%mm5, %%mm1, %%mm6)
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
"movq %%mm1, (%2, %3) \n\t"
|
||||
"add %%"REG_a", %1 \n\t"
|
||||
"add %%"REG_a", %2 \n\t"
|
||||
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
// this routine is 'slightly' suboptimal but mostly unused
|
||||
static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
MOVQ_ZERO(mm7);
|
||||
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 1(%1), %%mm4 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"xor %%"REG_a", %%"REG_a" \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
".p2align 3 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddusw %%mm2, %%mm0 \n\t"
|
||||
"paddusw %%mm3, %%mm1 \n\t"
|
||||
"paddusw %%mm6, %%mm4 \n\t"
|
||||
"paddusw %%mm6, %%mm5 \n\t"
|
||||
"paddusw %%mm0, %%mm4 \n\t"
|
||||
"paddusw %%mm1, %%mm5 \n\t"
|
||||
"psrlw $2, %%mm4 \n\t"
|
||||
"psrlw $2, %%mm5 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm3 \n\t"
|
||||
"packuswb %%mm5, %%mm4 \n\t"
|
||||
"pcmpeqd %%mm2, %%mm2 \n\t"
|
||||
"paddb %%mm2, %%mm2 \n\t"
|
||||
OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
|
||||
"movq %%mm5, (%2, %%"REG_a") \n\t"
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
|
||||
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddusw %%mm2, %%mm4 \n\t"
|
||||
"paddusw %%mm3, %%mm5 \n\t"
|
||||
"paddusw %%mm6, %%mm0 \n\t"
|
||||
"paddusw %%mm6, %%mm1 \n\t"
|
||||
"paddusw %%mm4, %%mm0 \n\t"
|
||||
"paddusw %%mm5, %%mm1 \n\t"
|
||||
"psrlw $2, %%mm0 \n\t"
|
||||
"psrlw $2, %%mm1 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm3 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
"pcmpeqd %%mm2, %%mm2 \n\t"
|
||||
"paddb %%mm2, %%mm2 \n\t"
|
||||
OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
|
||||
"movq %%mm1, (%2, %%"REG_a") \n\t"
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels)
|
||||
:"D"(block), "r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
//FIXME optimize
|
||||
static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(put, pixels8_y2)(block , pixels , line_size, h);
|
||||
DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
|
||||
static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(put, pixels8_xy2)(block , pixels , line_size, h);
|
||||
DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
|
||||
static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(avg, pixels8_y2)(block , pixels , line_size, h);
|
||||
DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
|
||||
static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
|
||||
DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
|
||||
DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
|
||||
}
|
||||
339
project/jni/ffmpeg/libavcodec/x86/dsputilenc.asm
Normal file
339
project/jni/ffmpeg/libavcodec/x86/dsputilenc.asm
Normal file
@@ -0,0 +1,339 @@
|
||||
;*****************************************************************************
|
||||
;* MMX optimized DSP utils
|
||||
;*****************************************************************************
|
||||
;* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;*****************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro DIFF_PIXELS_1 4
|
||||
movh %1, %3
|
||||
movh %2, %4
|
||||
punpcklbw %2, %1
|
||||
punpcklbw %1, %1
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
|
||||
; %6=temporary storage location
|
||||
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
|
||||
%macro DIFF_PIXELS_8 6
|
||||
DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
|
||||
DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
|
||||
DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
||||
add %1, %5
|
||||
add %2, %5
|
||||
DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
|
||||
DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
|
||||
DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
||||
DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
|
||||
%ifdef m8
|
||||
DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
|
||||
%else
|
||||
mova [%6], m0
|
||||
DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
|
||||
mova m0, [%6]
|
||||
%endif
|
||||
sub %1, %5
|
||||
sub %2, %5
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8 0
|
||||
SUMSUB_BADC w, 0, 1, 2, 3
|
||||
SUMSUB_BADC w, 4, 5, 6, 7
|
||||
SUMSUB_BADC w, 0, 2, 1, 3
|
||||
SUMSUB_BADC w, 4, 6, 5, 7
|
||||
SUMSUB_BADC w, 0, 4, 1, 5
|
||||
SUMSUB_BADC w, 2, 6, 3, 7
|
||||
%endmacro
|
||||
|
||||
%macro ABS1_SUM 3
|
||||
ABS1 %1, %2
|
||||
paddusw %3, %1
|
||||
%endmacro
|
||||
|
||||
%macro ABS2_SUM 6
|
||||
ABS2 %1, %2, %3, %4
|
||||
paddusw %5, %1
|
||||
paddusw %6, %2
|
||||
%endmacro
|
||||
|
||||
%macro ABS_SUM_8x8_64 1
|
||||
ABS2 m0, m1, m8, m9
|
||||
ABS2_SUM m2, m3, m8, m9, m0, m1
|
||||
ABS2_SUM m4, m5, m8, m9, m0, m1
|
||||
ABS2_SUM m6, m7, m8, m9, m0, m1
|
||||
paddusw m0, m1
|
||||
%endmacro
|
||||
|
||||
%macro ABS_SUM_8x8_32 1
|
||||
mova [%1], m7
|
||||
ABS1 m0, m7
|
||||
ABS1 m1, m7
|
||||
ABS1_SUM m2, m7, m0
|
||||
ABS1_SUM m3, m7, m1
|
||||
ABS1_SUM m4, m7, m0
|
||||
ABS1_SUM m5, m7, m1
|
||||
ABS1_SUM m6, m7, m0
|
||||
mova m2, [%1]
|
||||
ABS1_SUM m2, m7, m1
|
||||
paddusw m0, m1
|
||||
%endmacro
|
||||
|
||||
; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
||||
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
||||
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
|
||||
%macro HSUM 3
|
||||
%if cpuflag(sse2)
|
||||
movhlps %2, %1
|
||||
paddusw %1, %2
|
||||
pshuflw %2, %1, 0xE
|
||||
paddusw %1, %2
|
||||
pshuflw %2, %1, 0x1
|
||||
paddusw %1, %2
|
||||
movd %3, %1
|
||||
%elif cpuflag(mmxext)
|
||||
pshufw %2, %1, 0xE
|
||||
paddusw %1, %2
|
||||
pshufw %2, %1, 0x1
|
||||
paddusw %1, %2
|
||||
movd %3, %1
|
||||
%elif cpuflag(mmx)
|
||||
mova %2, %1
|
||||
psrlq %1, 32
|
||||
paddusw %1, %2
|
||||
mova %2, %1
|
||||
psrlq %1, 16
|
||||
paddusw %1, %2
|
||||
movd %3, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE4 5
|
||||
mova [%1+mmsize*0], %2
|
||||
mova [%1+mmsize*1], %3
|
||||
mova [%1+mmsize*2], %4
|
||||
mova [%1+mmsize*3], %5
|
||||
%endmacro
|
||||
|
||||
%macro LOAD4 5
|
||||
mova %2, [%1+mmsize*0]
|
||||
mova %3, [%1+mmsize*1]
|
||||
mova %4, [%1+mmsize*2]
|
||||
mova %5, [%1+mmsize*3]
|
||||
%endmacro
|
||||
|
||||
%macro hadamard8_16_wrapper 2
|
||||
cglobal hadamard8_diff, 4, 4, %1
|
||||
%ifndef m8
|
||||
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
|
||||
SUB rsp, pad
|
||||
%endif
|
||||
call hadamard8x8_diff %+ SUFFIX
|
||||
%ifndef m8
|
||||
ADD rsp, pad
|
||||
%endif
|
||||
RET
|
||||
|
||||
cglobal hadamard8_diff16, 5, 6, %1
|
||||
%ifndef m8
|
||||
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
|
||||
SUB rsp, pad
|
||||
%endif
|
||||
|
||||
call hadamard8x8_diff %+ SUFFIX
|
||||
mov r5d, eax
|
||||
|
||||
add r1, 8
|
||||
add r2, 8
|
||||
call hadamard8x8_diff %+ SUFFIX
|
||||
add r5d, eax
|
||||
|
||||
cmp r4d, 16
|
||||
jne .done
|
||||
|
||||
lea r1, [r1+r3*8-8]
|
||||
lea r2, [r2+r3*8-8]
|
||||
call hadamard8x8_diff %+ SUFFIX
|
||||
add r5d, eax
|
||||
|
||||
add r1, 8
|
||||
add r2, 8
|
||||
call hadamard8x8_diff %+ SUFFIX
|
||||
add r5d, eax
|
||||
|
||||
.done:
|
||||
mov eax, r5d
|
||||
%ifndef m8
|
||||
ADD rsp, pad
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_DIFF 0-1
|
||||
%if cpuflag(sse2)
|
||||
hadamard8x8_diff %+ SUFFIX:
|
||||
lea r0, [r3*3]
|
||||
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
|
||||
HADAMARD8
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
|
||||
%endif
|
||||
HADAMARD8
|
||||
ABS_SUM_8x8 rsp+gprsize
|
||||
HSUM m0, m1, eax
|
||||
and eax, 0xFFFF
|
||||
ret
|
||||
|
||||
hadamard8_16_wrapper %1, 3
|
||||
%elif cpuflag(mmx)
|
||||
ALIGN 16
|
||||
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
|
||||
; int stride, int h)
|
||||
; r0 = void *s = unused, int h = unused (always 8)
|
||||
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
|
||||
; can simply call this 2x2x (and that's why we access rsp+gprsize
|
||||
; everywhere, which is rsp of calling func
|
||||
hadamard8x8_diff %+ SUFFIX:
|
||||
lea r0, [r3*3]
|
||||
|
||||
; first 4x8 pixels
|
||||
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
|
||||
HADAMARD8
|
||||
mova [rsp+gprsize+0x60], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
STORE4 rsp+gprsize, m0, m1, m2, m3
|
||||
mova m7, [rsp+gprsize+0x60]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
|
||||
|
||||
; second 4x8 pixels
|
||||
DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
|
||||
HADAMARD8
|
||||
mova [rsp+gprsize+0x60], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
|
||||
mova m7, [rsp+gprsize+0x60]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
|
||||
LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
|
||||
HADAMARD8
|
||||
ABS_SUM_8x8_32 rsp+gprsize+0x60
|
||||
mova [rsp+gprsize+0x60], m0
|
||||
|
||||
LOAD4 rsp+gprsize , m0, m1, m2, m3
|
||||
LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
|
||||
HADAMARD8
|
||||
ABS_SUM_8x8_32 rsp+gprsize
|
||||
paddusw m0, [rsp+gprsize+0x60]
|
||||
|
||||
HSUM m0, m1, eax
|
||||
and rax, 0xFFFF
|
||||
ret
|
||||
|
||||
hadamard8_16_wrapper 0, 14
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define ABS1 ABS1_MMX
|
||||
HADAMARD8_DIFF
|
||||
|
||||
INIT_MMX mmxext
|
||||
%define ABS1 ABS1_MMXEXT
|
||||
HADAMARD8_DIFF
|
||||
|
||||
INIT_XMM sse2
|
||||
%define ABS2 ABS2_MMXEXT
|
||||
%if ARCH_X86_64
|
||||
%define ABS_SUM_8x8 ABS_SUM_8x8_64
|
||||
%else
|
||||
%define ABS_SUM_8x8 ABS_SUM_8x8_32
|
||||
%endif
|
||||
HADAMARD8_DIFF 10
|
||||
|
||||
INIT_XMM ssse3
|
||||
%define ABS2 ABS2_SSSE3
|
||||
%define ABS_SUM_8x8 ABS_SUM_8x8_64
|
||||
HADAMARD8_DIFF 9
|
||||
|
||||
INIT_XMM sse2
|
||||
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
|
||||
cglobal sse16, 5, 5, 8
|
||||
shr r4d, 1
|
||||
pxor m0, m0 ; mm0 = 0
|
||||
pxor m7, m7 ; mm7 holds the sum
|
||||
|
||||
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
|
||||
movu m1, [r1 ] ; mm1 = pix1[0][0-15]
|
||||
movu m2, [r2 ] ; mm2 = pix2[0][0-15]
|
||||
movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
|
||||
movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
|
||||
|
||||
; todo: mm1-mm2, mm3-mm4
|
||||
; algo: subtract mm1 from mm2 with saturation and vice versa
|
||||
; OR the result to get the absolute difference
|
||||
mova m5, m1
|
||||
mova m6, m3
|
||||
psubusb m1, m2
|
||||
psubusb m3, m4
|
||||
psubusb m2, m5
|
||||
psubusb m4, m6
|
||||
|
||||
por m2, m1
|
||||
por m4, m3
|
||||
|
||||
; now convert to 16-bit vectors so we can square them
|
||||
mova m1, m2
|
||||
mova m3, m4
|
||||
|
||||
punpckhbw m2, m0
|
||||
punpckhbw m4, m0
|
||||
punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
|
||||
punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
|
||||
|
||||
pmaddwd m2, m2
|
||||
pmaddwd m4, m4
|
||||
pmaddwd m1, m1
|
||||
pmaddwd m3, m3
|
||||
|
||||
lea r1, [r1+r3*2] ; pix1 += 2*line_size
|
||||
lea r2, [r2+r3*2] ; pix2 += 2*line_size
|
||||
|
||||
paddd m1, m2
|
||||
paddd m3, m4
|
||||
paddd m7, m1
|
||||
paddd m7, m3
|
||||
|
||||
dec r4
|
||||
jnz .next2lines
|
||||
|
||||
mova m1, m7
|
||||
psrldq m7, 8 ; shift hi qword to lo
|
||||
paddd m7, m1
|
||||
mova m1, m7
|
||||
psrldq m7, 4 ; shift hi dword to lo
|
||||
paddd m7, m1
|
||||
movd eax, m7 ; return value
|
||||
RET
|
||||
1219
project/jni/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
Normal file
1219
project/jni/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
Normal file
File diff suppressed because it is too large
Load Diff
202
project/jni/ffmpeg/libavcodec/x86/dwt.c
Normal file
202
project/jni/ffmpeg/libavcodec/x86/dwt.c
Normal file
@@ -0,0 +1,202 @@
|
||||
/*
|
||||
* MMX optimized discrete wavelet transform
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
* Copyright (c) 2010 David Conrad
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "dwt.h"
|
||||
|
||||
#define COMPOSE_VERTICAL(ext, align) \
|
||||
void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
|
||||
void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
|
||||
void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
|
||||
void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
|
||||
void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
|
||||
void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
|
||||
void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
|
||||
\
|
||||
static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
|
||||
\
|
||||
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
|
||||
IDWTELEM *b3, IDWTELEM *b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
\
|
||||
static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
|
||||
IDWTELEM *b3, IDWTELEM *b4, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) \
|
||||
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
|
||||
\
|
||||
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
|
||||
} \
|
||||
static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
|
||||
{ \
|
||||
int i, width_align = width&~(align-1); \
|
||||
\
|
||||
for(i=width_align; i<width; i++) { \
|
||||
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
|
||||
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
|
||||
} \
|
||||
\
|
||||
ff_vertical_compose_haar##ext(b0, b1, width_align); \
|
||||
} \
|
||||
static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
|
||||
{\
|
||||
int w2= w>>1;\
|
||||
int x= w2 - (w2&(align-1));\
|
||||
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
|
||||
\
|
||||
for (; x < w2; x++) {\
|
||||
b[2*x ] = tmp[x];\
|
||||
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
|
||||
}\
|
||||
}\
|
||||
static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
|
||||
{\
|
||||
int w2= w>>1;\
|
||||
int x= w2 - (w2&(align-1));\
|
||||
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
|
||||
\
|
||||
for (; x < w2; x++) {\
|
||||
b[2*x ] = (tmp[x] + 1)>>1;\
|
||||
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
|
||||
#if HAVE_YASM
|
||||
#if !ARCH_X86_64
|
||||
COMPOSE_VERTICAL(_mmx, 4)
|
||||
#endif
|
||||
COMPOSE_VERTICAL(_sse2, 8)
|
||||
|
||||
|
||||
void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
|
||||
|
||||
static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
|
||||
{
|
||||
int w2= w>>1;
|
||||
int x= w2 - (w2&7);
|
||||
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
|
||||
|
||||
for (; x < w2; x++) {
|
||||
b[2*x ] = (tmp[x] + 1)>>1;
|
||||
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
#if !ARCH_X86_64
|
||||
if (!(mm_flags & AV_CPU_FLAG_MMX))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_LEGALL5_3:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_DD13_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR0:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_mmx;
|
||||
d->horizontal_compose = horizontal_compose_haar0i_mmx;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR1:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_mmx;
|
||||
d->horizontal_compose = horizontal_compose_haar1i_mmx;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSE2))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_LEGALL5_3:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_DD13_7:
|
||||
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
|
||||
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR0:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_sse2;
|
||||
d->horizontal_compose = horizontal_compose_haar0i_sse2;
|
||||
break;
|
||||
case DWT_DIRAC_HAAR1:
|
||||
d->vertical_compose = (void*)vertical_compose_haar_sse2;
|
||||
d->horizontal_compose = horizontal_compose_haar1i_sse2;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
|
||||
return;
|
||||
|
||||
switch (type) {
|
||||
case DWT_DIRAC_DD9_7:
|
||||
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
|
||||
break;
|
||||
}
|
||||
#endif // HAVE_YASM
|
||||
}
|
||||
30
project/jni/ffmpeg/libavcodec/x86/dwt.h
Normal file
30
project/jni/ffmpeg/libavcodec/x86/dwt.h
Normal file
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_DWT_H
|
||||
#define AVCODEC_X86_DWT_H
|
||||
|
||||
#include "libavcodec/dwt.h"
|
||||
|
||||
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
|
||||
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
|
||||
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
|
||||
|
||||
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
|
||||
|
||||
#endif
|
||||
291
project/jni/ffmpeg/libavcodec/x86/dwt_yasm.asm
Normal file
291
project/jni/ffmpeg/libavcodec/x86/dwt_yasm.asm
Normal file
@@ -0,0 +1,291 @@
|
||||
;******************************************************************************
|
||||
;* MMX optimized discrete wavelet trasnform
|
||||
;* Copyright (c) 2010 David Conrad
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pw_1: times 8 dw 1
|
||||
pw_2: times 8 dw 2
|
||||
pw_8: times 8 dw 8
|
||||
pw_16: times 8 dw 16
|
||||
pw_1991: times 4 dw 9,-1
|
||||
|
||||
section .text
|
||||
|
||||
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
|
||||
%macro COMPOSE_53iL0 4
|
||||
paddw %2, %3
|
||||
paddw %2, %4
|
||||
psraw %2, 2
|
||||
psubw %1, %2
|
||||
%endm
|
||||
|
||||
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
|
||||
; if %4 is supplied, %1 is loaded unaligned from there
|
||||
; m2: clobbered m3: pw_8 m4: pw_1991
|
||||
%macro COMPOSE_DD97iH0 3-4
|
||||
paddw m0, %3
|
||||
paddw m1, %2
|
||||
psubw m0, m3
|
||||
mova m2, m1
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m2, m0
|
||||
pmaddwd m1, m4
|
||||
pmaddwd m2, m4
|
||||
%if %0 > 3
|
||||
movu %1, %4
|
||||
%endif
|
||||
psrad m1, 4
|
||||
psrad m2, 4
|
||||
packssdw m1, m2
|
||||
paddw m1, %1
|
||||
%endm
|
||||
|
||||
%macro COMPOSE_VERTICAL 1
|
||||
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; int width)
|
||||
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
|
||||
mova m2, [pw_2]
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m1, [b0q+2*widthq]
|
||||
mova m0, [b1q+2*widthq]
|
||||
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
|
||||
mova [b1q+2*widthq], m0
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; int width)
|
||||
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
|
||||
mova m1, [pw_1]
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m0, [b0q+2*widthq]
|
||||
paddw m0, [b2q+2*widthq]
|
||||
paddw m0, m1
|
||||
psraw m0, 1
|
||||
paddw m0, [b1q+2*widthq]
|
||||
mova [b1q+2*widthq], m0
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; IDWTELEM *b3, IDWTELEM *b4, int width)
|
||||
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
|
||||
mova m3, [pw_8]
|
||||
mova m4, [pw_1991]
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m0, [b0q+2*widthq]
|
||||
mova m1, [b1q+2*widthq]
|
||||
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
|
||||
mova [b2q+2*widthq], m1
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
; IDWTELEM *b3, IDWTELEM *b4, int width)
|
||||
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
|
||||
mova m3, [pw_16]
|
||||
mova m4, [pw_1991]
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m0, [b0q+2*widthq]
|
||||
mova m1, [b1q+2*widthq]
|
||||
mova m5, [b2q+2*widthq]
|
||||
paddw m0, [b4q+2*widthq]
|
||||
paddw m1, [b3q+2*widthq]
|
||||
psubw m0, m3
|
||||
mova m2, m1
|
||||
punpcklwd m1, m0
|
||||
punpckhwd m2, m0
|
||||
pmaddwd m1, m4
|
||||
pmaddwd m2, m4
|
||||
psrad m1, 5
|
||||
psrad m2, 5
|
||||
packssdw m1, m2
|
||||
psubw m5, m1
|
||||
mova [b2q+2*widthq], m5
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
|
||||
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
|
||||
mova m3, [pw_1]
|
||||
.loop:
|
||||
sub widthq, mmsize/2
|
||||
mova m1, [b1q+2*widthq]
|
||||
mova m0, [b0q+2*widthq]
|
||||
mova m2, m1
|
||||
paddw m1, m3
|
||||
psraw m1, 1
|
||||
psubw m0, m1
|
||||
mova [b0q+2*widthq], m0
|
||||
paddw m2, m0
|
||||
mova [b1q+2*widthq], m2
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
; extend the left and right edges of the tmp array by %1 and %2 respectively
|
||||
%macro EDGE_EXTENSION 3
|
||||
mov %3, [tmpq]
|
||||
%assign %%i 1
|
||||
%rep %1
|
||||
mov [tmpq-2*%%i], %3
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
mov %3, [tmpq+2*w2q-2]
|
||||
%assign %%i 0
|
||||
%rep %2
|
||||
mov [tmpq+2*w2q+2*%%i], %3
|
||||
%assign %%i %%i+1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro HAAR_HORIZONTAL 2
|
||||
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
|
||||
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
|
||||
mov w2d, wd
|
||||
xor xq, xq
|
||||
shr w2d, 1
|
||||
lea b_w2q, [bq+wq]
|
||||
mova m3, [pw_1]
|
||||
.lowpass_loop:
|
||||
movu m1, [b_w2q + 2*xq]
|
||||
mova m0, [bq + 2*xq]
|
||||
paddw m1, m3
|
||||
psraw m1, 1
|
||||
psubw m0, m1
|
||||
mova [tmpq + 2*xq], m0
|
||||
add xq, mmsize/2
|
||||
cmp xq, w2q
|
||||
jl .lowpass_loop
|
||||
|
||||
xor xq, xq
|
||||
and w2q, ~(mmsize/2 - 1)
|
||||
cmp w2q, mmsize/2
|
||||
jl .end
|
||||
|
||||
.highpass_loop:
|
||||
movu m1, [b_w2q + 2*xq]
|
||||
mova m0, [tmpq + 2*xq]
|
||||
paddw m1, m0
|
||||
|
||||
; shift and interleave
|
||||
%if %2 == 1
|
||||
paddw m0, m3
|
||||
paddw m1, m3
|
||||
psraw m0, 1
|
||||
psraw m1, 1
|
||||
%endif
|
||||
mova m2, m0
|
||||
punpcklwd m0, m1
|
||||
punpckhwd m2, m1
|
||||
mova [bq+4*xq], m0
|
||||
mova [bq+4*xq+mmsize], m2
|
||||
|
||||
add xq, mmsize/2
|
||||
cmp xq, w2q
|
||||
jl .highpass_loop
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
|
||||
INIT_XMM
|
||||
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
|
||||
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
|
||||
mov w2d, wd
|
||||
xor xd, xd
|
||||
shr w2d, 1
|
||||
lea b_w2q, [bq+wq]
|
||||
movu m4, [bq+wq]
|
||||
mova m7, [pw_2]
|
||||
pslldq m4, 14
|
||||
.lowpass_loop:
|
||||
movu m1, [b_w2q + 2*xq]
|
||||
mova m0, [bq + 2*xq]
|
||||
mova m2, m1
|
||||
palignr m1, m4, 14
|
||||
mova m4, m2
|
||||
COMPOSE_53iL0 m0, m1, m2, m7
|
||||
mova [tmpq + 2*xq], m0
|
||||
add xd, mmsize/2
|
||||
cmp xd, w2d
|
||||
jl .lowpass_loop
|
||||
|
||||
EDGE_EXTENSION 1, 2, xw
|
||||
; leave the last up to 7 (sse) or 3 (mmx) values for C
|
||||
xor xd, xd
|
||||
and w2d, ~(mmsize/2 - 1)
|
||||
cmp w2d, mmsize/2
|
||||
jl .end
|
||||
|
||||
mova m7, [tmpq-mmsize]
|
||||
mova m0, [tmpq]
|
||||
mova m5, [pw_1]
|
||||
mova m3, [pw_8]
|
||||
mova m4, [pw_1991]
|
||||
.highpass_loop:
|
||||
mova m6, m0
|
||||
palignr m0, m7, 14
|
||||
mova m7, [tmpq + 2*xq + 16]
|
||||
mova m1, m7
|
||||
mova m2, m7
|
||||
palignr m1, m6, 2
|
||||
palignr m2, m6, 4
|
||||
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
|
||||
mova m0, m7
|
||||
mova m7, m6
|
||||
|
||||
; shift and interleave
|
||||
paddw m6, m5
|
||||
paddw m1, m5
|
||||
psraw m6, 1
|
||||
psraw m1, 1
|
||||
mova m2, m6
|
||||
punpcklwd m6, m1
|
||||
punpckhwd m2, m1
|
||||
mova [bq+4*xq], m6
|
||||
mova [bq+4*xq+mmsize], m2
|
||||
|
||||
add xd, mmsize/2
|
||||
cmp xd, w2d
|
||||
jl .highpass_loop
|
||||
.end:
|
||||
REP_RET
|
||||
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
INIT_MMX
|
||||
COMPOSE_VERTICAL mmx
|
||||
HAAR_HORIZONTAL mmx, 0
|
||||
HAAR_HORIZONTAL mmx, 1
|
||||
%endif
|
||||
|
||||
;;INIT_XMM
|
||||
INIT_XMM
|
||||
COMPOSE_VERTICAL sse2
|
||||
HAAR_HORIZONTAL sse2, 0
|
||||
HAAR_HORIZONTAL sse2, 1
|
||||
586
project/jni/ffmpeg/libavcodec/x86/fdct.c
Normal file
586
project/jni/ffmpeg/libavcodec/x86/fdct.c
Normal file
@@ -0,0 +1,586 @@
|
||||
/*
|
||||
* MMX optimized forward DCT
|
||||
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
|
||||
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
|
||||
*
|
||||
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
|
||||
*
|
||||
* Intel Application Note AP-922 - fast, precise implementation of DCT
|
||||
* http://developer.intel.com/vtune/cbts/appnotes.htm
|
||||
*
|
||||
* Also of inspiration:
|
||||
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
|
||||
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// constants for the forward DCT
|
||||
// -----------------------------
|
||||
//
|
||||
// Be sure to check that your compiler is aligning all constants to QWORD
|
||||
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
|
||||
// severely stall MMX execution.
|
||||
//
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
|
||||
#define SHIFT_FRW_COL BITS_FRW_ACC
|
||||
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
|
||||
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
|
||||
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
|
||||
|
||||
#define X8(x) x,x,x,x,x,x,x,x
|
||||
|
||||
//concatenated table, for forward DCT transformation
|
||||
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
|
||||
X8(13036), // tg * (2<<16) + 0.5
|
||||
X8(27146), // tg * (2<<16) + 0.5
|
||||
X8(-21746) // tg * (2<<16) + 0.5
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
|
||||
X8(23170) //cos * (2<<15) + 0.5
|
||||
};
|
||||
|
||||
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
|
||||
|
||||
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
|
||||
|
||||
static const struct
|
||||
{
|
||||
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
|
||||
} fdct_r_row_sse2 =
|
||||
{{
|
||||
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
|
||||
}};
|
||||
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
|
||||
|
||||
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
|
||||
16384, 16384, 22725, 19266,
|
||||
16384, 16384, 12873, 4520,
|
||||
21407, 8867, 19266, -4520,
|
||||
-8867, -21407, -22725, -12873,
|
||||
16384, -16384, 12873, -22725,
|
||||
-16384, 16384, 4520, 19266,
|
||||
8867, -21407, 4520, -12873,
|
||||
21407, -8867, 19266, -22725,
|
||||
|
||||
22725, 22725, 31521, 26722,
|
||||
22725, 22725, 17855, 6270,
|
||||
29692, 12299, 26722, -6270,
|
||||
-12299, -29692, -31521, -17855,
|
||||
22725, -22725, 17855, -31521,
|
||||
-22725, 22725, 6270, 26722,
|
||||
12299, -29692, 6270, -17855,
|
||||
29692, -12299, 26722, -31521,
|
||||
|
||||
21407, 21407, 29692, 25172,
|
||||
21407, 21407, 16819, 5906,
|
||||
27969, 11585, 25172, -5906,
|
||||
-11585, -27969, -29692, -16819,
|
||||
21407, -21407, 16819, -29692,
|
||||
-21407, 21407, 5906, 25172,
|
||||
11585, -27969, 5906, -16819,
|
||||
27969, -11585, 25172, -29692,
|
||||
|
||||
19266, 19266, 26722, 22654,
|
||||
19266, 19266, 15137, 5315,
|
||||
25172, 10426, 22654, -5315,
|
||||
-10426, -25172, -26722, -15137,
|
||||
19266, -19266, 15137, -26722,
|
||||
-19266, 19266, 5315, 22654,
|
||||
10426, -25172, 5315, -15137,
|
||||
25172, -10426, 22654, -26722,
|
||||
|
||||
16384, 16384, 22725, 19266,
|
||||
16384, 16384, 12873, 4520,
|
||||
21407, 8867, 19266, -4520,
|
||||
-8867, -21407, -22725, -12873,
|
||||
16384, -16384, 12873, -22725,
|
||||
-16384, 16384, 4520, 19266,
|
||||
8867, -21407, 4520, -12873,
|
||||
21407, -8867, 19266, -22725,
|
||||
|
||||
19266, 19266, 26722, 22654,
|
||||
19266, 19266, 15137, 5315,
|
||||
25172, 10426, 22654, -5315,
|
||||
-10426, -25172, -26722, -15137,
|
||||
19266, -19266, 15137, -26722,
|
||||
-19266, 19266, 5315, 22654,
|
||||
10426, -25172, 5315, -15137,
|
||||
25172, -10426, 22654, -26722,
|
||||
|
||||
21407, 21407, 29692, 25172,
|
||||
21407, 21407, 16819, 5906,
|
||||
27969, 11585, 25172, -5906,
|
||||
-11585, -27969, -29692, -16819,
|
||||
21407, -21407, 16819, -29692,
|
||||
-21407, 21407, 5906, 25172,
|
||||
11585, -27969, 5906, -16819,
|
||||
27969, -11585, 25172, -29692,
|
||||
|
||||
22725, 22725, 31521, 26722,
|
||||
22725, 22725, 17855, 6270,
|
||||
29692, 12299, 26722, -6270,
|
||||
-12299, -29692, -31521, -17855,
|
||||
22725, -22725, 17855, -31521,
|
||||
-22725, 22725, 6270, 26722,
|
||||
12299, -29692, 6270, -17855,
|
||||
29692, -12299, 26722, -31521,
|
||||
};
|
||||
|
||||
static const struct
|
||||
{
|
||||
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
|
||||
} tab_frw_01234567_sse2 =
|
||||
{{
|
||||
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
|
||||
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
|
||||
C4, C4, C5, C7, C2, C6, C3, -C7, \
|
||||
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
|
||||
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
|
||||
// c1..c7 * cos(pi/4) * 2^15
|
||||
#define C1 22725
|
||||
#define C2 21407
|
||||
#define C3 19266
|
||||
#define C4 16384
|
||||
#define C5 12873
|
||||
#define C6 8867
|
||||
#define C7 4520
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 31521
|
||||
#define C2 29692
|
||||
#define C3 26722
|
||||
#define C4 22725
|
||||
#define C5 17855
|
||||
#define C6 12299
|
||||
#define C7 6270
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 29692
|
||||
#define C2 27969
|
||||
#define C3 25172
|
||||
#define C4 21407
|
||||
#define C5 16819
|
||||
#define C6 11585
|
||||
#define C7 5906
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 26722
|
||||
#define C2 25172
|
||||
#define C3 22654
|
||||
#define C4 19266
|
||||
#define C5 15137
|
||||
#define C6 10426
|
||||
#define C7 5315
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 22725
|
||||
#define C2 21407
|
||||
#define C3 19266
|
||||
#define C4 16384
|
||||
#define C5 12873
|
||||
#define C6 8867
|
||||
#define C7 4520
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 26722
|
||||
#define C2 25172
|
||||
#define C3 22654
|
||||
#define C4 19266
|
||||
#define C5 15137
|
||||
#define C6 10426
|
||||
#define C7 5315
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 29692
|
||||
#define C2 27969
|
||||
#define C3 25172
|
||||
#define C4 21407
|
||||
#define C5 16819
|
||||
#define C6 11585
|
||||
#define C7 5906
|
||||
TABLE_SSE2
|
||||
|
||||
#undef C1
|
||||
#undef C2
|
||||
#undef C3
|
||||
#undef C4
|
||||
#undef C5
|
||||
#undef C6
|
||||
#undef C7
|
||||
#define C1 31521
|
||||
#define C2 29692
|
||||
#define C3 26722
|
||||
#define C4 22725
|
||||
#define C5 17855
|
||||
#define C6 12299
|
||||
#define C7 6270
|
||||
TABLE_SSE2
|
||||
}};
|
||||
|
||||
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
|
||||
|
||||
#define FDCT_COL(cpu, mm, mov)\
|
||||
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
|
||||
{\
|
||||
__asm__ volatile (\
|
||||
#mov" 16(%0), %%"#mm"0 \n\t" \
|
||||
#mov" 96(%0), %%"#mm"1 \n\t" \
|
||||
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
|
||||
#mov" 32(%0), %%"#mm"3 \n\t" \
|
||||
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
|
||||
#mov" 80(%0), %%"#mm"4 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
|
||||
#mov" (%0), %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
|
||||
"paddsw 112(%0), %%"#mm"5 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
|
||||
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
|
||||
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
|
||||
#mov" 16(%1), %%"#mm"1 \n\t" \
|
||||
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
|
||||
#mov" 48(%0), %%"#mm"7 \n\t" \
|
||||
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
|
||||
"paddsw 64(%0), %%"#mm"7 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
|
||||
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
|
||||
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
|
||||
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
|
||||
"por (%2), %%"#mm"1 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
|
||||
"pmulhw 16(%1), %%"#mm"5 \n\t" \
|
||||
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
|
||||
"psubsw 80(%0), %%"#mm"3 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
|
||||
#mov" %%"#mm"1, 32(%3) \n\t" \
|
||||
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
|
||||
#mov" 48(%0), %%"#mm"1 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
|
||||
"psubsw 64(%0), %%"#mm"1 \n\t" \
|
||||
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
|
||||
#mov" %%"#mm"4, 64(%3) \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
|
||||
"pmulhw (%4), %%"#mm"2 \n\t" \
|
||||
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
|
||||
"pmulhw (%4), %%"#mm"6 \n\t" \
|
||||
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
|
||||
"por (%2), %%"#mm"5 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
|
||||
"por (%2), %%"#mm"2 \n\t" \
|
||||
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
|
||||
#mov" (%0), %%"#mm"3 \n\t" \
|
||||
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
|
||||
"psubsw 112(%0), %%"#mm"3 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
|
||||
#mov" (%1), %%"#mm"0 \n\t" \
|
||||
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
|
||||
#mov" 32(%1), %%"#mm"6 \n\t" \
|
||||
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
|
||||
#mov" %%"#mm"7, (%3) \n\t" \
|
||||
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
#mov" %%"#mm"5, 96(%3) \n\t" \
|
||||
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
|
||||
#mov" 32(%1), %%"#mm"5 \n\t" \
|
||||
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
|
||||
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
|
||||
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
|
||||
"pmulhw (%1), %%"#mm"3 \n\t" \
|
||||
"por (%2), %%"#mm"0 \n\t" \
|
||||
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
|
||||
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
|
||||
#mov" %%"#mm"0, 16(%3) \n\t" \
|
||||
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
|
||||
#mov" %%"#mm"7, 48(%3) \n\t" \
|
||||
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
|
||||
#mov" %%"#mm"5, 80(%3) \n\t" \
|
||||
#mov" %%"#mm"3, 112(%3) \n\t" \
|
||||
: \
|
||||
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
|
||||
"r" (out + offset), "r" (ocos_4_16)); \
|
||||
}
|
||||
|
||||
FDCT_COL(mmx, mm, movq)
|
||||
FDCT_COL(sse2, xmm, movdqa)
|
||||
|
||||
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
|
||||
{
|
||||
__asm__ volatile(
|
||||
#define FDCT_ROW_SSE2_H1(i,t) \
|
||||
"movq " #i "(%0), %%xmm2 \n\t" \
|
||||
"movq " #i "+8(%0), %%xmm0 \n\t" \
|
||||
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
|
||||
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
|
||||
"movdqa " #t "(%1), %%xmm4 \n\t" \
|
||||
"movdqa " #t "+16(%1), %%xmm5 \n\t"
|
||||
|
||||
#define FDCT_ROW_SSE2_H2(i,t) \
|
||||
"movq " #i "(%0), %%xmm2 \n\t" \
|
||||
"movq " #i "+8(%0), %%xmm0 \n\t" \
|
||||
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
|
||||
"movdqa " #t "+48(%1), %%xmm7 \n\t"
|
||||
|
||||
#define FDCT_ROW_SSE2(i) \
|
||||
"movq %%xmm2, %%xmm1 \n\t" \
|
||||
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
|
||||
"paddsw %%xmm0, %%xmm1 \n\t" \
|
||||
"psubsw %%xmm0, %%xmm2 \n\t" \
|
||||
"punpckldq %%xmm2, %%xmm1 \n\t" \
|
||||
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
|
||||
"pmaddwd %%xmm2, %%xmm3 \n\t" \
|
||||
"pmaddwd %%xmm1, %%xmm7 \n\t" \
|
||||
"pmaddwd %%xmm5, %%xmm2 \n\t" \
|
||||
"pmaddwd %%xmm4, %%xmm1 \n\t" \
|
||||
"paddd %%xmm7, %%xmm3 \n\t" \
|
||||
"paddd %%xmm2, %%xmm1 \n\t" \
|
||||
"paddd %%xmm6, %%xmm3 \n\t" \
|
||||
"paddd %%xmm6, %%xmm1 \n\t" \
|
||||
"psrad %3, %%xmm3 \n\t" \
|
||||
"psrad %3, %%xmm1 \n\t" \
|
||||
"packssdw %%xmm3, %%xmm1 \n\t" \
|
||||
"movdqa %%xmm1, " #i "(%4) \n\t"
|
||||
|
||||
"movdqa (%2), %%xmm6 \n\t"
|
||||
FDCT_ROW_SSE2_H1(0,0)
|
||||
FDCT_ROW_SSE2(0)
|
||||
FDCT_ROW_SSE2_H2(64,0)
|
||||
FDCT_ROW_SSE2(64)
|
||||
|
||||
FDCT_ROW_SSE2_H1(16,64)
|
||||
FDCT_ROW_SSE2(16)
|
||||
FDCT_ROW_SSE2_H2(112,64)
|
||||
FDCT_ROW_SSE2(112)
|
||||
|
||||
FDCT_ROW_SSE2_H1(32,128)
|
||||
FDCT_ROW_SSE2(32)
|
||||
FDCT_ROW_SSE2_H2(96,128)
|
||||
FDCT_ROW_SSE2(96)
|
||||
|
||||
FDCT_ROW_SSE2_H1(48,192)
|
||||
FDCT_ROW_SSE2(48)
|
||||
FDCT_ROW_SSE2_H2(80,192)
|
||||
FDCT_ROW_SSE2(80)
|
||||
:
|
||||
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
|
||||
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
|
||||
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
|
||||
const int16_t *table)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"paddsw %%mm5, %%mm0 \n\t"
|
||||
"psubsw %%mm5, %%mm1 \n\t"
|
||||
"movq %%mm0, %%mm2 \n\t"
|
||||
"punpckldq %%mm1, %%mm0 \n\t"
|
||||
"punpckhdq %%mm1, %%mm2 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm5 \n\t"
|
||||
"movq 32(%1), %%mm6 \n\t"
|
||||
"movq 40(%1), %%mm7 \n\t"
|
||||
"pmaddwd %%mm0, %%mm1 \n\t"
|
||||
"pmaddwd %%mm2, %%mm3 \n\t"
|
||||
"pmaddwd %%mm0, %%mm4 \n\t"
|
||||
"pmaddwd %%mm2, %%mm5 \n\t"
|
||||
"pmaddwd %%mm0, %%mm6 \n\t"
|
||||
"pmaddwd %%mm2, %%mm7 \n\t"
|
||||
"pmaddwd 48(%1), %%mm0 \n\t"
|
||||
"pmaddwd 56(%1), %%mm2 \n\t"
|
||||
"paddd %%mm1, %%mm3 \n\t"
|
||||
"paddd %%mm4, %%mm5 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm3 \n\t"
|
||||
"paddd %%mm0, %%mm5 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
|
||||
"packssdw %%mm5, %%mm3 \n\t"
|
||||
"packssdw %%mm2, %%mm7 \n\t"
|
||||
"movq %%mm3, (%3) \n\t"
|
||||
"movq %%mm7, 8(%3) \n\t"
|
||||
:
|
||||
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
|
||||
}
|
||||
|
||||
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
|
||||
{
|
||||
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
|
||||
__asm__ volatile(
|
||||
"movd 12(%0), %%mm1 \n\t"
|
||||
"punpcklwd 8(%0), %%mm1 \n\t"
|
||||
"movq %%mm1, %%mm2 \n\t"
|
||||
"psrlq $0x20, %%mm1 \n\t"
|
||||
"movq 0(%0), %%mm0 \n\t"
|
||||
"punpcklwd %%mm2, %%mm1 \n\t"
|
||||
"movq %%mm0, %%mm5 \n\t"
|
||||
"paddsw %%mm1, %%mm0 \n\t"
|
||||
"psubsw %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm0, %%mm2 \n\t"
|
||||
"punpckldq %%mm5, %%mm0 \n\t"
|
||||
"punpckhdq %%mm5, %%mm2 \n\t"
|
||||
"movq 0(%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm5 \n\t"
|
||||
"movq 32(%1), %%mm6 \n\t"
|
||||
"movq 40(%1), %%mm7 \n\t"
|
||||
"pmaddwd %%mm0, %%mm1 \n\t"
|
||||
"pmaddwd %%mm2, %%mm3 \n\t"
|
||||
"pmaddwd %%mm0, %%mm4 \n\t"
|
||||
"pmaddwd %%mm2, %%mm5 \n\t"
|
||||
"pmaddwd %%mm0, %%mm6 \n\t"
|
||||
"pmaddwd %%mm2, %%mm7 \n\t"
|
||||
"pmaddwd 48(%1), %%mm0 \n\t"
|
||||
"pmaddwd 56(%1), %%mm2 \n\t"
|
||||
"paddd %%mm1, %%mm3 \n\t"
|
||||
"paddd %%mm4, %%mm5 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"movq (%2), %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm3 \n\t"
|
||||
"paddd %%mm0, %%mm5 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"paddd %%mm0, %%mm2 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
|
||||
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
|
||||
"packssdw %%mm5, %%mm3 \n\t"
|
||||
"packssdw %%mm2, %%mm7 \n\t"
|
||||
"movq %%mm3, 0(%3) \n\t"
|
||||
"movq %%mm7, 8(%3) \n\t"
|
||||
:
|
||||
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
|
||||
}
|
||||
|
||||
void ff_fdct_mmx(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
|
||||
int16_t * block1= (int16_t*)align_tmp;
|
||||
const int16_t *table= tab_frw_01234567;
|
||||
int i;
|
||||
|
||||
fdct_col_mmx(block, block1, 0);
|
||||
fdct_col_mmx(block, block1, 4);
|
||||
|
||||
for(i=8;i>0;i--) {
|
||||
fdct_row_mmx(block1, block, table);
|
||||
block1 += 8;
|
||||
table += 32;
|
||||
block += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void ff_fdct_mmxext(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
|
||||
int16_t *block1= (int16_t*)align_tmp;
|
||||
const int16_t *table= tab_frw_01234567;
|
||||
int i;
|
||||
|
||||
fdct_col_mmx(block, block1, 0);
|
||||
fdct_col_mmx(block, block1, 4);
|
||||
|
||||
for(i=8;i>0;i--) {
|
||||
fdct_row_mmxext(block1, block, table);
|
||||
block1 += 8;
|
||||
table += 32;
|
||||
block += 8;
|
||||
}
|
||||
}
|
||||
|
||||
void ff_fdct_sse2(int16_t *block)
|
||||
{
|
||||
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
|
||||
int16_t * const block1= (int16_t*)align_tmp;
|
||||
|
||||
fdct_col_sse2(block, block1, 0);
|
||||
fdct_row_sse2(block1, block);
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
1093
project/jni/ffmpeg/libavcodec/x86/fft.asm
Normal file
1093
project/jni/ffmpeg/libavcodec/x86/fft.asm
Normal file
File diff suppressed because it is too large
Load Diff
41
project/jni/ffmpeg/libavcodec/x86/fft.h
Normal file
41
project/jni/ffmpeg/libavcodec/x86/fft.h
Normal file
@@ -0,0 +1,41 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_FFT_H
|
||||
#define AVCODEC_X86_FFT_H
|
||||
|
||||
#include "libavcodec/fft.h"
|
||||
|
||||
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
|
||||
|
||||
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
|
||||
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
|
||||
|
||||
#endif /* AVCODEC_X86_FFT_H */
|
||||
69
project/jni/ffmpeg/libavcodec/x86/fft_init.c
Normal file
69
project/jni/ffmpeg/libavcodec/x86/fft_init.c
Normal file
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/dct.h"
|
||||
#include "fft.h"
|
||||
|
||||
av_cold void ff_fft_init_x86(FFTContext *s)
|
||||
{
|
||||
int has_vectors = av_get_cpu_flags();
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_AMD3DNOW(has_vectors)) {
|
||||
/* 3DNow! for K6-2/3 */
|
||||
s->imdct_calc = ff_imdct_calc_3dnow;
|
||||
s->imdct_half = ff_imdct_half_3dnow;
|
||||
s->fft_calc = ff_fft_calc_3dnow;
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOWEXT(has_vectors)) {
|
||||
/* 3DNowEx for K7 */
|
||||
s->imdct_calc = ff_imdct_calc_3dnowext;
|
||||
s->imdct_half = ff_imdct_half_3dnowext;
|
||||
s->fft_calc = ff_fft_calc_3dnowext;
|
||||
}
|
||||
#endif
|
||||
if (EXTERNAL_SSE(has_vectors)) {
|
||||
/* SSE for P3/P4/K8 */
|
||||
s->imdct_calc = ff_imdct_calc_sse;
|
||||
s->imdct_half = ff_imdct_half_sse;
|
||||
s->fft_permute = ff_fft_permute_sse;
|
||||
s->fft_calc = ff_fft_calc_sse;
|
||||
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
|
||||
}
|
||||
if (EXTERNAL_AVX(has_vectors) && s->nbits >= 5) {
|
||||
/* AVX for SB */
|
||||
s->imdct_half = ff_imdct_half_avx;
|
||||
s->fft_calc = ff_fft_calc_avx;
|
||||
s->fft_permutation = FF_FFT_PERM_AVX;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_DCT
|
||||
av_cold void ff_dct_init_x86(DCTContext *s)
|
||||
{
|
||||
int has_vectors = av_get_cpu_flags();
|
||||
if (EXTERNAL_SSE(has_vectors))
|
||||
s->dct32 = ff_dct32_float_sse;
|
||||
if (EXTERNAL_SSE2(has_vectors))
|
||||
s->dct32 = ff_dct32_float_sse2;
|
||||
if (EXTERNAL_AVX(has_vectors))
|
||||
s->dct32 = ff_dct32_float_avx;
|
||||
}
|
||||
#endif
|
||||
429
project/jni/ffmpeg/libavcodec/x86/fmtconvert.asm
Normal file
429
project/jni/ffmpeg/libavcodec/x86/fmtconvert.asm
Normal file
@@ -0,0 +1,429 @@
|
||||
;******************************************************************************
|
||||
;* x86 optimized Format Conversion Utils
|
||||
;* Copyright (c) 2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
%macro CVTPS2PI 2
|
||||
%if cpuflag(sse)
|
||||
cvtps2pi %1, %2
|
||||
%elif cpuflag(3dnow)
|
||||
pf2id %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;---------------------------------------------------------------------------------
|
||||
; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
|
||||
;---------------------------------------------------------------------------------
|
||||
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
|
||||
%if UNIX64
|
||||
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
|
||||
%else
|
||||
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
|
||||
%endif
|
||||
%if WIN64
|
||||
SWAP 0, 2
|
||||
%elif ARCH_X86_32
|
||||
movss m0, mulm
|
||||
%endif
|
||||
SPLATD m0
|
||||
shl lenq, 2
|
||||
add srcq, lenq
|
||||
add dstq, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
%if cpuflag(sse2)
|
||||
cvtdq2ps m1, [srcq+lenq ]
|
||||
cvtdq2ps m2, [srcq+lenq+16]
|
||||
%else
|
||||
cvtpi2ps m1, [srcq+lenq ]
|
||||
cvtpi2ps m3, [srcq+lenq+ 8]
|
||||
cvtpi2ps m2, [srcq+lenq+16]
|
||||
cvtpi2ps m4, [srcq+lenq+24]
|
||||
movlhps m1, m3
|
||||
movlhps m2, m4
|
||||
%endif
|
||||
mulps m1, m0
|
||||
mulps m2, m0
|
||||
mova [dstq+lenq ], m1
|
||||
mova [dstq+lenq+16], m2
|
||||
add lenq, 32
|
||||
jl .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
INT32_TO_FLOAT_FMUL_SCALAR 5
|
||||
INIT_XMM sse2
|
||||
INT32_TO_FLOAT_FMUL_SCALAR 3
|
||||
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
|
||||
;------------------------------------------------------------------------------
|
||||
%macro FLOAT_TO_INT16 1
|
||||
cglobal float_to_int16, 3, 3, %1, dst, src, len
|
||||
add lenq, lenq
|
||||
lea srcq, [srcq+2*lenq]
|
||||
add dstq, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
%if cpuflag(sse2)
|
||||
cvtps2dq m0, [srcq+2*lenq ]
|
||||
cvtps2dq m1, [srcq+2*lenq+16]
|
||||
packssdw m0, m1
|
||||
mova [dstq+lenq], m0
|
||||
%else
|
||||
CVTPS2PI m0, [srcq+2*lenq ]
|
||||
CVTPS2PI m1, [srcq+2*lenq+ 8]
|
||||
CVTPS2PI m2, [srcq+2*lenq+16]
|
||||
CVTPS2PI m3, [srcq+2*lenq+24]
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
mova [dstq+lenq ], m0
|
||||
mova [dstq+lenq+8], m2
|
||||
%endif
|
||||
add lenq, 16
|
||||
js .loop
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
FLOAT_TO_INT16 2
|
||||
INIT_MMX sse
|
||||
FLOAT_TO_INT16 0
|
||||
INIT_MMX 3dnow
|
||||
FLOAT_TO_INT16 0
|
||||
|
||||
;------------------------------------------------------------------------------
|
||||
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
|
||||
;------------------------------------------------------------------------------
|
||||
%macro FLOAT_TO_INT16_STEP 1
|
||||
cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
|
||||
add lenq, lenq
|
||||
lea srcq, [srcq+2*lenq]
|
||||
lea step3q, [stepq*3]
|
||||
neg lenq
|
||||
.loop:
|
||||
%if cpuflag(sse2)
|
||||
cvtps2dq m0, [srcq+2*lenq ]
|
||||
cvtps2dq m1, [srcq+2*lenq+16]
|
||||
packssdw m0, m1
|
||||
movd v1d, m0
|
||||
psrldq m0, 4
|
||||
movd v2d, m0
|
||||
psrldq m0, 4
|
||||
mov [dstq], v1w
|
||||
mov [dstq+stepq*4], v2w
|
||||
shr v1d, 16
|
||||
shr v2d, 16
|
||||
mov [dstq+stepq*2], v1w
|
||||
mov [dstq+step3q*2], v2w
|
||||
lea dstq, [dstq+stepq*8]
|
||||
movd v1d, m0
|
||||
psrldq m0, 4
|
||||
movd v2d, m0
|
||||
mov [dstq], v1w
|
||||
mov [dstq+stepq*4], v2w
|
||||
shr v1d, 16
|
||||
shr v2d, 16
|
||||
mov [dstq+stepq*2], v1w
|
||||
mov [dstq+step3q*2], v2w
|
||||
lea dstq, [dstq+stepq*8]
|
||||
%else
|
||||
CVTPS2PI m0, [srcq+2*lenq ]
|
||||
CVTPS2PI m1, [srcq+2*lenq+ 8]
|
||||
CVTPS2PI m2, [srcq+2*lenq+16]
|
||||
CVTPS2PI m3, [srcq+2*lenq+24]
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
movd v1d, m0
|
||||
psrlq m0, 32
|
||||
movd v2d, m0
|
||||
mov [dstq], v1w
|
||||
mov [dstq+stepq*4], v2w
|
||||
shr v1d, 16
|
||||
shr v2d, 16
|
||||
mov [dstq+stepq*2], v1w
|
||||
mov [dstq+step3q*2], v2w
|
||||
lea dstq, [dstq+stepq*8]
|
||||
movd v1d, m2
|
||||
psrlq m2, 32
|
||||
movd v2d, m2
|
||||
mov [dstq], v1w
|
||||
mov [dstq+stepq*4], v2w
|
||||
shr v1d, 16
|
||||
shr v2d, 16
|
||||
mov [dstq+stepq*2], v1w
|
||||
mov [dstq+step3q*2], v2w
|
||||
lea dstq, [dstq+stepq*8]
|
||||
%endif
|
||||
add lenq, 16
|
||||
js .loop
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
FLOAT_TO_INT16_STEP 2
|
||||
INIT_MMX sse
|
||||
FLOAT_TO_INT16_STEP 0
|
||||
INIT_MMX 3dnow
|
||||
FLOAT_TO_INT16_STEP 0
|
||||
|
||||
;-------------------------------------------------------------------------------
|
||||
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
|
||||
;-------------------------------------------------------------------------------
|
||||
%macro FLOAT_TO_INT16_INTERLEAVE2 0
|
||||
cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
|
||||
lea lenq, [4*r2q]
|
||||
mov src1q, [src0q+gprsize]
|
||||
mov src0q, [src0q]
|
||||
add dstq, lenq
|
||||
add src0q, lenq
|
||||
add src1q, lenq
|
||||
neg lenq
|
||||
.loop:
|
||||
%if cpuflag(sse2)
|
||||
cvtps2dq m0, [src0q+lenq]
|
||||
cvtps2dq m1, [src1q+lenq]
|
||||
packssdw m0, m1
|
||||
movhlps m1, m0
|
||||
punpcklwd m0, m1
|
||||
mova [dstq+lenq], m0
|
||||
%else
|
||||
CVTPS2PI m0, [src0q+lenq ]
|
||||
CVTPS2PI m1, [src0q+lenq+8]
|
||||
CVTPS2PI m2, [src1q+lenq ]
|
||||
CVTPS2PI m3, [src1q+lenq+8]
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
mova m1, m0
|
||||
punpcklwd m0, m2
|
||||
punpckhwd m1, m2
|
||||
mova [dstq+lenq ], m0
|
||||
mova [dstq+lenq+8], m1
|
||||
%endif
|
||||
add lenq, 16
|
||||
js .loop
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX 3dnow
|
||||
FLOAT_TO_INT16_INTERLEAVE2
|
||||
INIT_MMX sse
|
||||
FLOAT_TO_INT16_INTERLEAVE2
|
||||
INIT_XMM sse2
|
||||
FLOAT_TO_INT16_INTERLEAVE2
|
||||
|
||||
%macro FLOAT_TO_INT16_INTERLEAVE6 0
|
||||
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
|
||||
cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
|
||||
%if ARCH_X86_64
|
||||
mov lend, r2d
|
||||
%else
|
||||
%define lend dword r2m
|
||||
%endif
|
||||
mov src1q, [srcq+1*gprsize]
|
||||
mov src2q, [srcq+2*gprsize]
|
||||
mov src3q, [srcq+3*gprsize]
|
||||
mov src4q, [srcq+4*gprsize]
|
||||
mov src5q, [srcq+5*gprsize]
|
||||
mov srcq, [srcq]
|
||||
sub src1q, srcq
|
||||
sub src2q, srcq
|
||||
sub src3q, srcq
|
||||
sub src4q, srcq
|
||||
sub src5q, srcq
|
||||
.loop:
|
||||
CVTPS2PI mm0, [srcq]
|
||||
CVTPS2PI mm1, [srcq+src1q]
|
||||
CVTPS2PI mm2, [srcq+src2q]
|
||||
CVTPS2PI mm3, [srcq+src3q]
|
||||
CVTPS2PI mm4, [srcq+src4q]
|
||||
CVTPS2PI mm5, [srcq+src5q]
|
||||
packssdw mm0, mm3
|
||||
packssdw mm1, mm4
|
||||
packssdw mm2, mm5
|
||||
PSWAPD mm3, mm0
|
||||
punpcklwd mm0, mm1
|
||||
punpckhwd mm1, mm2
|
||||
punpcklwd mm2, mm3
|
||||
PSWAPD mm3, mm0
|
||||
punpckldq mm0, mm2
|
||||
punpckhdq mm2, mm1
|
||||
punpckldq mm1, mm3
|
||||
movq [dstq ], mm0
|
||||
movq [dstq+16], mm2
|
||||
movq [dstq+ 8], mm1
|
||||
add srcq, 8
|
||||
add dstq, 24
|
||||
sub lend, 2
|
||||
jg .loop
|
||||
emms
|
||||
RET
|
||||
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
|
||||
|
||||
INIT_MMX sse
|
||||
FLOAT_TO_INT16_INTERLEAVE6
|
||||
INIT_MMX 3dnow
|
||||
FLOAT_TO_INT16_INTERLEAVE6
|
||||
INIT_MMX 3dnowext
|
||||
FLOAT_TO_INT16_INTERLEAVE6
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro FLOAT_INTERLEAVE6 1
|
||||
cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
|
||||
%if ARCH_X86_64
|
||||
mov lend, r2d
|
||||
%else
|
||||
%define lend dword r2m
|
||||
%endif
|
||||
mov src1q, [srcq+1*gprsize]
|
||||
mov src2q, [srcq+2*gprsize]
|
||||
mov src3q, [srcq+3*gprsize]
|
||||
mov src4q, [srcq+4*gprsize]
|
||||
mov src5q, [srcq+5*gprsize]
|
||||
mov srcq, [srcq]
|
||||
sub src1q, srcq
|
||||
sub src2q, srcq
|
||||
sub src3q, srcq
|
||||
sub src4q, srcq
|
||||
sub src5q, srcq
|
||||
.loop:
|
||||
%if cpuflag(sse)
|
||||
movaps m0, [srcq]
|
||||
movaps m1, [srcq+src1q]
|
||||
movaps m2, [srcq+src2q]
|
||||
movaps m3, [srcq+src3q]
|
||||
movaps m4, [srcq+src4q]
|
||||
movaps m5, [srcq+src5q]
|
||||
|
||||
SBUTTERFLYPS 0, 1, 6
|
||||
SBUTTERFLYPS 2, 3, 6
|
||||
SBUTTERFLYPS 4, 5, 6
|
||||
|
||||
movaps m6, m4
|
||||
shufps m4, m0, 0xe4
|
||||
movlhps m0, m2
|
||||
movhlps m6, m2
|
||||
movaps [dstq ], m0
|
||||
movaps [dstq+16], m4
|
||||
movaps [dstq+32], m6
|
||||
|
||||
movaps m6, m5
|
||||
shufps m5, m1, 0xe4
|
||||
movlhps m1, m3
|
||||
movhlps m6, m3
|
||||
movaps [dstq+48], m1
|
||||
movaps [dstq+64], m5
|
||||
movaps [dstq+80], m6
|
||||
%else ; mmx
|
||||
movq m0, [srcq]
|
||||
movq m1, [srcq+src1q]
|
||||
movq m2, [srcq+src2q]
|
||||
movq m3, [srcq+src3q]
|
||||
movq m4, [srcq+src4q]
|
||||
movq m5, [srcq+src5q]
|
||||
|
||||
SBUTTERFLY dq, 0, 1, 6
|
||||
SBUTTERFLY dq, 2, 3, 6
|
||||
SBUTTERFLY dq, 4, 5, 6
|
||||
movq [dstq ], m0
|
||||
movq [dstq+ 8], m2
|
||||
movq [dstq+16], m4
|
||||
movq [dstq+24], m1
|
||||
movq [dstq+32], m3
|
||||
movq [dstq+40], m5
|
||||
%endif
|
||||
add srcq, mmsize
|
||||
add dstq, mmsize*6
|
||||
sub lend, mmsize/4
|
||||
jg .loop
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
FLOAT_INTERLEAVE6 0
|
||||
INIT_XMM sse
|
||||
FLOAT_INTERLEAVE6 7
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro FLOAT_INTERLEAVE2 1
|
||||
cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
|
||||
mov src1q, [srcq+gprsize]
|
||||
mov srcq, [srcq ]
|
||||
sub src1q, srcq
|
||||
.loop:
|
||||
mova m0, [srcq ]
|
||||
mova m1, [srcq+src1q ]
|
||||
mova m3, [srcq +mmsize]
|
||||
mova m4, [srcq+src1q+mmsize]
|
||||
|
||||
mova m2, m0
|
||||
PUNPCKLDQ m0, m1
|
||||
PUNPCKHDQ m2, m1
|
||||
|
||||
mova m1, m3
|
||||
PUNPCKLDQ m3, m4
|
||||
PUNPCKHDQ m1, m4
|
||||
|
||||
mova [dstq ], m0
|
||||
mova [dstq+1*mmsize], m2
|
||||
mova [dstq+2*mmsize], m3
|
||||
mova [dstq+3*mmsize], m1
|
||||
|
||||
add srcq, mmsize*2
|
||||
add dstq, mmsize*4
|
||||
sub lend, mmsize/2
|
||||
jg .loop
|
||||
%if mmsize == 8
|
||||
emms
|
||||
%endif
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define PUNPCKLDQ punpckldq
|
||||
%define PUNPCKHDQ punpckhdq
|
||||
FLOAT_INTERLEAVE2 0
|
||||
INIT_XMM sse
|
||||
%define PUNPCKLDQ unpcklps
|
||||
%define PUNPCKHDQ unpckhps
|
||||
FLOAT_INTERLEAVE2 5
|
||||
148
project/jni/ffmpeg/libavcodec/x86/fmtconvert_init.c
Normal file
148
project/jni/ffmpeg/libavcodec/x86/fmtconvert_init.c
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Format Conversion Utils
|
||||
* Copyright (c) 2000, 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/fmtconvert.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
|
||||
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int *src, float mul, int len);
|
||||
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len);
|
||||
|
||||
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
|
||||
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
|
||||
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
|
||||
|
||||
void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
|
||||
void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
|
||||
void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
|
||||
|
||||
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
|
||||
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
|
||||
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
|
||||
|
||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
|
||||
|
||||
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
|
||||
|
||||
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
|
||||
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
|
||||
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
int c;\
|
||||
for(c=0; c<channels; c++){\
|
||||
ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
|
||||
if(channels==1)\
|
||||
ff_float_to_int16_##cpu(dst, src[0], len);\
|
||||
else if(channels==2){\
|
||||
ff_float_to_int16_interleave2_##cpu(dst, src, len);\
|
||||
}else if(channels==6){\
|
||||
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
|
||||
}else\
|
||||
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
|
||||
}
|
||||
|
||||
FLOAT_TO_INT16_INTERLEAVE(3dnow)
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse)
|
||||
FLOAT_TO_INT16_INTERLEAVE(sse2)
|
||||
|
||||
static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
|
||||
long len, int channels)
|
||||
{
|
||||
if(channels==6)
|
||||
ff_float_to_int16_interleave6_3dnowext(dst, src, len);
|
||||
else
|
||||
float_to_int16_interleave_3dnow(dst, src, len, channels);
|
||||
}
|
||||
|
||||
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
|
||||
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
|
||||
|
||||
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
|
||||
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
|
||||
|
||||
static void float_interleave_mmx(float *dst, const float **src,
|
||||
unsigned int len, int channels)
|
||||
{
|
||||
if (channels == 2) {
|
||||
ff_float_interleave2_mmx(dst, src, len);
|
||||
} else if (channels == 6)
|
||||
ff_float_interleave6_mmx(dst, src, len);
|
||||
else
|
||||
ff_float_interleave_c(dst, src, len, channels);
|
||||
}
|
||||
|
||||
static void float_interleave_sse(float *dst, const float **src,
|
||||
unsigned int len, int channels)
|
||||
{
|
||||
if (channels == 2) {
|
||||
ff_float_interleave2_sse(dst, src, len);
|
||||
} else if (channels == 6)
|
||||
ff_float_interleave6_sse(dst, src, len);
|
||||
else
|
||||
ff_float_interleave_c(dst, src, len, channels);
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->float_interleave = float_interleave_mmx;
|
||||
|
||||
if (EXTERNAL_AMD3DNOW(mm_flags)) {
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->float_to_int16 = ff_float_to_int16_3dnow;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_AMD3DNOWEXT(mm_flags)) {
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
|
||||
}
|
||||
}
|
||||
if (EXTERNAL_SSE(mm_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
|
||||
c->float_to_int16 = ff_float_to_int16_sse;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse;
|
||||
c->float_interleave = float_interleave_sse;
|
||||
}
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
|
||||
c->float_to_int16 = ff_float_to_int16_sse2;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
||||
681
project/jni/ffmpeg/libavcodec/x86/h264_chromamc.asm
Normal file
681
project/jni/ffmpeg/libavcodec/x86/h264_chromamc.asm
Normal file
@@ -0,0 +1,681 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSSE3-optimized functions for H264 chroma MC
|
||||
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
|
||||
;* 2005-2008 Loren Merritt
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
rnd_rv40_2d_tbl: times 4 dw 0
|
||||
times 4 dw 16
|
||||
times 4 dw 32
|
||||
times 4 dw 16
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
times 4 dw 0
|
||||
times 4 dw 32
|
||||
times 4 dw 16
|
||||
times 4 dw 32
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
times 4 dw 32
|
||||
times 4 dw 28
|
||||
rnd_rv40_1d_tbl: times 4 dw 0
|
||||
times 4 dw 2
|
||||
times 4 dw 4
|
||||
times 4 dw 2
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
times 4 dw 0
|
||||
times 4 dw 4
|
||||
times 4 dw 2
|
||||
times 4 dw 4
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
times 4 dw 4
|
||||
times 4 dw 3
|
||||
|
||||
cextern pw_3
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
cextern pw_28
|
||||
cextern pw_32
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro mv0_pixels_mc8 0
|
||||
lea r4, [r2*2 ]
|
||||
.next4rows:
|
||||
movq mm0, [r1 ]
|
||||
movq mm1, [r1+r2]
|
||||
add r1, r4
|
||||
CHROMAMC_AVG mm0, [r0 ]
|
||||
CHROMAMC_AVG mm1, [r0+r2]
|
||||
movq [r0 ], mm0
|
||||
movq [r0+r2], mm1
|
||||
add r0, r4
|
||||
movq mm0, [r1 ]
|
||||
movq mm1, [r1+r2]
|
||||
add r1, r4
|
||||
CHROMAMC_AVG mm0, [r0 ]
|
||||
CHROMAMC_AVG mm1, [r0+r2]
|
||||
movq [r0 ], mm0
|
||||
movq [r0+r2], mm1
|
||||
add r0, r4
|
||||
sub r3d, 4
|
||||
jne .next4rows
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc8_mmx_func 2-3
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
%define rnd_1d_rv40 r8
|
||||
%define rnd_2d_rv40 r8
|
||||
%define extra_regs 2
|
||||
%else ; no-PIC
|
||||
%define rnd_1d_rv40 rnd_rv40_1d_tbl
|
||||
%define rnd_2d_rv40 rnd_rv40_2d_tbl
|
||||
%define extra_regs 1
|
||||
%endif ; PIC
|
||||
%else
|
||||
%define extra_regs 0
|
||||
%endif ; rv40
|
||||
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
|
||||
; int stride, int h, int mx, int my)
|
||||
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
|
||||
%if ARCH_X86_64
|
||||
movsxd r2, r2d
|
||||
%endif
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
mv0_pixels_mc8
|
||||
REP_RET
|
||||
|
||||
.at_least_one_non_zero:
|
||||
%ifidn %2, rv40
|
||||
%if ARCH_X86_64
|
||||
mov r7, r5
|
||||
and r7, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r7, [r7*4+r4]
|
||||
sar r7d, 1
|
||||
%define rnd_bias r7
|
||||
%define dest_reg r0
|
||||
%else ; x86-32
|
||||
mov r0, r5
|
||||
and r0, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r0, [r0*4+r4]
|
||||
sar r0d, 1
|
||||
%define rnd_bias r0
|
||||
%define dest_reg r5
|
||||
%endif
|
||||
%else ; vc1, h264
|
||||
%define rnd_bias 0
|
||||
%define dest_reg r0
|
||||
%endif
|
||||
|
||||
test r5d, r5d
|
||||
mov r6, 1
|
||||
je .my_is_zero
|
||||
test r4d, r4d
|
||||
mov r6, r2 ; dxy = x ? 1 : stride
|
||||
jne .both_non_zero
|
||||
.my_is_zero:
|
||||
; mx == 0 XOR my == 0 - 1 dimensional filter only
|
||||
or r4d, r5d ; x + y
|
||||
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
lea r8, [rnd_rv40_1d_tbl]
|
||||
%endif
|
||||
%if ARCH_X86_64 == 0
|
||||
mov r5, r0m
|
||||
%endif
|
||||
%endif
|
||||
|
||||
movd m5, r4d
|
||||
movq m4, [pw_8]
|
||||
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
|
||||
punpcklwd m5, m5
|
||||
punpckldq m5, m5 ; mm5 = B = x
|
||||
pxor m7, m7
|
||||
psubw m4, m5 ; mm4 = A = 8-x
|
||||
|
||||
.next1drow:
|
||||
movq m0, [r1 ] ; mm0 = src[0..7]
|
||||
movq m2, [r1+r6] ; mm1 = src[1..8]
|
||||
|
||||
movq m1, m0
|
||||
movq m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
|
||||
pmullw m1, m4
|
||||
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
|
||||
pmullw m3, m5
|
||||
|
||||
paddw m0, m6
|
||||
paddw m1, m6
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psrlw m0, 3
|
||||
psrlw m1, 3
|
||||
packuswb m0, m1
|
||||
CHROMAMC_AVG m0, [dest_reg]
|
||||
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
|
||||
|
||||
add dest_reg, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jne .next1drow
|
||||
REP_RET
|
||||
|
||||
.both_non_zero: ; general case, bilinear
|
||||
movd m4, r4d ; x
|
||||
movd m6, r5d ; y
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
lea r8, [rnd_rv40_2d_tbl]
|
||||
%endif
|
||||
%if ARCH_X86_64 == 0
|
||||
mov r5, r0m
|
||||
%endif
|
||||
%endif
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, 16 ; AA and DD
|
||||
|
||||
punpcklwd m4, m4
|
||||
punpcklwd m6, m6
|
||||
punpckldq m4, m4 ; mm4 = x words
|
||||
punpckldq m6, m6 ; mm6 = y words
|
||||
movq m5, m4
|
||||
pmullw m4, m6 ; mm4 = x * y
|
||||
psllw m5, 3
|
||||
psllw m6, 3
|
||||
movq m7, m5
|
||||
paddw m7, m6
|
||||
movq [rsp+8], m4 ; DD = x * y
|
||||
psubw m5, m4 ; mm5 = B = 8x - xy
|
||||
psubw m6, m4 ; mm6 = C = 8y - xy
|
||||
paddw m4, [pw_64]
|
||||
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
|
||||
pxor m7, m7
|
||||
movq [rsp ], m4
|
||||
|
||||
movq m0, [r1 ] ; mm0 = src[0..7]
|
||||
movq m1, [r1+1] ; mm1 = src[1..8]
|
||||
.next2drow:
|
||||
add r1, r2
|
||||
|
||||
movq m2, m0
|
||||
movq m3, m1
|
||||
punpckhbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
pmullw m0, [rsp]
|
||||
pmullw m2, [rsp]
|
||||
pmullw m1, m5
|
||||
pmullw m3, m5
|
||||
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
|
||||
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
|
||||
|
||||
movq m0, [r1]
|
||||
movq m1, m0
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
pmullw m0, m6
|
||||
pmullw m1, m6
|
||||
paddw m2, m0
|
||||
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
|
||||
|
||||
movq m1, [r1+1]
|
||||
movq m0, m1
|
||||
movq m4, m1
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m4, m7
|
||||
pmullw m0, [rsp+8]
|
||||
pmullw m4, [rsp+8]
|
||||
paddw m2, m0
|
||||
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
|
||||
movq m0, [r1]
|
||||
|
||||
paddw m2, [rnd_2d_%2+rnd_bias*8]
|
||||
paddw m3, [rnd_2d_%2+rnd_bias*8]
|
||||
psrlw m2, 6
|
||||
psrlw m3, 6
|
||||
packuswb m2, m3
|
||||
CHROMAMC_AVG m2, [dest_reg]
|
||||
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
|
||||
|
||||
add dest_reg, r2
|
||||
dec r3d
|
||||
jne .next2drow
|
||||
mov rsp, r6 ; restore stack pointer
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc4_mmx_func 2
|
||||
%define extra_regs 0
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
%define extra_regs 1
|
||||
%endif ; PIC
|
||||
%endif ; rv40
|
||||
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
|
||||
%if ARCH_X86_64
|
||||
movsxd r2, r2d
|
||||
%endif
|
||||
pxor m7, m7
|
||||
movd m2, r4d ; x
|
||||
movd m3, r5d ; y
|
||||
movq m4, [pw_8]
|
||||
movq m5, [pw_8]
|
||||
punpcklwd m2, m2
|
||||
punpcklwd m3, m3
|
||||
punpcklwd m2, m2
|
||||
punpcklwd m3, m3
|
||||
psubw m4, m2
|
||||
psubw m5, m3
|
||||
|
||||
%ifidn %2, rv40
|
||||
%ifdef PIC
|
||||
lea r6, [rnd_rv40_2d_tbl]
|
||||
%define rnd_2d_rv40 r6
|
||||
%else
|
||||
%define rnd_2d_rv40 rnd_rv40_2d_tbl
|
||||
%endif
|
||||
and r5, 6 ; &~1 for mx/my=[0,7]
|
||||
lea r5, [r5*4+r4]
|
||||
sar r5d, 1
|
||||
%define rnd_bias r5
|
||||
%else ; vc1, h264
|
||||
%define rnd_bias 0
|
||||
%endif
|
||||
|
||||
movd m0, [r1 ]
|
||||
movd m6, [r1+1]
|
||||
add r1, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m6, m7
|
||||
pmullw m0, m4
|
||||
pmullw m6, m2
|
||||
paddw m6, m0
|
||||
|
||||
.next2rows:
|
||||
movd m0, [r1 ]
|
||||
movd m1, [r1+1]
|
||||
add r1, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m0, m4
|
||||
pmullw m1, m2
|
||||
paddw m1, m0
|
||||
movq m0, m1
|
||||
|
||||
pmullw m6, m5
|
||||
pmullw m1, m3
|
||||
paddw m6, [rnd_2d_%2+rnd_bias*8]
|
||||
paddw m1, m6
|
||||
psrlw m1, 6
|
||||
packuswb m1, m1
|
||||
CHROMAMC_AVG4 m1, m6, [r0]
|
||||
movd [r0], m1
|
||||
add r0, r2
|
||||
|
||||
movd m6, [r1 ]
|
||||
movd m1, [r1+1]
|
||||
add r1, r2
|
||||
punpcklbw m6, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m6, m4
|
||||
pmullw m1, m2
|
||||
paddw m1, m6
|
||||
movq m6, m1
|
||||
pmullw m0, m5
|
||||
pmullw m1, m3
|
||||
paddw m0, [rnd_2d_%2+rnd_bias*8]
|
||||
paddw m1, m0
|
||||
psrlw m1, 6
|
||||
packuswb m1, m1
|
||||
CHROMAMC_AVG4 m1, m0, [r0]
|
||||
movd [r0], m1
|
||||
add r0, r2
|
||||
sub r3d, 2
|
||||
jnz .next2rows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc2_mmx_func 2
|
||||
cglobal %1_%2_chroma_mc2, 6, 7, 0
|
||||
%if ARCH_X86_64
|
||||
movsxd r2, r2d
|
||||
%endif
|
||||
|
||||
mov r6d, r4d
|
||||
shl r4d, 16
|
||||
sub r4d, r6d
|
||||
add r4d, 8
|
||||
imul r5d, r4d ; x*y<<16 | y*(8-x)
|
||||
shl r4d, 3
|
||||
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
|
||||
|
||||
movd m5, r4d
|
||||
movd m6, r5d
|
||||
punpckldq m5, m5 ; mm5 = {A,B,A,B}
|
||||
punpckldq m6, m6 ; mm6 = {C,D,C,D}
|
||||
pxor m7, m7
|
||||
movd m2, [r1]
|
||||
punpcklbw m2, m7
|
||||
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
|
||||
|
||||
.nextrow:
|
||||
add r1, r2
|
||||
movq m1, m2
|
||||
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
|
||||
movd m0, [r1]
|
||||
punpcklbw m0, m7
|
||||
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
|
||||
movq m2, m0
|
||||
pmaddwd m0, m6
|
||||
paddw m1, [rnd_2d_%2]
|
||||
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
|
||||
psrlw m1, 6
|
||||
packssdw m1, m7
|
||||
packuswb m1, m7
|
||||
CHROMAMC_AVG4 m1, m3, [r0]
|
||||
movd r5d, m1
|
||||
mov [r0], r5w
|
||||
add r0, r2
|
||||
sub r3d, 1
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define rnd_1d_h264 pw_4
|
||||
%define rnd_2d_h264 pw_32
|
||||
%define rnd_1d_vc1 pw_3
|
||||
%define rnd_2d_vc1 pw_28
|
||||
|
||||
%macro NOTHING 2-3
|
||||
%endmacro
|
||||
%macro DIRECT_AVG 2
|
||||
PAVG %1, %2
|
||||
%endmacro
|
||||
%macro COPY_AVG 3
|
||||
movd %2, %3
|
||||
PAVG %1, %2
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
%define CHROMAMC_AVG4 NOTHING
|
||||
chroma_mc8_mmx_func put, h264, _rnd
|
||||
chroma_mc8_mmx_func put, vc1, _nornd
|
||||
chroma_mc8_mmx_func put, rv40
|
||||
chroma_mc4_mmx_func put, h264
|
||||
chroma_mc4_mmx_func put, rv40
|
||||
|
||||
INIT_MMX mmxext
|
||||
chroma_mc2_mmx_func put, h264
|
||||
|
||||
%define CHROMAMC_AVG DIRECT_AVG
|
||||
%define CHROMAMC_AVG4 COPY_AVG
|
||||
%define PAVG pavgb
|
||||
chroma_mc8_mmx_func avg, h264, _rnd
|
||||
chroma_mc8_mmx_func avg, vc1, _nornd
|
||||
chroma_mc8_mmx_func avg, rv40
|
||||
chroma_mc4_mmx_func avg, h264
|
||||
chroma_mc4_mmx_func avg, rv40
|
||||
chroma_mc2_mmx_func avg, h264
|
||||
|
||||
%define PAVG pavgusb
|
||||
INIT_MMX 3dnow
|
||||
chroma_mc8_mmx_func avg, h264, _rnd
|
||||
chroma_mc8_mmx_func avg, vc1, _nornd
|
||||
chroma_mc8_mmx_func avg, rv40
|
||||
chroma_mc4_mmx_func avg, h264
|
||||
chroma_mc4_mmx_func avg, rv40
|
||||
|
||||
%macro chroma_mc8_ssse3_func 2-3
|
||||
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
|
||||
%if ARCH_X86_64
|
||||
movsxd r2, r2d
|
||||
%endif
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
mv0_pixels_mc8
|
||||
REP_RET
|
||||
|
||||
.at_least_one_non_zero:
|
||||
test r5d, r5d
|
||||
je .my_is_zero
|
||||
test r4d, r4d
|
||||
je .mx_is_zero
|
||||
|
||||
; general case, bilinear
|
||||
mov r6d, r4d
|
||||
shl r4d, 8
|
||||
sub r4, r6
|
||||
mov r6, 8
|
||||
add r4, 8 ; x*288+8 = x<<8 | (8-x)
|
||||
sub r6d, r5d
|
||||
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
|
||||
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
|
||||
|
||||
movd m7, r6d
|
||||
movd m6, r4d
|
||||
movdqa m5, [rnd_2d_%2]
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1+1]
|
||||
pshuflw m7, m7, 0
|
||||
pshuflw m6, m6, 0
|
||||
punpcklbw m0, m1
|
||||
movlhps m7, m7
|
||||
movlhps m6, m6
|
||||
|
||||
.next2rows:
|
||||
movq m1, [r1+r2*1 ]
|
||||
movq m2, [r1+r2*1+1]
|
||||
movq m3, [r1+r2*2 ]
|
||||
movq m4, [r1+r2*2+1]
|
||||
lea r1, [r1+r2*2]
|
||||
punpcklbw m1, m2
|
||||
movdqa m2, m1
|
||||
punpcklbw m3, m4
|
||||
movdqa m4, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m7
|
||||
pmaddubsw m3, m6
|
||||
paddw m0, m5
|
||||
paddw m2, m5
|
||||
paddw m1, m0
|
||||
paddw m3, m2
|
||||
psrlw m1, 6
|
||||
movdqa m0, m4
|
||||
psrlw m3, 6
|
||||
%ifidn %1, avg
|
||||
movq m2, [r0 ]
|
||||
movhps m2, [r0+r2]
|
||||
%endif
|
||||
packuswb m1, m3
|
||||
CHROMAMC_AVG m1, m2
|
||||
movq [r0 ], m1
|
||||
movhps [r0+r2], m1
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
jg .next2rows
|
||||
REP_RET
|
||||
|
||||
.my_is_zero:
|
||||
mov r5d, r4d
|
||||
shl r4d, 8
|
||||
add r4, 8
|
||||
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
|
||||
movd m7, r4d
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
pshuflw m7, m7, 0
|
||||
movlhps m7, m7
|
||||
|
||||
.next2xrows:
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1 +1]
|
||||
movq m2, [r1+r2 ]
|
||||
movq m3, [r1+r2+1]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m2, m7
|
||||
%ifidn %1, avg
|
||||
movq m4, [r0 ]
|
||||
movhps m4, [r0+r2]
|
||||
%endif
|
||||
paddw m0, m6
|
||||
paddw m2, m6
|
||||
psrlw m0, 3
|
||||
psrlw m2, 3
|
||||
packuswb m0, m2
|
||||
CHROMAMC_AVG m0, m4
|
||||
movq [r0 ], m0
|
||||
movhps [r0+r2], m0
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
lea r1, [r1+r2*2]
|
||||
jg .next2xrows
|
||||
REP_RET
|
||||
|
||||
.mx_is_zero:
|
||||
mov r4d, r5d
|
||||
shl r5d, 8
|
||||
add r5, 8
|
||||
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
|
||||
movd m7, r5d
|
||||
movdqa m6, [rnd_1d_%2]
|
||||
pshuflw m7, m7, 0
|
||||
movlhps m7, m7
|
||||
|
||||
.next2yrows:
|
||||
movq m0, [r1 ]
|
||||
movq m1, [r1+r2 ]
|
||||
movdqa m2, m1
|
||||
movq m3, [r1+r2*2]
|
||||
lea r1, [r1+r2*2]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m2, m7
|
||||
%ifidn %1, avg
|
||||
movq m4, [r0 ]
|
||||
movhps m4, [r0+r2]
|
||||
%endif
|
||||
paddw m0, m6
|
||||
paddw m2, m6
|
||||
psrlw m0, 3
|
||||
psrlw m2, 3
|
||||
packuswb m0, m2
|
||||
CHROMAMC_AVG m0, m4
|
||||
movq [r0 ], m0
|
||||
movhps [r0+r2], m0
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
jg .next2yrows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro chroma_mc4_ssse3_func 2
|
||||
cglobal %1_%2_chroma_mc4, 6, 7, 0
|
||||
%if ARCH_X86_64
|
||||
movsxd r2, r2d
|
||||
%endif
|
||||
mov r6, r4
|
||||
shl r4d, 8
|
||||
sub r4d, r6d
|
||||
mov r6, 8
|
||||
add r4d, 8 ; x*288+8
|
||||
sub r6d, r5d
|
||||
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
|
||||
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
|
||||
|
||||
movd m7, r6d
|
||||
movd m6, r4d
|
||||
movq m5, [pw_32]
|
||||
movd m0, [r1 ]
|
||||
pshufw m7, m7, 0
|
||||
punpcklbw m0, [r1+1]
|
||||
pshufw m6, m6, 0
|
||||
|
||||
.next2rows:
|
||||
movd m1, [r1+r2*1 ]
|
||||
movd m3, [r1+r2*2 ]
|
||||
punpcklbw m1, [r1+r2*1+1]
|
||||
punpcklbw m3, [r1+r2*2+1]
|
||||
lea r1, [r1+r2*2]
|
||||
movq m2, m1
|
||||
movq m4, m3
|
||||
pmaddubsw m0, m7
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m7
|
||||
pmaddubsw m3, m6
|
||||
paddw m0, m5
|
||||
paddw m2, m5
|
||||
paddw m1, m0
|
||||
paddw m3, m2
|
||||
psrlw m1, 6
|
||||
movq m0, m4
|
||||
psrlw m3, 6
|
||||
packuswb m1, m1
|
||||
packuswb m3, m3
|
||||
CHROMAMC_AVG m1, [r0 ]
|
||||
CHROMAMC_AVG m3, [r0+r2]
|
||||
movd [r0 ], m1
|
||||
movd [r0+r2], m3
|
||||
sub r3d, 2
|
||||
lea r0, [r0+r2*2]
|
||||
jg .next2rows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
INIT_XMM ssse3
|
||||
chroma_mc8_ssse3_func put, h264, _rnd
|
||||
chroma_mc8_ssse3_func put, vc1, _nornd
|
||||
INIT_MMX ssse3
|
||||
chroma_mc4_ssse3_func put, h264
|
||||
|
||||
%define CHROMAMC_AVG DIRECT_AVG
|
||||
%define PAVG pavgb
|
||||
INIT_XMM ssse3
|
||||
chroma_mc8_ssse3_func avg, h264, _rnd
|
||||
chroma_mc8_ssse3_func avg, vc1, _nornd
|
||||
INIT_MMX ssse3
|
||||
chroma_mc4_ssse3_func avg, h264
|
||||
271
project/jni/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
Normal file
271
project/jni/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
Normal file
@@ -0,0 +1,271 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
cextern pw_32
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
%macro MV0_PIXELS_MC8 0
|
||||
lea r4, [r2*3 ]
|
||||
lea r5, [r2*4 ]
|
||||
.next4rows:
|
||||
movu m0, [r1 ]
|
||||
movu m1, [r1+r2 ]
|
||||
CHROMAMC_AVG m0, [r0 ]
|
||||
CHROMAMC_AVG m1, [r0+r2 ]
|
||||
mova [r0 ], m0
|
||||
mova [r0+r2 ], m1
|
||||
movu m0, [r1+r2*2]
|
||||
movu m1, [r1+r4 ]
|
||||
CHROMAMC_AVG m0, [r0+r2*2]
|
||||
CHROMAMC_AVG m1, [r0+r4 ]
|
||||
mova [r0+r2*2], m0
|
||||
mova [r0+r4 ], m1
|
||||
add r1, r5
|
||||
add r0, r5
|
||||
sub r3d, 4
|
||||
jne .next4rows
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro CHROMA_MC8 1
|
||||
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
|
||||
; int stride, int h, int mx, int my)
|
||||
cglobal %1_h264_chroma_mc8_10, 6,7,8
|
||||
movsxdifnidn r2, r2d
|
||||
mov r6d, r5d
|
||||
or r6d, r4d
|
||||
jne .at_least_one_non_zero
|
||||
; mx == 0 AND my == 0 - no filter needed
|
||||
MV0_PIXELS_MC8
|
||||
REP_RET
|
||||
|
||||
.at_least_one_non_zero:
|
||||
mov r6d, 2
|
||||
test r5d, r5d
|
||||
je .x_interpolation
|
||||
mov r6, r2 ; dxy = x ? 1 : stride
|
||||
test r4d, r4d
|
||||
jne .xy_interpolation
|
||||
.x_interpolation:
|
||||
; mx == 0 XOR my == 0 - 1 dimensional filter only
|
||||
or r4d, r5d ; x + y
|
||||
movd m5, r4d
|
||||
mova m4, [pw_8]
|
||||
mova m6, [pw_4] ; mm6 = rnd >> 3
|
||||
SPLATW m5, m5 ; mm5 = B = x
|
||||
psubw m4, m5 ; mm4 = A = 8-x
|
||||
|
||||
.next1drow:
|
||||
movu m0, [r1 ] ; mm0 = src[0..7]
|
||||
movu m2, [r1+r6] ; mm2 = src[1..8]
|
||||
|
||||
pmullw m0, m4 ; mm0 = A * src[0..7]
|
||||
pmullw m2, m5 ; mm2 = B * src[1..8]
|
||||
|
||||
paddw m0, m6
|
||||
paddw m0, m2
|
||||
psrlw m0, 3
|
||||
CHROMAMC_AVG m0, [r0]
|
||||
mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
|
||||
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jne .next1drow
|
||||
REP_RET
|
||||
|
||||
.xy_interpolation: ; general case, bilinear
|
||||
movd m4, r4m ; x
|
||||
movd m6, r5m ; y
|
||||
|
||||
SPLATW m4, m4 ; mm4 = x words
|
||||
SPLATW m6, m6 ; mm6 = y words
|
||||
psllw m5, m4, 3 ; mm5 = 8x
|
||||
pmullw m4, m6 ; mm4 = x * y
|
||||
psllw m6, 3 ; mm6 = 8y
|
||||
paddw m1, m5, m6 ; mm7 = 8x+8y
|
||||
mova m7, m4 ; DD = x * y
|
||||
psubw m5, m4 ; mm5 = B = 8x - xy
|
||||
psubw m6, m4 ; mm6 = C = 8y - xy
|
||||
paddw m4, [pw_64]
|
||||
psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
|
||||
|
||||
movu m0, [r1 ] ; mm0 = src[0..7]
|
||||
movu m1, [r1+2] ; mm1 = src[1..8]
|
||||
.next2drow:
|
||||
add r1, r2
|
||||
|
||||
pmullw m2, m0, m4
|
||||
pmullw m1, m5
|
||||
paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
|
||||
|
||||
movu m0, [r1]
|
||||
movu m1, [r1+2]
|
||||
pmullw m3, m0, m6
|
||||
paddw m2, m3 ; mm2 += C * src[0..7+strde]
|
||||
pmullw m3, m1, m7
|
||||
paddw m2, m3 ; mm2 += D * src[1..8+strde]
|
||||
|
||||
paddw m2, [pw_32]
|
||||
psrlw m2, 6
|
||||
CHROMAMC_AVG m2, [r0]
|
||||
mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
|
||||
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jne .next2drow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
|
||||
;-----------------------------------------------------------------------------
|
||||
;TODO: xmm mc4
|
||||
%macro MC4_OP 2
|
||||
movq %1, [r1 ]
|
||||
movq m1, [r1+2]
|
||||
add r1, r2
|
||||
pmullw %1, m4
|
||||
pmullw m1, m2
|
||||
paddw m1, %1
|
||||
mova %1, m1
|
||||
|
||||
pmullw %2, m5
|
||||
pmullw m1, m3
|
||||
paddw %2, [pw_32]
|
||||
paddw m1, %2
|
||||
psrlw m1, 6
|
||||
CHROMAMC_AVG m1, %2, [r0]
|
||||
movq [r0], m1
|
||||
add r0, r2
|
||||
%endmacro
|
||||
|
||||
%macro CHROMA_MC4 1
|
||||
cglobal %1_h264_chroma_mc4_10, 6,6,7
|
||||
movsxdifnidn r2, r2d
|
||||
movd m2, r4m ; x
|
||||
movd m3, r5m ; y
|
||||
mova m4, [pw_8]
|
||||
mova m5, m4
|
||||
SPLATW m2, m2
|
||||
SPLATW m3, m3
|
||||
psubw m4, m2
|
||||
psubw m5, m3
|
||||
|
||||
movq m0, [r1 ]
|
||||
movq m6, [r1+2]
|
||||
add r1, r2
|
||||
pmullw m0, m4
|
||||
pmullw m6, m2
|
||||
paddw m6, m0
|
||||
|
||||
.next2rows:
|
||||
MC4_OP m0, m6
|
||||
MC4_OP m6, m0
|
||||
sub r3d, 2
|
||||
jnz .next2rows
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro CHROMA_MC2 1
|
||||
cglobal %1_h264_chroma_mc2_10, 6,7
|
||||
movsxdifnidn r2, r2d
|
||||
mov r6d, r4d
|
||||
shl r4d, 16
|
||||
sub r4d, r6d
|
||||
add r4d, 8
|
||||
imul r5d, r4d ; x*y<<16 | y*(8-x)
|
||||
shl r4d, 3
|
||||
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
|
||||
|
||||
movd m5, r4d
|
||||
movd m6, r5d
|
||||
punpckldq m5, m5 ; mm5 = {A,B,A,B}
|
||||
punpckldq m6, m6 ; mm6 = {C,D,C,D}
|
||||
pxor m7, m7
|
||||
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
|
||||
|
||||
.nextrow:
|
||||
add r1, r2
|
||||
movq m1, m2
|
||||
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
|
||||
pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
|
||||
movq m2, m0
|
||||
pmaddwd m0, m6
|
||||
paddw m1, [pw_32]
|
||||
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
|
||||
psrlw m1, 6
|
||||
packssdw m1, m7
|
||||
CHROMAMC_AVG m1, m3, [r0]
|
||||
movd [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro NOTHING 2-3
|
||||
%endmacro
|
||||
%macro AVG 2-3
|
||||
%if %0==3
|
||||
movq %2, %3
|
||||
%endif
|
||||
pavgw %1, %2
|
||||
%endmacro
|
||||
|
||||
%define CHROMAMC_AVG NOTHING
|
||||
INIT_XMM sse2
|
||||
CHROMA_MC8 put
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
CHROMA_MC8 put
|
||||
%endif
|
||||
INIT_MMX mmxext
|
||||
CHROMA_MC4 put
|
||||
CHROMA_MC2 put
|
||||
|
||||
%define CHROMAMC_AVG AVG
|
||||
INIT_XMM sse2
|
||||
CHROMA_MC8 avg
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
CHROMA_MC8 avg
|
||||
%endif
|
||||
INIT_MMX mmxext
|
||||
CHROMA_MC4 avg
|
||||
CHROMA_MC2 avg
|
||||
1075
project/jni/ffmpeg/libavcodec/x86/h264_deblock.asm
Normal file
1075
project/jni/ffmpeg/libavcodec/x86/h264_deblock.asm
Normal file
File diff suppressed because it is too large
Load Diff
923
project/jni/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
Normal file
923
project/jni/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
Normal file
@@ -0,0 +1,923 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Oskar Arvidsson <oskar@irock.se>
|
||||
;* Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Jason Garrett-Glaser <darkshikari@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern pw_2
|
||||
cextern pw_3
|
||||
cextern pw_4
|
||||
|
||||
; out: %4 = |%1-%2|-%3
|
||||
; clobbers: %5
|
||||
%macro ABS_SUB 5
|
||||
psubusw %5, %2, %1
|
||||
psubusw %4, %1, %2
|
||||
por %4, %5
|
||||
psubw %4, %3
|
||||
%endmacro
|
||||
|
||||
; out: %4 = |%1-%2|<%3
|
||||
%macro DIFF_LT 5
|
||||
psubusw %4, %2, %1
|
||||
psubusw %5, %1, %2
|
||||
por %5, %4 ; |%1-%2|
|
||||
pxor %4, %4
|
||||
psubw %5, %3 ; |%1-%2|-%3
|
||||
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_AB 4
|
||||
movd %1, %3
|
||||
movd %2, %4
|
||||
SPLATW %1, %1
|
||||
SPLATW %2, %2
|
||||
%endmacro
|
||||
|
||||
; in: %2=tc reg
|
||||
; out: %1=splatted tc
|
||||
%macro LOAD_TC 2
|
||||
movd %1, [%2]
|
||||
punpcklbw %1, %1
|
||||
%if mmsize == 8
|
||||
pshufw %1, %1, 0
|
||||
%else
|
||||
pshuflw %1, %1, 01010000b
|
||||
pshufd %1, %1, 01010000b
|
||||
%endif
|
||||
psraw %1, 6
|
||||
%endmacro
|
||||
|
||||
; in: %1=p1, %2=p0, %3=q0, %4=q1
|
||||
; %5=alpha, %6=beta, %7-%9=tmp
|
||||
; out: %7=mask
|
||||
%macro LOAD_MASK 9
|
||||
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
|
||||
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
|
||||
pand %8, %9
|
||||
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
|
||||
pxor %7, %7
|
||||
pand %8, %9
|
||||
pcmpgtw %7, %8
|
||||
%endmacro
|
||||
|
||||
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
|
||||
; out: %1=p0', m2=q0'
|
||||
%macro DEBLOCK_P0_Q0 7
|
||||
psubw %3, %4
|
||||
pxor %7, %7
|
||||
paddw %3, [pw_4]
|
||||
psubw %7, %5
|
||||
psubw %6, %2, %1
|
||||
psllw %6, 2
|
||||
paddw %3, %6
|
||||
psraw %3, 3
|
||||
mova %6, [pw_pixel_max]
|
||||
CLIPW %3, %7, %5
|
||||
pxor %7, %7
|
||||
paddw %1, %3
|
||||
psubw %2, %3
|
||||
CLIPW %1, %7, %6
|
||||
CLIPW %2, %7, %6
|
||||
%endmacro
|
||||
|
||||
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
|
||||
%macro LUMA_Q1 6
|
||||
pavgw %6, %3, %4 ; (p0+q0+1)>>1
|
||||
paddw %1, %6
|
||||
pxor %6, %6
|
||||
psraw %1, 1
|
||||
psubw %6, %5
|
||||
psubw %1, %2
|
||||
CLIPW %1, %6, %5
|
||||
paddw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro LUMA_DEBLOCK_ONE 3
|
||||
DIFF_LT m5, %1, bm, m4, m6
|
||||
pxor m6, m6
|
||||
mova %3, m4
|
||||
pcmpgtw m6, tcm
|
||||
pand m4, tcm
|
||||
pandn m6, m7
|
||||
pand m4, m6
|
||||
LUMA_Q1 m5, %2, m1, m2, m4, m6
|
||||
%endmacro
|
||||
|
||||
%macro LUMA_H_STORE 2
|
||||
%if mmsize == 8
|
||||
movq [r0-4], m0
|
||||
movq [r0+r1-4], m1
|
||||
movq [r0+r1*2-4], m2
|
||||
movq [r0+%2-4], m3
|
||||
%else
|
||||
movq [r0-4], m0
|
||||
movhps [r0+r1-4], m0
|
||||
movq [r0+r1*2-4], m1
|
||||
movhps [%1-4], m1
|
||||
movq [%1+r1-4], m2
|
||||
movhps [%1+r1*2-4], m2
|
||||
movq [%1+%2-4], m3
|
||||
movhps [%1+r1*4-4], m3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEBLOCK_LUMA 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
|
||||
%assign pad 5*mmsize+12-(stack_offset&15)
|
||||
%define tcm [rsp]
|
||||
%define ms1 [rsp+mmsize]
|
||||
%define ms2 [rsp+mmsize*2]
|
||||
%define am [rsp+mmsize*3]
|
||||
%define bm [rsp+mmsize*4]
|
||||
SUB rsp, pad
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB m4, m5, r2d, r3d
|
||||
mov r3, 32/mmsize
|
||||
mov r2, r0
|
||||
sub r0, r1
|
||||
mova am, m4
|
||||
sub r0, r1
|
||||
mova bm, m5
|
||||
sub r0, r1
|
||||
.loop:
|
||||
mova m0, [r0+r1]
|
||||
mova m1, [r0+r1*2]
|
||||
mova m2, [r2]
|
||||
mova m3, [r2+r1]
|
||||
|
||||
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
|
||||
LOAD_TC m6, r4
|
||||
mova tcm, m6
|
||||
|
||||
mova m5, [r0]
|
||||
LUMA_DEBLOCK_ONE m1, m0, ms1
|
||||
mova [r0+r1], m5
|
||||
|
||||
mova m5, [r2+r1*2]
|
||||
LUMA_DEBLOCK_ONE m2, m3, ms2
|
||||
mova [r2+r1], m5
|
||||
|
||||
pxor m5, m5
|
||||
mova m6, tcm
|
||||
pcmpgtw m5, tcm
|
||||
psubw m6, ms1
|
||||
pandn m5, m7
|
||||
psubw m6, ms2
|
||||
pand m5, m6
|
||||
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
|
||||
mova [r0+r1*2], m1
|
||||
mova [r2], m2
|
||||
|
||||
add r0, mmsize
|
||||
add r2, mmsize
|
||||
add r4, mmsize/8
|
||||
dec r3
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
|
||||
cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
|
||||
%assign pad 7*mmsize+12-(stack_offset&15)
|
||||
%define tcm [rsp]
|
||||
%define ms1 [rsp+mmsize]
|
||||
%define ms2 [rsp+mmsize*2]
|
||||
%define p1m [rsp+mmsize*3]
|
||||
%define p2m [rsp+mmsize*4]
|
||||
%define am [rsp+mmsize*5]
|
||||
%define bm [rsp+mmsize*6]
|
||||
SUB rsp, pad
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB m4, m5, r2d, r3d
|
||||
mov r3, r1
|
||||
mova am, m4
|
||||
add r3, r1
|
||||
mov r5, 32/mmsize
|
||||
mova bm, m5
|
||||
add r3, r1
|
||||
%if mmsize == 16
|
||||
mov r2, r0
|
||||
add r2, r3
|
||||
%endif
|
||||
.loop:
|
||||
%if mmsize == 8
|
||||
movq m2, [r0-8] ; y q2 q1 q0
|
||||
movq m7, [r0+0]
|
||||
movq m5, [r0+r1-8]
|
||||
movq m3, [r0+r1+0]
|
||||
movq m0, [r0+r1*2-8]
|
||||
movq m6, [r0+r1*2+0]
|
||||
movq m1, [r0+r3-8]
|
||||
TRANSPOSE4x4W 2, 5, 0, 1, 4
|
||||
SWAP 2, 7
|
||||
movq m7, [r0+r3]
|
||||
TRANSPOSE4x4W 2, 3, 6, 7, 4
|
||||
%else
|
||||
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
|
||||
movu m0, [r0+r1-8]
|
||||
movu m2, [r0+r1*2-8]
|
||||
movu m3, [r2-8]
|
||||
TRANSPOSE4x4W 5, 0, 2, 3, 6
|
||||
mova tcm, m3
|
||||
|
||||
movu m4, [r2+r1-8]
|
||||
movu m1, [r2+r1*2-8]
|
||||
movu m3, [r2+r3-8]
|
||||
movu m7, [r2+r1*4-8]
|
||||
TRANSPOSE4x4W 4, 1, 3, 7, 6
|
||||
|
||||
mova m6, tcm
|
||||
punpcklqdq m6, m7
|
||||
punpckhqdq m5, m4
|
||||
SBUTTERFLY qdq, 0, 1, 7
|
||||
SBUTTERFLY qdq, 2, 3, 7
|
||||
%endif
|
||||
|
||||
mova p2m, m6
|
||||
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
|
||||
LOAD_TC m6, r4
|
||||
mova tcm, m6
|
||||
|
||||
LUMA_DEBLOCK_ONE m1, m0, ms1
|
||||
mova p1m, m5
|
||||
|
||||
mova m5, p2m
|
||||
LUMA_DEBLOCK_ONE m2, m3, ms2
|
||||
mova p2m, m5
|
||||
|
||||
pxor m5, m5
|
||||
mova m6, tcm
|
||||
pcmpgtw m5, tcm
|
||||
psubw m6, ms1
|
||||
pandn m5, m7
|
||||
psubw m6, ms2
|
||||
pand m5, m6
|
||||
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
|
||||
mova m0, p1m
|
||||
mova m3, p2m
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
LUMA_H_STORE r2, r3
|
||||
|
||||
add r4, mmsize/8
|
||||
lea r0, [r0+r1*(mmsize/2)]
|
||||
lea r2, [r2+r1*(mmsize/2)]
|
||||
dec r5
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
|
||||
; m12=alpha, m13=beta
|
||||
; out: m0=p1', m3=q1', m1=p0', m2=q0'
|
||||
; clobbers: m4, m5, m6, m7, m10, m11, m14
|
||||
%macro DEBLOCK_LUMA_INTER_SSE2 0
|
||||
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
|
||||
LOAD_TC m6, r4
|
||||
DIFF_LT m8, m1, m13, m10, m4
|
||||
DIFF_LT m9, m2, m13, m11, m4
|
||||
pand m6, m7
|
||||
|
||||
mova m14, m6
|
||||
pxor m4, m4
|
||||
pcmpgtw m6, m4
|
||||
pand m6, m14
|
||||
|
||||
mova m5, m10
|
||||
pand m5, m6
|
||||
LUMA_Q1 m8, m0, m1, m2, m5, m4
|
||||
|
||||
mova m5, m11
|
||||
pand m5, m6
|
||||
LUMA_Q1 m9, m3, m1, m2, m5, m4
|
||||
|
||||
pxor m4, m4
|
||||
psubw m6, m10
|
||||
pcmpgtw m4, m14
|
||||
pandn m4, m7
|
||||
psubw m6, m11
|
||||
pand m4, m6
|
||||
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
|
||||
|
||||
SWAP 0, 8
|
||||
SWAP 3, 9
|
||||
%endmacro
|
||||
|
||||
%macro DEBLOCK_LUMA_64 0
|
||||
cglobal deblock_v_luma_10, 5,5,15
|
||||
%define p2 m8
|
||||
%define p1 m0
|
||||
%define p0 m1
|
||||
%define q0 m2
|
||||
%define q1 m3
|
||||
%define q2 m9
|
||||
%define mask0 m7
|
||||
%define mask1 m10
|
||||
%define mask2 m11
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB m12, m13, r2d, r3d
|
||||
mov r2, r0
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
mov r3, 2
|
||||
.loop:
|
||||
mova p2, [r0]
|
||||
mova p1, [r0+r1]
|
||||
mova p0, [r0+r1*2]
|
||||
mova q0, [r2]
|
||||
mova q1, [r2+r1]
|
||||
mova q2, [r2+r1*2]
|
||||
DEBLOCK_LUMA_INTER_SSE2
|
||||
mova [r0+r1], p1
|
||||
mova [r0+r1*2], p0
|
||||
mova [r2], q0
|
||||
mova [r2+r1], q1
|
||||
add r0, mmsize
|
||||
add r2, mmsize
|
||||
add r4, 2
|
||||
dec r3
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
cglobal deblock_h_luma_10, 5,7,15
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB m12, m13, r2d, r3d
|
||||
mov r2, r1
|
||||
add r2, r1
|
||||
add r2, r1
|
||||
mov r5, r0
|
||||
add r5, r2
|
||||
mov r6, 2
|
||||
.loop:
|
||||
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
|
||||
movu m0, [r0+r1-8]
|
||||
movu m2, [r0+r1*2-8]
|
||||
movu m9, [r5-8]
|
||||
movu m5, [r5+r1-8]
|
||||
movu m1, [r5+r1*2-8]
|
||||
movu m3, [r5+r2-8]
|
||||
movu m7, [r5+r1*4-8]
|
||||
|
||||
TRANSPOSE4x4W 8, 0, 2, 9, 10
|
||||
TRANSPOSE4x4W 5, 1, 3, 7, 10
|
||||
|
||||
punpckhqdq m8, m5
|
||||
SBUTTERFLY qdq, 0, 1, 10
|
||||
SBUTTERFLY qdq, 2, 3, 10
|
||||
punpcklqdq m9, m7
|
||||
|
||||
DEBLOCK_LUMA_INTER_SSE2
|
||||
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
LUMA_H_STORE r5, r2
|
||||
add r4, 2
|
||||
lea r0, [r0+r1*8]
|
||||
lea r5, [r5+r1*8]
|
||||
dec r6
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
DEBLOCK_LUMA_64
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEBLOCK_LUMA_64
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%macro SWAPMOVA 2
|
||||
%ifid %1
|
||||
SWAP %1, %2
|
||||
%else
|
||||
mova %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; in: t0-t2: tmp registers
|
||||
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
|
||||
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
|
||||
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
|
||||
%if ARCH_X86_64
|
||||
paddw t0, %3, %2
|
||||
mova t2, %4
|
||||
paddw t2, %3
|
||||
%else
|
||||
mova t0, %3
|
||||
mova t2, %4
|
||||
paddw t0, %2
|
||||
paddw t2, %3
|
||||
%endif
|
||||
paddw t0, %1
|
||||
paddw t2, t2
|
||||
paddw t0, %5
|
||||
paddw t2, %9
|
||||
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
|
||||
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
|
||||
|
||||
psrlw t2, 3
|
||||
psrlw t1, t0, 2
|
||||
psubw t2, %3
|
||||
psubw t1, %2
|
||||
pand t2, %8
|
||||
pand t1, %8
|
||||
paddw t2, %3
|
||||
paddw t1, %2
|
||||
SWAPMOVA %11, t1
|
||||
|
||||
psubw t1, t0, %3
|
||||
paddw t0, t0
|
||||
psubw t1, %5
|
||||
psubw t0, %3
|
||||
paddw t1, %6
|
||||
paddw t1, %2
|
||||
paddw t0, %6
|
||||
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
|
||||
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
|
||||
|
||||
pxor t0, t1
|
||||
pxor t1, %1
|
||||
pand t0, %8
|
||||
pand t1, %7
|
||||
pxor t0, t1
|
||||
pxor t0, %1
|
||||
SWAPMOVA %10, t0
|
||||
SWAPMOVA %12, t2
|
||||
%endmacro
|
||||
|
||||
%macro LUMA_INTRA_INIT 1
|
||||
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
|
||||
%define t0 m4
|
||||
%define t1 m5
|
||||
%define t2 m6
|
||||
%define t3 m7
|
||||
%assign i 4
|
||||
%rep %1
|
||||
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
|
||||
%assign i i+1
|
||||
%endrep
|
||||
SUB rsp, pad
|
||||
%endmacro
|
||||
|
||||
; in: %1-%3=tmp, %4=p2, %5=q2
|
||||
%macro LUMA_INTRA_INTER 5
|
||||
LOAD_AB t0, t1, r2d, r3d
|
||||
mova %1, t0
|
||||
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
|
||||
%if ARCH_X86_64
|
||||
mova %2, t0 ; mask0
|
||||
psrlw t3, %1, 2
|
||||
%else
|
||||
mova t3, %1
|
||||
mova %2, t0 ; mask0
|
||||
psrlw t3, 2
|
||||
%endif
|
||||
paddw t3, [pw_2] ; alpha/4+2
|
||||
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
|
||||
pand t2, %2
|
||||
mova t3, %5 ; q2
|
||||
mova %1, t2 ; mask1
|
||||
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
|
||||
pand t2, %1
|
||||
mova t3, %4 ; p2
|
||||
mova %3, t2 ; mask1q
|
||||
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
|
||||
pand t2, %1
|
||||
mova %1, t2 ; mask1p
|
||||
%endmacro
|
||||
|
||||
%macro LUMA_H_INTRA_LOAD 0
|
||||
%if mmsize == 8
|
||||
movu t0, [r0-8]
|
||||
movu t1, [r0+r1-8]
|
||||
movu m0, [r0+r1*2-8]
|
||||
movu m1, [r0+r4-8]
|
||||
TRANSPOSE4x4W 4, 5, 0, 1, 2
|
||||
mova t4, t0 ; p3
|
||||
mova t5, t1 ; p2
|
||||
|
||||
movu m2, [r0]
|
||||
movu m3, [r0+r1]
|
||||
movu t0, [r0+r1*2]
|
||||
movu t1, [r0+r4]
|
||||
TRANSPOSE4x4W 2, 3, 4, 5, 6
|
||||
mova t6, t0 ; q2
|
||||
mova t7, t1 ; q3
|
||||
%else
|
||||
movu t0, [r0-8]
|
||||
movu t1, [r0+r1-8]
|
||||
movu m0, [r0+r1*2-8]
|
||||
movu m1, [r0+r5-8]
|
||||
movu m2, [r4-8]
|
||||
movu m3, [r4+r1-8]
|
||||
movu t2, [r4+r1*2-8]
|
||||
movu t3, [r4+r5-8]
|
||||
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
|
||||
mova t4, t0 ; p3
|
||||
mova t5, t1 ; p2
|
||||
mova t6, t2 ; q2
|
||||
mova t7, t3 ; q3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
|
||||
%macro LUMA_H_INTRA_STORE 9
|
||||
%if mmsize == 8
|
||||
TRANSPOSE4x4W %1, %2, %3, %4, %9
|
||||
movq [r0-8], m%1
|
||||
movq [r0+r1-8], m%2
|
||||
movq [r0+r1*2-8], m%3
|
||||
movq [r0+r4-8], m%4
|
||||
movq m%1, %8
|
||||
TRANSPOSE4x4W %5, %6, %7, %1, %9
|
||||
movq [r0], m%5
|
||||
movq [r0+r1], m%6
|
||||
movq [r0+r1*2], m%7
|
||||
movq [r0+r4], m%1
|
||||
%else
|
||||
TRANSPOSE2x4x4W %1, %2, %3, %4, %9
|
||||
movq [r0-8], m%1
|
||||
movq [r0+r1-8], m%2
|
||||
movq [r0+r1*2-8], m%3
|
||||
movq [r0+r5-8], m%4
|
||||
movhps [r4-8], m%1
|
||||
movhps [r4+r1-8], m%2
|
||||
movhps [r4+r1*2-8], m%3
|
||||
movhps [r4+r5-8], m%4
|
||||
%ifnum %8
|
||||
SWAP %1, %8
|
||||
%else
|
||||
mova m%1, %8
|
||||
%endif
|
||||
TRANSPOSE2x4x4W %5, %6, %7, %1, %9
|
||||
movq [r0], m%5
|
||||
movq [r0+r1], m%6
|
||||
movq [r0+r1*2], m%7
|
||||
movq [r0+r5], m%1
|
||||
movhps [r4], m%5
|
||||
movhps [r4+r1], m%6
|
||||
movhps [r4+r1*2], m%7
|
||||
movhps [r4+r5], m%1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro DEBLOCK_LUMA_INTRA_64 0
|
||||
cglobal deblock_v_luma_intra_10, 4,7,16
|
||||
%define t0 m1
|
||||
%define t1 m2
|
||||
%define t2 m4
|
||||
%define p2 m8
|
||||
%define p1 m9
|
||||
%define p0 m10
|
||||
%define q0 m11
|
||||
%define q1 m12
|
||||
%define q2 m13
|
||||
%define aa m5
|
||||
%define bb m14
|
||||
lea r4, [r1*4]
|
||||
lea r5, [r1*3] ; 3*stride
|
||||
neg r4
|
||||
add r4, r0 ; pix-4*stride
|
||||
mov r6, 2
|
||||
mova m0, [pw_2]
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
LOAD_AB aa, bb, r2d, r3d
|
||||
.loop:
|
||||
mova p2, [r4+r1]
|
||||
mova p1, [r4+2*r1]
|
||||
mova p0, [r4+r5]
|
||||
mova q0, [r0]
|
||||
mova q1, [r0+r1]
|
||||
mova q2, [r0+2*r1]
|
||||
|
||||
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
|
||||
mova t2, aa
|
||||
psrlw t2, 2
|
||||
paddw t2, m0 ; alpha/4+2
|
||||
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
|
||||
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
|
||||
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
|
||||
pand m6, m3
|
||||
pand m7, m6
|
||||
pand m6, t1
|
||||
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
|
||||
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
|
||||
add r0, mmsize
|
||||
add r4, mmsize
|
||||
dec r6
|
||||
jg .loop
|
||||
REP_RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_h_luma_intra_10, 4,7,16
|
||||
%define t0 m15
|
||||
%define t1 m14
|
||||
%define t2 m2
|
||||
%define q3 m5
|
||||
%define q2 m8
|
||||
%define q1 m9
|
||||
%define q0 m10
|
||||
%define p0 m11
|
||||
%define p1 m12
|
||||
%define p2 m13
|
||||
%define p3 m4
|
||||
%define spill [rsp]
|
||||
%assign pad 24-(stack_offset&15)
|
||||
SUB rsp, pad
|
||||
lea r4, [r1*4]
|
||||
lea r5, [r1*3] ; 3*stride
|
||||
add r4, r0 ; pix+4*stride
|
||||
mov r6, 2
|
||||
mova m0, [pw_2]
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
.loop:
|
||||
movu q3, [r0-8]
|
||||
movu q2, [r0+r1-8]
|
||||
movu q1, [r0+r1*2-8]
|
||||
movu q0, [r0+r5-8]
|
||||
movu p0, [r4-8]
|
||||
movu p1, [r4+r1-8]
|
||||
movu p2, [r4+r1*2-8]
|
||||
movu p3, [r4+r5-8]
|
||||
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
|
||||
|
||||
LOAD_AB m1, m2, r2d, r3d
|
||||
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
|
||||
psrlw m1, 2
|
||||
paddw m1, m0 ; alpha/4+2
|
||||
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
|
||||
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
|
||||
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
|
||||
pand m6, m3
|
||||
pand m7, m6
|
||||
pand m6, t1
|
||||
|
||||
mova spill, q3
|
||||
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
|
||||
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
|
||||
mova m7, spill
|
||||
|
||||
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
|
||||
|
||||
lea r0, [r0+r1*8]
|
||||
lea r4, [r4+r1*8]
|
||||
dec r6
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
DEBLOCK_LUMA_INTRA_64
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEBLOCK_LUMA_INTRA_64
|
||||
%endif
|
||||
|
||||
%endif
|
||||
|
||||
%macro DEBLOCK_LUMA_INTRA 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
|
||||
LUMA_INTRA_INIT 3
|
||||
lea r4, [r1*4]
|
||||
lea r5, [r1*3]
|
||||
neg r4
|
||||
add r4, r0
|
||||
mov r6, 32/mmsize
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
.loop:
|
||||
mova m0, [r4+r1*2] ; p1
|
||||
mova m1, [r4+r5] ; p0
|
||||
mova m2, [r0] ; q0
|
||||
mova m3, [r0+r1] ; q1
|
||||
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
|
||||
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
|
||||
mova t3, [r0+r1*2] ; q2
|
||||
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
|
||||
add r0, mmsize
|
||||
add r4, mmsize
|
||||
dec r6
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
|
||||
LUMA_INTRA_INIT 8
|
||||
%if mmsize == 8
|
||||
lea r4, [r1*3]
|
||||
mov r5, 32/mmsize
|
||||
%else
|
||||
lea r4, [r1*4]
|
||||
lea r5, [r1*3] ; 3*stride
|
||||
add r4, r0 ; pix+4*stride
|
||||
mov r6, 32/mmsize
|
||||
%endif
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
.loop:
|
||||
LUMA_H_INTRA_LOAD
|
||||
LUMA_INTRA_INTER t8, t9, t10, t5, t6
|
||||
|
||||
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
|
||||
mova t3, t6 ; q2
|
||||
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
|
||||
|
||||
mova m2, t4
|
||||
mova m0, t11
|
||||
mova m1, t5
|
||||
mova m3, t8
|
||||
mova m6, t6
|
||||
|
||||
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
|
||||
|
||||
lea r0, [r0+r1*(mmsize/2)]
|
||||
%if mmsize == 8
|
||||
dec r5
|
||||
%else
|
||||
lea r4, [r4+r1*(mmsize/2)]
|
||||
dec r6
|
||||
%endif
|
||||
jg .loop
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
INIT_MMX mmxext
|
||||
DEBLOCK_LUMA
|
||||
DEBLOCK_LUMA_INTRA
|
||||
INIT_XMM sse2
|
||||
DEBLOCK_LUMA
|
||||
DEBLOCK_LUMA_INTRA
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEBLOCK_LUMA
|
||||
DEBLOCK_LUMA_INTRA
|
||||
%endif
|
||||
%endif
|
||||
|
||||
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
|
||||
; out: %1=p0', %2=q0'
|
||||
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
|
||||
mova %6, [pw_2]
|
||||
paddw %6, %3
|
||||
paddw %6, %4
|
||||
paddw %7, %6, %2
|
||||
paddw %6, %1
|
||||
paddw %6, %3
|
||||
paddw %7, %4
|
||||
psraw %6, 2
|
||||
psraw %7, 2
|
||||
psubw %6, %1
|
||||
psubw %7, %2
|
||||
pand %6, %5
|
||||
pand %7, %5
|
||||
paddw %1, %6
|
||||
paddw %2, %7
|
||||
%endmacro
|
||||
|
||||
%macro CHROMA_V_LOAD 1
|
||||
mova m0, [r0] ; p1
|
||||
mova m1, [r0+r1] ; p0
|
||||
mova m2, [%1] ; q0
|
||||
mova m3, [%1+r1] ; q1
|
||||
%endmacro
|
||||
|
||||
%macro CHROMA_V_STORE 0
|
||||
mova [r0+1*r1], m1
|
||||
mova [r0+2*r1], m2
|
||||
%endmacro
|
||||
|
||||
%macro CHROMA_V_LOAD_TC 2
|
||||
movd %1, [%2]
|
||||
punpcklbw %1, %1
|
||||
punpcklwd %1, %1
|
||||
psraw %1, 6
|
||||
%endmacro
|
||||
|
||||
%macro DEBLOCK_CHROMA 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
|
||||
mov r5, r0
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
%if mmsize < 16
|
||||
mov r6, 16/mmsize
|
||||
.loop:
|
||||
%endif
|
||||
CHROMA_V_LOAD r5
|
||||
LOAD_AB m4, m5, r2d, r3d
|
||||
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
|
||||
pxor m4, m4
|
||||
CHROMA_V_LOAD_TC m6, r4
|
||||
psubw m6, [pw_3]
|
||||
pmaxsw m6, m4
|
||||
pand m7, m6
|
||||
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
|
||||
CHROMA_V_STORE
|
||||
%if mmsize < 16
|
||||
add r0, mmsize
|
||||
add r5, mmsize
|
||||
add r4, mmsize/4
|
||||
dec r6
|
||||
jg .loop
|
||||
REP_RET
|
||||
%else
|
||||
RET
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
|
||||
mov r4, r0
|
||||
sub r0, r1
|
||||
sub r0, r1
|
||||
shl r2d, 2
|
||||
shl r3d, 2
|
||||
%if mmsize < 16
|
||||
mov r5, 16/mmsize
|
||||
.loop:
|
||||
%endif
|
||||
CHROMA_V_LOAD r4
|
||||
LOAD_AB m4, m5, r2d, r3d
|
||||
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
|
||||
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
|
||||
CHROMA_V_STORE
|
||||
%if mmsize < 16
|
||||
add r0, mmsize
|
||||
add r4, mmsize
|
||||
dec r5
|
||||
jg .loop
|
||||
REP_RET
|
||||
%else
|
||||
RET
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
INIT_MMX mmxext
|
||||
DEBLOCK_CHROMA
|
||||
%endif
|
||||
INIT_XMM sse2
|
||||
DEBLOCK_CHROMA
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEBLOCK_CHROMA
|
||||
%endif
|
||||
204
project/jni/ffmpeg/libavcodec/x86/h264_i386.h
Normal file
204
project/jni/ffmpeg/libavcodec/x86/h264_i386.h
Normal file
@@ -0,0 +1,204 @@
|
||||
/*
|
||||
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
|
||||
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* H.264 / AVC / MPEG4 part10 codec.
|
||||
* non-MMX i386-specific optimizations for H.264
|
||||
* @author Michael Niedermayer <michaelni@gmx.at>
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_H264_I386_H
|
||||
#define AVCODEC_X86_H264_I386_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "libavcodec/cabac.h"
|
||||
#include "cabac.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
|
||||
//as that would make optimization work hard)
|
||||
#if HAVE_7REGS
|
||||
#define decode_significance decode_significance_x86
|
||||
static int decode_significance_x86(CABACContext *c, int max_coeff,
|
||||
uint8_t *significant_coeff_ctx_base,
|
||||
int *index, x86_reg last_off){
|
||||
void *end= significant_coeff_ctx_base + max_coeff - 1;
|
||||
int minusstart= -(intptr_t)significant_coeff_ctx_base;
|
||||
int minusindex= 4-(intptr_t)index;
|
||||
int bit;
|
||||
x86_reg coeff_count;
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
"3: \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c11(%6)", "%c12(%6)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%13")
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jz 4f \n\t"
|
||||
"add %10, %1 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c11(%6)", "%c12(%6)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%13")
|
||||
|
||||
"sub %10, %1 \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %7, %%ecx \n\t"
|
||||
"add %1, %%"REG_c" \n\t"
|
||||
"movl %%ecx, (%0) \n\t"
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jnz 5f \n\t"
|
||||
|
||||
"add"OPSIZE" $4, %2 \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"add $1, %1 \n\t"
|
||||
"cmp %8, %1 \n\t"
|
||||
" jb 3b \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %7, %%ecx \n\t"
|
||||
"add %1, %%"REG_c" \n\t"
|
||||
"movl %%ecx, (%0) \n\t"
|
||||
"5: \n\t"
|
||||
"add %9, %k0 \n\t"
|
||||
"shr $2, %k0 \n\t"
|
||||
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
|
||||
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
|
||||
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end))
|
||||
TABLES_ARG
|
||||
: "%"REG_c, "memory"
|
||||
);
|
||||
return coeff_count;
|
||||
}
|
||||
|
||||
#define decode_significance_8x8 decode_significance_8x8_x86
|
||||
static int decode_significance_8x8_x86(CABACContext *c,
|
||||
uint8_t *significant_coeff_ctx_base,
|
||||
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
|
||||
int minusindex= 4-(intptr_t)index;
|
||||
int bit;
|
||||
x86_reg coeff_count;
|
||||
x86_reg last=0;
|
||||
x86_reg state;
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
void *tables;
|
||||
|
||||
__asm__ volatile(
|
||||
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
|
||||
: "=&r"(tables)
|
||||
);
|
||||
#endif
|
||||
|
||||
__asm__ volatile(
|
||||
"mov %1, %6 \n\t"
|
||||
"3: \n\t"
|
||||
|
||||
"mov %10, %0 \n\t"
|
||||
"movzbl (%0, %6), %k6 \n\t"
|
||||
"add %9, %6 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c12(%7)", "%c13(%7)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%15")
|
||||
|
||||
"mov %1, %k6 \n\t"
|
||||
"test $1, %4 \n\t"
|
||||
" jz 4f \n\t"
|
||||
|
||||
#ifdef BROKEN_RELOCATIONS
|
||||
"movzbl %c14(%15, %q6), %k6\n\t"
|
||||
#else
|
||||
"movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
|
||||
#endif
|
||||
"add %11, %6 \n\t"
|
||||
|
||||
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
|
||||
"%5", "%q5", "%k0", "%b0",
|
||||
"%c12(%7)", "%c13(%7)",
|
||||
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
|
||||
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
|
||||
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
|
||||
"%15")
|
||||
|
||||
"mov %2, %0 \n\t"
|
||||
"mov %1, %k6 \n\t"
|
||||
"movl %k6, (%0) \n\t"
|
||||
|
||||
"test $1, %4 \n\t"
|
||||
" jnz 5f \n\t"
|
||||
|
||||
"add"OPSIZE" $4, %2 \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"addl $1, %k6 \n\t"
|
||||
"mov %k6, %1 \n\t"
|
||||
"cmpl $63, %k6 \n\t"
|
||||
" jb 3b \n\t"
|
||||
"mov %2, %0 \n\t"
|
||||
"movl %k6, (%0) \n\t"
|
||||
"5: \n\t"
|
||||
"addl %8, %k0 \n\t"
|
||||
"shr $2, %k0 \n\t"
|
||||
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
|
||||
"=&r"(bit), "+&r"(c->range), "=&r"(state)
|
||||
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
|
||||
"m"(sig_off), "m"(last_coeff_ctx_base),
|
||||
"i"(offsetof(CABACContext, bytestream)),
|
||||
"i"(offsetof(CABACContext, bytestream_end)),
|
||||
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
|
||||
: "%"REG_c, "memory"
|
||||
);
|
||||
return coeff_count;
|
||||
}
|
||||
#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVCODEC_X86_H264_I386_H */
|
||||
1015
project/jni/ffmpeg/libavcodec/x86/h264_idct.asm
Normal file
1015
project/jni/ffmpeg/libavcodec/x86/h264_idct.asm
Normal file
File diff suppressed because it is too large
Load Diff
546
project/jni/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
Normal file
546
project/jni/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
Normal file
@@ -0,0 +1,546 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||
pd_32: times 4 dd 32
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro STORE_DIFFx2 6
|
||||
psrad %1, 6
|
||||
psrad %2, 6
|
||||
packssdw %1, %2
|
||||
movq %3, [%5]
|
||||
movhps %3, [%5+%6]
|
||||
paddsw %1, %3
|
||||
CLIPW %1, %4, [pw_pixel_max]
|
||||
movq [%5], %1
|
||||
movhps [%5+%6], %1
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DIFF16 5
|
||||
psrad %1, 6
|
||||
psrad %2, 6
|
||||
packssdw %1, %2
|
||||
paddsw %1, [%5]
|
||||
CLIPW %1, %3, %4
|
||||
mova [%5], %1
|
||||
%endmacro
|
||||
|
||||
;dst, in, stride
|
||||
%macro IDCT4_ADD_10 3
|
||||
mova m0, [%2+ 0]
|
||||
mova m1, [%2+16]
|
||||
mova m2, [%2+32]
|
||||
mova m3, [%2+48]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
TRANSPOSE4x4D 0,1,2,3,4
|
||||
paddd m0, [pd_32]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
pxor m5, m5
|
||||
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_ADD_10 0
|
||||
cglobal h264_idct_add_10, 3,3
|
||||
IDCT4_ADD_10 r0, r1, r2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD_10
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD_10
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
;;;;;;; NO FATE SAMPLES TRIGGER THIS
|
||||
%macro ADD4x4IDCT 0
|
||||
add4x4_idct %+ SUFFIX:
|
||||
add r5, r0
|
||||
mova m0, [r2+ 0]
|
||||
mova m1, [r2+16]
|
||||
mova m2, [r2+32]
|
||||
mova m3, [r2+48]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
TRANSPOSE4x4D 0,1,2,3,4
|
||||
paddd m0, [pd_32]
|
||||
IDCT4_1D d,0,1,2,3,4,5
|
||||
pxor m5, m5
|
||||
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
|
||||
lea r5, [r5+r3*2]
|
||||
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
ALIGN 16
|
||||
ADD4x4IDCT
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
ALIGN 16
|
||||
ADD4x4IDCT
|
||||
%endif
|
||||
|
||||
%macro ADD16_OP 2
|
||||
cmp byte [r4+%2], 0
|
||||
jz .skipblock%1
|
||||
mov r5d, [r1+%1*4]
|
||||
call add4x4_idct %+ SUFFIX
|
||||
.skipblock%1:
|
||||
%if %1<15
|
||||
add r2, 64
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_ADD16_10 0
|
||||
cglobal h264_idct_add16_10, 5,6
|
||||
ADD16_OP 0, 4+1*8
|
||||
ADD16_OP 1, 5+1*8
|
||||
ADD16_OP 2, 4+2*8
|
||||
ADD16_OP 3, 5+2*8
|
||||
ADD16_OP 4, 6+1*8
|
||||
ADD16_OP 5, 7+1*8
|
||||
ADD16_OP 6, 6+2*8
|
||||
ADD16_OP 7, 7+2*8
|
||||
ADD16_OP 8, 4+3*8
|
||||
ADD16_OP 9, 5+3*8
|
||||
ADD16_OP 10, 4+4*8
|
||||
ADD16_OP 11, 5+4*8
|
||||
ADD16_OP 12, 6+3*8
|
||||
ADD16_OP 13, 7+3*8
|
||||
ADD16_OP 14, 6+4*8
|
||||
ADD16_OP 15, 7+4*8
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD16_10
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD16_10
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT_DC_ADD_OP_10 3
|
||||
pxor m5, m5
|
||||
%if avx_enabled
|
||||
paddw m1, m0, [%1+0 ]
|
||||
paddw m2, m0, [%1+%2 ]
|
||||
paddw m3, m0, [%1+%2*2]
|
||||
paddw m4, m0, [%1+%3 ]
|
||||
%else
|
||||
mova m1, [%1+0 ]
|
||||
mova m2, [%1+%2 ]
|
||||
mova m3, [%1+%2*2]
|
||||
mova m4, [%1+%3 ]
|
||||
paddw m1, m0
|
||||
paddw m2, m0
|
||||
paddw m3, m0
|
||||
paddw m4, m0
|
||||
%endif
|
||||
CLIPW m1, m5, m6
|
||||
CLIPW m2, m5, m6
|
||||
CLIPW m3, m5, m6
|
||||
CLIPW m4, m5, m6
|
||||
mova [%1+0 ], m1
|
||||
mova [%1+%2 ], m2
|
||||
mova [%1+%2*2], m3
|
||||
mova [%1+%3 ], m4
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal h264_idct_dc_add_10,3,3
|
||||
movd m0, [r1]
|
||||
paddd m0, [pd_32]
|
||||
psrad m0, 6
|
||||
lea r1, [r2*3]
|
||||
pshufw m0, m0, 0
|
||||
mova m6, [pw_pixel_max]
|
||||
IDCT_DC_ADD_OP_10 r0, r2, r1
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT8_DC_ADD 0
|
||||
cglobal h264_idct8_dc_add_10,3,3,7
|
||||
mov r1d, [r1]
|
||||
add r1, 32
|
||||
sar r1, 6
|
||||
movd m0, r1d
|
||||
lea r1, [r2*3]
|
||||
SPLATW m0, m0, 0
|
||||
mova m6, [pw_pixel_max]
|
||||
IDCT_DC_ADD_OP_10 r0, r2, r1
|
||||
lea r0, [r0+r2*4]
|
||||
IDCT_DC_ADD_OP_10 r0, r2, r1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT8_DC_ADD
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT8_DC_ADD
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro AC 1
|
||||
.ac%1:
|
||||
mov r5d, [r1+(%1+0)*4]
|
||||
call add4x4_idct %+ SUFFIX
|
||||
mov r5d, [r1+(%1+1)*4]
|
||||
add r2, 64
|
||||
call add4x4_idct %+ SUFFIX
|
||||
add r2, 64
|
||||
jmp .skipadd%1
|
||||
%endmacro
|
||||
|
||||
%assign last_block 16
|
||||
%macro ADD16_OP_INTRA 2
|
||||
cmp word [r4+%2], 0
|
||||
jnz .ac%1
|
||||
mov r5d, [r2+ 0]
|
||||
or r5d, [r2+64]
|
||||
jz .skipblock%1
|
||||
mov r5d, [r1+(%1+0)*4]
|
||||
call idct_dc_add %+ SUFFIX
|
||||
.skipblock%1:
|
||||
%if %1<last_block-2
|
||||
add r2, 128
|
||||
%endif
|
||||
.skipadd%1:
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_ADD16INTRA_10 0
|
||||
idct_dc_add %+ SUFFIX:
|
||||
add r5, r0
|
||||
movq m0, [r2+ 0]
|
||||
movhps m0, [r2+64]
|
||||
paddd m0, [pd_32]
|
||||
psrad m0, 6
|
||||
pshufhw m0, m0, 0
|
||||
pshuflw m0, m0, 0
|
||||
lea r6, [r3*3]
|
||||
mova m6, [pw_pixel_max]
|
||||
IDCT_DC_ADD_OP_10 r5, r3, r6
|
||||
ret
|
||||
|
||||
cglobal h264_idct_add16intra_10,5,7,8
|
||||
ADD16_OP_INTRA 0, 4+1*8
|
||||
ADD16_OP_INTRA 2, 4+2*8
|
||||
ADD16_OP_INTRA 4, 6+1*8
|
||||
ADD16_OP_INTRA 6, 6+2*8
|
||||
ADD16_OP_INTRA 8, 4+3*8
|
||||
ADD16_OP_INTRA 10, 4+4*8
|
||||
ADD16_OP_INTRA 12, 6+3*8
|
||||
ADD16_OP_INTRA 14, 6+4*8
|
||||
REP_RET
|
||||
AC 8
|
||||
AC 10
|
||||
AC 12
|
||||
AC 14
|
||||
AC 0
|
||||
AC 2
|
||||
AC 4
|
||||
AC 6
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD16INTRA_10
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD16INTRA_10
|
||||
%endif
|
||||
|
||||
%assign last_block 36
|
||||
;-----------------------------------------------------------------------------
|
||||
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT_ADD8 0
|
||||
cglobal h264_idct_add8_10,5,8,7
|
||||
%if ARCH_X86_64
|
||||
mov r7, r0
|
||||
%endif
|
||||
add r2, 1024
|
||||
mov r0, [r0]
|
||||
ADD16_OP_INTRA 16, 4+ 6*8
|
||||
ADD16_OP_INTRA 18, 4+ 7*8
|
||||
add r2, 1024-128*2
|
||||
%if ARCH_X86_64
|
||||
mov r0, [r7+gprsize]
|
||||
%else
|
||||
mov r0, r0m
|
||||
mov r0, [r0+gprsize]
|
||||
%endif
|
||||
ADD16_OP_INTRA 32, 4+11*8
|
||||
ADD16_OP_INTRA 34, 4+12*8
|
||||
REP_RET
|
||||
AC 16
|
||||
AC 18
|
||||
AC 32
|
||||
AC 34
|
||||
|
||||
%endmacro ; IDCT_ADD8
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT_ADD8
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT_ADD8
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro IDCT8_1D 2
|
||||
SWAP 0, 1
|
||||
psrad m4, m5, 1
|
||||
psrad m1, m0, 1
|
||||
paddd m4, m5
|
||||
paddd m1, m0
|
||||
paddd m4, m7
|
||||
paddd m1, m5
|
||||
psubd m4, m0
|
||||
paddd m1, m3
|
||||
|
||||
psubd m0, m3
|
||||
psubd m5, m3
|
||||
paddd m0, m7
|
||||
psubd m5, m7
|
||||
psrad m3, 1
|
||||
psrad m7, 1
|
||||
psubd m0, m3
|
||||
psubd m5, m7
|
||||
|
||||
SWAP 1, 7
|
||||
psrad m1, m7, 2
|
||||
psrad m3, m4, 2
|
||||
paddd m3, m0
|
||||
psrad m0, 2
|
||||
paddd m1, m5
|
||||
psrad m5, 2
|
||||
psubd m0, m4
|
||||
psubd m7, m5
|
||||
|
||||
SWAP 5, 6
|
||||
psrad m4, m2, 1
|
||||
psrad m6, m5, 1
|
||||
psubd m4, m5
|
||||
paddd m6, m2
|
||||
|
||||
mova m2, %1
|
||||
mova m5, %2
|
||||
SUMSUB_BA d, 5, 2
|
||||
SUMSUB_BA d, 6, 5
|
||||
SUMSUB_BA d, 4, 2
|
||||
SUMSUB_BA d, 7, 6
|
||||
SUMSUB_BA d, 0, 4
|
||||
SUMSUB_BA d, 3, 2
|
||||
SUMSUB_BA d, 1, 5
|
||||
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_1D_FULL 1
|
||||
mova m7, [%1+112*2]
|
||||
mova m6, [%1+ 96*2]
|
||||
mova m5, [%1+ 80*2]
|
||||
mova m3, [%1+ 48*2]
|
||||
mova m2, [%1+ 32*2]
|
||||
mova m1, [%1+ 16*2]
|
||||
IDCT8_1D [%1], [%1+ 64*2]
|
||||
%endmacro
|
||||
|
||||
; %1=int16_t *block, %2=int16_t *dstblock
|
||||
%macro IDCT8_ADD_SSE_START 2
|
||||
IDCT8_1D_FULL %1
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE4x4D 0,1,2,3,8
|
||||
mova [%2 ], m0
|
||||
TRANSPOSE4x4D 4,5,6,7,8
|
||||
mova [%2+8*2], m4
|
||||
%else
|
||||
mova [%1], m7
|
||||
TRANSPOSE4x4D 0,1,2,3,7
|
||||
mova m7, [%1]
|
||||
mova [%2 ], m0
|
||||
mova [%2+16*2], m1
|
||||
mova [%2+32*2], m2
|
||||
mova [%2+48*2], m3
|
||||
TRANSPOSE4x4D 4,5,6,7,3
|
||||
mova [%2+ 8*2], m4
|
||||
mova [%2+24*2], m5
|
||||
mova [%2+40*2], m6
|
||||
mova [%2+56*2], m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
|
||||
%macro IDCT8_ADD_SSE_END 3
|
||||
IDCT8_1D_FULL %2
|
||||
mova [%2 ], m6
|
||||
mova [%2+16*2], m7
|
||||
|
||||
pxor m7, m7
|
||||
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m2, m3, m6, m7, %1, %3
|
||||
mova m0, [%2 ]
|
||||
mova m1, [%2+16*2]
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m4, m5, m6, m7, %1, %3
|
||||
lea %1, [%1+%3*2]
|
||||
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_ADD 0
|
||||
cglobal h264_idct8_add_10, 3,4,16
|
||||
%if UNIX64 == 0
|
||||
%assign pad 16-gprsize-(stack_offset&15)
|
||||
sub rsp, pad
|
||||
call h264_idct8_add1_10 %+ SUFFIX
|
||||
add rsp, pad
|
||||
RET
|
||||
%endif
|
||||
|
||||
ALIGN 16
|
||||
; TODO: does not need to use stack
|
||||
h264_idct8_add1_10 %+ SUFFIX:
|
||||
%assign pad 256+16-gprsize
|
||||
sub rsp, pad
|
||||
add dword [r1], 32
|
||||
|
||||
%if ARCH_X86_64
|
||||
IDCT8_ADD_SSE_START r1, rsp
|
||||
SWAP 1, 9
|
||||
SWAP 2, 10
|
||||
SWAP 3, 11
|
||||
SWAP 5, 13
|
||||
SWAP 6, 14
|
||||
SWAP 7, 15
|
||||
IDCT8_ADD_SSE_START r1+16, rsp+128
|
||||
PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
|
||||
IDCT8_1D [rsp], [rsp+128]
|
||||
SWAP 0, 8
|
||||
SWAP 1, 9
|
||||
SWAP 2, 10
|
||||
SWAP 3, 11
|
||||
SWAP 4, 12
|
||||
SWAP 5, 13
|
||||
SWAP 6, 14
|
||||
SWAP 7, 15
|
||||
IDCT8_1D [rsp+16], [rsp+144]
|
||||
psrad m8, 6
|
||||
psrad m0, 6
|
||||
packssdw m8, m0
|
||||
paddsw m8, [r0]
|
||||
pxor m0, m0
|
||||
CLIPW m8, m0, [pw_pixel_max]
|
||||
mova [r0], m8
|
||||
mova m8, [pw_pixel_max]
|
||||
STORE_DIFF16 m9, m1, m0, m8, r0+r2
|
||||
lea r0, [r0+r2*2]
|
||||
STORE_DIFF16 m10, m2, m0, m8, r0
|
||||
STORE_DIFF16 m11, m3, m0, m8, r0+r2
|
||||
lea r0, [r0+r2*2]
|
||||
STORE_DIFF16 m12, m4, m0, m8, r0
|
||||
STORE_DIFF16 m13, m5, m0, m8, r0+r2
|
||||
lea r0, [r0+r2*2]
|
||||
STORE_DIFF16 m14, m6, m0, m8, r0
|
||||
STORE_DIFF16 m15, m7, m0, m8, r0+r2
|
||||
%else
|
||||
IDCT8_ADD_SSE_START r1, rsp
|
||||
IDCT8_ADD_SSE_START r1+16, rsp+128
|
||||
lea r3, [r0+8]
|
||||
IDCT8_ADD_SSE_END r0, rsp, r2
|
||||
IDCT8_ADD_SSE_END r3, rsp+16, r2
|
||||
%endif ; ARCH_X86_64
|
||||
|
||||
add rsp, pad
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT8_ADD
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT8_ADD
|
||||
%endif
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
|
||||
;-----------------------------------------------------------------------------
|
||||
;;;;;;; NO FATE SAMPLES TRIGGER THIS
|
||||
%macro IDCT8_ADD4_OP 2
|
||||
cmp byte [r4+%2], 0
|
||||
jz .skipblock%1
|
||||
mov r0d, [r6+%1*4]
|
||||
add r0, r5
|
||||
call h264_idct8_add1_10 %+ SUFFIX
|
||||
.skipblock%1:
|
||||
%if %1<12
|
||||
add r1, 256
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_ADD4 0
|
||||
cglobal h264_idct8_add4_10, 0,7,16
|
||||
%assign pad 16-gprsize-(stack_offset&15)
|
||||
SUB rsp, pad
|
||||
mov r5, r0mp
|
||||
mov r6, r1mp
|
||||
mov r1, r2mp
|
||||
mov r2d, r3m
|
||||
movifnidn r4, r4mp
|
||||
IDCT8_ADD4_OP 0, 4+1*8
|
||||
IDCT8_ADD4_OP 4, 6+1*8
|
||||
IDCT8_ADD4_OP 8, 4+3*8
|
||||
IDCT8_ADD4_OP 12, 6+3*8
|
||||
ADD rsp, pad
|
||||
RET
|
||||
%endmacro ; IDCT8_ADD4
|
||||
|
||||
INIT_XMM sse2
|
||||
IDCT8_ADD4
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
IDCT8_ADD4
|
||||
%endif
|
||||
2702
project/jni/ffmpeg/libavcodec/x86/h264_intrapred.asm
Normal file
2702
project/jni/ffmpeg/libavcodec/x86/h264_intrapred.asm
Normal file
File diff suppressed because it is too large
Load Diff
1199
project/jni/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
Normal file
1199
project/jni/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
Normal file
File diff suppressed because it is too large
Load Diff
398
project/jni/ffmpeg/libavcodec/x86/h264_intrapred_init.c
Normal file
398
project/jni/ffmpeg/libavcodec/x86/h264_intrapred_init.c
Normal file
@@ -0,0 +1,398 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Jason Garrett-Glaser
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264pred.h"
|
||||
|
||||
#define PRED4x4(TYPE, DEPTH, OPT) \
|
||||
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
const uint8_t *topright, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED4x4(dc, 10, mmxext)
|
||||
PRED4x4(down_left, 10, sse2)
|
||||
PRED4x4(down_left, 10, avx)
|
||||
PRED4x4(down_right, 10, sse2)
|
||||
PRED4x4(down_right, 10, ssse3)
|
||||
PRED4x4(down_right, 10, avx)
|
||||
PRED4x4(vertical_left, 10, sse2)
|
||||
PRED4x4(vertical_left, 10, avx)
|
||||
PRED4x4(vertical_right, 10, sse2)
|
||||
PRED4x4(vertical_right, 10, ssse3)
|
||||
PRED4x4(vertical_right, 10, avx)
|
||||
PRED4x4(horizontal_up, 10, mmxext)
|
||||
PRED4x4(horizontal_down, 10, sse2)
|
||||
PRED4x4(horizontal_down, 10, ssse3)
|
||||
PRED4x4(horizontal_down, 10, avx)
|
||||
|
||||
#define PRED8x8(TYPE, DEPTH, OPT) \
|
||||
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED8x8(dc, 10, mmxext)
|
||||
PRED8x8(dc, 10, sse2)
|
||||
PRED8x8(top_dc, 10, sse2)
|
||||
PRED8x8(plane, 10, sse2)
|
||||
PRED8x8(vertical, 10, sse2)
|
||||
PRED8x8(horizontal, 10, sse2)
|
||||
|
||||
#define PRED8x8L(TYPE, DEPTH, OPT)\
|
||||
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
int has_topleft, \
|
||||
int has_topright, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED8x8L(dc, 10, sse2)
|
||||
PRED8x8L(dc, 10, avx)
|
||||
PRED8x8L(128_dc, 10, mmxext)
|
||||
PRED8x8L(128_dc, 10, sse2)
|
||||
PRED8x8L(top_dc, 10, sse2)
|
||||
PRED8x8L(top_dc, 10, avx)
|
||||
PRED8x8L(vertical, 10, sse2)
|
||||
PRED8x8L(vertical, 10, avx)
|
||||
PRED8x8L(horizontal, 10, sse2)
|
||||
PRED8x8L(horizontal, 10, ssse3)
|
||||
PRED8x8L(horizontal, 10, avx)
|
||||
PRED8x8L(down_left, 10, sse2)
|
||||
PRED8x8L(down_left, 10, ssse3)
|
||||
PRED8x8L(down_left, 10, avx)
|
||||
PRED8x8L(down_right, 10, sse2)
|
||||
PRED8x8L(down_right, 10, ssse3)
|
||||
PRED8x8L(down_right, 10, avx)
|
||||
PRED8x8L(vertical_right, 10, sse2)
|
||||
PRED8x8L(vertical_right, 10, ssse3)
|
||||
PRED8x8L(vertical_right, 10, avx)
|
||||
PRED8x8L(horizontal_up, 10, sse2)
|
||||
PRED8x8L(horizontal_up, 10, ssse3)
|
||||
PRED8x8L(horizontal_up, 10, avx)
|
||||
|
||||
#define PRED16x16(TYPE, DEPTH, OPT)\
|
||||
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
|
||||
ptrdiff_t stride);
|
||||
|
||||
PRED16x16(dc, 10, mmxext)
|
||||
PRED16x16(dc, 10, sse2)
|
||||
PRED16x16(top_dc, 10, mmxext)
|
||||
PRED16x16(top_dc, 10, sse2)
|
||||
PRED16x16(128_dc, 10, mmxext)
|
||||
PRED16x16(128_dc, 10, sse2)
|
||||
PRED16x16(left_dc, 10, mmxext)
|
||||
PRED16x16(left_dc, 10, sse2)
|
||||
PRED16x16(vertical, 10, mmxext)
|
||||
PRED16x16(vertical, 10, sse2)
|
||||
PRED16x16(horizontal, 10, mmxext)
|
||||
PRED16x16(horizontal, 10, sse2)
|
||||
|
||||
/* 8-bit versions */
|
||||
PRED16x16(vertical, 8, mmx)
|
||||
PRED16x16(vertical, 8, sse)
|
||||
PRED16x16(horizontal, 8, mmx)
|
||||
PRED16x16(horizontal, 8, mmxext)
|
||||
PRED16x16(horizontal, 8, ssse3)
|
||||
PRED16x16(dc, 8, mmxext)
|
||||
PRED16x16(dc, 8, sse2)
|
||||
PRED16x16(dc, 8, ssse3)
|
||||
PRED16x16(plane_h264, 8, mmx)
|
||||
PRED16x16(plane_h264, 8, mmxext)
|
||||
PRED16x16(plane_h264, 8, sse2)
|
||||
PRED16x16(plane_h264, 8, ssse3)
|
||||
PRED16x16(plane_rv40, 8, mmx)
|
||||
PRED16x16(plane_rv40, 8, mmxext)
|
||||
PRED16x16(plane_rv40, 8, sse2)
|
||||
PRED16x16(plane_rv40, 8, ssse3)
|
||||
PRED16x16(plane_svq3, 8, mmx)
|
||||
PRED16x16(plane_svq3, 8, mmxext)
|
||||
PRED16x16(plane_svq3, 8, sse2)
|
||||
PRED16x16(plane_svq3, 8, ssse3)
|
||||
PRED16x16(tm_vp8, 8, mmx)
|
||||
PRED16x16(tm_vp8, 8, mmxext)
|
||||
PRED16x16(tm_vp8, 8, sse2)
|
||||
|
||||
PRED8x8(top_dc, 8, mmxext)
|
||||
PRED8x8(dc_rv40, 8, mmxext)
|
||||
PRED8x8(dc, 8, mmxext)
|
||||
PRED8x8(vertical, 8, mmx)
|
||||
PRED8x8(horizontal, 8, mmx)
|
||||
PRED8x8(horizontal, 8, mmxext)
|
||||
PRED8x8(horizontal, 8, ssse3)
|
||||
PRED8x8(plane, 8, mmx)
|
||||
PRED8x8(plane, 8, mmxext)
|
||||
PRED8x8(plane, 8, sse2)
|
||||
PRED8x8(plane, 8, ssse3)
|
||||
PRED8x8(tm_vp8, 8, mmx)
|
||||
PRED8x8(tm_vp8, 8, mmxext)
|
||||
PRED8x8(tm_vp8, 8, sse2)
|
||||
PRED8x8(tm_vp8, 8, ssse3)
|
||||
|
||||
PRED8x8L(top_dc, 8, mmxext)
|
||||
PRED8x8L(top_dc, 8, ssse3)
|
||||
PRED8x8L(dc, 8, mmxext)
|
||||
PRED8x8L(dc, 8, ssse3)
|
||||
PRED8x8L(horizontal, 8, mmxext)
|
||||
PRED8x8L(horizontal, 8, ssse3)
|
||||
PRED8x8L(vertical, 8, mmxext)
|
||||
PRED8x8L(vertical, 8, ssse3)
|
||||
PRED8x8L(down_left, 8, mmxext)
|
||||
PRED8x8L(down_left, 8, sse2)
|
||||
PRED8x8L(down_left, 8, ssse3)
|
||||
PRED8x8L(down_right, 8, mmxext)
|
||||
PRED8x8L(down_right, 8, sse2)
|
||||
PRED8x8L(down_right, 8, ssse3)
|
||||
PRED8x8L(vertical_right, 8, mmxext)
|
||||
PRED8x8L(vertical_right, 8, sse2)
|
||||
PRED8x8L(vertical_right, 8, ssse3)
|
||||
PRED8x8L(vertical_left, 8, sse2)
|
||||
PRED8x8L(vertical_left, 8, ssse3)
|
||||
PRED8x8L(horizontal_up, 8, mmxext)
|
||||
PRED8x8L(horizontal_up, 8, ssse3)
|
||||
PRED8x8L(horizontal_down, 8, mmxext)
|
||||
PRED8x8L(horizontal_down, 8, sse2)
|
||||
PRED8x8L(horizontal_down, 8, ssse3)
|
||||
|
||||
PRED4x4(dc, 8, mmxext)
|
||||
PRED4x4(down_left, 8, mmxext)
|
||||
PRED4x4(down_right, 8, mmxext)
|
||||
PRED4x4(vertical_left, 8, mmxext)
|
||||
PRED4x4(vertical_right, 8, mmxext)
|
||||
PRED4x4(horizontal_up, 8, mmxext)
|
||||
PRED4x4(horizontal_down, 8, mmxext)
|
||||
PRED4x4(tm_vp8, 8, mmx)
|
||||
PRED4x4(tm_vp8, 8, mmxext)
|
||||
PRED4x4(tm_vp8, 8, ssse3)
|
||||
PRED4x4(vertical_vp8, 8, mmxext)
|
||||
|
||||
void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, const int bit_depth, const int chroma_format_idc)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
|
||||
if (chroma_format_idc == 1) {
|
||||
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
|
||||
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
|
||||
} else {
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
if (mm_flags & AV_CPU_FLAG_CMOV)
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
|
||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
|
||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
|
||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
|
||||
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
|
||||
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
|
||||
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
|
||||
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
|
||||
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
|
||||
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
|
||||
if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
|
||||
}
|
||||
if (codec_id != AV_CODEC_ID_RV40) {
|
||||
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
|
||||
if (chroma_format_idc == 1) {
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
|
||||
}
|
||||
}
|
||||
if (codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
|
||||
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
|
||||
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
|
||||
} else {
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE(mm_flags)) {
|
||||
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
|
||||
} else {
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSSE3(mm_flags)) {
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
|
||||
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
|
||||
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
|
||||
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
|
||||
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
|
||||
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
|
||||
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
|
||||
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
|
||||
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
|
||||
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
|
||||
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
|
||||
if (codec_id == AV_CODEC_ID_VP8) {
|
||||
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
|
||||
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
|
||||
} else {
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
|
||||
if (codec_id == AV_CODEC_ID_SVQ3) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
|
||||
} else if (codec_id == AV_CODEC_ID_RV40) {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
|
||||
} else {
|
||||
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
|
||||
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
|
||||
|
||||
if (chroma_format_idc == 1)
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
|
||||
|
||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
|
||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
|
||||
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
|
||||
|
||||
if (chroma_format_idc == 1) {
|
||||
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
|
||||
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
|
||||
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
|
||||
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
|
||||
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
|
||||
}
|
||||
|
||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
|
||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
|
||||
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
|
||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
|
||||
|
||||
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
|
||||
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
|
||||
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
|
||||
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
|
||||
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
|
||||
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(mm_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
|
||||
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(mm_flags)) {
|
||||
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
|
||||
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
|
||||
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
|
||||
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
|
||||
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
|
||||
|
||||
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
|
||||
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
|
||||
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
|
||||
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
|
||||
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
|
||||
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
|
||||
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
|
||||
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
|
||||
}
|
||||
}
|
||||
}
|
||||
492
project/jni/ffmpeg/libavcodec/x86/h264_qpel.c
Normal file
492
project/jni/ffmpeg/libavcodec/x86/h264_qpel.c
Normal file
@@ -0,0 +1,492 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
* Copyright (c) 2011 Daniel Kang
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
int line_size, int h)
|
||||
{
|
||||
ff_put_pixels8_mmxext(block, pixels, line_size, h);
|
||||
ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
|
||||
int line_size, int h)
|
||||
{
|
||||
ff_avg_pixels8_mmxext(block, pixels, line_size, h);
|
||||
ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
|
||||
int dstStride, int src1Stride, int h);
|
||||
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
int line_size, int h);
|
||||
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
|
||||
int line_size, int h);
|
||||
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
|
||||
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
|
||||
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
|
||||
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
|
||||
|
||||
#define DEF_QPEL(OPNAME)\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
|
||||
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
|
||||
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
|
||||
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
|
||||
|
||||
DEF_QPEL(avg)
|
||||
DEF_QPEL(put)
|
||||
|
||||
#define QPEL_H264(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
int w=3;\
|
||||
src -= 2*srcStride+2;\
|
||||
while(w--){\
|
||||
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
|
||||
tmp += 4;\
|
||||
src += 4;\
|
||||
}\
|
||||
tmp -= 3*4;\
|
||||
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
|
||||
src -= 2*srcStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
|
||||
src += 4;\
|
||||
dst += 4;\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
|
||||
int w = (size+8)>>2;\
|
||||
src -= 2*srcStride+2;\
|
||||
while(w--){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
|
||||
tmp += 4;\
|
||||
src += 4;\
|
||||
}\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
|
||||
int w = size>>4;\
|
||||
do{\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
|
||||
tmp += 8;\
|
||||
dst += 8;\
|
||||
}while(w--);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
src += 8*dstStride;\
|
||||
dst += 8*dstStride;\
|
||||
src2 += 8*src2Stride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
\
|
||||
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
|
||||
{\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
|
||||
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
|
||||
}\
|
||||
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
|
||||
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
|
||||
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
|
||||
|
||||
#else // ARCH_X86_64
|
||||
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
src += 8*dstStride;\
|
||||
dst += 8*dstStride;\
|
||||
src2 += 8*src2Stride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
|
||||
}
|
||||
#endif // ARCH_X86_64
|
||||
|
||||
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
|
||||
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
src += 8*srcStride;\
|
||||
dst += 8*dstStride;\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
|
||||
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
|
||||
}\
|
||||
|
||||
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
|
||||
}
|
||||
|
||||
static av_always_inline void ff_put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
|
||||
int w = (size+8)>>3;
|
||||
src -= 2*srcStride+2;
|
||||
while(w--){
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
|
||||
tmp += 8;
|
||||
src += 8;
|
||||
}
|
||||
}
|
||||
|
||||
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
|
||||
ff_put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
|
||||
}\
|
||||
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
|
||||
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
|
||||
}\
|
||||
|
||||
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
|
||||
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
|
||||
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
|
||||
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
|
||||
|
||||
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
|
||||
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
|
||||
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
|
||||
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
|
||||
|
||||
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
|
||||
|
||||
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
|
||||
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
|
||||
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
|
||||
|
||||
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
|
||||
ff_put_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
|
||||
ff_avg_pixels16_sse2(dst, src, stride, 16);
|
||||
}
|
||||
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
|
||||
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
|
||||
|
||||
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
|
||||
}\
|
||||
|
||||
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
|
||||
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
assert(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
assert(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
assert(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
|
||||
}\
|
||||
\
|
||||
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
|
||||
uint8_t * const halfHV= temp;\
|
||||
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
|
||||
assert(((int)temp & 7) == 0);\
|
||||
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
|
||||
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
|
||||
}\
|
||||
|
||||
#define H264_MC_4816(MMX)\
|
||||
H264_MC(put_, 4, MMX, 8)\
|
||||
H264_MC(put_, 8, MMX, 8)\
|
||||
H264_MC(put_, 16,MMX, 8)\
|
||||
H264_MC(avg_, 4, MMX, 8)\
|
||||
H264_MC(avg_, 8, MMX, 8)\
|
||||
H264_MC(avg_, 16,MMX, 8)\
|
||||
|
||||
#define H264_MC_816(QPEL, XMM)\
|
||||
QPEL(put_, 8, XMM, 16)\
|
||||
QPEL(put_, 16,XMM, 16)\
|
||||
QPEL(avg_, 8, XMM, 16)\
|
||||
QPEL(avg_, 16,XMM, 16)\
|
||||
|
||||
#undef PAVGB
|
||||
#define PAVGB "pavgb"
|
||||
QPEL_H264(put_, PUT_OP, mmxext)
|
||||
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
|
||||
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
|
||||
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
|
||||
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
|
||||
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
|
||||
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
|
||||
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
|
||||
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
|
||||
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
|
||||
#undef PAVGB
|
||||
|
||||
H264_MC_4816(mmxext)
|
||||
H264_MC_816(H264_MC_V, sse2)
|
||||
H264_MC_816(H264_MC_HV, sse2)
|
||||
H264_MC_816(H264_MC_H, ssse3)
|
||||
H264_MC_816(H264_MC_HV, ssse3)
|
||||
|
||||
|
||||
//10bit
|
||||
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
|
||||
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, uint8_t *src, int stride);
|
||||
|
||||
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
|
||||
|
||||
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
|
||||
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
|
||||
|
||||
LUMA_MC_ALL(10, mc00, mmxext)
|
||||
LUMA_MC_ALL(10, mc10, mmxext)
|
||||
LUMA_MC_ALL(10, mc20, mmxext)
|
||||
LUMA_MC_ALL(10, mc30, mmxext)
|
||||
LUMA_MC_ALL(10, mc01, mmxext)
|
||||
LUMA_MC_ALL(10, mc11, mmxext)
|
||||
LUMA_MC_ALL(10, mc21, mmxext)
|
||||
LUMA_MC_ALL(10, mc31, mmxext)
|
||||
LUMA_MC_ALL(10, mc02, mmxext)
|
||||
LUMA_MC_ALL(10, mc12, mmxext)
|
||||
LUMA_MC_ALL(10, mc22, mmxext)
|
||||
LUMA_MC_ALL(10, mc32, mmxext)
|
||||
LUMA_MC_ALL(10, mc03, mmxext)
|
||||
LUMA_MC_ALL(10, mc13, mmxext)
|
||||
LUMA_MC_ALL(10, mc23, mmxext)
|
||||
LUMA_MC_ALL(10, mc33, mmxext)
|
||||
|
||||
LUMA_MC_816(10, mc00, sse2)
|
||||
LUMA_MC_816(10, mc10, sse2)
|
||||
LUMA_MC_816(10, mc10, sse2_cache64)
|
||||
LUMA_MC_816(10, mc10, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc20, sse2)
|
||||
LUMA_MC_816(10, mc20, sse2_cache64)
|
||||
LUMA_MC_816(10, mc20, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc30, sse2)
|
||||
LUMA_MC_816(10, mc30, sse2_cache64)
|
||||
LUMA_MC_816(10, mc30, ssse3_cache64)
|
||||
LUMA_MC_816(10, mc01, sse2)
|
||||
LUMA_MC_816(10, mc11, sse2)
|
||||
LUMA_MC_816(10, mc21, sse2)
|
||||
LUMA_MC_816(10, mc31, sse2)
|
||||
LUMA_MC_816(10, mc02, sse2)
|
||||
LUMA_MC_816(10, mc12, sse2)
|
||||
LUMA_MC_816(10, mc22, sse2)
|
||||
LUMA_MC_816(10, mc32, sse2)
|
||||
LUMA_MC_816(10, mc03, sse2)
|
||||
LUMA_MC_816(10, mc13, sse2)
|
||||
LUMA_MC_816(10, mc23, sse2)
|
||||
LUMA_MC_816(10, mc33, sse2)
|
||||
|
||||
#define QPEL16_OPMC(OP, MC, MMX)\
|
||||
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
|
||||
src += 8*stride;\
|
||||
dst += 8*stride;\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
|
||||
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
|
||||
}
|
||||
|
||||
#define QPEL16_OP(MC, MMX)\
|
||||
QPEL16_OPMC(put, MC, MMX)\
|
||||
QPEL16_OPMC(avg, MC, MMX)
|
||||
|
||||
#define QPEL16(MMX)\
|
||||
QPEL16_OP(mc00, MMX)\
|
||||
QPEL16_OP(mc01, MMX)\
|
||||
QPEL16_OP(mc02, MMX)\
|
||||
QPEL16_OP(mc03, MMX)\
|
||||
QPEL16_OP(mc10, MMX)\
|
||||
QPEL16_OP(mc11, MMX)\
|
||||
QPEL16_OP(mc12, MMX)\
|
||||
QPEL16_OP(mc13, MMX)\
|
||||
QPEL16_OP(mc20, MMX)\
|
||||
QPEL16_OP(mc21, MMX)\
|
||||
QPEL16_OP(mc22, MMX)\
|
||||
QPEL16_OP(mc23, MMX)\
|
||||
QPEL16_OP(mc30, MMX)\
|
||||
QPEL16_OP(mc31, MMX)\
|
||||
QPEL16_OP(mc32, MMX)\
|
||||
QPEL16_OP(mc33, MMX)
|
||||
|
||||
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
|
||||
QPEL16(mmxext)
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_YASM */
|
||||
884
project/jni/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
Normal file
884
project/jni/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
Normal file
@@ -0,0 +1,884 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
cextern pw_16
|
||||
cextern pw_1
|
||||
cextern pb_0
|
||||
|
||||
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||
|
||||
pad10: times 8 dw 10*1023
|
||||
pad20: times 8 dw 20*1023
|
||||
pad30: times 8 dw 30*1023
|
||||
depad: times 4 dd 32*20*1023 + 512
|
||||
depad2: times 8 dw 20*1023 + 16*1022 + 16
|
||||
unpad: times 8 dw 16*1022/32 ; needs to be mod 16
|
||||
|
||||
tap1: times 4 dw 1, -5
|
||||
tap2: times 4 dw 20, 20
|
||||
tap3: times 4 dw -5, 1
|
||||
pd_0f: times 4 dd 0xffff
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
%macro AVG_MOV 2
|
||||
pavgw %2, %1
|
||||
mova %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro ADDW 3
|
||||
%if mmsize == 8
|
||||
paddw %1, %2
|
||||
%else
|
||||
movu %3, %2
|
||||
paddw %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro FILT_H 4
|
||||
paddw %1, %4
|
||||
psubw %1, %2 ; a-b
|
||||
psraw %1, 2 ; (a-b)/4
|
||||
psubw %1, %2 ; (a-b)/4-b
|
||||
paddw %1, %3 ; (a-b)/4-b+c
|
||||
psraw %1, 2 ; ((a-b)/4-b+c)/4
|
||||
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
||||
%endmacro
|
||||
|
||||
%macro PRELOAD_V 0
|
||||
lea r3, [r2*3]
|
||||
sub r1, r3
|
||||
movu m0, [r1+r2]
|
||||
movu m1, [r1+r2*2]
|
||||
add r1, r3
|
||||
movu m2, [r1]
|
||||
movu m3, [r1+r2]
|
||||
movu m4, [r1+r2*2]
|
||||
add r1, r3
|
||||
%endmacro
|
||||
|
||||
%macro FILT_V 8
|
||||
movu %6, [r1]
|
||||
paddw %1, %6
|
||||
mova %7, %2
|
||||
paddw %7, %5
|
||||
mova %8, %3
|
||||
paddw %8, %4
|
||||
FILT_H %1, %7, %8, [pw_16]
|
||||
psraw %1, 1
|
||||
CLIPW %1, [pb_0], [pw_pixel_max]
|
||||
%endmacro
|
||||
|
||||
%macro MC 1
|
||||
%define OP_MOV mova
|
||||
INIT_MMX mmxext
|
||||
%1 put, 4
|
||||
INIT_XMM sse2
|
||||
%1 put, 8
|
||||
|
||||
%define OP_MOV AVG_MOV
|
||||
INIT_MMX mmxext
|
||||
%1 avg, 4
|
||||
INIT_XMM sse2
|
||||
%1 avg, 8
|
||||
%endmacro
|
||||
|
||||
%macro MCAxA_OP 7
|
||||
%if ARCH_X86_32
|
||||
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
add r0, %3*2
|
||||
add r1, %3*2
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
lea r0, [r0+r2*%3]
|
||||
lea r1, [r1+r2*%3]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
mov r0, r0m
|
||||
mov r1, r1m
|
||||
lea r0, [r0+r2*%3+%3*2]
|
||||
lea r1, [r1+r2*%3+%3*2]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
RET
|
||||
%else ; ARCH_X86_64
|
||||
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
|
||||
mov r%6, r0
|
||||
%assign p1 %6+1
|
||||
mov r %+ p1, r1
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
lea r0, [r%6+%3*2]
|
||||
lea r1, [r %+ p1+%3*2]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
lea r0, [r%6+r2*%3]
|
||||
lea r1, [r %+ p1+r2*%3]
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
lea r0, [r%6+r2*%3+%3*2]
|
||||
lea r1, [r %+ p1+r2*%3+%3*2]
|
||||
%if UNIX64 == 0 ; fall through to function
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
RET
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;cpu, put/avg, mc, 4/8, ...
|
||||
%macro cglobal_mc 6
|
||||
%assign i %3*2
|
||||
%if ARCH_X86_32 || cpuflag(sse2)
|
||||
MCAxA_OP %1, %2, %3, i, %4,%5,%6
|
||||
%endif
|
||||
|
||||
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
|
||||
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
|
||||
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
|
||||
RET
|
||||
%endif
|
||||
|
||||
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro COPY4 0
|
||||
movu m0, [r1 ]
|
||||
OP_MOV [r0 ], m0
|
||||
movu m0, [r1+r2 ]
|
||||
OP_MOV [r0+r2 ], m0
|
||||
movu m0, [r1+r2*2]
|
||||
OP_MOV [r0+r2*2], m0
|
||||
movu m0, [r1+r3 ]
|
||||
OP_MOV [r0+r3 ], m0
|
||||
%endmacro
|
||||
|
||||
%macro MC00 1
|
||||
INIT_MMX mmxext
|
||||
cglobal_mc %1, mc00, 4, 3,4,0
|
||||
lea r3, [r2*3]
|
||||
COPY4
|
||||
ret
|
||||
|
||||
INIT_XMM sse2
|
||||
cglobal %1_h264_qpel8_mc00_10, 3,4
|
||||
lea r3, [r2*3]
|
||||
COPY4
|
||||
lea r0, [r0+r2*4]
|
||||
lea r1, [r1+r2*4]
|
||||
COPY4
|
||||
RET
|
||||
|
||||
cglobal %1_h264_qpel16_mc00_10, 3,4
|
||||
mov r3d, 8
|
||||
.loop:
|
||||
movu m0, [r1 ]
|
||||
movu m1, [r1 +16]
|
||||
OP_MOV [r0 ], m0
|
||||
OP_MOV [r0 +16], m1
|
||||
movu m0, [r1+r2 ]
|
||||
movu m1, [r1+r2+16]
|
||||
OP_MOV [r0+r2 ], m0
|
||||
OP_MOV [r0+r2+16], m1
|
||||
lea r0, [r0+r2*2]
|
||||
lea r1, [r1+r2*2]
|
||||
dec r3d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%define OP_MOV mova
|
||||
MC00 put
|
||||
|
||||
%define OP_MOV AVG_MOV
|
||||
MC00 avg
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC_CACHE 1
|
||||
%define OP_MOV mova
|
||||
INIT_MMX mmxext
|
||||
%1 put, 4
|
||||
INIT_XMM sse2, cache64
|
||||
%1 put, 8
|
||||
INIT_XMM ssse3, cache64
|
||||
%1 put, 8
|
||||
INIT_XMM sse2
|
||||
%1 put, 8
|
||||
|
||||
%define OP_MOV AVG_MOV
|
||||
INIT_MMX mmxext
|
||||
%1 avg, 4
|
||||
INIT_XMM sse2, cache64
|
||||
%1 avg, 8
|
||||
INIT_XMM ssse3, cache64
|
||||
%1 avg, 8
|
||||
INIT_XMM sse2
|
||||
%1 avg, 8
|
||||
%endmacro
|
||||
|
||||
%macro MC20 2
|
||||
cglobal_mc %1, mc20, %2, 3,4,9
|
||||
mov r3d, %2
|
||||
mova m1, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
mova m8, [pw_16]
|
||||
%define p16 m8
|
||||
%else
|
||||
%define p16 [pw_16]
|
||||
%endif
|
||||
.nextrow:
|
||||
%if %0 == 4
|
||||
movu m2, [r1-4]
|
||||
movu m3, [r1-2]
|
||||
movu m4, [r1+0]
|
||||
ADDW m2, [r1+6], m5
|
||||
ADDW m3, [r1+4], m5
|
||||
ADDW m4, [r1+2], m5
|
||||
%else ; movu is slow on these processors
|
||||
%if mmsize==16
|
||||
movu m2, [r1-4]
|
||||
movu m0, [r1+6]
|
||||
mova m6, m0
|
||||
psrldq m0, 6
|
||||
|
||||
paddw m6, m2
|
||||
PALIGNR m3, m0, m2, 2, m5
|
||||
PALIGNR m7, m0, m2, 8, m5
|
||||
paddw m3, m7
|
||||
PALIGNR m4, m0, m2, 4, m5
|
||||
PALIGNR m7, m0, m2, 6, m5
|
||||
paddw m4, m7
|
||||
SWAP 2, 6
|
||||
%else
|
||||
movu m2, [r1-4]
|
||||
movu m6, [r1+4]
|
||||
PALIGNR m3, m6, m2, 2, m5
|
||||
paddw m3, m6
|
||||
PALIGNR m4, m6, m2, 4, m5
|
||||
PALIGNR m7, m6, m2, 6, m5
|
||||
paddw m4, m7
|
||||
paddw m2, [r1+6]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
FILT_H m2, m3, m4, p16
|
||||
psraw m2, 1
|
||||
pxor m0, m0
|
||||
CLIPW m2, m0, m1
|
||||
OP_MOV [r0], m2
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jg .nextrow
|
||||
rep ret
|
||||
%endmacro
|
||||
|
||||
MC_CACHE MC20
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC30 2
|
||||
cglobal_mc %1, mc30, %2, 3,5,9
|
||||
lea r4, [r1+2]
|
||||
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC_CACHE MC30
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC10 2
|
||||
cglobal_mc %1, mc10, %2, 3,5,9
|
||||
mov r4, r1
|
||||
.body:
|
||||
mov r3d, %2
|
||||
mova m1, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
mova m8, [pw_16]
|
||||
%define p16 m8
|
||||
%else
|
||||
%define p16 [pw_16]
|
||||
%endif
|
||||
.nextrow:
|
||||
%if %0 == 4
|
||||
movu m2, [r1-4]
|
||||
movu m3, [r1-2]
|
||||
movu m4, [r1+0]
|
||||
ADDW m2, [r1+6], m5
|
||||
ADDW m3, [r1+4], m5
|
||||
ADDW m4, [r1+2], m5
|
||||
%else ; movu is slow on these processors
|
||||
%if mmsize==16
|
||||
movu m2, [r1-4]
|
||||
movu m0, [r1+6]
|
||||
mova m6, m0
|
||||
psrldq m0, 6
|
||||
|
||||
paddw m6, m2
|
||||
PALIGNR m3, m0, m2, 2, m5
|
||||
PALIGNR m7, m0, m2, 8, m5
|
||||
paddw m3, m7
|
||||
PALIGNR m4, m0, m2, 4, m5
|
||||
PALIGNR m7, m0, m2, 6, m5
|
||||
paddw m4, m7
|
||||
SWAP 2, 6
|
||||
%else
|
||||
movu m2, [r1-4]
|
||||
movu m6, [r1+4]
|
||||
PALIGNR m3, m6, m2, 2, m5
|
||||
paddw m3, m6
|
||||
PALIGNR m4, m6, m2, 4, m5
|
||||
PALIGNR m7, m6, m2, 6, m5
|
||||
paddw m4, m7
|
||||
paddw m2, [r1+6]
|
||||
%endif
|
||||
%endif
|
||||
|
||||
FILT_H m2, m3, m4, p16
|
||||
psraw m2, 1
|
||||
pxor m0, m0
|
||||
CLIPW m2, m0, m1
|
||||
movu m3, [r4]
|
||||
pavgw m2, m3
|
||||
OP_MOV [r0], m2
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
add r4, r2
|
||||
dec r3d
|
||||
jg .nextrow
|
||||
rep ret
|
||||
%endmacro
|
||||
|
||||
MC_CACHE MC10
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro V_FILT 10
|
||||
v_filt%9_%10_10
|
||||
add r4, r2
|
||||
.no_addr4:
|
||||
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
|
||||
add r1, r2
|
||||
add r0, r2
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 4
|
||||
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
INIT_XMM sse2
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 6
|
||||
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
%macro MC02 2
|
||||
cglobal_mc %1, mc02, %2, 3,4,8
|
||||
PRELOAD_V
|
||||
|
||||
sub r0, r2
|
||||
%assign j 0
|
||||
%rep %2
|
||||
%assign i (j % 6)
|
||||
call v_filt%2_ %+ i %+ _10.no_addr4
|
||||
OP_MOV [r0], m0
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign j j+1
|
||||
%endrep
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC02
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC01 2
|
||||
cglobal_mc %1, mc01, %2, 3,5,8
|
||||
mov r4, r1
|
||||
.body:
|
||||
PRELOAD_V
|
||||
|
||||
sub r4, r2
|
||||
sub r0, r2
|
||||
%assign j 0
|
||||
%rep %2
|
||||
%assign i (j % 6)
|
||||
call v_filt%2_ %+ i %+ _10
|
||||
movu m7, [r4]
|
||||
pavgw m0, m7
|
||||
OP_MOV [r0], m0
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign j j+1
|
||||
%endrep
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC01
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC03 2
|
||||
cglobal_mc %1, mc03, %2, 3,5,8
|
||||
lea r4, [r1+r2]
|
||||
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC03
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro H_FILT_AVG 2-3
|
||||
h_filt%1_%2_10:
|
||||
;FILT_H with fewer registers and averaged with the FILT_V result
|
||||
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
|
||||
;unfortunately I need three registers, so m5 will have to be re-read from memory
|
||||
movu m5, [r4-4]
|
||||
ADDW m5, [r4+6], m7
|
||||
movu m6, [r4-2]
|
||||
ADDW m6, [r4+4], m7
|
||||
paddw m5, [pw_16]
|
||||
psubw m5, m6 ; a-b
|
||||
psraw m5, 2 ; (a-b)/4
|
||||
psubw m5, m6 ; (a-b)/4-b
|
||||
movu m6, [r4+0]
|
||||
ADDW m6, [r4+2], m7
|
||||
paddw m5, m6 ; (a-b)/4-b+c
|
||||
psraw m5, 2 ; ((a-b)/4-b+c)/4
|
||||
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
|
||||
psraw m5, 1
|
||||
CLIPW m5, [pb_0], [pw_pixel_max]
|
||||
;avg FILT_V, FILT_H
|
||||
pavgw m0, m5
|
||||
%if %0!=4
|
||||
movu m5, [r1+r5]
|
||||
%endif
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 3
|
||||
H_FILT_AVG 4, i
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
H_FILT_AVG 4, i, 0
|
||||
|
||||
INIT_XMM sse2
|
||||
RESET_MM_PERMUTATION
|
||||
%assign i 0
|
||||
%rep 6
|
||||
%if i==1
|
||||
H_FILT_AVG 8, i, 0
|
||||
%else
|
||||
H_FILT_AVG 8, i
|
||||
%endif
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
|
||||
%macro MC11 2
|
||||
; this REALLY needs x86_64
|
||||
cglobal_mc %1, mc11, %2, 3,6,8
|
||||
mov r4, r1
|
||||
.body:
|
||||
PRELOAD_V
|
||||
|
||||
sub r0, r2
|
||||
sub r4, r2
|
||||
mov r5, r2
|
||||
neg r5
|
||||
%assign j 0
|
||||
%rep %2
|
||||
%assign i (j % 6)
|
||||
call v_filt%2_ %+ i %+ _10
|
||||
call h_filt%2_ %+ i %+ _10
|
||||
%if %2==8 && i==1
|
||||
movu m5, [r1+r5]
|
||||
%endif
|
||||
OP_MOV [r0], m0
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign j j+1
|
||||
%endrep
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC11
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC31 2
|
||||
cglobal_mc %1, mc31, %2, 3,6,8
|
||||
mov r4, r1
|
||||
add r1, 2
|
||||
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC31
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC13 2
|
||||
cglobal_mc %1, mc13, %2, 3,7,12
|
||||
lea r4, [r1+r2]
|
||||
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC13
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC33 2
|
||||
cglobal_mc %1, mc33, %2, 3,6,8
|
||||
lea r4, [r1+r2]
|
||||
add r1, 2
|
||||
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC33
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro FILT_H2 3
|
||||
psubw %1, %2 ; a-b
|
||||
psubw %2, %3 ; b-c
|
||||
psllw %2, 2
|
||||
psubw %1, %2 ; a-5*b+4*c
|
||||
psllw %3, 4
|
||||
paddw %1, %3 ; a-5*b+20*c
|
||||
%endmacro
|
||||
|
||||
%macro FILT_VNRD 8
|
||||
movu %6, [r1]
|
||||
paddw %1, %6
|
||||
mova %7, %2
|
||||
paddw %7, %5
|
||||
mova %8, %3
|
||||
paddw %8, %4
|
||||
FILT_H2 %1, %7, %8
|
||||
%endmacro
|
||||
|
||||
%macro HV 1
|
||||
%if mmsize==16
|
||||
%define PAD 12
|
||||
%define COUNT 2
|
||||
%else
|
||||
%define PAD 4
|
||||
%define COUNT 3
|
||||
%endif
|
||||
put_hv%1_10:
|
||||
neg r2 ; This actually saves instructions
|
||||
lea r1, [r1+r2*2-mmsize+PAD]
|
||||
lea r4, [rsp+PAD+gprsize]
|
||||
mov r3d, COUNT
|
||||
.v_loop:
|
||||
movu m0, [r1]
|
||||
sub r1, r2
|
||||
movu m1, [r1]
|
||||
sub r1, r2
|
||||
movu m2, [r1]
|
||||
sub r1, r2
|
||||
movu m3, [r1]
|
||||
sub r1, r2
|
||||
movu m4, [r1]
|
||||
sub r1, r2
|
||||
%assign i 0
|
||||
%rep %1-1
|
||||
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
|
||||
psubw m0, [pad20]
|
||||
movu [r4+i*mmsize*3], m0
|
||||
sub r1, r2
|
||||
SWAP 0,1,2,3,4,5
|
||||
%assign i i+1
|
||||
%endrep
|
||||
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
|
||||
psubw m0, [pad20]
|
||||
movu [r4+i*mmsize*3], m0
|
||||
add r4, mmsize
|
||||
lea r1, [r1+r2*8+mmsize]
|
||||
%if %1==8
|
||||
lea r1, [r1+r2*4]
|
||||
%endif
|
||||
dec r3d
|
||||
jg .v_loop
|
||||
neg r2
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
HV 4
|
||||
INIT_XMM sse2
|
||||
HV 8
|
||||
|
||||
%macro H_LOOP 1
|
||||
%if num_mmregs > 8
|
||||
%define s1 m8
|
||||
%define s2 m9
|
||||
%define s3 m10
|
||||
%define d1 m11
|
||||
%else
|
||||
%define s1 [tap1]
|
||||
%define s2 [tap2]
|
||||
%define s3 [tap3]
|
||||
%define d1 [depad]
|
||||
%endif
|
||||
h%1_loop_op:
|
||||
movu m1, [r1+mmsize-4]
|
||||
movu m2, [r1+mmsize-2]
|
||||
mova m3, [r1+mmsize+0]
|
||||
movu m4, [r1+mmsize+2]
|
||||
movu m5, [r1+mmsize+4]
|
||||
movu m6, [r1+mmsize+6]
|
||||
%if num_mmregs > 8
|
||||
pmaddwd m1, s1
|
||||
pmaddwd m2, s1
|
||||
pmaddwd m3, s2
|
||||
pmaddwd m4, s2
|
||||
pmaddwd m5, s3
|
||||
pmaddwd m6, s3
|
||||
paddd m1, d1
|
||||
paddd m2, d1
|
||||
%else
|
||||
mova m0, s1
|
||||
pmaddwd m1, m0
|
||||
pmaddwd m2, m0
|
||||
mova m0, s2
|
||||
pmaddwd m3, m0
|
||||
pmaddwd m4, m0
|
||||
mova m0, s3
|
||||
pmaddwd m5, m0
|
||||
pmaddwd m6, m0
|
||||
mova m0, d1
|
||||
paddd m1, m0
|
||||
paddd m2, m0
|
||||
%endif
|
||||
paddd m3, m5
|
||||
paddd m4, m6
|
||||
paddd m1, m3
|
||||
paddd m2, m4
|
||||
psrad m1, 10
|
||||
psrad m2, 10
|
||||
pslld m2, 16
|
||||
pand m1, [pd_0f]
|
||||
por m1, m2
|
||||
%if num_mmregs <= 8
|
||||
pxor m0, m0
|
||||
%endif
|
||||
CLIPW m1, m0, m7
|
||||
add r1, mmsize*3
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
H_LOOP 4
|
||||
INIT_XMM sse2
|
||||
H_LOOP 8
|
||||
|
||||
%macro MC22 2
|
||||
cglobal_mc %1, mc22, %2, 3,7,12
|
||||
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, PAD
|
||||
|
||||
call put_hv%2_10
|
||||
|
||||
mov r3d, %2
|
||||
mova m7, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
pxor m0, m0
|
||||
mova m8, [tap1]
|
||||
mova m9, [tap2]
|
||||
mova m10, [tap3]
|
||||
mova m11, [depad]
|
||||
%endif
|
||||
mov r1, rsp
|
||||
.h_loop:
|
||||
call h%2_loop_op
|
||||
|
||||
OP_MOV [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jg .h_loop
|
||||
|
||||
mov rsp, r6 ; restore stack pointer
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC22
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC12 2
|
||||
cglobal_mc %1, mc12, %2, 3,7,12
|
||||
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, PAD
|
||||
|
||||
call put_hv%2_10
|
||||
|
||||
xor r4d, r4d
|
||||
.body:
|
||||
mov r3d, %2
|
||||
pxor m0, m0
|
||||
mova m7, [pw_pixel_max]
|
||||
%if num_mmregs > 8
|
||||
mova m8, [tap1]
|
||||
mova m9, [tap2]
|
||||
mova m10, [tap3]
|
||||
mova m11, [depad]
|
||||
%endif
|
||||
mov r1, rsp
|
||||
.h_loop:
|
||||
call h%2_loop_op
|
||||
|
||||
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
|
||||
paddw m3, [depad2]
|
||||
psrlw m3, 5
|
||||
psubw m3, [unpad]
|
||||
CLIPW m3, m0, m7
|
||||
pavgw m1, m3
|
||||
|
||||
OP_MOV [r0], m1
|
||||
add r0, r2
|
||||
dec r3d
|
||||
jg .h_loop
|
||||
|
||||
mov rsp, r6 ; restore stack pointer
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
MC MC12
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC32 2
|
||||
cglobal_mc %1, mc32, %2, 3,7,12
|
||||
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
sub rsp, PAD
|
||||
|
||||
call put_hv%2_10
|
||||
|
||||
mov r4d, 2 ; sizeof(pixel)
|
||||
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC32
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro H_NRD 1
|
||||
put_h%1_10:
|
||||
add rsp, gprsize
|
||||
mov r3d, %1
|
||||
xor r4d, r4d
|
||||
mova m6, [pad20]
|
||||
.nextrow:
|
||||
movu m2, [r5-4]
|
||||
movu m3, [r5-2]
|
||||
movu m4, [r5+0]
|
||||
ADDW m2, [r5+6], m5
|
||||
ADDW m3, [r5+4], m5
|
||||
ADDW m4, [r5+2], m5
|
||||
|
||||
FILT_H2 m2, m3, m4
|
||||
psubw m2, m6
|
||||
mova [rsp+r4], m2
|
||||
add r4d, mmsize*3
|
||||
add r5, r2
|
||||
dec r3d
|
||||
jg .nextrow
|
||||
sub rsp, gprsize
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
H_NRD 4
|
||||
INIT_XMM sse2
|
||||
H_NRD 8
|
||||
|
||||
%macro MC21 2
|
||||
cglobal_mc %1, mc21, %2, 3,7,12
|
||||
mov r5, r1
|
||||
.body:
|
||||
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
|
||||
mov r6, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
|
||||
sub rsp, PAD
|
||||
call put_h%2_10
|
||||
|
||||
sub rsp, PAD
|
||||
call put_hv%2_10
|
||||
|
||||
mov r4d, PAD-mmsize ; H buffer
|
||||
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC21
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro MC23 2
|
||||
cglobal_mc %1, mc23, %2, 3,7,12
|
||||
lea r5, [r1+r2]
|
||||
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
|
||||
%endmacro
|
||||
|
||||
MC MC23
|
||||
862
project/jni/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
Normal file
862
project/jni/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
Normal file
@@ -0,0 +1,862 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
|
||||
;*****************************************************************************
|
||||
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
;* Copyright (C) 2012 Daniel Kang
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
cextern pw_16
|
||||
cextern pw_5
|
||||
cextern pb_0
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
||||
%macro op_avgh 3
|
||||
movh %3, %2
|
||||
pavgb %1, %3
|
||||
movh %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_avg 2-3
|
||||
pavgb %1, %2
|
||||
mova %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_puth 2-3
|
||||
movh %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro op_put 2-3
|
||||
mova %2, %1
|
||||
%endmacro
|
||||
|
||||
%macro QPEL4_H_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
pxor m7, m7
|
||||
mova m4, [pw_5]
|
||||
mova m5, [pw_16]
|
||||
mov r4d, 4
|
||||
.loop:
|
||||
movh m1, [r1-1]
|
||||
movh m2, [r1+0]
|
||||
movh m3, [r1+1]
|
||||
movh m0, [r1+2]
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m0, m7
|
||||
paddw m1, m0
|
||||
paddw m2, m3
|
||||
movh m0, [r1-2]
|
||||
movh m3, [r1+3]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m3, m7
|
||||
paddw m0, m3
|
||||
psllw m2, 2
|
||||
psubw m2, m1
|
||||
pmullw m2, m4
|
||||
paddw m0, m5
|
||||
paddw m0, m2
|
||||
psraw m0, 5
|
||||
packuswb m0, m0
|
||||
op_%1h m0, [r0], m6
|
||||
add r0, r2
|
||||
add r1, r3
|
||||
dec r4d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_H_LOWPASS_OP put
|
||||
QPEL4_H_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8_H_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
mov r4d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+1]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psllw m0, 2
|
||||
psllw m1, 2
|
||||
mova m2, [r1-1]
|
||||
mova m4, [r1+2]
|
||||
mova m3, m2
|
||||
mova m5, m4
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m5, m7
|
||||
paddw m2, m4
|
||||
paddw m5, m3
|
||||
psubw m0, m2
|
||||
psubw m1, m5
|
||||
pmullw m0, m6
|
||||
pmullw m1, m6
|
||||
movd m2, [r1-2]
|
||||
movd m5, [r1+7]
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m5, m7
|
||||
paddw m2, m3
|
||||
paddw m4, m5
|
||||
mova m5, [pw_16]
|
||||
paddw m2, m5
|
||||
paddw m4, m5
|
||||
paddw m0, m2
|
||||
paddw m1, m4
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m1
|
||||
op_%1 m0, [r0], m4
|
||||
add r0, r2
|
||||
add r1, r3
|
||||
dec r4d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8_H_LOWPASS_OP put
|
||||
QPEL8_H_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8_H_LOWPASS_OP_XMM 1
|
||||
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
mov r4d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
movu m1, [r1-2]
|
||||
mova m0, m1
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m0, m7
|
||||
mova m2, m1
|
||||
mova m3, m1
|
||||
mova m4, m1
|
||||
mova m5, m1
|
||||
palignr m4, m0, 2
|
||||
palignr m3, m0, 4
|
||||
palignr m2, m0, 6
|
||||
palignr m1, m0, 8
|
||||
palignr m5, m0, 10
|
||||
paddw m0, m5
|
||||
paddw m2, m3
|
||||
paddw m1, m4
|
||||
psllw m2, 2
|
||||
psubw m2, m1
|
||||
paddw m0, [pw_16]
|
||||
pmullw m2, m6
|
||||
paddw m2, m0
|
||||
psraw m2, 5
|
||||
packuswb m2, m2
|
||||
op_%1h m2, [r0], m4
|
||||
add r1, r3
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL8_H_LOWPASS_OP_XMM put
|
||||
QPEL8_H_LOWPASS_OP_XMM avg
|
||||
|
||||
|
||||
%macro QPEL4_H_LOWPASS_L2_OP 1
|
||||
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
pxor m7, m7
|
||||
mova m4, [pw_5]
|
||||
mova m5, [pw_16]
|
||||
mov r5d, 4
|
||||
.loop:
|
||||
movh m1, [r1-1]
|
||||
movh m2, [r1+0]
|
||||
movh m3, [r1+1]
|
||||
movh m0, [r1+2]
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m0, m7
|
||||
paddw m1, m0
|
||||
paddw m2, m3
|
||||
movh m0, [r1-2]
|
||||
movh m3, [r1+3]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m3, m7
|
||||
paddw m0, m3
|
||||
psllw m2, 2
|
||||
psubw m2, m1
|
||||
pmullw m2, m4
|
||||
paddw m0, m5
|
||||
paddw m0, m2
|
||||
movh m3, [r2]
|
||||
psraw m0, 5
|
||||
packuswb m0, m0
|
||||
pavgb m0, m3
|
||||
op_%1h m0, [r0], m6
|
||||
add r0, r3
|
||||
add r1, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_H_LOWPASS_L2_OP put
|
||||
QPEL4_H_LOWPASS_L2_OP avg
|
||||
|
||||
|
||||
%macro QPEL8_H_LOWPASS_L2_OP 1
|
||||
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mov r5d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m2, [r1+1]
|
||||
mova m1, m0
|
||||
mova m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
psllw m0, 2
|
||||
psllw m1, 2
|
||||
mova m2, [r1-1]
|
||||
mova m4, [r1+2]
|
||||
mova m3, m2
|
||||
mova m5, m4
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
punpckhbw m5, m7
|
||||
paddw m2, m4
|
||||
paddw m5, m3
|
||||
psubw m0, m2
|
||||
psubw m1, m5
|
||||
pmullw m0, m6
|
||||
pmullw m1, m6
|
||||
movd m2, [r1-2]
|
||||
movd m5, [r1+7]
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m5, m7
|
||||
paddw m2, m3
|
||||
paddw m4, m5
|
||||
mova m5, [pw_16]
|
||||
paddw m2, m5
|
||||
paddw m4, m5
|
||||
paddw m0, m2
|
||||
paddw m1, m4
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
mova m4, [r2]
|
||||
packuswb m0, m1
|
||||
pavgb m0, m4
|
||||
op_%1 m0, [r0], m4
|
||||
add r0, r3
|
||||
add r1, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8_H_LOWPASS_L2_OP put
|
||||
QPEL8_H_LOWPASS_L2_OP avg
|
||||
|
||||
|
||||
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
|
||||
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mov r5d, 8
|
||||
pxor m7, m7
|
||||
mova m6, [pw_5]
|
||||
.loop:
|
||||
lddqu m1, [r1-2]
|
||||
mova m0, m1
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m0, m7
|
||||
mova m2, m1
|
||||
mova m3, m1
|
||||
mova m4, m1
|
||||
mova m5, m1
|
||||
palignr m4, m0, 2
|
||||
palignr m3, m0, 4
|
||||
palignr m2, m0, 6
|
||||
palignr m1, m0, 8
|
||||
palignr m5, m0, 10
|
||||
paddw m0, m5
|
||||
paddw m2, m3
|
||||
paddw m1, m4
|
||||
psllw m2, 2
|
||||
movh m3, [r2]
|
||||
psubw m2, m1
|
||||
paddw m0, [pw_16]
|
||||
pmullw m2, m6
|
||||
paddw m2, m0
|
||||
psraw m2, 5
|
||||
packuswb m2, m2
|
||||
pavgb m2, m3
|
||||
op_%1h m2, [r0], m4
|
||||
add r1, r3
|
||||
add r0, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL8_H_LOWPASS_L2_OP_XMM put
|
||||
QPEL8_H_LOWPASS_L2_OP_XMM avg
|
||||
|
||||
|
||||
; All functions that call this are required to have function arguments of
|
||||
; dst, src, dstStride, srcStride
|
||||
%macro FILT_V 1
|
||||
mova m6, m2
|
||||
movh m5, [r1]
|
||||
paddw m6, m3
|
||||
psllw m6, 2
|
||||
psubw m6, m1
|
||||
psubw m6, m4
|
||||
punpcklbw m5, m7
|
||||
pmullw m6, [pw_5]
|
||||
paddw m0, [pw_16]
|
||||
add r1, r3
|
||||
paddw m0, m5
|
||||
paddw m6, m0
|
||||
psraw m6, 5
|
||||
packuswb m6, m6
|
||||
op_%1h m6, [r0], m0 ; 1
|
||||
add r0, r2
|
||||
SWAP 0, 1, 2, 3, 4, 5
|
||||
%endmacro
|
||||
|
||||
%macro QPEL4_V_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
sub r1, r3
|
||||
sub r1, r3
|
||||
pxor m7, m7
|
||||
movh m0, [r1]
|
||||
movh m1, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m2, [r1]
|
||||
movh m3, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m4, [r1]
|
||||
add r1, r3
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_V_LOWPASS_OP put
|
||||
QPEL4_V_LOWPASS_OP avg
|
||||
|
||||
|
||||
|
||||
%macro QPEL8OR16_V_LOWPASS_OP 1
|
||||
%if cpuflag(sse2)
|
||||
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
sub r1, r3
|
||||
sub r1, r3
|
||||
%else
|
||||
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
%endif
|
||||
pxor m7, m7
|
||||
movh m0, [r1]
|
||||
movh m1, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m2, [r1]
|
||||
movh m3, [r1+r3]
|
||||
lea r1, [r1+2*r3]
|
||||
movh m4, [r1]
|
||||
add r1, r3
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
cmp r4d, 16
|
||||
jne .end
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
FILT_V %1
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8OR16_V_LOWPASS_OP put
|
||||
QPEL8OR16_V_LOWPASS_OP avg
|
||||
|
||||
INIT_XMM sse2
|
||||
QPEL8OR16_V_LOWPASS_OP put
|
||||
QPEL8OR16_V_LOWPASS_OP avg
|
||||
|
||||
|
||||
; All functions that use this are required to have args:
|
||||
; src, tmp, srcSize
|
||||
%macro FILT_HV 1 ; offset
|
||||
mova m6, m2
|
||||
movh m5, [r0]
|
||||
paddw m6, m3
|
||||
psllw m6, 2
|
||||
paddw m0, [pw_16]
|
||||
psubw m6, m1
|
||||
psubw m6, m4
|
||||
punpcklbw m5, m7
|
||||
pmullw m6, [pw_5]
|
||||
paddw m0, m5
|
||||
add r0, r2
|
||||
paddw m6, m0
|
||||
mova [r1+%1], m6
|
||||
SWAP 0, 1, 2, 3, 4, 5
|
||||
%endmacro
|
||||
|
||||
%macro QPEL4_HV1_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
|
||||
movsxdifnidn r2, r2d
|
||||
pxor m7, m7
|
||||
movh m0, [r0]
|
||||
movh m1, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m2, [r0]
|
||||
movh m3, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m4, [r0]
|
||||
add r0, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_HV 0*24
|
||||
FILT_HV 1*24
|
||||
FILT_HV 2*24
|
||||
FILT_HV 3*24
|
||||
RET
|
||||
|
||||
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
|
||||
movsxdifnidn r2, r2d
|
||||
mov r3d, 4
|
||||
.loop:
|
||||
mova m0, [r0]
|
||||
paddw m0, [r0+10]
|
||||
mova m1, [r0+2]
|
||||
paddw m1, [r0+8]
|
||||
mova m2, [r0+4]
|
||||
paddw m2, [r0+6]
|
||||
psubw m0, m1
|
||||
psraw m0, 2
|
||||
psubw m0, m1
|
||||
paddsw m0, m2
|
||||
psraw m0, 2
|
||||
paddw m0, m2
|
||||
psraw m0, 6
|
||||
packuswb m0, m0
|
||||
op_%1h m0, [r1], m7
|
||||
add r0, 24
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL4_HV1_LOWPASS_OP put
|
||||
QPEL4_HV1_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8OR16_HV1_LOWPASS_OP 1
|
||||
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
|
||||
movsxdifnidn r2, r2d
|
||||
pxor m7, m7
|
||||
movh m0, [r0]
|
||||
movh m1, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m2, [r0]
|
||||
movh m3, [r0+r2]
|
||||
lea r0, [r0+2*r2]
|
||||
movh m4, [r0]
|
||||
add r0, r2
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
FILT_HV 0*48
|
||||
FILT_HV 1*48
|
||||
FILT_HV 2*48
|
||||
FILT_HV 3*48
|
||||
FILT_HV 4*48
|
||||
FILT_HV 5*48
|
||||
FILT_HV 6*48
|
||||
FILT_HV 7*48
|
||||
cmp r3d, 16
|
||||
jne .end
|
||||
FILT_HV 8*48
|
||||
FILT_HV 9*48
|
||||
FILT_HV 10*48
|
||||
FILT_HV 11*48
|
||||
FILT_HV 12*48
|
||||
FILT_HV 13*48
|
||||
FILT_HV 14*48
|
||||
FILT_HV 15*48
|
||||
.end:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8OR16_HV1_LOWPASS_OP put
|
||||
QPEL8OR16_HV1_LOWPASS_OP avg
|
||||
|
||||
INIT_XMM sse2
|
||||
QPEL8OR16_HV1_LOWPASS_OP put
|
||||
|
||||
|
||||
|
||||
%macro QPEL8OR16_HV2_LOWPASS_OP 1
|
||||
; unused is to match ssse3 and mmxext args
|
||||
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
|
||||
movsxdifnidn r2, r2d
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m3, [r1+8]
|
||||
mova m1, [r1+2]
|
||||
mova m4, [r1+10]
|
||||
paddw m0, m4
|
||||
paddw m1, m3
|
||||
paddw m3, [r1+18]
|
||||
paddw m4, [r1+16]
|
||||
mova m2, [r1+4]
|
||||
mova m5, [r1+12]
|
||||
paddw m2, [r1+6]
|
||||
paddw m5, [r1+14]
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
paddsw m0, m2
|
||||
paddsw m3, m5
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
paddw m0, m2
|
||||
paddw m3, m5
|
||||
psraw m0, 6
|
||||
psraw m3, 6
|
||||
packuswb m0, m3
|
||||
op_%1 m0, [r0], m7
|
||||
add r1, 48
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
QPEL8OR16_HV2_LOWPASS_OP put
|
||||
QPEL8OR16_HV2_LOWPASS_OP avg
|
||||
|
||||
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
|
||||
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
|
||||
movsxdifnidn r2, r2d
|
||||
movsxdifnidn r3, r3d
|
||||
cmp r4d, 16
|
||||
je .op16
|
||||
.loop8:
|
||||
mova m1, [r1+16]
|
||||
mova m0, [r1]
|
||||
mova m2, m1
|
||||
mova m3, m1
|
||||
mova m4, m1
|
||||
mova m5, m1
|
||||
palignr m5, m0, 10
|
||||
palignr m4, m0, 8
|
||||
palignr m3, m0, 6
|
||||
palignr m2, m0, 4
|
||||
palignr m1, m0, 2
|
||||
paddw m0, m5
|
||||
paddw m1, m4
|
||||
paddw m2, m3
|
||||
psubw m0, m1
|
||||
psraw m0, 2
|
||||
psubw m0, m1
|
||||
paddw m0, m2
|
||||
psraw m0, 2
|
||||
paddw m0, m2
|
||||
psraw m0, 6
|
||||
packuswb m0, m0
|
||||
op_%1h m0, [r0], m7
|
||||
add r1, 48
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .loop8
|
||||
jmp .done
|
||||
.op16:
|
||||
mova m4, [r1+32]
|
||||
mova m5, [r1+16]
|
||||
mova m7, [r1]
|
||||
mova m3, m4
|
||||
mova m2, m4
|
||||
mova m1, m4
|
||||
mova m0, m4
|
||||
palignr m0, m5, 10
|
||||
palignr m1, m5, 8
|
||||
palignr m2, m5, 6
|
||||
palignr m3, m5, 4
|
||||
palignr m4, m5, 2
|
||||
paddw m0, m5
|
||||
paddw m1, m4
|
||||
paddw m2, m3
|
||||
mova m6, m5
|
||||
mova m4, m5
|
||||
mova m3, m5
|
||||
palignr m4, m7, 8
|
||||
palignr m6, m7, 2
|
||||
palignr m3, m7, 10
|
||||
paddw m4, m6
|
||||
mova m6, m5
|
||||
palignr m5, m7, 6
|
||||
palignr m6, m7, 4
|
||||
paddw m3, m7
|
||||
paddw m5, m6
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
psubw m0, m1
|
||||
psubw m3, m4
|
||||
paddw m0, m2
|
||||
paddw m3, m5
|
||||
psraw m0, 2
|
||||
psraw m3, 2
|
||||
paddw m0, m2
|
||||
paddw m3, m5
|
||||
psraw m0, 6
|
||||
psraw m3, 6
|
||||
packuswb m3, m0
|
||||
op_%1 m3, [r0], m7
|
||||
add r1, 48
|
||||
add r0, r2
|
||||
dec r4d
|
||||
jne .op16
|
||||
.done:
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL8OR16_HV2_LOWPASS_OP_XMM put
|
||||
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
|
||||
|
||||
|
||||
%macro PIXELS4_L2_SHIFT5 1
|
||||
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+24]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
pavgb m0, [r2]
|
||||
pavgb m1, [r2+r4]
|
||||
op_%1h m0, [r0], m4
|
||||
op_%1h m1, [r0+r3], m5
|
||||
lea r2, [r2+r4*2]
|
||||
lea r0, [r0+r3*2]
|
||||
mova m0, [r1+48]
|
||||
mova m1, [r1+72]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
pavgb m0, [r2]
|
||||
pavgb m1, [r2+r4]
|
||||
op_%1h m0, [r0], m4
|
||||
op_%1h m1, [r0+r3], m5
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PIXELS4_L2_SHIFT5 put
|
||||
PIXELS4_L2_SHIFT5 avg
|
||||
|
||||
|
||||
%macro PIXELS8_L2_SHIFT5 1
|
||||
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
.loop:
|
||||
mova m0, [r1]
|
||||
mova m1, [r1+8]
|
||||
mova m2, [r1+48]
|
||||
mova m3, [r1+48+8]
|
||||
psraw m0, 5
|
||||
psraw m1, 5
|
||||
psraw m2, 5
|
||||
psraw m3, 5
|
||||
packuswb m0, m1
|
||||
packuswb m2, m3
|
||||
pavgb m0, [r2]
|
||||
pavgb m2, [r2+r4]
|
||||
op_%1 m0, [r0], m4
|
||||
op_%1 m2, [r0+r3], m5
|
||||
lea r2, [r2+2*r4]
|
||||
add r1, 48*2
|
||||
lea r0, [r0+2*r3]
|
||||
sub r5d, 2
|
||||
jne .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PIXELS8_L2_SHIFT5 put
|
||||
PIXELS8_L2_SHIFT5 avg
|
||||
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro QPEL16_H_LOWPASS_L2_OP 1
|
||||
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
|
||||
movsxdifnidn r3, r3d
|
||||
movsxdifnidn r4, r4d
|
||||
mov r5d, 16
|
||||
pxor m15, m15
|
||||
mova m14, [pw_5]
|
||||
mova m13, [pw_16]
|
||||
.loop:
|
||||
lddqu m1, [r1+6]
|
||||
lddqu m7, [r1-2]
|
||||
mova m0, m1
|
||||
punpckhbw m1, m15
|
||||
punpcklbw m0, m15
|
||||
punpcklbw m7, m15
|
||||
mova m2, m1
|
||||
mova m6, m0
|
||||
mova m3, m1
|
||||
mova m8, m0
|
||||
mova m4, m1
|
||||
mova m9, m0
|
||||
mova m12, m0
|
||||
mova m11, m1
|
||||
palignr m11, m0, 10
|
||||
palignr m12, m7, 10
|
||||
palignr m4, m0, 2
|
||||
palignr m9, m7, 2
|
||||
palignr m3, m0, 4
|
||||
palignr m8, m7, 4
|
||||
palignr m2, m0, 6
|
||||
palignr m6, m7, 6
|
||||
paddw m11, m0
|
||||
palignr m1, m0, 8
|
||||
palignr m0, m7, 8
|
||||
paddw m7, m12
|
||||
paddw m2, m3
|
||||
paddw m6, m8
|
||||
paddw m1, m4
|
||||
paddw m0, m9
|
||||
psllw m2, 2
|
||||
psllw m6, 2
|
||||
psubw m2, m1
|
||||
psubw m6, m0
|
||||
paddw m11, m13
|
||||
paddw m7, m13
|
||||
pmullw m2, m14
|
||||
pmullw m6, m14
|
||||
lddqu m3, [r2]
|
||||
paddw m2, m11
|
||||
paddw m6, m7
|
||||
psraw m2, 5
|
||||
psraw m6, 5
|
||||
packuswb m6, m2
|
||||
pavgb m6, m3
|
||||
op_%1 m6, [r0], m11
|
||||
add r1, r3
|
||||
add r0, r3
|
||||
add r2, r4
|
||||
dec r5d
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QPEL16_H_LOWPASS_L2_OP put
|
||||
QPEL16_H_LOWPASS_L2_OP avg
|
||||
%endif
|
||||
317
project/jni/ffmpeg/libavcodec/x86/h264_weight.asm
Normal file
317
project/jni/ffmpeg/libavcodec/x86/h264_weight.asm
Normal file
@@ -0,0 +1,317 @@
|
||||
;*****************************************************************************
|
||||
;* SSE2-optimized weighted prediction code
|
||||
;*****************************************************************************
|
||||
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; biweight pred:
|
||||
;
|
||||
; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
|
||||
; int height, int log2_denom, int weightd,
|
||||
; int weights, int offset);
|
||||
; and
|
||||
; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
|
||||
; int log2_denom, int weight, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro WEIGHT_SETUP 0
|
||||
add r5, r5
|
||||
inc r5
|
||||
movd m3, r4d
|
||||
movd m5, r5d
|
||||
movd m6, r3d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
%if mmsize == 16
|
||||
pshuflw m3, m3, 0
|
||||
pshuflw m5, m5, 0
|
||||
punpcklqdq m3, m3
|
||||
punpcklqdq m5, m5
|
||||
%else
|
||||
pshufw m3, m3, 0
|
||||
pshufw m5, m5, 0
|
||||
%endif
|
||||
pxor m7, m7
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_OP 2
|
||||
movh m0, [r0+%1]
|
||||
movh m1, [r0+%2]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m0, m3
|
||||
pmullw m1, m3
|
||||
paddsw m0, m5
|
||||
paddsw m1, m5
|
||||
psraw m0, m6
|
||||
psraw m1, m6
|
||||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal h264_weight_16, 6, 6, 0
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, 4
|
||||
mova [r0 ], m0
|
||||
WEIGHT_OP 8, 12
|
||||
mova [r0+8], m0
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
%macro WEIGHT_FUNC_MM 2
|
||||
cglobal h264_weight_%1, 6, 6, %2
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, mmsize/2
|
||||
mova [r0], m0
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
WEIGHT_FUNC_MM 8, 0
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_MM 16, 8
|
||||
|
||||
%macro WEIGHT_FUNC_HALF_MM 2
|
||||
cglobal h264_weight_%1, 6, 6, %2
|
||||
WEIGHT_SETUP
|
||||
sar r2d, 1
|
||||
lea r3, [r1*2]
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, r1
|
||||
movh [r0], m0
|
||||
%if mmsize == 16
|
||||
movhps [r0+r1], m0
|
||||
%else
|
||||
psrlq m0, 32
|
||||
movh [r0+r1], m0
|
||||
%endif
|
||||
add r0, r3
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
WEIGHT_FUNC_HALF_MM 4, 0
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 8
|
||||
|
||||
%macro BIWEIGHT_SETUP 0
|
||||
%if ARCH_X86_64
|
||||
%define off_regd r7d
|
||||
%else
|
||||
%define off_regd r3d
|
||||
%endif
|
||||
mov off_regd, r7m
|
||||
add off_regd, 1
|
||||
or off_regd, 1
|
||||
add r4, 1
|
||||
cmp r5, 128
|
||||
jne .normal
|
||||
sar r5, 1
|
||||
sar r6, 1
|
||||
sar off_regd, 1
|
||||
sub r4, 1
|
||||
.normal
|
||||
%if cpuflag(ssse3)
|
||||
movd m4, r5d
|
||||
movd m0, r6d
|
||||
%else
|
||||
movd m3, r5d
|
||||
movd m4, r6d
|
||||
%endif
|
||||
movd m5, off_regd
|
||||
movd m6, r4d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
%if cpuflag(ssse3)
|
||||
punpcklbw m4, m0
|
||||
pshuflw m4, m4, 0
|
||||
pshuflw m5, m5, 0
|
||||
punpcklqdq m4, m4
|
||||
punpcklqdq m5, m5
|
||||
|
||||
%else
|
||||
%if mmsize == 16
|
||||
pshuflw m3, m3, 0
|
||||
pshuflw m4, m4, 0
|
||||
pshuflw m5, m5, 0
|
||||
punpcklqdq m3, m3
|
||||
punpcklqdq m4, m4
|
||||
punpcklqdq m5, m5
|
||||
%else
|
||||
pshufw m3, m3, 0
|
||||
pshufw m4, m4, 0
|
||||
pshufw m5, m5, 0
|
||||
%endif
|
||||
pxor m7, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_STEPA 3
|
||||
movh m%1, [r0+%3]
|
||||
movh m%2, [r1+%3]
|
||||
punpcklbw m%1, m7
|
||||
punpcklbw m%2, m7
|
||||
pmullw m%1, m3
|
||||
pmullw m%2, m4
|
||||
paddsw m%1, m%2
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_STEPB 0
|
||||
paddsw m0, m5
|
||||
paddsw m1, m5
|
||||
psraw m0, m6
|
||||
psraw m1, m6
|
||||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal h264_biweight_16, 7, 8, 0
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
.nextrow:
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, 4
|
||||
BIWEIGHT_STEPB
|
||||
mova [r0], m0
|
||||
BIWEIGHT_STEPA 0, 1, 8
|
||||
BIWEIGHT_STEPA 1, 2, 12
|
||||
BIWEIGHT_STEPB
|
||||
mova [r0+8], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
%macro BIWEIGHT_FUNC_MM 2
|
||||
cglobal h264_biweight_%1, 7, 8, %2
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
.nextrow:
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, mmsize/2
|
||||
BIWEIGHT_STEPB
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
BIWEIGHT_FUNC_MM 8, 0
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_MM 16, 8
|
||||
|
||||
%macro BIWEIGHT_FUNC_HALF_MM 2
|
||||
cglobal h264_biweight_%1, 7, 8, %2
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
sar r3, 1
|
||||
lea r4, [r2*2]
|
||||
.nextrow:
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, r2
|
||||
BIWEIGHT_STEPB
|
||||
movh [r0], m0
|
||||
%if mmsize == 16
|
||||
movhps [r0+r2], m0
|
||||
%else
|
||||
psrlq m0, 32
|
||||
movh [r0+r2], m0
|
||||
%endif
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
BIWEIGHT_FUNC_HALF_MM 4, 0
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_HALF_MM 8, 8
|
||||
|
||||
%macro BIWEIGHT_SSSE3_OP 0
|
||||
pmaddubsw m0, m4
|
||||
pmaddubsw m2, m4
|
||||
paddsw m0, m5
|
||||
paddsw m2, m5
|
||||
psraw m0, m6
|
||||
psraw m2, m6
|
||||
packuswb m0, m2
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal h264_biweight_16, 7, 8, 8
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
|
||||
.nextrow:
|
||||
movh m0, [r0]
|
||||
movh m2, [r0+8]
|
||||
movh m3, [r1+8]
|
||||
punpcklbw m0, [r1]
|
||||
punpcklbw m2, m3
|
||||
BIWEIGHT_SSSE3_OP
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal h264_biweight_8, 7, 8, 8
|
||||
BIWEIGHT_SETUP
|
||||
movifnidn r3d, r3m
|
||||
sar r3, 1
|
||||
lea r4, [r2*2]
|
||||
|
||||
.nextrow:
|
||||
movh m0, [r0]
|
||||
movh m1, [r1]
|
||||
movh m2, [r0+r2]
|
||||
movh m3, [r1+r2]
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m2, m3
|
||||
BIWEIGHT_SSSE3_OP
|
||||
movh [r0], m0
|
||||
movhps [r0+r2], m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
282
project/jni/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
Normal file
282
project/jni/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
Normal file
@@ -0,0 +1,282 @@
|
||||
;*****************************************************************************
|
||||
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2005-2011 x264 project
|
||||
;*
|
||||
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
pw_pixel_max: times 8 dw ((1 << 10)-1)
|
||||
sq_1: dq 1
|
||||
dq 0
|
||||
|
||||
cextern pw_1
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
|
||||
; int weight, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro WEIGHT_PROLOGUE 0
|
||||
.prologue:
|
||||
PROLOGUE 0,6,8
|
||||
movifnidn r0, r0mp
|
||||
movifnidn r1d, r1m
|
||||
movifnidn r2d, r2m
|
||||
movifnidn r4d, r4m
|
||||
movifnidn r5d, r5m
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_SETUP 0
|
||||
mova m0, [pw_1]
|
||||
movd m2, r3m
|
||||
pslld m0, m2 ; 1<<log2_denom
|
||||
SPLATW m0, m0
|
||||
shl r5, 19 ; *8, move to upper half of dword
|
||||
lea r5, [r5+r4*2+0x10000]
|
||||
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
|
||||
pshufd m3, m3, 0
|
||||
mova m4, [pw_pixel_max]
|
||||
paddw m2, [sq_1] ; log2_denom+1
|
||||
%if notcpuflag(sse4)
|
||||
pxor m7, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_OP 1-2
|
||||
%if %0==1
|
||||
mova m5, [r0+%1]
|
||||
punpckhwd m6, m5, m0
|
||||
punpcklwd m5, m0
|
||||
%else
|
||||
movq m5, [r0+%1]
|
||||
movq m6, [r0+%2]
|
||||
punpcklwd m5, m0
|
||||
punpcklwd m6, m0
|
||||
%endif
|
||||
pmaddwd m5, m3
|
||||
pmaddwd m6, m3
|
||||
psrad m5, m2
|
||||
psrad m6, m2
|
||||
%if cpuflag(sse4)
|
||||
packusdw m5, m6
|
||||
pminsw m5, m4
|
||||
%else
|
||||
packssdw m5, m6
|
||||
CLIPW m5, m7, m4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_FUNC_DBL 0
|
||||
cglobal h264_weight_16_10
|
||||
WEIGHT_PROLOGUE
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0
|
||||
mova [r0 ], m5
|
||||
WEIGHT_OP 16
|
||||
mova [r0+16], m5
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_DBL
|
||||
INIT_XMM sse4
|
||||
WEIGHT_FUNC_DBL
|
||||
|
||||
|
||||
%macro WEIGHT_FUNC_MM 0
|
||||
cglobal h264_weight_8_10
|
||||
WEIGHT_PROLOGUE
|
||||
WEIGHT_SETUP
|
||||
.nextrow:
|
||||
WEIGHT_OP 0
|
||||
mova [r0], m5
|
||||
add r0, r1
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_MM
|
||||
INIT_XMM sse4
|
||||
WEIGHT_FUNC_MM
|
||||
|
||||
|
||||
%macro WEIGHT_FUNC_HALF_MM 0
|
||||
cglobal h264_weight_4_10
|
||||
WEIGHT_PROLOGUE
|
||||
sar r2d, 1
|
||||
WEIGHT_SETUP
|
||||
lea r3, [r1*2]
|
||||
.nextrow:
|
||||
WEIGHT_OP 0, r1
|
||||
movh [r0], m5
|
||||
movhps [r0+r1], m5
|
||||
add r0, r3
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
WEIGHT_FUNC_HALF_MM
|
||||
INIT_XMM sse4
|
||||
WEIGHT_FUNC_HALF_MM
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
|
||||
; int log2_denom, int weightd, int weights, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
%if ARCH_X86_32
|
||||
DECLARE_REG_TMP 3
|
||||
%else
|
||||
DECLARE_REG_TMP 7
|
||||
%endif
|
||||
|
||||
%macro BIWEIGHT_PROLOGUE 0
|
||||
.prologue:
|
||||
PROLOGUE 0,8,8
|
||||
movifnidn r0, r0mp
|
||||
movifnidn r1, r1mp
|
||||
movifnidn r2d, r2m
|
||||
movifnidn r5d, r5m
|
||||
movifnidn r6d, r6m
|
||||
movifnidn t0d, r7m
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_SETUP 0
|
||||
lea t0, [t0*4+1] ; (offset<<2)+1
|
||||
or t0, 1
|
||||
shl r6, 16
|
||||
or r5, r6
|
||||
movd m4, r5d ; weightd | weights
|
||||
movd m5, t0d ; (offset+1)|1
|
||||
movd m6, r4m ; log2_denom
|
||||
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
|
||||
paddd m6, [sq_1]
|
||||
pshufd m4, m4, 0
|
||||
pshufd m5, m5, 0
|
||||
mova m3, [pw_pixel_max]
|
||||
movifnidn r3d, r3m
|
||||
%if notcpuflag(sse4)
|
||||
pxor m7, m7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT 1-2
|
||||
%if %0==1
|
||||
mova m0, [r0+%1]
|
||||
mova m1, [r1+%1]
|
||||
punpckhwd m2, m0, m1
|
||||
punpcklwd m0, m1
|
||||
%else
|
||||
movq m0, [r0+%1]
|
||||
movq m1, [r1+%1]
|
||||
punpcklwd m0, m1
|
||||
movq m2, [r0+%2]
|
||||
movq m1, [r1+%2]
|
||||
punpcklwd m2, m1
|
||||
%endif
|
||||
pmaddwd m0, m4
|
||||
pmaddwd m2, m4
|
||||
paddd m0, m5
|
||||
paddd m2, m5
|
||||
psrad m0, m6
|
||||
psrad m2, m6
|
||||
%if cpuflag(sse4)
|
||||
packusdw m0, m2
|
||||
pminsw m0, m3
|
||||
%else
|
||||
packssdw m0, m2
|
||||
CLIPW m0, m7, m3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_FUNC_DBL 0
|
||||
cglobal h264_biweight_16_10
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP
|
||||
.nextrow:
|
||||
BIWEIGHT 0
|
||||
mova [r0 ], m0
|
||||
BIWEIGHT 16
|
||||
mova [r0+16], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_DBL
|
||||
INIT_XMM sse4
|
||||
BIWEIGHT_FUNC_DBL
|
||||
|
||||
%macro BIWEIGHT_FUNC 0
|
||||
cglobal h264_biweight_8_10
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP
|
||||
.nextrow:
|
||||
BIWEIGHT 0
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC
|
||||
INIT_XMM sse4
|
||||
BIWEIGHT_FUNC
|
||||
|
||||
%macro BIWEIGHT_FUNC_HALF 0
|
||||
cglobal h264_biweight_4_10
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP
|
||||
sar r3d, 1
|
||||
lea r4, [r2*2]
|
||||
.nextrow:
|
||||
BIWEIGHT 0, r2
|
||||
movh [r0 ], m0
|
||||
movhps [r0+r2], m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
BIWEIGHT_FUNC_HALF
|
||||
INIT_XMM sse4
|
||||
BIWEIGHT_FUNC_HALF
|
||||
374
project/jni/ffmpeg/libavcodec/x86/h264dsp_init.c
Normal file
374
project/jni/ffmpeg/libavcodec/x86/h264dsp_init.c
Normal file
@@ -0,0 +1,374 @@
|
||||
/*
|
||||
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
/***********************************/
|
||||
/* IDCT */
|
||||
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int16_t *block, \
|
||||
int stride);
|
||||
|
||||
IDCT_ADD_FUNC(, 8, mmx)
|
||||
IDCT_ADD_FUNC(, 10, sse2)
|
||||
IDCT_ADD_FUNC(_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(_dc, 10, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 8, mmxext)
|
||||
IDCT_ADD_FUNC(8_dc, 10, sse2)
|
||||
IDCT_ADD_FUNC(8, 8, mmx)
|
||||
IDCT_ADD_FUNC(8, 8, sse2)
|
||||
IDCT_ADD_FUNC(8, 10, sse2)
|
||||
IDCT_ADD_FUNC(, 10, avx)
|
||||
IDCT_ADD_FUNC(8_dc, 10, avx)
|
||||
IDCT_ADD_FUNC(8, 10, avx)
|
||||
|
||||
|
||||
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, const int *block_offset, \
|
||||
DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]);
|
||||
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC(, 16, 10, avx)
|
||||
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
|
||||
|
||||
|
||||
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
|
||||
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t **dst, const int *block_offset, \
|
||||
DCTELEM *block, int stride, const uint8_t nnzc[6 * 8]);
|
||||
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
|
||||
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
|
||||
|
||||
void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul);
|
||||
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
|
||||
|
||||
/***********************************/
|
||||
/* deblocking */
|
||||
|
||||
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
|
||||
int8_t ref[2][40],
|
||||
int16_t mv[2][40][2],
|
||||
int bidir, int edges, int step,
|
||||
int mask_mv0, int mask_mv1, int field);
|
||||
|
||||
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
|
||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
|
||||
int stride, \
|
||||
int alpha, \
|
||||
int beta, \
|
||||
int8_t *tc0);
|
||||
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
|
||||
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
|
||||
int stride, \
|
||||
int alpha, \
|
||||
int beta);
|
||||
|
||||
#define LF_FUNCS(type, depth) \
|
||||
LF_FUNC(h, chroma, depth, mmxext) \
|
||||
LF_IFUNC(h, chroma_intra, depth, mmxext) \
|
||||
LF_FUNC(v, chroma, depth, mmxext) \
|
||||
LF_IFUNC(v, chroma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, luma, depth, mmxext) \
|
||||
LF_IFUNC(h, luma_intra, depth, mmxext) \
|
||||
LF_FUNC(h, luma, depth, sse2) \
|
||||
LF_IFUNC(h, luma_intra, depth, sse2) \
|
||||
LF_FUNC(v, luma, depth, sse2) \
|
||||
LF_IFUNC(v, luma_intra, depth, sse2) \
|
||||
LF_FUNC(h, chroma, depth, sse2) \
|
||||
LF_IFUNC(h, chroma_intra, depth, sse2) \
|
||||
LF_FUNC(v, chroma, depth, sse2) \
|
||||
LF_IFUNC(v, chroma_intra, depth, sse2) \
|
||||
LF_FUNC(h, luma, depth, avx) \
|
||||
LF_IFUNC(h, luma_intra, depth, avx) \
|
||||
LF_FUNC(v, luma, depth, avx) \
|
||||
LF_IFUNC(v, luma_intra, depth, avx) \
|
||||
LF_FUNC(h, chroma, depth, avx) \
|
||||
LF_IFUNC(h, chroma_intra, depth, avx) \
|
||||
LF_FUNC(v, chroma, depth, avx) \
|
||||
LF_IFUNC(v, chroma_intra, depth, avx)
|
||||
|
||||
LF_FUNCS(uint8_t, 8)
|
||||
LF_FUNCS(uint16_t, 10)
|
||||
|
||||
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
|
||||
LF_FUNC(v8, luma, 8, mmxext)
|
||||
static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0)
|
||||
{
|
||||
if ((tc0[0] & tc0[1]) >= 0)
|
||||
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
|
||||
if ((tc0[2] & tc0[3]) >= 0)
|
||||
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
|
||||
}
|
||||
LF_IFUNC(v8, luma_intra, 8, mmxext)
|
||||
static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
|
||||
int alpha, int beta)
|
||||
{
|
||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
|
||||
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
|
||||
}
|
||||
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
|
||||
|
||||
LF_FUNC(v, luma, 10, mmxext)
|
||||
LF_IFUNC(v, luma_intra, 10, mmxext)
|
||||
|
||||
/***********************************/
|
||||
/* weighted prediction */
|
||||
|
||||
#define H264_WEIGHT(W, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \
|
||||
int height, int log2_denom, \
|
||||
int weight, int offset);
|
||||
|
||||
#define H264_BIWEIGHT(W, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
|
||||
int stride, int height, \
|
||||
int log2_denom, int weightd, \
|
||||
int weights, int offset);
|
||||
|
||||
#define H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT(W, mmxext) \
|
||||
H264_BIWEIGHT(W, mmxext)
|
||||
|
||||
#define H264_BIWEIGHT_MMX_SSE(W) \
|
||||
H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT(W, sse2) \
|
||||
H264_BIWEIGHT(W, sse2) \
|
||||
H264_BIWEIGHT(W, ssse3)
|
||||
|
||||
H264_BIWEIGHT_MMX_SSE(16)
|
||||
H264_BIWEIGHT_MMX_SSE(8)
|
||||
H264_BIWEIGHT_MMX(4)
|
||||
|
||||
#define H264_WEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int stride, \
|
||||
int height, \
|
||||
int log2_denom, \
|
||||
int weight, \
|
||||
int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
uint8_t *src, \
|
||||
int stride, \
|
||||
int height, \
|
||||
int log2_denom, \
|
||||
int weightd, \
|
||||
int weights, \
|
||||
int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
|
||||
H264_WEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_WEIGHT_10(W, DEPTH, sse4) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse4)
|
||||
|
||||
H264_BIWEIGHT_10_SSE(16, 10)
|
||||
H264_BIWEIGHT_10_SSE(8, 10)
|
||||
H264_BIWEIGHT_10_SSE(4, 10)
|
||||
|
||||
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(mm_flags))
|
||||
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->h264_idct_dc_add =
|
||||
c->h264_idct_add = ff_h264_idct_add_8_mmx;
|
||||
c->h264_idct8_dc_add =
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
|
||||
if (chroma_format_idc == 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
|
||||
if (mm_flags & AV_CPU_FLAG_CMOV)
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
|
||||
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
|
||||
if (chroma_format_idc == 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
|
||||
if (chroma_format_idc == 1) {
|
||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
|
||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
|
||||
}
|
||||
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
|
||||
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
|
||||
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
|
||||
if (chroma_format_idc == 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
|
||||
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
|
||||
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
|
||||
}
|
||||
if (EXTERNAL_SSSE3(mm_flags)) {
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
|
||||
}
|
||||
if (EXTERNAL_AVX(mm_flags)) {
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (bit_depth == 10) {
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
#if ARCH_X86_32
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
|
||||
#endif /* ARCH_X86_32 */
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->h264_idct_add = ff_h264_idct_add_10_sse2;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
|
||||
if (chroma_format_idc == 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
if (EXTERNAL_SSE4(mm_flags)) {
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
|
||||
}
|
||||
if (EXTERNAL_AVX(mm_flags)) {
|
||||
c->h264_idct_dc_add =
|
||||
c->h264_idct_add = ff_h264_idct_add_10_avx;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
|
||||
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
|
||||
if (chroma_format_idc == 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
|
||||
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
|
||||
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
|
||||
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
|
||||
#endif /* HAVE_ALIGNED_STACK */
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
632
project/jni/ffmpeg/libavcodec/x86/idct_mmx.c
Normal file
632
project/jni/ffmpeg/libavcodec/x86/idct_mmx.c
Normal file
@@ -0,0 +1,632 @@
|
||||
/*
|
||||
* Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
|
||||
*
|
||||
* This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
|
||||
* See http://libmpeg2.sourceforge.net/ for updates.
|
||||
*
|
||||
* mpeg2dec is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* mpeg2dec is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with mpeg2dec; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define ROW_SHIFT 11
|
||||
#define COL_SHIFT 6
|
||||
|
||||
#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
|
||||
#define rounder(bias) {round (bias), round (bias)}
|
||||
|
||||
|
||||
#if 0
|
||||
/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
|
||||
static inline void idct_row (int16_t * row, int offset,
|
||||
int16_t * table, int32_t * rounder)
|
||||
{
|
||||
int C1, C2, C3, C4, C5, C6, C7;
|
||||
int a0, a1, a2, a3, b0, b1, b2, b3;
|
||||
|
||||
row += offset;
|
||||
|
||||
C1 = table[1];
|
||||
C2 = table[2];
|
||||
C3 = table[3];
|
||||
C4 = table[4];
|
||||
C5 = table[5];
|
||||
C6 = table[6];
|
||||
C7 = table[7];
|
||||
|
||||
a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
|
||||
a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
|
||||
a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
|
||||
a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
|
||||
|
||||
b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
|
||||
b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
|
||||
b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
|
||||
b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
|
||||
|
||||
row[0] = (a0 + b0) >> ROW_SHIFT;
|
||||
row[1] = (a1 + b1) >> ROW_SHIFT;
|
||||
row[2] = (a2 + b2) >> ROW_SHIFT;
|
||||
row[3] = (a3 + b3) >> ROW_SHIFT;
|
||||
row[4] = (a3 - b3) >> ROW_SHIFT;
|
||||
row[5] = (a2 - b2) >> ROW_SHIFT;
|
||||
row[6] = (a1 - b1) >> ROW_SHIFT;
|
||||
row[7] = (a0 - b0) >> ROW_SHIFT;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* MMXEXT row IDCT */
|
||||
|
||||
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
|
||||
c4, c6, c4, c6, \
|
||||
c1, c3, -c1, -c5, \
|
||||
c5, c7, c3, -c7, \
|
||||
c4, -c6, c4, -c6, \
|
||||
-c4, c2, c4, -c2, \
|
||||
c5, -c1, c3, -c1, \
|
||||
c7, c3, c7, -c5 }
|
||||
|
||||
static inline void mmxext_row_head (int16_t * const row, const int offset,
|
||||
const int16_t * const table)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
|
||||
|
||||
"movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
|
||||
"movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
|
||||
|
||||
"movq (%1), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */
|
||||
"movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
|
||||
|
||||
"movq 8(%1), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */
|
||||
"pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
|
||||
|
||||
"pshufw $0x4e, %%mm2, %%mm2 \n\t" /* mm2 = x2 x0 x6 x4 */
|
||||
:: "r" ((row+offset)), "r" (table)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void mmxext_row (const int16_t * const table,
|
||||
const int32_t * const rounder)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"movq 16(%0), %%mm1 \n\t" /* mm1 = -C5 -C1 C3 C1 */
|
||||
"pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
|
||||
|
||||
"pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
|
||||
"pshufw $0x4e, %%mm6, %%mm6 \n\t" /* mm6 = x3 x1 x7 x5 */
|
||||
|
||||
"movq 24(%0), %%mm7 \n\t" /* mm7 = -C7 C3 C7 C5 */
|
||||
"pmaddwd %%mm5, %%mm1 \n\t" /* mm1= -C1*x5-C5*x7 C1*x1+C3*x3 */
|
||||
|
||||
"paddd (%1), %%mm3 \n\t" /* mm3 += rounder */
|
||||
"pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
|
||||
|
||||
"pmaddwd 40(%0), %%mm2 \n\t" /* mm2= C4*x0-C2*x2 -C4*x4+C2*x6 */
|
||||
"paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */
|
||||
|
||||
"pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
|
||||
"movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */
|
||||
|
||||
"pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
|
||||
"paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */
|
||||
|
||||
"paddd (%1), %%mm0 \n\t" /* mm0 += rounder */
|
||||
"psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */
|
||||
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */
|
||||
"paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */
|
||||
|
||||
"paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */
|
||||
|
||||
"paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */
|
||||
"movq %%mm0, %%mm4 \n\t" /* mm4 = a3 a2 + rounder */
|
||||
|
||||
"paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */
|
||||
"psubd %%mm5, %%mm4 \n\t" /* mm4 = a3-b3 a2-b2 + rounder */
|
||||
: : "r" (table), "r" (rounder));
|
||||
}
|
||||
|
||||
static inline void mmxext_row_tail (int16_t * const row, const int store)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
|
||||
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */
|
||||
|
||||
"packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
|
||||
|
||||
"packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */
|
||||
|
||||
"movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */
|
||||
"pshufw $0xb1, %%mm4, %%mm4 \n\t" /* mm4 = y7 y6 y5 y4 */
|
||||
|
||||
/* slot */
|
||||
|
||||
"movq %%mm4, 8(%0) \n\t" /* save y7 y6 y5 y4 */
|
||||
:: "r" (row+store)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void mmxext_row_mid (int16_t * const row, const int store,
|
||||
const int offset,
|
||||
const int16_t * const table)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
|
||||
|
||||
"movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm4 \n\t" /* mm4 = y4 y5 */
|
||||
|
||||
"packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
|
||||
"movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
|
||||
|
||||
"packssdw %%mm3, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */
|
||||
"movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
|
||||
|
||||
"movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */
|
||||
"pshufw $0xb1, %%mm4, %%mm4\n\t" /* mm4 = y7 y6 y5 y4 */
|
||||
|
||||
"movq (%3), %%mm3 \n\t" /* mm3 = -C2 -C4 C2 C4 */
|
||||
"movq %%mm4, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */
|
||||
|
||||
"pmaddwd %%mm0, %%mm3 \n\t" /* mm3= -C4*x4-C2*x6 C4*x0+C2*x2 */
|
||||
|
||||
"movq 8(%3), %%mm4 \n\t" /* mm4 = C6 C4 C6 C4 */
|
||||
"pshufw $0x4e, %%mm2, %%mm2\n\t" /* mm2 = x2 x0 x6 x4 */
|
||||
:: "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/* MMX row IDCT */
|
||||
|
||||
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
|
||||
c4, c6, -c4, -c2, \
|
||||
c1, c3, c3, -c7, \
|
||||
c5, c7, -c1, -c5, \
|
||||
c4, -c6, c4, -c2, \
|
||||
-c4, c2, c4, -c6, \
|
||||
c5, -c1, c7, -c5, \
|
||||
c7, c3, c3, -c1 }
|
||||
|
||||
static inline void mmx_row_head (int16_t * const row, const int offset,
|
||||
const int16_t * const table)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"movq (%0), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
|
||||
|
||||
"movq 8(%0), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
|
||||
"movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
|
||||
|
||||
"movq (%1), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */
|
||||
"movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
|
||||
|
||||
"punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */
|
||||
|
||||
"movq 8(%1), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */
|
||||
"pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
|
||||
|
||||
"movq 16(%1), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */
|
||||
"punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */
|
||||
:: "r" ((row+offset)), "r" (table)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void mmx_row (const int16_t * const table,
|
||||
const int32_t * const rounder)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"pmaddwd %%mm2, %%mm4 \n\t" /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
|
||||
"punpckldq %%mm5, %%mm5 \n\t" /* mm5 = x3 x1 x3 x1 */
|
||||
|
||||
"pmaddwd 32(%0), %%mm0 \n\t" /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
|
||||
"punpckhdq %%mm6, %%mm6 \n\t" /* mm6 = x7 x5 x7 x5 */
|
||||
|
||||
"movq 24(%0), %%mm7 \n\t" /* mm7 = -C5 -C1 C7 C5 */
|
||||
"pmaddwd %%mm5, %%mm1 \n\t" /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
|
||||
|
||||
"paddd (%1), %%mm3 \n\t" /* mm3 += rounder */
|
||||
"pmaddwd %%mm6, %%mm7 \n\t" /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
|
||||
|
||||
"pmaddwd 40(%0), %%mm2 \n\t" /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
|
||||
"paddd %%mm4, %%mm3 \n\t" /* mm3 = a1 a0 + rounder */
|
||||
|
||||
"pmaddwd 48(%0), %%mm5 \n\t" /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
|
||||
"movq %%mm3, %%mm4 \n\t" /* mm4 = a1 a0 + rounder */
|
||||
|
||||
"pmaddwd 56(%0), %%mm6 \n\t" /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
|
||||
"paddd %%mm7, %%mm1 \n\t" /* mm1 = b1 b0 */
|
||||
|
||||
"paddd (%1), %%mm0 \n\t" /* mm0 += rounder */
|
||||
"psubd %%mm1, %%mm3 \n\t" /* mm3 = a1-b1 a0-b0 + rounder */
|
||||
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm3 \n\t" /* mm3 = y6 y7 */
|
||||
"paddd %%mm4, %%mm1 \n\t" /* mm1 = a1+b1 a0+b0 + rounder */
|
||||
|
||||
"paddd %%mm2, %%mm0 \n\t" /* mm0 = a3 a2 + rounder */
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm1 \n\t" /* mm1 = y1 y0 */
|
||||
|
||||
"paddd %%mm6, %%mm5 \n\t" /* mm5 = b3 b2 */
|
||||
"movq %%mm0, %%mm7 \n\t" /* mm7 = a3 a2 + rounder */
|
||||
|
||||
"paddd %%mm5, %%mm0 \n\t" /* mm0 = a3+b3 a2+b2 + rounder */
|
||||
"psubd %%mm5, %%mm7 \n\t" /* mm7 = a3-b3 a2-b2 + rounder */
|
||||
:: "r" (table), "r" (rounder)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void mmx_row_tail (int16_t * const row, const int store)
|
||||
{
|
||||
__asm__ volatile (
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
|
||||
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */
|
||||
|
||||
"packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
|
||||
|
||||
"packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */
|
||||
|
||||
"movq %%mm1, (%0) \n\t" /* save y3 y2 y1 y0 */
|
||||
"movq %%mm7, %%mm4 \n\t" /* mm4 = y6 y7 y4 y5 */
|
||||
|
||||
"pslld $16, %%mm7 \n\t" /* mm7 = y7 0 y5 0 */
|
||||
|
||||
"psrld $16, %%mm4 \n\t" /* mm4 = 0 y6 0 y4 */
|
||||
|
||||
"por %%mm4, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */
|
||||
|
||||
/* slot */
|
||||
|
||||
"movq %%mm7, 8(%0) \n\t" /* save y7 y6 y5 y4 */
|
||||
:: "r" (row+store)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void mmx_row_mid (int16_t * const row, const int store,
|
||||
const int offset, const int16_t * const table)
|
||||
{
|
||||
|
||||
__asm__ volatile (
|
||||
"movq (%0,%1), %%mm2 \n\t" /* mm2 = x6 x4 x2 x0 */
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm0 \n\t" /* mm0 = y3 y2 */
|
||||
|
||||
"movq 8(%0,%1), %%mm5 \n\t" /* mm5 = x7 x5 x3 x1 */
|
||||
"psrad $" AV_STRINGIFY(ROW_SHIFT) ", %%mm7 \n\t" /* mm7 = y4 y5 */
|
||||
|
||||
"packssdw %%mm0, %%mm1 \n\t" /* mm1 = y3 y2 y1 y0 */
|
||||
"movq %%mm5, %%mm6 \n\t" /* mm6 = x7 x5 x3 x1 */
|
||||
|
||||
"packssdw %%mm3, %%mm7 \n\t" /* mm7 = y6 y7 y4 y5 */
|
||||
"movq %%mm2, %%mm0 \n\t" /* mm0 = x6 x4 x2 x0 */
|
||||
|
||||
"movq %%mm1, (%0,%2) \n\t" /* save y3 y2 y1 y0 */
|
||||
"movq %%mm7, %%mm1 \n\t" /* mm1 = y6 y7 y4 y5 */
|
||||
|
||||
"punpckldq %%mm0, %%mm0 \n\t" /* mm0 = x2 x0 x2 x0 */
|
||||
"psrld $16, %%mm7 \n\t" /* mm7 = 0 y6 0 y4 */
|
||||
|
||||
"movq (%3), %%mm3 \n\t" /* mm3 = C6 C4 C2 C4 */
|
||||
"pslld $16, %%mm1 \n\t" /* mm1 = y7 0 y5 0 */
|
||||
|
||||
"movq 8(%3), %%mm4 \n\t" /* mm4 = -C2 -C4 C6 C4 */
|
||||
"por %%mm1, %%mm7 \n\t" /* mm7 = y7 y6 y5 y4 */
|
||||
|
||||
"movq 16(%3), %%mm1 \n\t" /* mm1 = -C7 C3 C3 C1 */
|
||||
"punpckhdq %%mm2, %%mm2 \n\t" /* mm2 = x6 x4 x6 x4 */
|
||||
|
||||
"movq %%mm7, 8(%0,%2) \n\t" /* save y7 y6 y5 y4 */
|
||||
"pmaddwd %%mm0, %%mm3 \n\t" /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
|
||||
: : "r" (row), "r" ((x86_reg) (2*offset)), "r" ((x86_reg) (2*store)), "r" (table)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
/* C column IDCT - it is just here to document the MMXEXT and MMX versions */
|
||||
static inline void idct_col (int16_t * col, int offset)
|
||||
{
|
||||
/* multiplication - as implemented on mmx */
|
||||
#define F(c,x) (((c) * (x)) >> 16)
|
||||
|
||||
/* saturation - it helps us handle torture test cases */
|
||||
#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
|
||||
|
||||
int16_t x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
int16_t y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
int16_t a0, a1, a2, a3, b0, b1, b2, b3;
|
||||
int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
|
||||
|
||||
col += offset;
|
||||
|
||||
x0 = col[0*8];
|
||||
x1 = col[1*8];
|
||||
x2 = col[2*8];
|
||||
x3 = col[3*8];
|
||||
x4 = col[4*8];
|
||||
x5 = col[5*8];
|
||||
x6 = col[6*8];
|
||||
x7 = col[7*8];
|
||||
|
||||
u04 = S (x0 + x4);
|
||||
v04 = S (x0 - x4);
|
||||
u26 = S (F (T2, x6) + x2);
|
||||
v26 = S (F (T2, x2) - x6);
|
||||
|
||||
a0 = S (u04 + u26);
|
||||
a1 = S (v04 + v26);
|
||||
a2 = S (v04 - v26);
|
||||
a3 = S (u04 - u26);
|
||||
|
||||
u17 = S (F (T1, x7) + x1);
|
||||
v17 = S (F (T1, x1) - x7);
|
||||
u35 = S (F (T3, x5) + x3);
|
||||
v35 = S (F (T3, x3) - x5);
|
||||
|
||||
b0 = S (u17 + u35);
|
||||
b3 = S (v17 - v35);
|
||||
u12 = S (u17 - u35);
|
||||
v12 = S (v17 + v35);
|
||||
u12 = S (2 * F (C4, u12));
|
||||
v12 = S (2 * F (C4, v12));
|
||||
b1 = S (u12 + v12);
|
||||
b2 = S (u12 - v12);
|
||||
|
||||
y0 = S (a0 + b0) >> COL_SHIFT;
|
||||
y1 = S (a1 + b1) >> COL_SHIFT;
|
||||
y2 = S (a2 + b2) >> COL_SHIFT;
|
||||
y3 = S (a3 + b3) >> COL_SHIFT;
|
||||
|
||||
y4 = S (a3 - b3) >> COL_SHIFT;
|
||||
y5 = S (a2 - b2) >> COL_SHIFT;
|
||||
y6 = S (a1 - b1) >> COL_SHIFT;
|
||||
y7 = S (a0 - b0) >> COL_SHIFT;
|
||||
|
||||
col[0*8] = y0;
|
||||
col[1*8] = y1;
|
||||
col[2*8] = y2;
|
||||
col[3*8] = y3;
|
||||
col[4*8] = y4;
|
||||
col[5*8] = y5;
|
||||
col[6*8] = y6;
|
||||
col[7*8] = y7;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* MMX column IDCT */
|
||||
static inline void idct_col (int16_t * const col, const int offset)
|
||||
{
|
||||
#define T1 13036
|
||||
#define T2 27146
|
||||
#define T3 43790
|
||||
#define C4 23170
|
||||
|
||||
DECLARE_ALIGNED(8, static const short, t1_vector)[] = {
|
||||
T1,T1,T1,T1,
|
||||
T2,T2,T2,T2,
|
||||
T3,T3,T3,T3,
|
||||
C4,C4,C4,C4
|
||||
};
|
||||
|
||||
/* column code adapted from Peter Gubanov */
|
||||
/* http://www.elecard.com/peter/idct.shtml */
|
||||
|
||||
__asm__ volatile (
|
||||
"movq (%0), %%mm0 \n\t" /* mm0 = T1 */
|
||||
|
||||
"movq 2*8(%1), %%mm1 \n\t" /* mm1 = x1 */
|
||||
"movq %%mm0, %%mm2 \n\t" /* mm2 = T1 */
|
||||
|
||||
"movq 7*2*8(%1), %%mm4 \n\t" /* mm4 = x7 */
|
||||
"pmulhw %%mm1, %%mm0 \n\t" /* mm0 = T1*x1 */
|
||||
|
||||
"movq 16(%0), %%mm5 \n\t" /* mm5 = T3 */
|
||||
"pmulhw %%mm4, %%mm2 \n\t" /* mm2 = T1*x7 */
|
||||
|
||||
"movq 2*5*8(%1), %%mm6 \n\t" /* mm6 = x5 */
|
||||
"movq %%mm5, %%mm7 \n\t" /* mm7 = T3-1 */
|
||||
|
||||
"movq 3*8*2(%1), %%mm3 \n\t" /* mm3 = x3 */
|
||||
"psubsw %%mm4, %%mm0 \n\t" /* mm0 = v17 */
|
||||
|
||||
"movq 8(%0), %%mm4 \n\t" /* mm4 = T2 */
|
||||
"pmulhw %%mm3, %%mm5 \n\t" /* mm5 = (T3-1)*x3 */
|
||||
|
||||
"paddsw %%mm2, %%mm1 \n\t" /* mm1 = u17 */
|
||||
"pmulhw %%mm6, %%mm7 \n\t" /* mm7 = (T3-1)*x5 */
|
||||
|
||||
/* slot */
|
||||
|
||||
"movq %%mm4, %%mm2 \n\t" /* mm2 = T2 */
|
||||
"paddsw %%mm3, %%mm5 \n\t" /* mm5 = T3*x3 */
|
||||
|
||||
"pmulhw 2*8*2(%1), %%mm4 \n\t" /* mm4 = T2*x2 */
|
||||
"paddsw %%mm6, %%mm7 \n\t" /* mm7 = T3*x5 */
|
||||
|
||||
"psubsw %%mm6, %%mm5 \n\t" /* mm5 = v35 */
|
||||
"paddsw %%mm3, %%mm7 \n\t" /* mm7 = u35 */
|
||||
|
||||
"movq 6*8*2(%1), %%mm3 \n\t" /* mm3 = x6 */
|
||||
"movq %%mm0, %%mm6 \n\t" /* mm6 = v17 */
|
||||
|
||||
"pmulhw %%mm3, %%mm2 \n\t" /* mm2 = T2*x6 */
|
||||
"psubsw %%mm5, %%mm0 \n\t" /* mm0 = b3 */
|
||||
|
||||
"psubsw %%mm3, %%mm4 \n\t" /* mm4 = v26 */
|
||||
"paddsw %%mm6, %%mm5 \n\t" /* mm5 = v12 */
|
||||
|
||||
"movq %%mm0, 3*8*2(%1)\n\t" /* save b3 in scratch0 */
|
||||
"movq %%mm1, %%mm6 \n\t" /* mm6 = u17 */
|
||||
|
||||
"paddsw 2*8*2(%1), %%mm2 \n\t" /* mm2 = u26 */
|
||||
"paddsw %%mm7, %%mm6 \n\t" /* mm6 = b0 */
|
||||
|
||||
"psubsw %%mm7, %%mm1 \n\t" /* mm1 = u12 */
|
||||
"movq %%mm1, %%mm7 \n\t" /* mm7 = u12 */
|
||||
|
||||
"movq 0*8(%1), %%mm3 \n\t" /* mm3 = x0 */
|
||||
"paddsw %%mm5, %%mm1 \n\t" /* mm1 = u12+v12 */
|
||||
|
||||
"movq 24(%0), %%mm0 \n\t" /* mm0 = C4/2 */
|
||||
"psubsw %%mm5, %%mm7 \n\t" /* mm7 = u12-v12 */
|
||||
|
||||
"movq %%mm6, 5*8*2(%1)\n\t" /* save b0 in scratch1 */
|
||||
"pmulhw %%mm0, %%mm1 \n\t" /* mm1 = b1/2 */
|
||||
|
||||
"movq %%mm4, %%mm6 \n\t" /* mm6 = v26 */
|
||||
"pmulhw %%mm0, %%mm7 \n\t" /* mm7 = b2/2 */
|
||||
|
||||
"movq 4*8*2(%1), %%mm5 \n\t" /* mm5 = x4 */
|
||||
"movq %%mm3, %%mm0 \n\t" /* mm0 = x0 */
|
||||
|
||||
"psubsw %%mm5, %%mm3 \n\t" /* mm3 = v04 */
|
||||
"paddsw %%mm5, %%mm0 \n\t" /* mm0 = u04 */
|
||||
|
||||
"paddsw %%mm3, %%mm4 \n\t" /* mm4 = a1 */
|
||||
"movq %%mm0, %%mm5 \n\t" /* mm5 = u04 */
|
||||
|
||||
"psubsw %%mm6, %%mm3 \n\t" /* mm3 = a2 */
|
||||
"paddsw %%mm2, %%mm5 \n\t" /* mm5 = a0 */
|
||||
|
||||
"paddsw %%mm1, %%mm1 \n\t" /* mm1 = b1 */
|
||||
"psubsw %%mm2, %%mm0 \n\t" /* mm0 = a3 */
|
||||
|
||||
"paddsw %%mm7, %%mm7 \n\t" /* mm7 = b2 */
|
||||
"movq %%mm3, %%mm2 \n\t" /* mm2 = a2 */
|
||||
|
||||
"movq %%mm4, %%mm6 \n\t" /* mm6 = a1 */
|
||||
"paddsw %%mm7, %%mm3 \n\t" /* mm3 = a2+b2 */
|
||||
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y2 */
|
||||
"paddsw %%mm1, %%mm4\n\t" /* mm4 = a1+b1 */
|
||||
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y1 */
|
||||
"psubsw %%mm1, %%mm6 \n\t" /* mm6 = a1-b1 */
|
||||
|
||||
"movq 5*8*2(%1), %%mm1 \n\t" /* mm1 = b0 */
|
||||
"psubsw %%mm7, %%mm2 \n\t" /* mm2 = a2-b2 */
|
||||
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm6\n\t" /* mm6 = y6 */
|
||||
"movq %%mm5, %%mm7 \n\t" /* mm7 = a0 */
|
||||
|
||||
"movq %%mm4, 1*8*2(%1)\n\t" /* save y1 */
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm2\n\t" /* mm2 = y5 */
|
||||
|
||||
"movq %%mm3, 2*8*2(%1)\n\t" /* save y2 */
|
||||
"paddsw %%mm1, %%mm5 \n\t" /* mm5 = a0+b0 */
|
||||
|
||||
"movq 3*8*2(%1), %%mm4 \n\t" /* mm4 = b3 */
|
||||
"psubsw %%mm1, %%mm7 \n\t" /* mm7 = a0-b0 */
|
||||
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm5\n\t" /* mm5 = y0 */
|
||||
"movq %%mm0, %%mm3 \n\t" /* mm3 = a3 */
|
||||
|
||||
"movq %%mm2, 5*8*2(%1)\n\t" /* save y5 */
|
||||
"psubsw %%mm4, %%mm3 \n\t" /* mm3 = a3-b3 */
|
||||
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm7\n\t" /* mm7 = y7 */
|
||||
"paddsw %%mm0, %%mm4 \n\t" /* mm4 = a3+b3 */
|
||||
|
||||
"movq %%mm5, 0*8*2(%1)\n\t" /* save y0 */
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm3\n\t" /* mm3 = y4 */
|
||||
|
||||
"movq %%mm6, 6*8*2(%1)\n\t" /* save y6 */
|
||||
"psraw $" AV_STRINGIFY(COL_SHIFT) ", %%mm4\n\t" /* mm4 = y3 */
|
||||
|
||||
"movq %%mm7, 7*8*2(%1)\n\t" /* save y7 */
|
||||
|
||||
"movq %%mm3, 4*8*2(%1)\n\t" /* save y4 */
|
||||
|
||||
"movq %%mm4, 3*8*2(%1)\n\t" /* save y3 */
|
||||
:: "r" (t1_vector), "r" (col+offset)
|
||||
);
|
||||
|
||||
#undef T1
|
||||
#undef T2
|
||||
#undef T3
|
||||
#undef C4
|
||||
}
|
||||
|
||||
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder0)[] =
|
||||
rounder ((1 << (COL_SHIFT - 1)) - 0.5);
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder4)[] = rounder (0);
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder1)[] =
|
||||
rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder7)[] =
|
||||
rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder2)[] =
|
||||
rounder (0.60355339059); /* C2 * (C6+C2)/2 */
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder6)[] =
|
||||
rounder (-0.25); /* C2 * (C6-C2)/2 */
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder3)[] =
|
||||
rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder5)[] =
|
||||
rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */
|
||||
|
||||
#undef COL_SHIFT
|
||||
#undef ROW_SHIFT
|
||||
|
||||
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
|
||||
void idct (int16_t * const block) \
|
||||
{ \
|
||||
DECLARE_ALIGNED(16, static const int16_t, table04)[] = \
|
||||
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
|
||||
DECLARE_ALIGNED(16, static const int16_t, table17)[] = \
|
||||
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
|
||||
DECLARE_ALIGNED(16, static const int16_t, table26)[] = \
|
||||
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
|
||||
DECLARE_ALIGNED(16, static const int16_t, table35)[] = \
|
||||
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
|
||||
\
|
||||
idct_row_head (block, 0*8, table04); \
|
||||
idct_row (table04, rounder0); \
|
||||
idct_row_mid (block, 0*8, 4*8, table04); \
|
||||
idct_row (table04, rounder4); \
|
||||
idct_row_mid (block, 4*8, 1*8, table17); \
|
||||
idct_row (table17, rounder1); \
|
||||
idct_row_mid (block, 1*8, 7*8, table17); \
|
||||
idct_row (table17, rounder7); \
|
||||
idct_row_mid (block, 7*8, 2*8, table26); \
|
||||
idct_row (table26, rounder2); \
|
||||
idct_row_mid (block, 2*8, 6*8, table26); \
|
||||
idct_row (table26, rounder6); \
|
||||
idct_row_mid (block, 6*8, 3*8, table35); \
|
||||
idct_row (table35, rounder3); \
|
||||
idct_row_mid (block, 3*8, 5*8, table35); \
|
||||
idct_row (table35, rounder5); \
|
||||
idct_row_tail (block, 5*8); \
|
||||
\
|
||||
idct_col (block, 0); \
|
||||
idct_col (block, 4); \
|
||||
}
|
||||
|
||||
declare_idct (ff_mmxext_idct, mmxext_table,
|
||||
mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
|
||||
|
||||
declare_idct (ff_mmx_idct, mmx_table,
|
||||
mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
558
project/jni/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
Normal file
558
project/jni/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
Normal file
@@ -0,0 +1,558 @@
|
||||
/*
|
||||
* XVID MPEG-4 VIDEO CODEC
|
||||
* - MMX and XMM forward discrete cosine transform -
|
||||
*
|
||||
* Copyright(C) 2001 Peter Ross <pross@xvid.org>
|
||||
*
|
||||
* Originally provided by Intel at AP-922
|
||||
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
|
||||
* (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
|
||||
* but in a limited edition.
|
||||
* New macro implements a column part for precise iDCT
|
||||
* The routine precision now satisfies IEEE standard 1180-1990.
|
||||
*
|
||||
* Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
|
||||
* Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
|
||||
*
|
||||
* http://www.elecard.com/peter/idct.html
|
||||
* http://www.linuxvideo.org/mpeg2dec/
|
||||
*
|
||||
* These examples contain code fragments for first stage iDCT 8x8
|
||||
* (for rows) and first stage DCT 8x8 (for columns)
|
||||
*
|
||||
* conversion to gcc syntax by Michael Niedermayer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with FFmpeg; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "config.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "idct_xvid.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
//=============================================================================
|
||||
// Macros and other preprocessor constants
|
||||
//=============================================================================
|
||||
|
||||
#define BITS_INV_ACC 5 // 4 or 5 for IEEE
|
||||
#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
|
||||
#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
|
||||
#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
|
||||
#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
|
||||
#define RND_INV_CORR (RND_INV_COL - 1)
|
||||
|
||||
#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
|
||||
#define SHIFT_FRW_COL BITS_FRW_ACC
|
||||
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
|
||||
#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Various memory constants (trigonometric values or rounding values)
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
|
||||
13036,13036,13036,13036, // tg * (2<<16) + 0.5
|
||||
27146,27146,27146,27146, // tg * (2<<16) + 0.5
|
||||
-21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
|
||||
23170,23170,23170,23170}; // cos * (2<<15) + 0.5
|
||||
|
||||
DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
|
||||
65536,65536,
|
||||
3597,3597,
|
||||
2260,2260,
|
||||
1203,1203,
|
||||
0,0,
|
||||
120,120,
|
||||
512,512,
|
||||
512,512};
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// The first stage iDCT 8x8 - inverse DCTs of rows
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
// The 8-point inverse DCT direct algorithm
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// static const short w[32] = {
|
||||
// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
|
||||
// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
|
||||
// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
|
||||
// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
|
||||
// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
|
||||
// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
|
||||
// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
|
||||
// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
|
||||
//
|
||||
// #define DCT_8_INV_ROW(x, y)
|
||||
// {
|
||||
// int a0, a1, a2, a3, b0, b1, b2, b3;
|
||||
//
|
||||
// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
|
||||
// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
|
||||
// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
|
||||
// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
|
||||
// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
|
||||
// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
|
||||
// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
|
||||
// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
|
||||
//
|
||||
// y[0] = SHIFT_ROUND ( a0 + b0 );
|
||||
// y[1] = SHIFT_ROUND ( a1 + b1 );
|
||||
// y[2] = SHIFT_ROUND ( a2 + b2 );
|
||||
// y[3] = SHIFT_ROUND ( a3 + b3 );
|
||||
// y[4] = SHIFT_ROUND ( a3 - b3 );
|
||||
// y[5] = SHIFT_ROUND ( a2 - b2 );
|
||||
// y[6] = SHIFT_ROUND ( a1 - b1 );
|
||||
// y[7] = SHIFT_ROUND ( a0 - b0 );
|
||||
// }
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// In this implementation the outputs of the iDCT-1D are multiplied
|
||||
// for rows 0,4 - by cos_4_16,
|
||||
// for rows 1,7 - by cos_1_16,
|
||||
// for rows 2,6 - by cos_2_16,
|
||||
// for rows 3,5 - by cos_3_16
|
||||
// and are shifted to the left for better accuracy
|
||||
//
|
||||
// For the constants used,
|
||||
// FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// Tables for mmx processors
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// Table for rows 0,4 - constants are multiplied by cos_4_16
|
||||
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
|
||||
16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
|
||||
21407,8867,8867,-21407, // w07 w05 w03 w01
|
||||
16384,-16384,16384,16384, // w14 w12 w10 w08
|
||||
-8867,21407,-21407,-8867, // w15 w13 w11 w09
|
||||
22725,12873,19266,-22725, // w22 w20 w18 w16
|
||||
19266,4520,-4520,-12873, // w23 w21 w19 w17
|
||||
12873,4520,4520,19266, // w30 w28 w26 w24
|
||||
-22725,19266,-12873,-22725, // w31 w29 w27 w25
|
||||
// Table for rows 1,7 - constants are multiplied by cos_1_16
|
||||
22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
|
||||
29692,12299,12299,-29692, // w07 w05 w03 w01
|
||||
22725,-22725,22725,22725, // w14 w12 w10 w08
|
||||
-12299,29692,-29692,-12299, // w15 w13 w11 w09
|
||||
31521,17855,26722,-31521, // w22 w20 w18 w16
|
||||
26722,6270,-6270,-17855, // w23 w21 w19 w17
|
||||
17855,6270,6270,26722, // w30 w28 w26 w24
|
||||
-31521,26722,-17855,-31521, // w31 w29 w27 w25
|
||||
// Table for rows 2,6 - constants are multiplied by cos_2_16
|
||||
21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
|
||||
27969,11585,11585,-27969, // w07 w05 w03 w01
|
||||
21407,-21407,21407,21407, // w14 w12 w10 w08
|
||||
-11585,27969,-27969,-11585, // w15 w13 w11 w09
|
||||
29692,16819,25172,-29692, // w22 w20 w18 w16
|
||||
25172,5906,-5906,-16819, // w23 w21 w19 w17
|
||||
16819,5906,5906,25172, // w30 w28 w26 w24
|
||||
-29692,25172,-16819,-29692, // w31 w29 w27 w25
|
||||
// Table for rows 3,5 - constants are multiplied by cos_3_16
|
||||
19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
|
||||
25172,10426,10426,-25172, // w07 w05 w03 w01
|
||||
19266,-19266,19266,19266, // w14 w12 w10 w08
|
||||
-10426,25172,-25172,-10426, // w15 w13 w11 w09
|
||||
26722,15137,22654,-26722, // w22 w20 w18 w16
|
||||
22654,5315,-5315,-15137, // w23 w21 w19 w17
|
||||
15137,5315,5315,22654, // w30 w28 w26 w24
|
||||
-26722,22654,-15137,-26722, // w31 w29 w27 w25
|
||||
};
|
||||
//-----------------------------------------------------------------------------
|
||||
// Tables for xmm processors
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// %3 for rows 0,4 - constants are multiplied by cos_4_16
|
||||
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
|
||||
16384,21407,16384,8867, // movq-> w05 w04 w01 w00
|
||||
16384,8867,-16384,-21407, // w07 w06 w03 w02
|
||||
16384,-8867,16384,-21407, // w13 w12 w09 w08
|
||||
-16384,21407,16384,-8867, // w15 w14 w11 w10
|
||||
22725,19266,19266,-4520, // w21 w20 w17 w16
|
||||
12873,4520,-22725,-12873, // w23 w22 w19 w18
|
||||
12873,-22725,4520,-12873, // w29 w28 w25 w24
|
||||
4520,19266,19266,-22725, // w31 w30 w27 w26
|
||||
// %3 for rows 1,7 - constants are multiplied by cos_1_16
|
||||
22725,29692,22725,12299, // movq-> w05 w04 w01 w00
|
||||
22725,12299,-22725,-29692, // w07 w06 w03 w02
|
||||
22725,-12299,22725,-29692, // w13 w12 w09 w08
|
||||
-22725,29692,22725,-12299, // w15 w14 w11 w10
|
||||
31521,26722,26722,-6270, // w21 w20 w17 w16
|
||||
17855,6270,-31521,-17855, // w23 w22 w19 w18
|
||||
17855,-31521,6270,-17855, // w29 w28 w25 w24
|
||||
6270,26722,26722,-31521, // w31 w30 w27 w26
|
||||
// %3 for rows 2,6 - constants are multiplied by cos_2_16
|
||||
21407,27969,21407,11585, // movq-> w05 w04 w01 w00
|
||||
21407,11585,-21407,-27969, // w07 w06 w03 w02
|
||||
21407,-11585,21407,-27969, // w13 w12 w09 w08
|
||||
-21407,27969,21407,-11585, // w15 w14 w11 w10
|
||||
29692,25172,25172,-5906, // w21 w20 w17 w16
|
||||
16819,5906,-29692,-16819, // w23 w22 w19 w18
|
||||
16819,-29692,5906,-16819, // w29 w28 w25 w24
|
||||
5906,25172,25172,-29692, // w31 w30 w27 w26
|
||||
// %3 for rows 3,5 - constants are multiplied by cos_3_16
|
||||
19266,25172,19266,10426, // movq-> w05 w04 w01 w00
|
||||
19266,10426,-19266,-25172, // w07 w06 w03 w02
|
||||
19266,-10426,19266,-25172, // w13 w12 w09 w08
|
||||
-19266,25172,19266,-10426, // w15 w14 w11 w10
|
||||
26722,22654,22654,-5315, // w21 w20 w17 w16
|
||||
15137,5315,-26722,-15137, // w23 w22 w19 w18
|
||||
15137,-26722,5315,-15137, // w29 w28 w25 w24
|
||||
5315,22654,22654,-26722, // w31 w30 w27 w26
|
||||
};
|
||||
//=============================================================================
|
||||
// Helper macros for the code
|
||||
//=============================================================================
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
|
||||
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
|
||||
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
|
||||
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
|
||||
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\
|
||||
"punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\
|
||||
"movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\
|
||||
"punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\
|
||||
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\
|
||||
"punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\
|
||||
"pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
|
||||
"movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\
|
||||
"movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\
|
||||
"punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\
|
||||
"pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
|
||||
"punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\
|
||||
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
|
||||
"punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\
|
||||
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\
|
||||
"pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
|
||||
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
|
||||
"pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
|
||||
"pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
|
||||
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
|
||||
"pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
|
||||
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
|
||||
"pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
|
||||
"paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
|
||||
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
|
||||
"psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\
|
||||
"psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
|
||||
"paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\
|
||||
"paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
|
||||
"psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
|
||||
"paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
|
||||
"movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\
|
||||
"paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\
|
||||
"psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\
|
||||
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
|
||||
"psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
|
||||
"packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\
|
||||
"packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\
|
||||
"movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\
|
||||
"psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\
|
||||
"pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\
|
||||
"movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\
|
||||
"por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\
|
||||
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
|
||||
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
|
||||
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
|
||||
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
|
||||
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\
|
||||
"pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\
|
||||
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\
|
||||
"movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\
|
||||
"pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
|
||||
"movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\
|
||||
"pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\
|
||||
"pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
|
||||
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\
|
||||
"pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\
|
||||
"pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
|
||||
"pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\
|
||||
"pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
|
||||
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
|
||||
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
|
||||
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
|
||||
"pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
|
||||
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
|
||||
"pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
|
||||
"paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
|
||||
"pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
|
||||
"paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\
|
||||
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
|
||||
"psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\
|
||||
"paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
|
||||
"psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\
|
||||
"movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\
|
||||
"paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
|
||||
"paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\
|
||||
"psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\
|
||||
"psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\
|
||||
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
|
||||
"psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\
|
||||
"packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\
|
||||
"packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\
|
||||
"movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
|
||||
"pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\
|
||||
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// The first stage DCT 8x8 - forward DCTs of columns
|
||||
//
|
||||
// The %2puts are multiplied
|
||||
// for rows 0,4 - on cos_4_16,
|
||||
// for rows 1,7 - on cos_1_16,
|
||||
// for rows 2,6 - on cos_2_16,
|
||||
// for rows 3,5 - on cos_3_16
|
||||
// and are shifted to the left for rise of accuracy
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// The 8-point scaled forward DCT algorithm (26a8m)
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// #define DCT_8_FRW_COL(x, y)
|
||||
//{
|
||||
// short t0, t1, t2, t3, t4, t5, t6, t7;
|
||||
// short tp03, tm03, tp12, tm12, tp65, tm65;
|
||||
// short tp465, tm465, tp765, tm765;
|
||||
//
|
||||
// t0 = LEFT_SHIFT ( x[0] + x[7] );
|
||||
// t1 = LEFT_SHIFT ( x[1] + x[6] );
|
||||
// t2 = LEFT_SHIFT ( x[2] + x[5] );
|
||||
// t3 = LEFT_SHIFT ( x[3] + x[4] );
|
||||
// t4 = LEFT_SHIFT ( x[3] - x[4] );
|
||||
// t5 = LEFT_SHIFT ( x[2] - x[5] );
|
||||
// t6 = LEFT_SHIFT ( x[1] - x[6] );
|
||||
// t7 = LEFT_SHIFT ( x[0] - x[7] );
|
||||
//
|
||||
// tp03 = t0 + t3;
|
||||
// tm03 = t0 - t3;
|
||||
// tp12 = t1 + t2;
|
||||
// tm12 = t1 - t2;
|
||||
//
|
||||
// y[0] = tp03 + tp12;
|
||||
// y[4] = tp03 - tp12;
|
||||
//
|
||||
// y[2] = tm03 + tm12 * tg_2_16;
|
||||
// y[6] = tm03 * tg_2_16 - tm12;
|
||||
//
|
||||
// tp65 =(t6 +t5 )*cos_4_16;
|
||||
// tm65 =(t6 -t5 )*cos_4_16;
|
||||
//
|
||||
// tp765 = t7 + tp65;
|
||||
// tm765 = t7 - tp65;
|
||||
// tp465 = t4 + tm65;
|
||||
// tm465 = t4 - tm65;
|
||||
//
|
||||
// y[1] = tp765 + tp465 * tg_1_16;
|
||||
// y[7] = tp765 * tg_1_16 - tp465;
|
||||
// y[5] = tm765 * tg_3_16 + tm465;
|
||||
// y[3] = tm765 - tm465 * tg_3_16;
|
||||
//}
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// DCT_8_INV_COL_4 INP,OUT
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
#define DCT_8_INV_COL(A1,A2)\
|
||||
"movq 2*8(%3),%%mm0\n\t"\
|
||||
"movq 16*3+" #A1 ",%%mm3\n\t"\
|
||||
"movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\
|
||||
"movq 16*5+" #A1 ",%%mm5\n\t"\
|
||||
"pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\
|
||||
"movq (%3),%%mm4\n\t"\
|
||||
"pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\
|
||||
"movq 16*7+" #A1 ",%%mm7\n\t"\
|
||||
"movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\
|
||||
"movq 16*1+" #A1 ",%%mm6\n\t"\
|
||||
"pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\
|
||||
"paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\
|
||||
"pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\
|
||||
"paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\
|
||||
"psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\
|
||||
"movq 3*8(%3),%%mm3\n\t"\
|
||||
"paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\
|
||||
"paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\
|
||||
"psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\
|
||||
"movq %%mm4,%%mm5 \n\t"/* tp17*/\
|
||||
"movq %%mm2,%%mm6 \n\t"/* tm17*/\
|
||||
"paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\
|
||||
"psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\
|
||||
"psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\
|
||||
"paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\
|
||||
"movq 1*8(%3),%%mm7\n\t"\
|
||||
"movq %%mm4,%%mm1 \n\t"/* t1*/\
|
||||
"movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\
|
||||
"paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\
|
||||
"movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\
|
||||
"psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\
|
||||
"movq 2*16+" #A1 ",%%mm5\n\t"\
|
||||
"movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\
|
||||
"movq 6*16+" #A1 ",%%mm6\n\t"\
|
||||
"pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\
|
||||
"pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\
|
||||
"pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
|
||||
"movq 0*16+" #A1 ",%%mm2\n\t"\
|
||||
"pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
|
||||
"psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\
|
||||
"movq %%mm2,%%mm3 \n\t"/* x0*/\
|
||||
"movq 4*16+" #A1 ",%%mm6\n\t"\
|
||||
"paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\
|
||||
"paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\
|
||||
"psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\
|
||||
"movq %%mm2,%%mm5 \n\t"/* tp04*/\
|
||||
"movq %%mm3,%%mm6 \n\t"/* tm04*/\
|
||||
"psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\
|
||||
"paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\
|
||||
"paddsw %%mm1,%%mm1 \n\t"/* b1*/\
|
||||
"paddsw %%mm4,%%mm4 \n\t"/* b2*/\
|
||||
"paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\
|
||||
"psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\
|
||||
"movq %%mm3,%%mm7 \n\t"/* a1*/\
|
||||
"movq %%mm6,%%mm0 \n\t"/* a2*/\
|
||||
"paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\
|
||||
"paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\
|
||||
"psraw $6,%%mm3 \n\t"/* dst1*/\
|
||||
"psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\
|
||||
"psraw $6,%%mm6 \n\t"/* dst2*/\
|
||||
"psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\
|
||||
"movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\
|
||||
"psraw $6,%%mm7 \n\t"/* dst6*/\
|
||||
"movq %%mm5,%%mm4 \n\t"/* a0*/\
|
||||
"psraw $6,%%mm0 \n\t"/* dst5*/\
|
||||
"movq %%mm3,1*16+" #A2 "\n\t"\
|
||||
"paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\
|
||||
"movq %%mm6,2*16+" #A2 "\n\t"\
|
||||
"psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\
|
||||
"movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\
|
||||
"psraw $6,%%mm5 \n\t"/* dst0*/\
|
||||
"movq %%mm2,%%mm6 \n\t"/* a3*/\
|
||||
"psraw $6,%%mm4 \n\t"/* dst7*/\
|
||||
"movq %%mm0,5*16+" #A2 "\n\t"\
|
||||
"paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\
|
||||
"movq %%mm7,6*16+" #A2 "\n\t"\
|
||||
"psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\
|
||||
"movq %%mm5,0*16+" #A2 "\n\t"\
|
||||
"psraw $6,%%mm2 \n\t"/* dst3*/\
|
||||
"movq %%mm4,7*16+" #A2 "\n\t"\
|
||||
"psraw $6,%%mm6 \n\t"/* dst4*/\
|
||||
"movq %%mm2,3*16+" #A2 "\n\t"\
|
||||
"movq %%mm6,4*16+" #A2 "\n\t"
|
||||
|
||||
//=============================================================================
|
||||
// Code
|
||||
//=============================================================================
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// void idct_mmx(uint16_t block[64]);
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void ff_idct_xvid_mmx(short *block){
|
||||
__asm__ volatile(
|
||||
//# Process each row
|
||||
DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
|
||||
DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
|
||||
DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
|
||||
DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
|
||||
DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
|
||||
DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
|
||||
DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
|
||||
DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
|
||||
|
||||
//# Process the columns (4 at a time)
|
||||
DCT_8_INV_COL(0(%0), 0(%0))
|
||||
DCT_8_INV_COL(8(%0), 8(%0))
|
||||
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// void idct_xmm(uint16_t block[64]);
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void ff_idct_xvid_mmxext(short *block)
|
||||
{
|
||||
__asm__ volatile(
|
||||
//# Process each row
|
||||
DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
|
||||
DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
|
||||
DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
|
||||
DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
|
||||
DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
|
||||
DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
|
||||
DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
|
||||
DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
|
||||
|
||||
//# Process the columns (4 at a time)
|
||||
DCT_8_INV_COL(0(%0), 0(%0))
|
||||
DCT_8_INV_COL(8(%0), 8(%0))
|
||||
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
|
||||
}
|
||||
|
||||
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_idct_xvid_mmx(block);
|
||||
ff_put_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_idct_xvid_mmx(block);
|
||||
ff_add_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_idct_xvid_mmxext(block);
|
||||
ff_put_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
ff_idct_xvid_mmxext(block);
|
||||
ff_add_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
408
project/jni/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
Normal file
408
project/jni/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
Normal file
@@ -0,0 +1,408 @@
|
||||
/*
|
||||
* XVID MPEG-4 VIDEO CODEC
|
||||
* - SSE2 inverse discrete cosine transform -
|
||||
*
|
||||
* Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
|
||||
*
|
||||
* Conversion to gcc syntax with modifications
|
||||
* by Alexander Strange <astrange@ithinksw.com>
|
||||
*
|
||||
* Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* Vertical pass is an implementation of the scheme:
|
||||
* Loeffler C., Ligtenberg A., and Moschytz C.S.:
|
||||
* Practical Fast 1D DCT Algorithm with Eleven Multiplications,
|
||||
* Proc. ICASSP 1989, 988-991.
|
||||
*
|
||||
* Horizontal pass is a double 4x4 vector/matrix multiplication,
|
||||
* (see also Intel's Application Note 922:
|
||||
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
|
||||
* Copyright (C) 1999 Intel Corporation)
|
||||
*
|
||||
* More details at http://skal.planet-d.net/coding/dct.html
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with FFmpeg; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "idct_xvid.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
/**
|
||||
* @file
|
||||
* @brief SSE2 idct compatible with xvidmmx
|
||||
*/
|
||||
|
||||
#define X8(x) x,x,x,x,x,x,x,x
|
||||
|
||||
#define ROW_SHIFT 11
|
||||
#define COL_SHIFT 6
|
||||
|
||||
DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
|
||||
DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
|
||||
DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
|
||||
DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
|
||||
DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
|
||||
|
||||
DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
|
||||
0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
|
||||
0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
|
||||
0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
|
||||
0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
|
||||
};
|
||||
|
||||
DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
|
||||
0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
|
||||
0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
|
||||
0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
|
||||
0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
|
||||
};
|
||||
|
||||
DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
|
||||
0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
|
||||
0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
|
||||
0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
|
||||
0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
|
||||
};
|
||||
|
||||
DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
|
||||
0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
|
||||
0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
|
||||
0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
|
||||
0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
|
||||
};
|
||||
|
||||
DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
|
||||
65536, 65536, 65536, 65536,
|
||||
3597, 3597, 3597, 3597,
|
||||
2260, 2260, 2260, 2260,
|
||||
1203, 1203, 1203, 1203,
|
||||
120, 120, 120, 120,
|
||||
512, 512, 512, 512
|
||||
};
|
||||
|
||||
// Temporary storage before the column pass
|
||||
#define ROW1 "%%xmm6"
|
||||
#define ROW3 "%%xmm4"
|
||||
#define ROW5 "%%xmm5"
|
||||
#define ROW7 "%%xmm7"
|
||||
|
||||
#define CLEAR_ODD(r) "pxor "r","r" \n\t"
|
||||
#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
|
||||
|
||||
#if ARCH_X86_64
|
||||
|
||||
# define ROW0 "%%xmm8"
|
||||
# define REG0 ROW0
|
||||
# define ROW2 "%%xmm9"
|
||||
# define REG2 ROW2
|
||||
# define ROW4 "%%xmm10"
|
||||
# define REG4 ROW4
|
||||
# define ROW6 "%%xmm11"
|
||||
# define REG6 ROW6
|
||||
# define CLEAR_EVEN(r) CLEAR_ODD(r)
|
||||
# define PUT_EVEN(dst) PUT_ODD(dst)
|
||||
# define XMMS "%%xmm12"
|
||||
# define MOV_32_ONLY "#"
|
||||
# define SREG2 REG2
|
||||
# define TAN3 "%%xmm13"
|
||||
# define TAN1 "%%xmm14"
|
||||
|
||||
#else
|
||||
|
||||
# define ROW0 "(%0)"
|
||||
# define REG0 "%%xmm4"
|
||||
# define ROW2 "2*16(%0)"
|
||||
# define REG2 "%%xmm4"
|
||||
# define ROW4 "4*16(%0)"
|
||||
# define REG4 "%%xmm6"
|
||||
# define ROW6 "6*16(%0)"
|
||||
# define REG6 "%%xmm6"
|
||||
# define CLEAR_EVEN(r)
|
||||
# define PUT_EVEN(dst) \
|
||||
"pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
|
||||
"movdqa %%xmm2, "dst" \n\t"
|
||||
# define XMMS "%%xmm2"
|
||||
# define MOV_32_ONLY "movdqa "
|
||||
# define SREG2 "%%xmm7"
|
||||
# define TAN3 "%%xmm0"
|
||||
# define TAN1 "%%xmm2"
|
||||
|
||||
#endif
|
||||
|
||||
#define ROUND(x) "paddd "MANGLE(x)
|
||||
|
||||
#define JZ(reg, to) \
|
||||
"testl "reg","reg" \n\t" \
|
||||
"jz "to" \n\t"
|
||||
|
||||
#define JNZ(reg, to) \
|
||||
"testl "reg","reg" \n\t" \
|
||||
"jnz "to" \n\t"
|
||||
|
||||
#define TEST_ONE_ROW(src, reg, clear) \
|
||||
clear \
|
||||
"movq "src", %%mm1 \n\t" \
|
||||
"por 8+"src", %%mm1 \n\t" \
|
||||
"paddusb %%mm0, %%mm1 \n\t" \
|
||||
"pmovmskb %%mm1, "reg" \n\t"
|
||||
|
||||
#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
|
||||
clear1 \
|
||||
clear2 \
|
||||
"movq "row1", %%mm1 \n\t" \
|
||||
"por 8+"row1", %%mm1 \n\t" \
|
||||
"movq "row2", %%mm2 \n\t" \
|
||||
"por 8+"row2", %%mm2 \n\t" \
|
||||
"paddusb %%mm0, %%mm1 \n\t" \
|
||||
"paddusb %%mm0, %%mm2 \n\t" \
|
||||
"pmovmskb %%mm1, "reg1" \n\t" \
|
||||
"pmovmskb %%mm2, "reg2" \n\t"
|
||||
|
||||
///IDCT pass on rows.
|
||||
#define iMTX_MULT(src, table, rounder, put) \
|
||||
"movdqa "src", %%xmm3 \n\t" \
|
||||
"movdqa %%xmm3, %%xmm0 \n\t" \
|
||||
"pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
|
||||
"punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
|
||||
"pmaddwd "table", %%xmm0 \n\t" \
|
||||
"pmaddwd 16+"table", %%xmm1 \n\t" \
|
||||
"pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
|
||||
"punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
|
||||
"pmaddwd 32+"table", %%xmm2 \n\t" \
|
||||
"pmaddwd 48+"table", %%xmm3 \n\t" \
|
||||
"paddd %%xmm1, %%xmm0 \n\t" \
|
||||
"paddd %%xmm3, %%xmm2 \n\t" \
|
||||
rounder", %%xmm0 \n\t" \
|
||||
"movdqa %%xmm2, %%xmm3 \n\t" \
|
||||
"paddd %%xmm0, %%xmm2 \n\t" \
|
||||
"psubd %%xmm3, %%xmm0 \n\t" \
|
||||
"psrad $11, %%xmm2 \n\t" \
|
||||
"psrad $11, %%xmm0 \n\t" \
|
||||
"packssdw %%xmm0, %%xmm2 \n\t" \
|
||||
put \
|
||||
"1: \n\t"
|
||||
|
||||
#define iLLM_HEAD \
|
||||
"movdqa "MANGLE(tan3)", "TAN3" \n\t" \
|
||||
"movdqa "MANGLE(tan1)", "TAN1" \n\t" \
|
||||
|
||||
///IDCT pass on columns.
|
||||
#define iLLM_PASS(dct) \
|
||||
"movdqa "TAN3", %%xmm1 \n\t" \
|
||||
"movdqa "TAN1", %%xmm3 \n\t" \
|
||||
"pmulhw %%xmm4, "TAN3" \n\t" \
|
||||
"pmulhw %%xmm5, %%xmm1 \n\t" \
|
||||
"paddsw %%xmm4, "TAN3" \n\t" \
|
||||
"paddsw %%xmm5, %%xmm1 \n\t" \
|
||||
"psubsw %%xmm5, "TAN3" \n\t" \
|
||||
"paddsw %%xmm4, %%xmm1 \n\t" \
|
||||
"pmulhw %%xmm7, %%xmm3 \n\t" \
|
||||
"pmulhw %%xmm6, "TAN1" \n\t" \
|
||||
"paddsw %%xmm6, %%xmm3 \n\t" \
|
||||
"psubsw %%xmm7, "TAN1" \n\t" \
|
||||
"movdqa %%xmm3, %%xmm7 \n\t" \
|
||||
"movdqa "TAN1", %%xmm6 \n\t" \
|
||||
"psubsw %%xmm1, %%xmm3 \n\t" \
|
||||
"psubsw "TAN3", "TAN1" \n\t" \
|
||||
"paddsw %%xmm7, %%xmm1 \n\t" \
|
||||
"paddsw %%xmm6, "TAN3" \n\t" \
|
||||
"movdqa %%xmm3, %%xmm6 \n\t" \
|
||||
"psubsw "TAN3", %%xmm3 \n\t" \
|
||||
"paddsw %%xmm6, "TAN3" \n\t" \
|
||||
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
|
||||
"pmulhw %%xmm4, %%xmm3 \n\t" \
|
||||
"pmulhw %%xmm4, "TAN3" \n\t" \
|
||||
"paddsw "TAN3", "TAN3" \n\t" \
|
||||
"paddsw %%xmm3, %%xmm3 \n\t" \
|
||||
"movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
|
||||
MOV_32_ONLY ROW2", "REG2" \n\t" \
|
||||
MOV_32_ONLY ROW6", "REG6" \n\t" \
|
||||
"movdqa %%xmm7, %%xmm5 \n\t" \
|
||||
"pmulhw "REG6", %%xmm7 \n\t" \
|
||||
"pmulhw "REG2", %%xmm5 \n\t" \
|
||||
"paddsw "REG2", %%xmm7 \n\t" \
|
||||
"psubsw "REG6", %%xmm5 \n\t" \
|
||||
MOV_32_ONLY ROW0", "REG0" \n\t" \
|
||||
MOV_32_ONLY ROW4", "REG4" \n\t" \
|
||||
MOV_32_ONLY" "TAN1", (%0) \n\t" \
|
||||
"movdqa "REG0", "XMMS" \n\t" \
|
||||
"psubsw "REG4", "REG0" \n\t" \
|
||||
"paddsw "XMMS", "REG4" \n\t" \
|
||||
"movdqa "REG4", "XMMS" \n\t" \
|
||||
"psubsw %%xmm7, "REG4" \n\t" \
|
||||
"paddsw "XMMS", %%xmm7 \n\t" \
|
||||
"movdqa "REG0", "XMMS" \n\t" \
|
||||
"psubsw %%xmm5, "REG0" \n\t" \
|
||||
"paddsw "XMMS", %%xmm5 \n\t" \
|
||||
"movdqa %%xmm5, "XMMS" \n\t" \
|
||||
"psubsw "TAN3", %%xmm5 \n\t" \
|
||||
"paddsw "XMMS", "TAN3" \n\t" \
|
||||
"movdqa "REG0", "XMMS" \n\t" \
|
||||
"psubsw %%xmm3, "REG0" \n\t" \
|
||||
"paddsw "XMMS", %%xmm3 \n\t" \
|
||||
MOV_32_ONLY" (%0), "TAN1" \n\t" \
|
||||
"psraw $6, %%xmm5 \n\t" \
|
||||
"psraw $6, "REG0" \n\t" \
|
||||
"psraw $6, "TAN3" \n\t" \
|
||||
"psraw $6, %%xmm3 \n\t" \
|
||||
"movdqa "TAN3", 1*16("dct") \n\t" \
|
||||
"movdqa %%xmm3, 2*16("dct") \n\t" \
|
||||
"movdqa "REG0", 5*16("dct") \n\t" \
|
||||
"movdqa %%xmm5, 6*16("dct") \n\t" \
|
||||
"movdqa %%xmm7, %%xmm0 \n\t" \
|
||||
"movdqa "REG4", %%xmm4 \n\t" \
|
||||
"psubsw %%xmm1, %%xmm7 \n\t" \
|
||||
"psubsw "TAN1", "REG4" \n\t" \
|
||||
"paddsw %%xmm0, %%xmm1 \n\t" \
|
||||
"paddsw %%xmm4, "TAN1" \n\t" \
|
||||
"psraw $6, %%xmm1 \n\t" \
|
||||
"psraw $6, %%xmm7 \n\t" \
|
||||
"psraw $6, "TAN1" \n\t" \
|
||||
"psraw $6, "REG4" \n\t" \
|
||||
"movdqa %%xmm1, ("dct") \n\t" \
|
||||
"movdqa "TAN1", 3*16("dct") \n\t" \
|
||||
"movdqa "REG4", 4*16("dct") \n\t" \
|
||||
"movdqa %%xmm7, 7*16("dct") \n\t"
|
||||
|
||||
///IDCT pass on columns, assuming rows 4-7 are zero.
|
||||
#define iLLM_PASS_SPARSE(dct) \
|
||||
"pmulhw %%xmm4, "TAN3" \n\t" \
|
||||
"paddsw %%xmm4, "TAN3" \n\t" \
|
||||
"movdqa %%xmm6, %%xmm3 \n\t" \
|
||||
"pmulhw %%xmm6, "TAN1" \n\t" \
|
||||
"movdqa %%xmm4, %%xmm1 \n\t" \
|
||||
"psubsw %%xmm1, %%xmm3 \n\t" \
|
||||
"paddsw %%xmm6, %%xmm1 \n\t" \
|
||||
"movdqa "TAN1", %%xmm6 \n\t" \
|
||||
"psubsw "TAN3", "TAN1" \n\t" \
|
||||
"paddsw %%xmm6, "TAN3" \n\t" \
|
||||
"movdqa %%xmm3, %%xmm6 \n\t" \
|
||||
"psubsw "TAN3", %%xmm3 \n\t" \
|
||||
"paddsw %%xmm6, "TAN3" \n\t" \
|
||||
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
|
||||
"pmulhw %%xmm4, %%xmm3 \n\t" \
|
||||
"pmulhw %%xmm4, "TAN3" \n\t" \
|
||||
"paddsw "TAN3", "TAN3" \n\t" \
|
||||
"paddsw %%xmm3, %%xmm3 \n\t" \
|
||||
"movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
|
||||
MOV_32_ONLY ROW2", "SREG2" \n\t" \
|
||||
"pmulhw "SREG2", %%xmm5 \n\t" \
|
||||
MOV_32_ONLY ROW0", "REG0" \n\t" \
|
||||
"movdqa "REG0", %%xmm6 \n\t" \
|
||||
"psubsw "SREG2", %%xmm6 \n\t" \
|
||||
"paddsw "REG0", "SREG2" \n\t" \
|
||||
MOV_32_ONLY" "TAN1", (%0) \n\t" \
|
||||
"movdqa "REG0", "XMMS" \n\t" \
|
||||
"psubsw %%xmm5, "REG0" \n\t" \
|
||||
"paddsw "XMMS", %%xmm5 \n\t" \
|
||||
"movdqa %%xmm5, "XMMS" \n\t" \
|
||||
"psubsw "TAN3", %%xmm5 \n\t" \
|
||||
"paddsw "XMMS", "TAN3" \n\t" \
|
||||
"movdqa "REG0", "XMMS" \n\t" \
|
||||
"psubsw %%xmm3, "REG0" \n\t" \
|
||||
"paddsw "XMMS", %%xmm3 \n\t" \
|
||||
MOV_32_ONLY" (%0), "TAN1" \n\t" \
|
||||
"psraw $6, %%xmm5 \n\t" \
|
||||
"psraw $6, "REG0" \n\t" \
|
||||
"psraw $6, "TAN3" \n\t" \
|
||||
"psraw $6, %%xmm3 \n\t" \
|
||||
"movdqa "TAN3", 1*16("dct") \n\t" \
|
||||
"movdqa %%xmm3, 2*16("dct") \n\t" \
|
||||
"movdqa "REG0", 5*16("dct") \n\t" \
|
||||
"movdqa %%xmm5, 6*16("dct") \n\t" \
|
||||
"movdqa "SREG2", %%xmm0 \n\t" \
|
||||
"movdqa %%xmm6, %%xmm4 \n\t" \
|
||||
"psubsw %%xmm1, "SREG2" \n\t" \
|
||||
"psubsw "TAN1", %%xmm6 \n\t" \
|
||||
"paddsw %%xmm0, %%xmm1 \n\t" \
|
||||
"paddsw %%xmm4, "TAN1" \n\t" \
|
||||
"psraw $6, %%xmm1 \n\t" \
|
||||
"psraw $6, "SREG2" \n\t" \
|
||||
"psraw $6, "TAN1" \n\t" \
|
||||
"psraw $6, %%xmm6 \n\t" \
|
||||
"movdqa %%xmm1, ("dct") \n\t" \
|
||||
"movdqa "TAN1", 3*16("dct") \n\t" \
|
||||
"movdqa %%xmm6, 4*16("dct") \n\t" \
|
||||
"movdqa "SREG2", 7*16("dct") \n\t"
|
||||
|
||||
inline void ff_idct_xvid_sse2(short *block)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"movq "MANGLE(m127)", %%mm0 \n\t"
|
||||
iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
|
||||
iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
|
||||
iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
|
||||
|
||||
TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
|
||||
JZ("%%eax", "1f")
|
||||
iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
|
||||
|
||||
TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
|
||||
TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
|
||||
iLLM_HEAD
|
||||
".p2align 4 \n\t"
|
||||
JNZ("%%ecx", "2f")
|
||||
JNZ("%%eax", "3f")
|
||||
JNZ("%%edx", "4f")
|
||||
JNZ("%%esi", "5f")
|
||||
iLLM_PASS_SPARSE("%0")
|
||||
"jmp 6f \n\t"
|
||||
"2: \n\t"
|
||||
iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
|
||||
"3: \n\t"
|
||||
iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
|
||||
JZ("%%edx", "1f")
|
||||
"4: \n\t"
|
||||
iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
|
||||
JZ("%%esi", "1f")
|
||||
"5: \n\t"
|
||||
iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
|
||||
#if !ARCH_X86_64
|
||||
iLLM_HEAD
|
||||
#endif
|
||||
iLLM_PASS("%0")
|
||||
"6: \n\t"
|
||||
: "+r"(block)
|
||||
:
|
||||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
|
||||
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
|
||||
#if ARCH_X86_64
|
||||
XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
|
||||
"%xmm12", "%xmm13", "%xmm14",)
|
||||
#endif
|
||||
"%eax", "%ecx", "%edx", "%esi", "memory"
|
||||
);
|
||||
}
|
||||
|
||||
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
|
||||
{
|
||||
ff_idct_xvid_sse2(block);
|
||||
ff_put_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
|
||||
{
|
||||
ff_idct_xvid_sse2(block);
|
||||
ff_add_pixels_clamped_mmx(block, dest, line_size);
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
45
project/jni/ffmpeg/libavcodec/x86/idct_xvid.h
Normal file
45
project/jni/ffmpeg/libavcodec/x86/idct_xvid.h
Normal file
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* XVID MPEG-4 VIDEO CODEC
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* header for Xvid IDCT functions
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_IDCT_XVID_H
|
||||
#define AVCODEC_X86_IDCT_XVID_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavcodec/dsputil.h"
|
||||
|
||||
void ff_idct_xvid_mmx(short *block);
|
||||
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_idct_xvid_mmxext(short *block);
|
||||
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_idct_xvid_sse2(short *block);
|
||||
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
|
||||
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
|
||||
|
||||
#endif /* AVCODEC_X86_IDCT_XVID_H */
|
||||
724
project/jni/ffmpeg/libavcodec/x86/imdct36.asm
Normal file
724
project/jni/ffmpeg/libavcodec/x86/imdct36.asm
Normal file
@@ -0,0 +1,724 @@
|
||||
;******************************************************************************
|
||||
;* 36 point SSE-optimized IMDCT transform
|
||||
;* Copyright (c) 2011 Vitor Sessak
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
align 16
|
||||
ps_mask: dd 0, ~0, ~0, ~0
|
||||
ps_mask2: dd 0, ~0, 0, ~0
|
||||
ps_mask3: dd 0, 0, 0, ~0
|
||||
ps_mask4: dd 0, ~0, 0, 0
|
||||
|
||||
ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
|
||||
ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
|
||||
ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
|
||||
ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
|
||||
ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
|
||||
ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
|
||||
ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
|
||||
|
||||
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
|
||||
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
|
||||
|
||||
ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
|
||||
dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
|
||||
dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
|
||||
dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
|
||||
dd 1.0, 0.70710678118654752439, 0.0, 0.0
|
||||
|
||||
ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
|
||||
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
|
||||
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
|
||||
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
|
||||
dd 1.0, 0.70710678118654752439, 0.0, 0.0
|
||||
|
||||
costabs: times 4 dd 0.98480773
|
||||
times 4 dd 0.93969262
|
||||
times 4 dd 0.86602539
|
||||
times 4 dd -0.76604444
|
||||
times 4 dd -0.64278764
|
||||
times 4 dd 0.50000000
|
||||
times 4 dd -0.50000000
|
||||
times 4 dd -0.34202015
|
||||
times 4 dd -0.17364818
|
||||
times 4 dd 0.50190992
|
||||
times 4 dd 0.51763808
|
||||
times 4 dd 0.55168896
|
||||
times 4 dd 0.61038726
|
||||
times 4 dd 0.70710677
|
||||
times 4 dd 0.87172341
|
||||
times 4 dd 1.18310082
|
||||
times 4 dd 1.93185163
|
||||
times 4 dd 5.73685646
|
||||
|
||||
%define SBLIMIT 32
|
||||
SECTION_TEXT
|
||||
|
||||
%macro PSHUFD 3
|
||||
%if cpuflag(sse2) && notcpuflag(avx)
|
||||
pshufd %1, %2, %3
|
||||
%else
|
||||
shufps %1, %2, %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
|
||||
; output %1={x3,x4,y1,y2}
|
||||
%macro BUILDINVHIGHLOW 3
|
||||
%if cpuflag(avx)
|
||||
shufps %1, %2, %3, 0x4e
|
||||
%else
|
||||
movlhps %1, %3
|
||||
movhlps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
|
||||
; output %1={x4,y1,y2,y3}
|
||||
%macro ROTLEFT 3
|
||||
%if cpuflag(ssse3)
|
||||
palignr %1, %3, %2, 12
|
||||
%else
|
||||
BUILDINVHIGHLOW %1, %2, %3
|
||||
shufps %1, %1, %3, 0x99
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro INVERTHL 2
|
||||
%if cpuflag(sse2)
|
||||
PSHUFD %1, %2, 0x4e
|
||||
%else
|
||||
movhlps %1, %2
|
||||
movlhps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro BUTTERF 3
|
||||
INVERTHL %2, %1
|
||||
xorps %1, [ps_p1p1m1m1]
|
||||
addps %1, %2
|
||||
%if cpuflag(sse3)
|
||||
mulps %1, %1, [ps_cosh_sse3 + %3]
|
||||
PSHUFD %2, %1, 0xb1
|
||||
addsubps %1, %1, %2
|
||||
%else
|
||||
mulps %1, [ps_cosh + %3]
|
||||
PSHUFD %2, %1, 0xb1
|
||||
xorps %1, [ps_p1m1p1m1]
|
||||
addps %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE 4
|
||||
movhlps %2, %1
|
||||
movss [%3 ], %1
|
||||
movss [%3 + 2*%4], %2
|
||||
shufps %1, %1, 0xb1
|
||||
movss [%3 + %4], %1
|
||||
movhlps %2, %1
|
||||
movss [%3 + 3*%4], %2
|
||||
%endmacro
|
||||
|
||||
%macro LOAD 4
|
||||
movlps %1, [%3 ]
|
||||
movhps %1, [%3 + %4]
|
||||
movlps %2, [%3 + 2*%4]
|
||||
movhps %2, [%3 + 3*%4]
|
||||
shufps %1, %2, 0x88
|
||||
%endmacro
|
||||
|
||||
%macro LOADA64 2
|
||||
%if cpuflag(avx)
|
||||
movu %1, [%2]
|
||||
%else
|
||||
movlps %1, [%2]
|
||||
movhps %1, [%2 + 8]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEFINE_IMDCT 0
|
||||
cglobal imdct36_float, 4,4,9, out, buf, in, win
|
||||
|
||||
; for(i=17;i>=1;i--) in[i] += in[i-1];
|
||||
LOADA64 m0, inq
|
||||
LOADA64 m1, inq + 16
|
||||
|
||||
ROTLEFT m5, m0, m1
|
||||
|
||||
PSHUFD m6, m0, 0x93
|
||||
andps m6, m6, [ps_mask]
|
||||
addps m0, m0, m6
|
||||
|
||||
LOADA64 m2, inq + 32
|
||||
|
||||
ROTLEFT m7, m1, m2
|
||||
|
||||
addps m1, m1, m5
|
||||
LOADA64 m3, inq + 48
|
||||
|
||||
ROTLEFT m5, m2, m3
|
||||
|
||||
xorps m4, m4, m4
|
||||
movlps m4, [inq+64]
|
||||
BUILDINVHIGHLOW m6, m3, m4
|
||||
shufps m6, m6, m4, 0xa9
|
||||
|
||||
addps m4, m4, m6
|
||||
addps m2, m2, m7
|
||||
addps m3, m3, m5
|
||||
|
||||
; for(i=17;i>=3;i-=2) in[i] += in[i-2];
|
||||
movlhps m5, m5, m0
|
||||
andps m5, m5, [ps_mask3]
|
||||
|
||||
BUILDINVHIGHLOW m7, m0, m1
|
||||
andps m7, m7, [ps_mask2]
|
||||
|
||||
addps m0, m0, m5
|
||||
|
||||
BUILDINVHIGHLOW m6, m1, m2
|
||||
andps m6, m6, [ps_mask2]
|
||||
|
||||
addps m1, m1, m7
|
||||
|
||||
BUILDINVHIGHLOW m7, m2, m3
|
||||
andps m7, m7, [ps_mask2]
|
||||
|
||||
addps m2, m2, m6
|
||||
|
||||
movhlps m6, m6, m3
|
||||
andps m6, m6, [ps_mask4]
|
||||
|
||||
addps m3, m3, m7
|
||||
addps m4, m4, m6
|
||||
|
||||
; Populate tmp[]
|
||||
movlhps m6, m1, m5 ; zero out high values
|
||||
subps m6, m6, m4
|
||||
|
||||
subps m5, m0, m3
|
||||
|
||||
%if ARCH_X86_64
|
||||
SWAP m5, m8
|
||||
%endif
|
||||
|
||||
mulps m7, m2, [ps_val1]
|
||||
|
||||
%if ARCH_X86_64
|
||||
mulps m5, m8, [ps_val2]
|
||||
%else
|
||||
mulps m5, m5, [ps_val2]
|
||||
%endif
|
||||
addps m7, m7, m5
|
||||
|
||||
mulps m5, m6, [ps_val1]
|
||||
subps m7, m7, m5
|
||||
|
||||
%if ARCH_X86_64
|
||||
SWAP m5, m8
|
||||
%else
|
||||
subps m5, m0, m3
|
||||
%endif
|
||||
|
||||
subps m5, m5, m6
|
||||
addps m5, m5, m2
|
||||
|
||||
shufps m6, m4, m3, 0xe4
|
||||
subps m6, m6, m2
|
||||
mulps m6, m6, [ps_val3]
|
||||
|
||||
addps m4, m4, m1
|
||||
mulps m4, m4, [ps_val4]
|
||||
|
||||
shufps m1, m1, m0, 0xe4
|
||||
addps m1, m1, m2
|
||||
mulps m1, m1, [ps_val5]
|
||||
|
||||
mulps m3, m3, [ps_val6]
|
||||
mulps m0, m0, [ps_val7]
|
||||
addps m0, m0, m3
|
||||
|
||||
xorps m2, m1, [ps_p1p1m1m1]
|
||||
subps m2, m2, m4
|
||||
addps m2, m2, m0
|
||||
|
||||
addps m3, m4, m0
|
||||
subps m3, m3, m6
|
||||
xorps m3, m3, [ps_p1p1m1m1]
|
||||
|
||||
shufps m0, m0, m4, 0xe4
|
||||
subps m0, m0, m1
|
||||
addps m0, m0, m6
|
||||
|
||||
BUILDINVHIGHLOW m4, m2, m3
|
||||
shufps m3, m3, m2, 0x4e
|
||||
|
||||
; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
|
||||
|
||||
BUTTERF m0, m1, 0
|
||||
BUTTERF m7, m2, 16
|
||||
BUTTERF m3, m6, 32
|
||||
BUTTERF m4, m1, 48
|
||||
|
||||
mulps m5, m5, [ps_cosh + 64]
|
||||
PSHUFD m1, m5, 0xe1
|
||||
xorps m5, m5, [ps_p1m1p1m1]
|
||||
addps m5, m5, m1
|
||||
|
||||
; permutates:
|
||||
; m0 0 1 2 3 => 2 6 10 14 m1
|
||||
; m7 4 5 6 7 => 3 7 11 15 m2
|
||||
; m3 8 9 10 11 => 17 13 9 5 m3
|
||||
; m4 12 13 14 15 => 16 12 8 4 m5
|
||||
; m5 16 17 xx xx => 0 1 xx xx m0
|
||||
|
||||
unpckhps m1, m0, m7
|
||||
unpckhps m6, m3, m4
|
||||
movhlps m2, m6, m1
|
||||
movlhps m1, m1, m6
|
||||
|
||||
unpcklps m5, m5, m4
|
||||
unpcklps m3, m3, m7
|
||||
movhlps m4, m3, m5
|
||||
movlhps m5, m5, m3
|
||||
SWAP m4, m3
|
||||
; permutation done
|
||||
|
||||
PSHUFD m6, m2, 0xb1
|
||||
movss m4, [bufq + 4*68]
|
||||
movss m7, [bufq + 4*64]
|
||||
unpcklps m7, m7, m4
|
||||
mulps m6, m6, [winq + 16*4]
|
||||
addps m6, m6, m7
|
||||
movss [outq + 64*SBLIMIT], m6
|
||||
shufps m6, m6, m6, 0xb1
|
||||
movss [outq + 68*SBLIMIT], m6
|
||||
|
||||
mulps m6, m3, [winq + 4*4]
|
||||
LOAD m4, m7, bufq + 4*16, 16
|
||||
addps m6, m6, m4
|
||||
STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
|
||||
|
||||
shufps m4, m0, m3, 0xb5
|
||||
mulps m4, m4, [winq + 8*4]
|
||||
LOAD m7, m6, bufq + 4*32, 16
|
||||
addps m4, m4, m7
|
||||
STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
|
||||
|
||||
shufps m3, m3, m2, 0xb1
|
||||
mulps m3, m3, [winq + 12*4]
|
||||
LOAD m7, m6, bufq + 4*48, 16
|
||||
addps m3, m3, m7
|
||||
STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
|
||||
|
||||
mulps m2, m2, [winq]
|
||||
LOAD m6, m7, bufq, 16
|
||||
addps m2, m2, m6
|
||||
STORE m2, m7, outq, 4*SBLIMIT
|
||||
|
||||
mulps m4, m1, [winq + 20*4]
|
||||
STORE m4, m7, bufq, 16
|
||||
|
||||
mulps m3, m5, [winq + 24*4]
|
||||
STORE m3, m7, bufq + 4*16, 16
|
||||
|
||||
shufps m0, m0, m5, 0xb0
|
||||
mulps m0, m0, [winq + 28*4]
|
||||
STORE m0, m7, bufq + 4*32, 16
|
||||
|
||||
shufps m5, m5, m1, 0xb1
|
||||
mulps m5, m5, [winq + 32*4]
|
||||
STORE m5, m7, bufq + 4*48, 16
|
||||
|
||||
shufps m1, m1, m1, 0xb1
|
||||
mulps m1, m1, [winq + 36*4]
|
||||
movss [bufq + 4*64], m1
|
||||
shufps m1, m1, 0xb1
|
||||
movss [bufq + 4*68], m1
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
DEFINE_IMDCT
|
||||
|
||||
INIT_XMM sse2
|
||||
DEFINE_IMDCT
|
||||
|
||||
INIT_XMM sse3
|
||||
DEFINE_IMDCT
|
||||
|
||||
INIT_XMM ssse3
|
||||
DEFINE_IMDCT
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEFINE_IMDCT
|
||||
%endif
|
||||
|
||||
INIT_XMM sse
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define SPILL SWAP
|
||||
%define UNSPILL SWAP
|
||||
%define SPILLED(x) m %+ x
|
||||
%else
|
||||
%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
|
||||
%macro SPILL 2 ; xmm#, mempos
|
||||
movaps SPILLED(%2), m%1
|
||||
%endmacro
|
||||
%macro UNSPILL 2
|
||||
movaps m%1, SPILLED(%2)
|
||||
%endmacro
|
||||
%endif
|
||||
|
||||
%macro DEFINE_FOUR_IMDCT 0
|
||||
cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
|
||||
movlps m0, [inq+64]
|
||||
movhps m0, [inq+64 + 72]
|
||||
movlps m3, [inq+64 + 2*72]
|
||||
movhps m3, [inq+64 + 3*72]
|
||||
|
||||
shufps m5, m0, m3, 0xdd
|
||||
shufps m0, m0, m3, 0x88
|
||||
|
||||
mova m1, [inq+48]
|
||||
movu m6, [inq+48 + 72]
|
||||
mova m7, [inq+48 + 2*72]
|
||||
movu m3, [inq+48 + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 1, 6, 7, 3, 4
|
||||
|
||||
addps m4, m6, m7
|
||||
mova [tmpq+4*28], m4
|
||||
|
||||
addps m7, m3
|
||||
addps m6, m1
|
||||
addps m3, m0
|
||||
addps m0, m5
|
||||
addps m0, m7
|
||||
addps m7, m6
|
||||
mova [tmpq+4*12], m7
|
||||
SPILL 3, 12
|
||||
|
||||
mova m4, [inq+32]
|
||||
movu m5, [inq+32 + 72]
|
||||
mova m2, [inq+32 + 2*72]
|
||||
movu m7, [inq+32 + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 4, 5, 2, 7, 3
|
||||
|
||||
addps m1, m7
|
||||
SPILL 1, 11
|
||||
|
||||
addps m3, m5, m2
|
||||
SPILL 3, 13
|
||||
|
||||
addps m7, m2
|
||||
addps m5, m4
|
||||
addps m6, m7
|
||||
mova [tmpq], m6
|
||||
addps m7, m5
|
||||
mova [tmpq+4*16], m7
|
||||
|
||||
mova m2, [inq+16]
|
||||
movu m7, [inq+16 + 72]
|
||||
mova m1, [inq+16 + 2*72]
|
||||
movu m6, [inq+16 + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 2, 7, 1, 6, 3
|
||||
|
||||
addps m4, m6
|
||||
addps m6, m1
|
||||
addps m1, m7
|
||||
addps m7, m2
|
||||
addps m5, m6
|
||||
SPILL 5, 15
|
||||
addps m6, m7
|
||||
mulps m6, [costabs + 16*2]
|
||||
mova [tmpq+4*8], m6
|
||||
SPILL 1, 10
|
||||
SPILL 0, 14
|
||||
|
||||
mova m1, [inq]
|
||||
movu m6, [inq + 72]
|
||||
mova m3, [inq + 2*72]
|
||||
movu m5, [inq + 3*72]
|
||||
|
||||
TRANSPOSE4x4PS 1, 6, 3, 5, 0
|
||||
|
||||
addps m2, m5
|
||||
addps m5, m3
|
||||
addps m7, m5
|
||||
addps m3, m6
|
||||
addps m6, m1
|
||||
SPILL 7, 8
|
||||
addps m5, m6
|
||||
SPILL 6, 9
|
||||
addps m6, m4, SPILLED(12)
|
||||
subps m6, m2
|
||||
UNSPILL 7, 11
|
||||
SPILL 5, 11
|
||||
subps m5, m1, m7
|
||||
mulps m7, [costabs + 16*5]
|
||||
addps m7, m1
|
||||
mulps m0, m6, [costabs + 16*6]
|
||||
addps m0, m5
|
||||
mova [tmpq+4*24], m0
|
||||
addps m6, m5
|
||||
mova [tmpq+4*4], m6
|
||||
addps m6, m4, m2
|
||||
mulps m6, [costabs + 16*1]
|
||||
subps m4, SPILLED(12)
|
||||
mulps m4, [costabs + 16*8]
|
||||
addps m2, SPILLED(12)
|
||||
mulps m2, [costabs + 16*3]
|
||||
subps m5, m7, m6
|
||||
subps m5, m2
|
||||
addps m6, m7
|
||||
addps m6, m4
|
||||
addps m7, m2
|
||||
subps m7, m4
|
||||
mova [tmpq+4*20], m7
|
||||
mova m2, [tmpq+4*28]
|
||||
mova [tmpq+4*28], m5
|
||||
UNSPILL 7, 13
|
||||
subps m5, m7, m2
|
||||
mulps m5, [costabs + 16*7]
|
||||
UNSPILL 1, 10
|
||||
mulps m1, [costabs + 16*2]
|
||||
addps m4, m3, m2
|
||||
mulps m4, [costabs + 16*4]
|
||||
addps m2, m7
|
||||
addps m7, m3
|
||||
mulps m7, [costabs]
|
||||
subps m3, m2
|
||||
mulps m3, [costabs + 16*2]
|
||||
addps m2, m7, m5
|
||||
addps m2, m1
|
||||
SPILL 2, 10
|
||||
addps m7, m4
|
||||
subps m7, m1
|
||||
SPILL 7, 12
|
||||
subps m5, m4
|
||||
subps m5, m1
|
||||
UNSPILL 0, 14
|
||||
SPILL 5, 13
|
||||
addps m1, m0, SPILLED(15)
|
||||
subps m1, SPILLED(8)
|
||||
mova m4, [costabs + 16*5]
|
||||
mulps m4, [tmpq]
|
||||
UNSPILL 2, 9
|
||||
addps m4, m2
|
||||
subps m2, [tmpq]
|
||||
mulps m5, m1, [costabs + 16*6]
|
||||
addps m5, m2
|
||||
SPILL 5, 9
|
||||
addps m2, m1
|
||||
SPILL 2, 14
|
||||
UNSPILL 5, 15
|
||||
subps m7, m5, m0
|
||||
addps m5, SPILLED(8)
|
||||
mulps m5, [costabs + 16*1]
|
||||
mulps m7, [costabs + 16*8]
|
||||
addps m0, SPILLED(8)
|
||||
mulps m0, [costabs + 16*3]
|
||||
subps m2, m4, m5
|
||||
subps m2, m0
|
||||
SPILL 2, 15
|
||||
addps m5, m4
|
||||
addps m5, m7
|
||||
addps m4, m0
|
||||
subps m4, m7
|
||||
SPILL 4, 8
|
||||
mova m7, [tmpq+4*16]
|
||||
mova m2, [tmpq+4*12]
|
||||
addps m0, m7, m2
|
||||
subps m0, SPILLED(11)
|
||||
mulps m0, [costabs + 16*2]
|
||||
addps m4, m7, SPILLED(11)
|
||||
mulps m4, [costabs]
|
||||
subps m7, m2
|
||||
mulps m7, [costabs + 16*7]
|
||||
addps m2, SPILLED(11)
|
||||
mulps m2, [costabs + 16*4]
|
||||
addps m1, m7, [tmpq+4*8]
|
||||
addps m1, m4
|
||||
addps m4, m2
|
||||
subps m4, [tmpq+4*8]
|
||||
SPILL 4, 11
|
||||
subps m7, m2
|
||||
subps m7, [tmpq+4*8]
|
||||
addps m4, m6, SPILLED(10)
|
||||
subps m6, SPILLED(10)
|
||||
addps m2, m5, m1
|
||||
mulps m2, [costabs + 16*9]
|
||||
subps m5, m1
|
||||
mulps m5, [costabs + 16*17]
|
||||
subps m1, m4, m2
|
||||
addps m4, m2
|
||||
mulps m2, m1, [winq+4*36]
|
||||
addps m2, [bufq+4*36]
|
||||
mova [outq+1152], m2
|
||||
mulps m1, [winq+4*32]
|
||||
addps m1, [bufq+4*32]
|
||||
mova [outq+1024], m1
|
||||
mulps m1, m4, [winq+4*116]
|
||||
mova [bufq+4*36], m1
|
||||
mulps m4, [winq+4*112]
|
||||
mova [bufq+4*32], m4
|
||||
addps m2, m6, m5
|
||||
subps m6, m5
|
||||
mulps m1, m6, [winq+4*68]
|
||||
addps m1, [bufq+4*68]
|
||||
mova [outq+2176], m1
|
||||
mulps m6, [winq]
|
||||
addps m6, [bufq]
|
||||
mova [outq], m6
|
||||
mulps m1, m2, [winq+4*148]
|
||||
mova [bufq+4*68], m1
|
||||
mulps m2, [winq+4*80]
|
||||
mova [bufq], m2
|
||||
addps m5, m3, [tmpq+4*24]
|
||||
mova m2, [tmpq+4*24]
|
||||
subps m2, m3
|
||||
mova m1, SPILLED(9)
|
||||
subps m1, m0
|
||||
mulps m1, [costabs + 16*10]
|
||||
addps m0, SPILLED(9)
|
||||
mulps m0, [costabs + 16*16]
|
||||
addps m6, m5, m1
|
||||
subps m5, m1
|
||||
mulps m3, m5, [winq+4*40]
|
||||
addps m3, [bufq+4*40]
|
||||
mova [outq+1280], m3
|
||||
mulps m5, [winq+4*28]
|
||||
addps m5, [bufq+4*28]
|
||||
mova [outq+896], m5
|
||||
mulps m1, m6, [winq+4*120]
|
||||
mova [bufq+4*40], m1
|
||||
mulps m6, [winq+4*108]
|
||||
mova [bufq+4*28], m6
|
||||
addps m1, m2, m0
|
||||
subps m2, m0
|
||||
mulps m5, m2, [winq+4*64]
|
||||
addps m5, [bufq+4*64]
|
||||
mova [outq+2048], m5
|
||||
mulps m2, [winq+4*4]
|
||||
addps m2, [bufq+4*4]
|
||||
mova [outq+128], m2
|
||||
mulps m0, m1, [winq+4*144]
|
||||
mova [bufq+4*64], m0
|
||||
mulps m1, [winq+4*84]
|
||||
mova [bufq+4*4], m1
|
||||
mova m1, [tmpq+4*28]
|
||||
mova m5, m1
|
||||
addps m1, SPILLED(13)
|
||||
subps m5, SPILLED(13)
|
||||
UNSPILL 3, 15
|
||||
addps m2, m7, m3
|
||||
mulps m2, [costabs + 16*11]
|
||||
subps m3, m7
|
||||
mulps m3, [costabs + 16*15]
|
||||
addps m0, m2, m1
|
||||
subps m1, m2
|
||||
SWAP m0, m2
|
||||
mulps m6, m1, [winq+4*44]
|
||||
addps m6, [bufq+4*44]
|
||||
mova [outq+1408], m6
|
||||
mulps m1, [winq+4*24]
|
||||
addps m1, [bufq+4*24]
|
||||
mova [outq+768], m1
|
||||
mulps m0, m2, [winq+4*124]
|
||||
mova [bufq+4*44], m0
|
||||
mulps m2, [winq+4*104]
|
||||
mova [bufq+4*24], m2
|
||||
addps m0, m5, m3
|
||||
subps m5, m3
|
||||
mulps m1, m5, [winq+4*60]
|
||||
addps m1, [bufq+4*60]
|
||||
mova [outq+1920], m1
|
||||
mulps m5, [winq+4*8]
|
||||
addps m5, [bufq+4*8]
|
||||
mova [outq+256], m5
|
||||
mulps m1, m0, [winq+4*140]
|
||||
mova [bufq+4*60], m1
|
||||
mulps m0, [winq+4*88]
|
||||
mova [bufq+4*8], m0
|
||||
mova m1, [tmpq+4*20]
|
||||
addps m1, SPILLED(12)
|
||||
mova m2, [tmpq+4*20]
|
||||
subps m2, SPILLED(12)
|
||||
UNSPILL 7, 8
|
||||
subps m0, m7, SPILLED(11)
|
||||
addps m7, SPILLED(11)
|
||||
mulps m4, m7, [costabs + 16*12]
|
||||
mulps m0, [costabs + 16*14]
|
||||
addps m5, m1, m4
|
||||
subps m1, m4
|
||||
mulps m7, m1, [winq+4*48]
|
||||
addps m7, [bufq+4*48]
|
||||
mova [outq+1536], m7
|
||||
mulps m1, [winq+4*20]
|
||||
addps m1, [bufq+4*20]
|
||||
mova [outq+640], m1
|
||||
mulps m1, m5, [winq+4*128]
|
||||
mova [bufq+4*48], m1
|
||||
mulps m5, [winq+4*100]
|
||||
mova [bufq+4*20], m5
|
||||
addps m6, m2, m0
|
||||
subps m2, m0
|
||||
mulps m1, m2, [winq+4*56]
|
||||
addps m1, [bufq+4*56]
|
||||
mova [outq+1792], m1
|
||||
mulps m2, [winq+4*12]
|
||||
addps m2, [bufq+4*12]
|
||||
mova [outq+384], m2
|
||||
mulps m0, m6, [winq+4*136]
|
||||
mova [bufq+4*56], m0
|
||||
mulps m6, [winq+4*92]
|
||||
mova [bufq+4*12], m6
|
||||
UNSPILL 0, 14
|
||||
mulps m0, [costabs + 16*13]
|
||||
mova m3, [tmpq+4*4]
|
||||
addps m2, m0, m3
|
||||
subps m3, m0
|
||||
mulps m0, m3, [winq+4*52]
|
||||
addps m0, [bufq+4*52]
|
||||
mova [outq+1664], m0
|
||||
mulps m3, [winq+4*16]
|
||||
addps m3, [bufq+4*16]
|
||||
mova [outq+512], m3
|
||||
mulps m0, m2, [winq+4*132]
|
||||
mova [bufq+4*52], m0
|
||||
mulps m2, [winq+4*96]
|
||||
mova [bufq+4*16], m2
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse
|
||||
DEFINE_FOUR_IMDCT
|
||||
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
DEFINE_FOUR_IMDCT
|
||||
%endif
|
||||
153
project/jni/ffmpeg/libavcodec/x86/lpc.c
Normal file
153
project/jni/ffmpeg/libavcodec/x86/lpc.c
Normal file
@@ -0,0 +1,153 @@
|
||||
/*
|
||||
* MMX optimized LPC DSP utils
|
||||
* Copyright (c) 2007 Loren Merritt
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavcodec/lpc.h"
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
|
||||
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
|
||||
double *w_data)
|
||||
{
|
||||
double c = 2.0 / (len-1.0);
|
||||
int n2 = len>>1;
|
||||
x86_reg i = -n2*sizeof(int32_t);
|
||||
x86_reg j = n2*sizeof(int32_t);
|
||||
__asm__ volatile(
|
||||
"movsd %4, %%xmm7 \n\t"
|
||||
"movapd "MANGLE(ff_pd_1)", %%xmm6 \n\t"
|
||||
"movapd "MANGLE(ff_pd_2)", %%xmm5 \n\t"
|
||||
"movlhps %%xmm7, %%xmm7 \n\t"
|
||||
"subpd %%xmm5, %%xmm7 \n\t"
|
||||
"addsd %%xmm6, %%xmm7 \n\t"
|
||||
"test $1, %5 \n\t"
|
||||
"jz 2f \n\t"
|
||||
#define WELCH(MOVPD, offset)\
|
||||
"1: \n\t"\
|
||||
"movapd %%xmm7, %%xmm1 \n\t"\
|
||||
"mulpd %%xmm1, %%xmm1 \n\t"\
|
||||
"movapd %%xmm6, %%xmm0 \n\t"\
|
||||
"subpd %%xmm1, %%xmm0 \n\t"\
|
||||
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
|
||||
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
|
||||
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
|
||||
"mulpd %%xmm0, %%xmm2 \n\t"\
|
||||
"mulpd %%xmm1, %%xmm3 \n\t"\
|
||||
"movapd %%xmm2, (%2,%0,2) \n\t"\
|
||||
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
|
||||
"subpd %%xmm5, %%xmm7 \n\t"\
|
||||
"sub $8, %1 \n\t"\
|
||||
"add $8, %0 \n\t"\
|
||||
"jl 1b \n\t"\
|
||||
|
||||
WELCH("movupd", -1)
|
||||
"jmp 3f \n\t"
|
||||
"2: \n\t"
|
||||
WELCH("movapd", -2)
|
||||
"3: \n\t"
|
||||
:"+&r"(i), "+&r"(j)
|
||||
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
#undef WELCH
|
||||
}
|
||||
|
||||
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
|
||||
double *autoc)
|
||||
{
|
||||
int j;
|
||||
|
||||
if((x86_reg)data & 15)
|
||||
data++;
|
||||
|
||||
for(j=0; j<lag; j+=2){
|
||||
x86_reg i = -len*sizeof(double);
|
||||
if(j == lag-2) {
|
||||
__asm__ volatile(
|
||||
"movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t"
|
||||
"movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t"
|
||||
"movsd "MANGLE(ff_pd_1)", %%xmm2 \n\t"
|
||||
"1: \n\t"
|
||||
"movapd (%2,%0), %%xmm3 \n\t"
|
||||
"movupd -8(%3,%0), %%xmm4 \n\t"
|
||||
"movapd (%3,%0), %%xmm5 \n\t"
|
||||
"mulpd %%xmm3, %%xmm4 \n\t"
|
||||
"mulpd %%xmm3, %%xmm5 \n\t"
|
||||
"mulpd -16(%3,%0), %%xmm3 \n\t"
|
||||
"addpd %%xmm4, %%xmm1 \n\t"
|
||||
"addpd %%xmm5, %%xmm0 \n\t"
|
||||
"addpd %%xmm3, %%xmm2 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"jl 1b \n\t"
|
||||
"movhlps %%xmm0, %%xmm3 \n\t"
|
||||
"movhlps %%xmm1, %%xmm4 \n\t"
|
||||
"movhlps %%xmm2, %%xmm5 \n\t"
|
||||
"addsd %%xmm3, %%xmm0 \n\t"
|
||||
"addsd %%xmm4, %%xmm1 \n\t"
|
||||
"addsd %%xmm5, %%xmm2 \n\t"
|
||||
"movsd %%xmm0, (%1) \n\t"
|
||||
"movsd %%xmm1, 8(%1) \n\t"
|
||||
"movsd %%xmm2, 16(%1) \n\t"
|
||||
:"+&r"(i)
|
||||
:"r"(autoc+j), "r"(data+len), "r"(data+len-j)
|
||||
:"memory"
|
||||
);
|
||||
} else {
|
||||
__asm__ volatile(
|
||||
"movsd "MANGLE(ff_pd_1)", %%xmm0 \n\t"
|
||||
"movsd "MANGLE(ff_pd_1)", %%xmm1 \n\t"
|
||||
"1: \n\t"
|
||||
"movapd (%3,%0), %%xmm3 \n\t"
|
||||
"movupd -8(%4,%0), %%xmm4 \n\t"
|
||||
"mulpd %%xmm3, %%xmm4 \n\t"
|
||||
"mulpd (%4,%0), %%xmm3 \n\t"
|
||||
"addpd %%xmm4, %%xmm1 \n\t"
|
||||
"addpd %%xmm3, %%xmm0 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"jl 1b \n\t"
|
||||
"movhlps %%xmm0, %%xmm3 \n\t"
|
||||
"movhlps %%xmm1, %%xmm4 \n\t"
|
||||
"addsd %%xmm3, %%xmm0 \n\t"
|
||||
"addsd %%xmm4, %%xmm1 \n\t"
|
||||
"movsd %%xmm0, %1 \n\t"
|
||||
"movsd %%xmm1, %2 \n\t"
|
||||
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
|
||||
:"r"(data+len), "r"(data+len-j)
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
av_cold void ff_lpc_init_x86(LPCContext *c)
|
||||
{
|
||||
#if HAVE_SSE2_INLINE
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
|
||||
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
|
||||
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
|
||||
}
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
}
|
||||
130
project/jni/ffmpeg/libavcodec/x86/mathops.h
Normal file
130
project/jni/ffmpeg/libavcodec/x86/mathops.h
Normal file
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* simple math operations
|
||||
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_MATHOPS_H
|
||||
#define AVCODEC_X86_MATHOPS_H
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/common.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#if ARCH_X86_32
|
||||
|
||||
#define MULL MULL
|
||||
static av_always_inline av_const int MULL(int a, int b, unsigned shift)
|
||||
{
|
||||
int rt, dummy;
|
||||
__asm__ (
|
||||
"imull %3 \n\t"
|
||||
"shrdl %4, %%edx, %%eax \n\t"
|
||||
:"=a"(rt), "=d"(dummy)
|
||||
:"a"(a), "rm"(b), "ci"((uint8_t)shift)
|
||||
);
|
||||
return rt;
|
||||
}
|
||||
|
||||
#define MULH MULH
|
||||
static av_always_inline av_const int MULH(int a, int b)
|
||||
{
|
||||
int rt, dummy;
|
||||
__asm__ (
|
||||
"imull %3"
|
||||
:"=d"(rt), "=a"(dummy)
|
||||
:"a"(a), "rm"(b)
|
||||
);
|
||||
return rt;
|
||||
}
|
||||
|
||||
#define MUL64 MUL64
|
||||
static av_always_inline av_const int64_t MUL64(int a, int b)
|
||||
{
|
||||
int64_t rt;
|
||||
__asm__ (
|
||||
"imull %2"
|
||||
:"=A"(rt)
|
||||
:"a"(a), "rm"(b)
|
||||
);
|
||||
return rt;
|
||||
}
|
||||
|
||||
#endif /* ARCH_X86_32 */
|
||||
|
||||
#if HAVE_CMOV
|
||||
/* median of 3 */
|
||||
#define mid_pred mid_pred
|
||||
static inline av_const int mid_pred(int a, int b, int c)
|
||||
{
|
||||
int i=b;
|
||||
__asm__ volatile(
|
||||
"cmp %2, %1 \n\t"
|
||||
"cmovg %1, %0 \n\t"
|
||||
"cmovg %2, %1 \n\t"
|
||||
"cmp %3, %1 \n\t"
|
||||
"cmovl %3, %1 \n\t"
|
||||
"cmp %1, %0 \n\t"
|
||||
"cmovg %1, %0 \n\t"
|
||||
:"+&r"(i), "+&r"(a)
|
||||
:"r"(b), "r"(c)
|
||||
);
|
||||
return i;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_CMOV
|
||||
#define COPY3_IF_LT(x, y, a, b, c, d)\
|
||||
__asm__ volatile(\
|
||||
"cmpl %0, %3 \n\t"\
|
||||
"cmovl %3, %0 \n\t"\
|
||||
"cmovl %4, %1 \n\t"\
|
||||
"cmovl %5, %2 \n\t"\
|
||||
: "+&r" (x), "+&r" (a), "+r" (c)\
|
||||
: "r" (y), "r" (b), "r" (d)\
|
||||
);
|
||||
#endif
|
||||
|
||||
#define MASK_ABS(mask, level) \
|
||||
__asm__ ("cltd \n\t" \
|
||||
"xorl %1, %0 \n\t" \
|
||||
"subl %1, %0 \n\t" \
|
||||
: "+a"(level), "=&d"(mask))
|
||||
|
||||
// avoid +32 for shift optimization (gcc should do that ...)
|
||||
#define NEG_SSR32 NEG_SSR32
|
||||
static inline int32_t NEG_SSR32( int32_t a, int8_t s){
|
||||
__asm__ ("sarl %1, %0\n\t"
|
||||
: "+r" (a)
|
||||
: "ic" ((uint8_t)(-s))
|
||||
);
|
||||
return a;
|
||||
}
|
||||
|
||||
#define NEG_USR32 NEG_USR32
|
||||
static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
|
||||
__asm__ ("shrl %1, %0\n\t"
|
||||
: "+r" (a)
|
||||
: "ic" ((uint8_t)(-s))
|
||||
);
|
||||
return a;
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
#endif /* AVCODEC_X86_MATHOPS_H */
|
||||
181
project/jni/ffmpeg/libavcodec/x86/mlpdsp.c
Normal file
181
project/jni/ffmpeg/libavcodec/x86/mlpdsp.c
Normal file
@@ -0,0 +1,181 @@
|
||||
/*
|
||||
* MLP DSP functions x86-optimized
|
||||
* Copyright (c) 2009 Ramiro Polla
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/mlpdsp.h"
|
||||
#include "libavcodec/mlp.h"
|
||||
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM
|
||||
|
||||
extern char ff_mlp_firorder_8;
|
||||
extern char ff_mlp_firorder_7;
|
||||
extern char ff_mlp_firorder_6;
|
||||
extern char ff_mlp_firorder_5;
|
||||
extern char ff_mlp_firorder_4;
|
||||
extern char ff_mlp_firorder_3;
|
||||
extern char ff_mlp_firorder_2;
|
||||
extern char ff_mlp_firorder_1;
|
||||
extern char ff_mlp_firorder_0;
|
||||
|
||||
extern char ff_mlp_iirorder_4;
|
||||
extern char ff_mlp_iirorder_3;
|
||||
extern char ff_mlp_iirorder_2;
|
||||
extern char ff_mlp_iirorder_1;
|
||||
extern char ff_mlp_iirorder_0;
|
||||
|
||||
static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
|
||||
&ff_mlp_firorder_2, &ff_mlp_firorder_3,
|
||||
&ff_mlp_firorder_4, &ff_mlp_firorder_5,
|
||||
&ff_mlp_firorder_6, &ff_mlp_firorder_7,
|
||||
&ff_mlp_firorder_8 };
|
||||
static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
|
||||
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
|
||||
&ff_mlp_iirorder_4 };
|
||||
|
||||
#if ARCH_X86_64
|
||||
|
||||
#define MLPMUL(label, offset, offs, offc) \
|
||||
LABEL_MANGLE(label)": \n\t" \
|
||||
"movslq "offset"+"offs"(%0), %%rax\n\t" \
|
||||
"movslq "offset"+"offc"(%1), %%rdx\n\t" \
|
||||
"imul %%rdx, %%rax\n\t" \
|
||||
"add %%rax, %%rsi\n\t"
|
||||
|
||||
#define FIRMULREG(label, offset, firc)\
|
||||
LABEL_MANGLE(label)": \n\t" \
|
||||
"movslq "#offset"(%0), %%rax\n\t" \
|
||||
"imul %"#firc", %%rax\n\t" \
|
||||
"add %%rax, %%rsi\n\t"
|
||||
|
||||
#define CLEAR_ACCUM \
|
||||
"xor %%rsi, %%rsi\n\t"
|
||||
|
||||
#define SHIFT_ACCUM \
|
||||
"shr %%cl, %%rsi\n\t"
|
||||
|
||||
#define ACCUM "%%rdx"
|
||||
#define RESULT "%%rsi"
|
||||
#define RESULT32 "%%esi"
|
||||
|
||||
#else /* if ARCH_X86_32 */
|
||||
|
||||
#define MLPMUL(label, offset, offs, offc) \
|
||||
LABEL_MANGLE(label)": \n\t" \
|
||||
"mov "offset"+"offs"(%0), %%eax\n\t" \
|
||||
"imull "offset"+"offc"(%1) \n\t" \
|
||||
"add %%eax , %%esi\n\t" \
|
||||
"adc %%edx , %%ecx\n\t"
|
||||
|
||||
#define FIRMULREG(label, offset, firc) \
|
||||
MLPMUL(label, #offset, "0", "0")
|
||||
|
||||
#define CLEAR_ACCUM \
|
||||
"xor %%esi, %%esi\n\t" \
|
||||
"xor %%ecx, %%ecx\n\t"
|
||||
|
||||
#define SHIFT_ACCUM \
|
||||
"mov %%ecx, %%edx\n\t" \
|
||||
"mov %%esi, %%eax\n\t" \
|
||||
"movzbl %7 , %%ecx\n\t" \
|
||||
"shrd %%cl, %%edx, %%eax\n\t" \
|
||||
|
||||
#define ACCUM "%%edx"
|
||||
#define RESULT "%%eax"
|
||||
#define RESULT32 "%%eax"
|
||||
|
||||
#endif /* !ARCH_X86_64 */
|
||||
|
||||
#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
|
||||
#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
|
||||
#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
|
||||
|
||||
#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
|
||||
#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
|
||||
|
||||
static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
|
||||
int firorder, int iirorder,
|
||||
unsigned int filter_shift, int32_t mask,
|
||||
int blocksize, int32_t *sample_buffer)
|
||||
{
|
||||
const void *firjump = firtable[firorder];
|
||||
const void *iirjump = iirtable[iirorder];
|
||||
|
||||
blocksize = -blocksize;
|
||||
|
||||
__asm__ volatile(
|
||||
"1: \n\t"
|
||||
CLEAR_ACCUM
|
||||
"jmp *%5 \n\t"
|
||||
FIRMUL (ff_mlp_firorder_8, 0x1c )
|
||||
FIRMUL (ff_mlp_firorder_7, 0x18 )
|
||||
FIRMUL (ff_mlp_firorder_6, 0x14 )
|
||||
FIRMUL (ff_mlp_firorder_5, 0x10 )
|
||||
FIRMUL (ff_mlp_firorder_4, 0x0c )
|
||||
FIRMULREG(ff_mlp_firorder_3, 0x08,10)
|
||||
FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
|
||||
FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
|
||||
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
|
||||
"jmp *%6 \n\t"
|
||||
IIRMUL (ff_mlp_iirorder_4, 0x0c )
|
||||
IIRMUL (ff_mlp_iirorder_3, 0x08 )
|
||||
IIRMUL (ff_mlp_iirorder_2, 0x04 )
|
||||
IIRMUL (ff_mlp_iirorder_1, 0x00 )
|
||||
LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
|
||||
SHIFT_ACCUM
|
||||
"mov "RESULT" ,"ACCUM" \n\t"
|
||||
"add (%2) ,"RESULT" \n\t"
|
||||
"and %4 ,"RESULT" \n\t"
|
||||
"sub $4 , %0 \n\t"
|
||||
"mov "RESULT32", (%0) \n\t"
|
||||
"mov "RESULT32", (%2) \n\t"
|
||||
"add $"BINC" , %2 \n\t"
|
||||
"sub "ACCUM" ,"RESULT" \n\t"
|
||||
"mov "RESULT32","IOFFS"(%0) \n\t"
|
||||
"incl %3 \n\t"
|
||||
"js 1b \n\t"
|
||||
: /* 0*/"+r"(state),
|
||||
/* 1*/"+r"(coeff),
|
||||
/* 2*/"+r"(sample_buffer),
|
||||
#if ARCH_X86_64
|
||||
/* 3*/"+r"(blocksize)
|
||||
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
|
||||
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
|
||||
, /* 8*/"r"((int64_t)coeff[0])
|
||||
, /* 9*/"r"((int64_t)coeff[1])
|
||||
, /*10*/"r"((int64_t)coeff[2])
|
||||
: "rax", "rdx", "rsi"
|
||||
#else /* ARCH_X86_32 */
|
||||
/* 3*/"+m"(blocksize)
|
||||
: /* 4*/"m"( mask), /* 5*/"m"(firjump),
|
||||
/* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
|
||||
: "eax", "edx", "esi", "ecx"
|
||||
#endif /* !ARCH_X86_64 */
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
|
||||
|
||||
void ff_mlpdsp_init_x86(MLPDSPContext *c)
|
||||
{
|
||||
#if HAVE_7REGS && HAVE_INLINE_ASM
|
||||
c->mlp_filter_channel = mlp_filter_channel_x86;
|
||||
#endif
|
||||
}
|
||||
473
project/jni/ffmpeg/libavcodec/x86/motion_est.c
Normal file
473
project/jni/ffmpeg/libavcodec/x86/motion_est.c
Normal file
@@ -0,0 +1,473 @@
|
||||
/*
|
||||
* MMX optimized motion estimation
|
||||
* Copyright (c) 2001 Fabrice Bellard
|
||||
* Copyright (c) 2002-2004 Michael Niedermayer
|
||||
*
|
||||
* mostly by Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
|
||||
0x0000000000000000ULL,
|
||||
0x0001000100010001ULL,
|
||||
0x0002000200020002ULL,
|
||||
};
|
||||
|
||||
DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
|
||||
|
||||
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
x86_reg len= -(x86_reg)stride*h;
|
||||
__asm__ volatile(
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm4 \n\t"
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
"psubusb %%mm0, %%mm2 \n\t"
|
||||
"psubusb %%mm4, %%mm0 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm1 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm3 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm5, %%mm1 \n\t"
|
||||
"por %%mm2, %%mm0 \n\t"
|
||||
"por %%mm1, %%mm3 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm3, %%mm2 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpcklbw %%mm7, %%mm3 \n\t"
|
||||
"punpckhbw %%mm7, %%mm2 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm3, %%mm2 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
|
||||
int stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"psadbw (%2), %%mm0 \n\t"
|
||||
"psadbw (%2, %3), %%mm1 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"paddw %%mm1, %%mm6 \n\t"
|
||||
"lea (%1,%3,2), %1 \n\t"
|
||||
"lea (%2,%3,2), %2 \n\t"
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
|
||||
{
|
||||
int ret;
|
||||
__asm__ volatile(
|
||||
"pxor %%xmm2, %%xmm2 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movdqu (%1), %%xmm0 \n\t"
|
||||
"movdqu (%1, %4), %%xmm1 \n\t"
|
||||
"psadbw (%2), %%xmm0 \n\t"
|
||||
"psadbw (%2, %4), %%xmm1 \n\t"
|
||||
"paddw %%xmm0, %%xmm2 \n\t"
|
||||
"paddw %%xmm1, %%xmm2 \n\t"
|
||||
"lea (%1,%4,2), %1 \n\t"
|
||||
"lea (%2,%4,2), %2 \n\t"
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
"movhlps %%xmm2, %%xmm0 \n\t"
|
||||
"paddw %%xmm0, %%xmm2 \n\t"
|
||||
"movd %%xmm2, %3 \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
|
||||
int stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq (%1, %3), %%mm1 \n\t"
|
||||
"pavgb 1(%1), %%mm0 \n\t"
|
||||
"pavgb 1(%1, %3), %%mm1 \n\t"
|
||||
"psadbw (%2), %%mm0 \n\t"
|
||||
"psadbw (%2, %3), %%mm1 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"paddw %%mm1, %%mm6 \n\t"
|
||||
"lea (%1,%3,2), %1 \n\t"
|
||||
"lea (%2,%3,2), %2 \n\t"
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
|
||||
int stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq (%1, %3), %%mm2 \n\t"
|
||||
"pavgb %%mm1, %%mm0 \n\t"
|
||||
"pavgb %%mm2, %%mm1 \n\t"
|
||||
"psadbw (%2), %%mm0 \n\t"
|
||||
"psadbw (%2, %3), %%mm1 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"paddw %%mm1, %%mm6 \n\t"
|
||||
"movq %%mm2, %%mm0 \n\t"
|
||||
"lea (%1,%3,2), %1 \n\t"
|
||||
"lea (%2,%3,2), %2 \n\t"
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
|
||||
int stride, int h)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"movq "MANGLE(bone)", %%mm5 \n\t"
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"pavgb 1(%1), %%mm0 \n\t"
|
||||
"add %3, %1 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq (%1,%3), %%mm2 \n\t"
|
||||
"pavgb 1(%1), %%mm1 \n\t"
|
||||
"pavgb 1(%1,%3), %%mm2 \n\t"
|
||||
"psubusb %%mm5, %%mm1 \n\t"
|
||||
"pavgb %%mm1, %%mm0 \n\t"
|
||||
"pavgb %%mm2, %%mm1 \n\t"
|
||||
"psadbw (%2), %%mm0 \n\t"
|
||||
"psadbw (%2,%3), %%mm1 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"paddw %%mm1, %%mm6 \n\t"
|
||||
"movq %%mm2, %%mm0 \n\t"
|
||||
"lea (%1,%3,2), %1 \n\t"
|
||||
"lea (%2,%3,2), %2 \n\t"
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
x86_reg len= -(x86_reg)stride*h;
|
||||
__asm__ volatile(
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm1 \n\t"
|
||||
"punpckhbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddw %%mm0, %%mm1 \n\t"
|
||||
"paddw %%mm2, %%mm3 \n\t"
|
||||
"movq (%3, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq (%3, %%"REG_a"), %%mm2 \n\t"
|
||||
"paddw %%mm5, %%mm1 \n\t"
|
||||
"paddw %%mm5, %%mm3 \n\t"
|
||||
"psrlw $1, %%mm1 \n\t"
|
||||
"psrlw $1, %%mm3 \n\t"
|
||||
"packuswb %%mm3, %%mm1 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm2, %%mm1 \n\t"
|
||||
"por %%mm4, %%mm1 \n\t"
|
||||
"movq %%mm1, %%mm0 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"add %4, %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
x86_reg len= -(x86_reg)stride*h;
|
||||
__asm__ volatile(
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq %%mm0, %%mm1 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm1 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm3, %%mm1 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%2, %%"REG_a"), %%mm2 \n\t"
|
||||
"movq 1(%2, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq %%mm2, %%mm3 \n\t"
|
||||
"movq %%mm4, %%mm5 \n\t"
|
||||
"punpcklbw %%mm7, %%mm2 \n\t"
|
||||
"punpckhbw %%mm7, %%mm3 \n\t"
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"
|
||||
"punpckhbw %%mm7, %%mm5 \n\t"
|
||||
"paddw %%mm4, %%mm2 \n\t"
|
||||
"paddw %%mm5, %%mm3 \n\t"
|
||||
"movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm3, %%mm1 \n\t"
|
||||
"paddw %%mm5, %%mm0 \n\t"
|
||||
"paddw %%mm5, %%mm1 \n\t"
|
||||
"movq (%3, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq (%3, %%"REG_a"), %%mm5 \n\t"
|
||||
"psrlw $2, %%mm0 \n\t"
|
||||
"psrlw $2, %%mm1 \n\t"
|
||||
"packuswb %%mm1, %%mm0 \n\t"
|
||||
"psubusb %%mm0, %%mm4 \n\t"
|
||||
"psubusb %%mm5, %%mm0 \n\t"
|
||||
"por %%mm4, %%mm0 \n\t"
|
||||
"movq %%mm0, %%mm4 \n\t"
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"
|
||||
"punpckhbw %%mm7, %%mm4 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"paddw %%mm4, %%mm6 \n\t"
|
||||
"movq %%mm2, %%mm0 \n\t"
|
||||
"movq %%mm3, %%mm1 \n\t"
|
||||
"add %4, %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline int sum_mmx(void)
|
||||
{
|
||||
int ret;
|
||||
__asm__ volatile(
|
||||
"movq %%mm6, %%mm0 \n\t"
|
||||
"psrlq $32, %%mm6 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"movq %%mm6, %%mm0 \n\t"
|
||||
"psrlq $16, %%mm6 \n\t"
|
||||
"paddw %%mm0, %%mm6 \n\t"
|
||||
"movd %%mm6, %0 \n\t"
|
||||
: "=r" (ret)
|
||||
);
|
||||
return ret&0xFFFF;
|
||||
}
|
||||
|
||||
static inline int sum_mmxext(void)
|
||||
{
|
||||
int ret;
|
||||
__asm__ volatile(
|
||||
"movd %%mm6, %0 \n\t"
|
||||
: "=r" (ret)
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
|
||||
}
|
||||
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
|
||||
}
|
||||
|
||||
|
||||
#define PIX_SAD(suf)\
|
||||
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
av_assert2(h==8);\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t":);\
|
||||
\
|
||||
sad8_1_ ## suf(blk1, blk2, stride, 8);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
av_assert2(h==8);\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t"\
|
||||
"movq %0, %%mm5 \n\t"\
|
||||
:: "m"(round_tab[1]) \
|
||||
);\
|
||||
\
|
||||
sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
\
|
||||
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
av_assert2(h==8);\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t"\
|
||||
"movq %0, %%mm5 \n\t"\
|
||||
:: "m"(round_tab[1]) \
|
||||
);\
|
||||
\
|
||||
sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
\
|
||||
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
av_assert2(h==8);\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t"\
|
||||
::);\
|
||||
\
|
||||
sad8_4_ ## suf(blk1, blk2, stride, 8);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
\
|
||||
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t":);\
|
||||
\
|
||||
sad8_1_ ## suf(blk1 , blk2 , stride, h);\
|
||||
sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t"\
|
||||
"movq %0, %%mm5 \n\t"\
|
||||
:: "m"(round_tab[1]) \
|
||||
);\
|
||||
\
|
||||
sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
|
||||
sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t"\
|
||||
"movq %0, %%mm5 \n\t"\
|
||||
:: "m"(round_tab[1]) \
|
||||
);\
|
||||
\
|
||||
sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
|
||||
sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
|
||||
{\
|
||||
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
|
||||
"pxor %%mm6, %%mm6 \n\t"\
|
||||
::);\
|
||||
\
|
||||
sad8_4_ ## suf(blk1 , blk2 , stride, h);\
|
||||
sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
|
||||
\
|
||||
return sum_ ## suf();\
|
||||
}\
|
||||
|
||||
PIX_SAD(mmx)
|
||||
PIX_SAD(mmxext)
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
#if HAVE_INLINE_ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
c->pix_abs[0][0] = sad16_mmx;
|
||||
c->pix_abs[0][1] = sad16_x2_mmx;
|
||||
c->pix_abs[0][2] = sad16_y2_mmx;
|
||||
c->pix_abs[0][3] = sad16_xy2_mmx;
|
||||
c->pix_abs[1][0] = sad8_mmx;
|
||||
c->pix_abs[1][1] = sad8_x2_mmx;
|
||||
c->pix_abs[1][2] = sad8_y2_mmx;
|
||||
c->pix_abs[1][3] = sad8_xy2_mmx;
|
||||
|
||||
c->sad[0]= sad16_mmx;
|
||||
c->sad[1]= sad8_mmx;
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
||||
c->pix_abs[0][0] = sad16_mmxext;
|
||||
c->pix_abs[1][0] = sad8_mmxext;
|
||||
|
||||
c->sad[0] = sad16_mmxext;
|
||||
c->sad[1] = sad8_mmxext;
|
||||
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->pix_abs[0][1] = sad16_x2_mmxext;
|
||||
c->pix_abs[0][2] = sad16_y2_mmxext;
|
||||
c->pix_abs[0][3] = sad16_xy2_mmxext;
|
||||
c->pix_abs[1][1] = sad8_x2_mmxext;
|
||||
c->pix_abs[1][2] = sad8_y2_mmxext;
|
||||
c->pix_abs[1][3] = sad8_xy2_mmxext;
|
||||
}
|
||||
}
|
||||
if ((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
|
||||
c->sad[0]= sad16_sse2;
|
||||
}
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
}
|
||||
272
project/jni/ffmpeg/libavcodec/x86/mpegaudiodec.c
Normal file
272
project/jni/ffmpeg/libavcodec/x86/mpegaudiodec.c
Normal file
@@ -0,0 +1,272 @@
|
||||
/*
|
||||
* MMX optimized MP3 decoding functions
|
||||
* Copyright (c) 2010 Vitor Sessak
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/mpegaudiodsp.h"
|
||||
|
||||
#define DECL(CPU)\
|
||||
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
|
||||
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
|
||||
|
||||
DECL(sse)
|
||||
DECL(sse2)
|
||||
DECL(sse3)
|
||||
DECL(ssse3)
|
||||
DECL(avx)
|
||||
|
||||
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
|
||||
float *tmpbuf);
|
||||
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
|
||||
float *tmpbuf);
|
||||
|
||||
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
|
||||
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
|
||||
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
|
||||
|
||||
#define SUM8(op, sum, w, p) \
|
||||
{ \
|
||||
op(sum, (w)[0 * 64], (p)[0 * 64]); \
|
||||
op(sum, (w)[1 * 64], (p)[1 * 64]); \
|
||||
op(sum, (w)[2 * 64], (p)[2 * 64]); \
|
||||
op(sum, (w)[3 * 64], (p)[3 * 64]); \
|
||||
op(sum, (w)[4 * 64], (p)[4 * 64]); \
|
||||
op(sum, (w)[5 * 64], (p)[5 * 64]); \
|
||||
op(sum, (w)[6 * 64], (p)[6 * 64]); \
|
||||
op(sum, (w)[7 * 64], (p)[7 * 64]); \
|
||||
}
|
||||
|
||||
static void apply_window(const float *buf, const float *win1,
|
||||
const float *win2, float *sum1, float *sum2, int len)
|
||||
{
|
||||
x86_reg count = - 4*len;
|
||||
const float *win1a = win1+len;
|
||||
const float *win2a = win2+len;
|
||||
const float *bufa = buf+len;
|
||||
float *sum1a = sum1+len;
|
||||
float *sum2a = sum2+len;
|
||||
|
||||
|
||||
#define MULT(a, b) \
|
||||
"movaps " #a "(%1,%0), %%xmm1 \n\t" \
|
||||
"movaps " #a "(%3,%0), %%xmm2 \n\t" \
|
||||
"mulps %%xmm2, %%xmm1 \n\t" \
|
||||
"subps %%xmm1, %%xmm0 \n\t" \
|
||||
"mulps " #b "(%2,%0), %%xmm2 \n\t" \
|
||||
"subps %%xmm2, %%xmm4 \n\t" \
|
||||
|
||||
__asm__ volatile(
|
||||
"1: \n\t"
|
||||
"xorps %%xmm0, %%xmm0 \n\t"
|
||||
"xorps %%xmm4, %%xmm4 \n\t"
|
||||
|
||||
MULT( 0, 0)
|
||||
MULT( 256, 64)
|
||||
MULT( 512, 128)
|
||||
MULT( 768, 192)
|
||||
MULT(1024, 256)
|
||||
MULT(1280, 320)
|
||||
MULT(1536, 384)
|
||||
MULT(1792, 448)
|
||||
|
||||
"movaps %%xmm0, (%4,%0) \n\t"
|
||||
"movaps %%xmm4, (%5,%0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"jl 1b \n\t"
|
||||
:"+&r"(count)
|
||||
:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
|
||||
);
|
||||
|
||||
#undef MULT
|
||||
}
|
||||
|
||||
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
|
||||
int incr)
|
||||
{
|
||||
LOCAL_ALIGNED_16(float, suma, [17]);
|
||||
LOCAL_ALIGNED_16(float, sumb, [17]);
|
||||
LOCAL_ALIGNED_16(float, sumc, [17]);
|
||||
LOCAL_ALIGNED_16(float, sumd, [17]);
|
||||
|
||||
float sum;
|
||||
|
||||
/* copy to avoid wrap */
|
||||
__asm__ volatile(
|
||||
"movaps 0(%0), %%xmm0 \n\t" \
|
||||
"movaps 16(%0), %%xmm1 \n\t" \
|
||||
"movaps 32(%0), %%xmm2 \n\t" \
|
||||
"movaps 48(%0), %%xmm3 \n\t" \
|
||||
"movaps %%xmm0, 0(%1) \n\t" \
|
||||
"movaps %%xmm1, 16(%1) \n\t" \
|
||||
"movaps %%xmm2, 32(%1) \n\t" \
|
||||
"movaps %%xmm3, 48(%1) \n\t" \
|
||||
"movaps 64(%0), %%xmm0 \n\t" \
|
||||
"movaps 80(%0), %%xmm1 \n\t" \
|
||||
"movaps 96(%0), %%xmm2 \n\t" \
|
||||
"movaps 112(%0), %%xmm3 \n\t" \
|
||||
"movaps %%xmm0, 64(%1) \n\t" \
|
||||
"movaps %%xmm1, 80(%1) \n\t" \
|
||||
"movaps %%xmm2, 96(%1) \n\t" \
|
||||
"movaps %%xmm3, 112(%1) \n\t"
|
||||
::"r"(in), "r"(in+512)
|
||||
:"memory"
|
||||
);
|
||||
|
||||
apply_window(in + 16, win , win + 512, suma, sumc, 16);
|
||||
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
|
||||
|
||||
SUM8(MACS, suma[0], win + 32, in + 48);
|
||||
|
||||
sumc[ 0] = 0;
|
||||
sumb[16] = 0;
|
||||
sumd[16] = 0;
|
||||
|
||||
#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
|
||||
"movups " #sumd "(%4), %%xmm0 \n\t" \
|
||||
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
|
||||
"subps " #suma "(%1), %%xmm0 \n\t" \
|
||||
"movaps %%xmm0," #out1 "(%0) \n\t" \
|
||||
\
|
||||
"movups " #sumc "(%3), %%xmm0 \n\t" \
|
||||
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
|
||||
"addps " #sumb "(%2), %%xmm0 \n\t" \
|
||||
"movaps %%xmm0," #out2 "(%0) \n\t"
|
||||
|
||||
if (incr == 1) {
|
||||
__asm__ volatile(
|
||||
SUMS( 0, 48, 4, 52, 0, 112)
|
||||
SUMS(16, 32, 20, 36, 16, 96)
|
||||
SUMS(32, 16, 36, 20, 32, 80)
|
||||
SUMS(48, 0, 52, 4, 48, 64)
|
||||
|
||||
:"+&r"(out)
|
||||
:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
|
||||
:"memory"
|
||||
);
|
||||
out += 16*incr;
|
||||
} else {
|
||||
int j;
|
||||
float *out2 = out + 32 * incr;
|
||||
out[0 ] = -suma[ 0];
|
||||
out += incr;
|
||||
out2 -= incr;
|
||||
for(j=1;j<16;j++) {
|
||||
*out = -suma[ j] + sumd[16-j];
|
||||
*out2 = sumb[16-j] + sumc[ j];
|
||||
out += incr;
|
||||
out2 -= incr;
|
||||
}
|
||||
}
|
||||
|
||||
sum = 0;
|
||||
SUM8(MLSS, sum, win + 16 + 32, in + 32);
|
||||
*out = sum;
|
||||
}
|
||||
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
#if HAVE_YASM
|
||||
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
|
||||
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
|
||||
int count, int switch_point, int block_type) \
|
||||
{ \
|
||||
int align_end = count - (count & 3); \
|
||||
int j; \
|
||||
for (j = 0; j < align_end; j+= 4) { \
|
||||
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
|
||||
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
|
||||
/* apply window & overlap with previous buffer */ \
|
||||
\
|
||||
/* select window */ \
|
||||
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
|
||||
in += 4*18; \
|
||||
buf += 4*18; \
|
||||
out += 4; \
|
||||
} \
|
||||
for (; j < count; j++) { \
|
||||
/* apply window & overlap with previous buffer */ \
|
||||
\
|
||||
/* select window */ \
|
||||
int win_idx = (switch_point && j < 2) ? 0 : block_type; \
|
||||
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
|
||||
\
|
||||
ff_imdct36_float_ ## CPU1(out, buf, in, win); \
|
||||
\
|
||||
in += 18; \
|
||||
buf++; \
|
||||
out++; \
|
||||
} \
|
||||
}
|
||||
|
||||
#if HAVE_SSE
|
||||
DECL_IMDCT_BLOCKS(sse,sse)
|
||||
DECL_IMDCT_BLOCKS(sse2,sse)
|
||||
DECL_IMDCT_BLOCKS(sse3,sse)
|
||||
DECL_IMDCT_BLOCKS(ssse3,sse)
|
||||
#endif
|
||||
#if HAVE_AVX_EXTERNAL
|
||||
DECL_IMDCT_BLOCKS(avx,avx)
|
||||
#endif
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
void ff_mpadsp_init_x86(MPADSPContext *s)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
int i, j;
|
||||
for (j = 0; j < 4; j++) {
|
||||
for (i = 0; i < 40; i ++) {
|
||||
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
|
||||
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
|
||||
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
|
||||
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
|
||||
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
|
||||
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
|
||||
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
|
||||
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
s->apply_window_float = apply_window_mp3;
|
||||
}
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
#if HAVE_YASM
|
||||
if (EXTERNAL_AVX(mm_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_avx;
|
||||
} else if (EXTERNAL_SSSE3(mm_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_ssse3;
|
||||
} else if (EXTERNAL_SSE3(mm_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_sse3;
|
||||
} else if (EXTERNAL_SSE2(mm_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_sse2;
|
||||
} else if (EXTERNAL_SSE(mm_flags)) {
|
||||
s->imdct36_blocks_float = imdct36_blocks_sse;
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
||||
600
project/jni/ffmpeg/libavcodec/x86/mpegvideo.c
Normal file
600
project/jni/ffmpeg/libavcodec/x86/mpegvideo.c
Normal file
@@ -0,0 +1,600 @@
|
||||
/*
|
||||
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
|
||||
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
x86_reg level, qmul, qadd, nCoeffs;
|
||||
|
||||
qmul = qscale << 1;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
|
||||
|
||||
if (!s->h263_aic) {
|
||||
if (n < 4)
|
||||
level = block[0] * s->y_dc_scale;
|
||||
else
|
||||
level = block[0] * s->c_dc_scale;
|
||||
qadd = (qscale - 1) | 1;
|
||||
}else{
|
||||
qadd = 0;
|
||||
level= block[0];
|
||||
}
|
||||
if(s->ac_pred)
|
||||
nCoeffs=63;
|
||||
else
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %1, %%mm6 \n\t" //qmul
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"movd %2, %%mm5 \n\t" //qadd
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"psubw %%mm5, %%mm7 \n\t"
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %3), %%mm0 \n\t"
|
||||
"movq 8(%0, %3), %%mm1 \n\t"
|
||||
|
||||
"pmullw %%mm6, %%mm0 \n\t"
|
||||
"pmullw %%mm6, %%mm1 \n\t"
|
||||
|
||||
"movq (%0, %3), %%mm2 \n\t"
|
||||
"movq 8(%0, %3), %%mm3 \n\t"
|
||||
|
||||
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
|
||||
"paddw %%mm7, %%mm0 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
|
||||
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
|
||||
|
||||
"pandn %%mm2, %%mm0 \n\t"
|
||||
"pandn %%mm3, %%mm1 \n\t"
|
||||
|
||||
"movq %%mm0, (%0, %3) \n\t"
|
||||
"movq %%mm1, 8(%0, %3) \n\t"
|
||||
|
||||
"add $16, %3 \n\t"
|
||||
"jng 1b \n\t"
|
||||
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
|
||||
: "memory"
|
||||
);
|
||||
block[0]= level;
|
||||
}
|
||||
|
||||
|
||||
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
x86_reg qmul, qadd, nCoeffs;
|
||||
|
||||
qmul = qscale << 1;
|
||||
qadd = (qscale - 1) | 1;
|
||||
|
||||
assert(s->block_last_index[n]>=0 || s->h263_aic);
|
||||
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %1, %%mm6 \n\t" //qmul
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"movd %2, %%mm5 \n\t" //qadd
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"packssdw %%mm5, %%mm5 \n\t"
|
||||
"psubw %%mm5, %%mm7 \n\t"
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %3), %%mm0 \n\t"
|
||||
"movq 8(%0, %3), %%mm1 \n\t"
|
||||
|
||||
"pmullw %%mm6, %%mm0 \n\t"
|
||||
"pmullw %%mm6, %%mm1 \n\t"
|
||||
|
||||
"movq (%0, %3), %%mm2 \n\t"
|
||||
"movq 8(%0, %3), %%mm3 \n\t"
|
||||
|
||||
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
|
||||
"paddw %%mm7, %%mm0 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
|
||||
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
|
||||
|
||||
"pandn %%mm2, %%mm0 \n\t"
|
||||
"pandn %%mm3, %%mm1 \n\t"
|
||||
|
||||
"movq %%mm0, (%0, %3) \n\t"
|
||||
"movq %%mm1, 8(%0, %3) \n\t"
|
||||
|
||||
"add $16, %3 \n\t"
|
||||
"jng 1b \n\t"
|
||||
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
We can suppose that result of two multiplications can't be greater than 0xFFFF
|
||||
i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
|
||||
a complex multiplication.
|
||||
=====================================================
|
||||
Full formula for multiplication of 2 integer numbers
|
||||
which are represent as high:low words:
|
||||
input: value1 = high1:low1
|
||||
value2 = high2:low2
|
||||
output: value3 = value1*value2
|
||||
value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
|
||||
this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
|
||||
but this algorithm will compute only 0x66cb0ce4
|
||||
this limited by 16-bit size of operands
|
||||
---------------------------------
|
||||
tlow1 = high1*low2
|
||||
tlow2 = high2*low1
|
||||
tlow1 = tlow1 + tlow2
|
||||
high3:low3 = low1*low2
|
||||
high3 += tlow1
|
||||
*/
|
||||
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
int block0;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
|
||||
|
||||
if (n < 4)
|
||||
block0 = block[0] * s->y_dc_scale;
|
||||
else
|
||||
block0 = block[0] * s->c_dc_scale;
|
||||
/* XXX: only mpeg1 */
|
||||
quant_matrix = s->intra_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlw $15, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psraw $3, %%mm0 \n\t"
|
||||
"psraw $3, %%mm1 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm7, %%mm1 \n\t"
|
||||
"por %%mm7, %%mm0 \n\t"
|
||||
"por %%mm7, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm4, (%0, %%"REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
|
||||
|
||||
"add $16, %%"REG_a" \n\t"
|
||||
"js 1b \n\t"
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
block[0]= block0;
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
|
||||
|
||||
quant_matrix = s->inter_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlw $15, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
|
||||
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
|
||||
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
|
||||
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
|
||||
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psraw $4, %%mm0 \n\t"
|
||||
"psraw $4, %%mm1 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm7, %%mm1 \n\t"
|
||||
"por %%mm7, %%mm0 \n\t"
|
||||
"por %%mm7, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm4, (%0, %%"REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
|
||||
|
||||
"add $16, %%"REG_a" \n\t"
|
||||
"js 1b \n\t"
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
int block0;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
if(s->alternate_scan) nCoeffs= 63; //FIXME
|
||||
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
if (n < 4)
|
||||
block0 = block[0] * s->y_dc_scale;
|
||||
else
|
||||
block0 = block[0] * s->c_dc_scale;
|
||||
quant_matrix = s->intra_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlw $15, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psraw $3, %%mm0 \n\t"
|
||||
"psraw $3, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm4, (%0, %%"REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
|
||||
|
||||
"add $16, %%"REG_a" \n\t"
|
||||
"jng 1b \n\t"
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
block[0]= block0;
|
||||
//Note, we do not do mismatch control for intra as errors cannot accumulate
|
||||
}
|
||||
|
||||
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
|
||||
av_assert2(s->block_last_index[n]>=0);
|
||||
|
||||
if(s->alternate_scan) nCoeffs= 63; //FIXME
|
||||
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
quant_matrix = s->inter_matrix;
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"psrlq $48, %%mm7 \n\t"
|
||||
"movd %2, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"packssdw %%mm6, %%mm6 \n\t"
|
||||
"mov %3, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%0, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
|
||||
"movq (%1, %%"REG_a"), %%mm4 \n\t"
|
||||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
|
||||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
|
||||
"pxor %%mm2, %%mm2 \n\t"
|
||||
"pxor %%mm3, %%mm3 \n\t"
|
||||
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
|
||||
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
|
||||
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
|
||||
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
|
||||
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
|
||||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
|
||||
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
|
||||
"pxor %%mm4, %%mm4 \n\t"
|
||||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
|
||||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
|
||||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
|
||||
"psrlw $4, %%mm0 \n\t"
|
||||
"psrlw $4, %%mm1 \n\t"
|
||||
"pxor %%mm2, %%mm0 \n\t"
|
||||
"pxor %%mm3, %%mm1 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm3, %%mm1 \n\t"
|
||||
"pandn %%mm0, %%mm4 \n\t"
|
||||
"pandn %%mm1, %%mm5 \n\t"
|
||||
"pxor %%mm4, %%mm7 \n\t"
|
||||
"pxor %%mm5, %%mm7 \n\t"
|
||||
"movq %%mm4, (%0, %%"REG_a") \n\t"
|
||||
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
|
||||
|
||||
"add $16, %%"REG_a" \n\t"
|
||||
"jng 1b \n\t"
|
||||
"movd 124(%0, %3), %%mm0 \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"psrlq $32, %%mm7 \n\t"
|
||||
"pxor %%mm6, %%mm7 \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"psrlq $16, %%mm7 \n\t"
|
||||
"pxor %%mm6, %%mm7 \n\t"
|
||||
"pslld $31, %%mm7 \n\t"
|
||||
"psrlq $15, %%mm7 \n\t"
|
||||
"pxor %%mm7, %%mm0 \n\t"
|
||||
"movd %%mm0, 124(%0, %3) \n\t"
|
||||
|
||||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
|
||||
const int intra= s->mb_intra;
|
||||
int *sum= s->dct_error_sum[intra];
|
||||
uint16_t *offset= s->dct_offset[intra];
|
||||
|
||||
s->dct_count[intra]++;
|
||||
|
||||
__asm__ volatile(
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"1: \n\t"
|
||||
"pxor %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"movq (%0), %%mm2 \n\t"
|
||||
"movq 8(%0), %%mm3 \n\t"
|
||||
"pcmpgtw %%mm2, %%mm0 \n\t"
|
||||
"pcmpgtw %%mm3, %%mm1 \n\t"
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
"psubw %%mm0, %%mm2 \n\t"
|
||||
"psubw %%mm1, %%mm3 \n\t"
|
||||
"movq %%mm2, %%mm4 \n\t"
|
||||
"movq %%mm3, %%mm5 \n\t"
|
||||
"psubusw (%2), %%mm2 \n\t"
|
||||
"psubusw 8(%2), %%mm3 \n\t"
|
||||
"pxor %%mm0, %%mm2 \n\t"
|
||||
"pxor %%mm1, %%mm3 \n\t"
|
||||
"psubw %%mm0, %%mm2 \n\t"
|
||||
"psubw %%mm1, %%mm3 \n\t"
|
||||
"movq %%mm2, (%0) \n\t"
|
||||
"movq %%mm3, 8(%0) \n\t"
|
||||
"movq %%mm4, %%mm2 \n\t"
|
||||
"movq %%mm5, %%mm3 \n\t"
|
||||
"punpcklwd %%mm7, %%mm4 \n\t"
|
||||
"punpckhwd %%mm7, %%mm2 \n\t"
|
||||
"punpcklwd %%mm7, %%mm5 \n\t"
|
||||
"punpckhwd %%mm7, %%mm3 \n\t"
|
||||
"paddd (%1), %%mm4 \n\t"
|
||||
"paddd 8(%1), %%mm2 \n\t"
|
||||
"paddd 16(%1), %%mm5 \n\t"
|
||||
"paddd 24(%1), %%mm3 \n\t"
|
||||
"movq %%mm4, (%1) \n\t"
|
||||
"movq %%mm2, 8(%1) \n\t"
|
||||
"movq %%mm5, 16(%1) \n\t"
|
||||
"movq %%mm3, 24(%1) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"add $32, %1 \n\t"
|
||||
"add $16, %2 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (block), "+r" (sum), "+r" (offset)
|
||||
: "r"(block+64)
|
||||
);
|
||||
}
|
||||
|
||||
static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
|
||||
const int intra= s->mb_intra;
|
||||
int *sum= s->dct_error_sum[intra];
|
||||
uint16_t *offset= s->dct_offset[intra];
|
||||
|
||||
s->dct_count[intra]++;
|
||||
|
||||
__asm__ volatile(
|
||||
"pxor %%xmm7, %%xmm7 \n\t"
|
||||
"1: \n\t"
|
||||
"pxor %%xmm0, %%xmm0 \n\t"
|
||||
"pxor %%xmm1, %%xmm1 \n\t"
|
||||
"movdqa (%0), %%xmm2 \n\t"
|
||||
"movdqa 16(%0), %%xmm3 \n\t"
|
||||
"pcmpgtw %%xmm2, %%xmm0 \n\t"
|
||||
"pcmpgtw %%xmm3, %%xmm1 \n\t"
|
||||
"pxor %%xmm0, %%xmm2 \n\t"
|
||||
"pxor %%xmm1, %%xmm3 \n\t"
|
||||
"psubw %%xmm0, %%xmm2 \n\t"
|
||||
"psubw %%xmm1, %%xmm3 \n\t"
|
||||
"movdqa %%xmm2, %%xmm4 \n\t"
|
||||
"movdqa %%xmm3, %%xmm5 \n\t"
|
||||
"psubusw (%2), %%xmm2 \n\t"
|
||||
"psubusw 16(%2), %%xmm3 \n\t"
|
||||
"pxor %%xmm0, %%xmm2 \n\t"
|
||||
"pxor %%xmm1, %%xmm3 \n\t"
|
||||
"psubw %%xmm0, %%xmm2 \n\t"
|
||||
"psubw %%xmm1, %%xmm3 \n\t"
|
||||
"movdqa %%xmm2, (%0) \n\t"
|
||||
"movdqa %%xmm3, 16(%0) \n\t"
|
||||
"movdqa %%xmm4, %%xmm6 \n\t"
|
||||
"movdqa %%xmm5, %%xmm0 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm4 \n\t"
|
||||
"punpckhwd %%xmm7, %%xmm6 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm5 \n\t"
|
||||
"punpckhwd %%xmm7, %%xmm0 \n\t"
|
||||
"paddd (%1), %%xmm4 \n\t"
|
||||
"paddd 16(%1), %%xmm6 \n\t"
|
||||
"paddd 32(%1), %%xmm5 \n\t"
|
||||
"paddd 48(%1), %%xmm0 \n\t"
|
||||
"movdqa %%xmm4, (%1) \n\t"
|
||||
"movdqa %%xmm6, 16(%1) \n\t"
|
||||
"movdqa %%xmm5, 32(%1) \n\t"
|
||||
"movdqa %%xmm0, 48(%1) \n\t"
|
||||
"add $32, %0 \n\t"
|
||||
"add $64, %1 \n\t"
|
||||
"add $32, %2 \n\t"
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (block), "+r" (sum), "+r" (offset)
|
||||
: "r"(block+64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
void ff_MPV_common_init_x86(MpegEncContext *s)
|
||||
{
|
||||
#if HAVE_INLINE_ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
|
||||
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
|
||||
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
|
||||
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
|
||||
if(!(s->flags & CODEC_FLAG_BITEXACT))
|
||||
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
|
||||
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
s->denoise_dct= denoise_dct_sse2;
|
||||
} else {
|
||||
s->denoise_dct= denoise_dct_mmx;
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
}
|
||||
106
project/jni/ffmpeg/libavcodec/x86/mpegvideoenc.c
Normal file
106
project/jni/ffmpeg/libavcodec/x86/mpegvideoenc.c
Normal file
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* The simplest mpeg encoder (well, it was the simplest!)
|
||||
* Copyright (c) 2000,2001 Fabrice Bellard
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/mpegvideo.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
extern uint16_t ff_inv_zigzag_direct16[64];
|
||||
|
||||
#if HAVE_MMX_INLINE
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 0
|
||||
#define COMPILE_TEMPLATE_SSSE3 0
|
||||
#define RENAME(a) a ## _MMX
|
||||
#define RENAMEl(a) a ## _mmx
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
#undef COMPILE_TEMPLATE_SSSE3
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#define COMPILE_TEMPLATE_MMXEXT 1
|
||||
#define COMPILE_TEMPLATE_SSE2 0
|
||||
#define COMPILE_TEMPLATE_SSSE3 0
|
||||
#undef RENAME
|
||||
#undef RENAMEl
|
||||
#define RENAME(a) a ## _MMXEXT
|
||||
#define RENAMEl(a) a ## _mmxext
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_MMXEXT_INLINE */
|
||||
|
||||
#if HAVE_SSE2_INLINE
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_SSSE3
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 1
|
||||
#define COMPILE_TEMPLATE_SSSE3 0
|
||||
#undef RENAME
|
||||
#undef RENAMEl
|
||||
#define RENAME(a) a ## _SSE2
|
||||
#define RENAMEl(a) a ## _sse2
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_SSE2_INLINE */
|
||||
|
||||
#if HAVE_SSSE3_INLINE
|
||||
#undef COMPILE_TEMPLATE_MMXEXT
|
||||
#undef COMPILE_TEMPLATE_SSE2
|
||||
#undef COMPILE_TEMPLATE_SSSE3
|
||||
#define COMPILE_TEMPLATE_MMXEXT 0
|
||||
#define COMPILE_TEMPLATE_SSE2 1
|
||||
#define COMPILE_TEMPLATE_SSSE3 1
|
||||
#undef RENAME
|
||||
#undef RENAMEl
|
||||
#define RENAME(a) a ## _SSSE3
|
||||
#define RENAMEl(a) a ## _sse2
|
||||
#include "mpegvideoenc_template.c"
|
||||
#endif /* HAVE_SSSE3_INLINE */
|
||||
|
||||
void ff_dct_encode_init_x86(MpegEncContext *s)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
const int dct_algo = s->avctx->dct_algo;
|
||||
|
||||
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
|
||||
#if HAVE_MMX_INLINE
|
||||
if (INLINE_MMX(mm_flags))
|
||||
s->dct_quantize = dct_quantize_MMX;
|
||||
#endif
|
||||
#if HAVE_MMXEXT_INLINE
|
||||
if (INLINE_MMXEXT(mm_flags))
|
||||
s->dct_quantize = dct_quantize_MMXEXT;
|
||||
#endif
|
||||
#if HAVE_SSE2_INLINE
|
||||
if (INLINE_SSE2(mm_flags))
|
||||
s->dct_quantize = dct_quantize_SSE2;
|
||||
#endif
|
||||
#if HAVE_SSSE3_INLINE
|
||||
if (INLINE_SSSE3(mm_flags))
|
||||
s->dct_quantize = dct_quantize_SSSE3;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
364
project/jni/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
Normal file
364
project/jni/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
Normal file
@@ -0,0 +1,364 @@
|
||||
/*
|
||||
* MPEG video MMX templates
|
||||
*
|
||||
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#undef MMREG_WIDTH
|
||||
#undef MM
|
||||
#undef MOVQ
|
||||
#undef SPREADW
|
||||
#undef PMAXW
|
||||
#undef PMAX
|
||||
#undef SAVE_SIGN
|
||||
#undef RESTORE_SIGN
|
||||
|
||||
#if COMPILE_TEMPLATE_SSE2
|
||||
#define MMREG_WIDTH "16"
|
||||
#define MM "%%xmm"
|
||||
#define MOVQ "movdqa"
|
||||
#define SPREADW(a) \
|
||||
"pshuflw $0, "a", "a" \n\t"\
|
||||
"punpcklwd "a", "a" \n\t"
|
||||
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
|
||||
#define PMAX(a,b) \
|
||||
"movhlps "a", "b" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"pshuflw $0x0E, "a", "b" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"pshuflw $0x01, "a", "b" \n\t"\
|
||||
PMAXW(b, a)
|
||||
#else
|
||||
#define MMREG_WIDTH "8"
|
||||
#define MM "%%mm"
|
||||
#define MOVQ "movq"
|
||||
#if COMPILE_TEMPLATE_MMXEXT
|
||||
#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
|
||||
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
|
||||
#define PMAX(a,b) \
|
||||
"pshufw $0x0E, "a", "b" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"pshufw $0x01, "a", "b" \n\t"\
|
||||
PMAXW(b, a)
|
||||
#else
|
||||
#define SPREADW(a) \
|
||||
"punpcklwd "a", "a" \n\t"\
|
||||
"punpcklwd "a", "a" \n\t"
|
||||
#define PMAXW(a,b) \
|
||||
"psubusw "a", "b" \n\t"\
|
||||
"paddw "a", "b" \n\t"
|
||||
#define PMAX(a,b) \
|
||||
"movq "a", "b" \n\t"\
|
||||
"psrlq $32, "a" \n\t"\
|
||||
PMAXW(b, a)\
|
||||
"movq "a", "b" \n\t"\
|
||||
"psrlq $16, "a" \n\t"\
|
||||
PMAXW(b, a)
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if COMPILE_TEMPLATE_SSSE3
|
||||
#define SAVE_SIGN(a,b) \
|
||||
"movdqa "b", "a" \n\t"\
|
||||
"pabsw "b", "b" \n\t"
|
||||
#define RESTORE_SIGN(a,b) \
|
||||
"psignw "a", "b" \n\t"
|
||||
#else
|
||||
#define SAVE_SIGN(a,b) \
|
||||
"pxor "a", "a" \n\t"\
|
||||
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
|
||||
"pxor "a", "b" \n\t"\
|
||||
"psubw "a", "b" \n\t" /* ABS(block[i]) */
|
||||
#define RESTORE_SIGN(a,b) \
|
||||
"pxor "a", "b" \n\t"\
|
||||
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
|
||||
#endif
|
||||
|
||||
static int RENAME(dct_quantize)(MpegEncContext *s,
|
||||
DCTELEM *block, int n,
|
||||
int qscale, int *overflow)
|
||||
{
|
||||
x86_reg last_non_zero_p1;
|
||||
int level=0, q; //=0 is because gcc says uninitialized ...
|
||||
const uint16_t *qmat, *bias;
|
||||
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
|
||||
|
||||
av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
|
||||
|
||||
//s->fdct (block);
|
||||
RENAMEl(ff_fdct) (block); //cannot be anything else ...
|
||||
|
||||
if(s->dct_error_sum)
|
||||
s->denoise_dct(s, block);
|
||||
|
||||
if (s->mb_intra) {
|
||||
int dummy;
|
||||
if (n < 4){
|
||||
q = s->y_dc_scale;
|
||||
bias = s->q_intra_matrix16[qscale][1];
|
||||
qmat = s->q_intra_matrix16[qscale][0];
|
||||
}else{
|
||||
q = s->c_dc_scale;
|
||||
bias = s->q_chroma_intra_matrix16[qscale][1];
|
||||
qmat = s->q_chroma_intra_matrix16[qscale][0];
|
||||
}
|
||||
/* note: block[0] is assumed to be positive */
|
||||
if (!s->h263_aic) {
|
||||
__asm__ volatile (
|
||||
"mul %%ecx \n\t"
|
||||
: "=d" (level), "=a"(dummy)
|
||||
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
|
||||
);
|
||||
} else
|
||||
/* For AIC we skip quant/dequant of INTRADC */
|
||||
level = (block[0] + 4)>>3;
|
||||
|
||||
block[0]=0; //avoid fake overflow
|
||||
// temp_block[0] = (block[0] + (q >> 1)) / q;
|
||||
last_non_zero_p1 = 1;
|
||||
} else {
|
||||
last_non_zero_p1 = 0;
|
||||
bias = s->q_inter_matrix16[qscale][1];
|
||||
qmat = s->q_inter_matrix16[qscale][0];
|
||||
}
|
||||
|
||||
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
|
||||
|
||||
__asm__ volatile(
|
||||
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
|
||||
SPREADW(MM"3")
|
||||
"pxor "MM"7, "MM"7 \n\t" // 0
|
||||
"pxor "MM"4, "MM"4 \n\t" // 0
|
||||
MOVQ" (%2), "MM"5 \n\t" // qmat[0]
|
||||
"pxor "MM"6, "MM"6 \n\t"
|
||||
"psubw (%3), "MM"6 \n\t" // -bias[0]
|
||||
"mov $-128, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
|
||||
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
|
||||
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
|
||||
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
|
||||
"por "MM"0, "MM"4 \n\t"
|
||||
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
|
||||
MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
|
||||
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
|
||||
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
|
||||
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
|
||||
"pandn "MM"1, "MM"0 \n\t"
|
||||
PMAXW(MM"0", MM"3")
|
||||
"add $"MMREG_WIDTH", %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
PMAX(MM"3", MM"0")
|
||||
"movd "MM"3, %%"REG_a" \n\t"
|
||||
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
|
||||
: "+a" (last_non_zero_p1)
|
||||
: "r" (block+64), "r" (qmat), "r" (bias),
|
||||
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}else{ // FMT_H263
|
||||
__asm__ volatile(
|
||||
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
|
||||
SPREADW(MM"3")
|
||||
"pxor "MM"7, "MM"7 \n\t" // 0
|
||||
"pxor "MM"4, "MM"4 \n\t" // 0
|
||||
"mov $-128, %%"REG_a" \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
|
||||
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
|
||||
MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
|
||||
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
|
||||
MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
|
||||
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
|
||||
"por "MM"0, "MM"4 \n\t"
|
||||
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
|
||||
MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
|
||||
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
|
||||
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
|
||||
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
|
||||
"pandn "MM"1, "MM"0 \n\t"
|
||||
PMAXW(MM"0", MM"3")
|
||||
"add $"MMREG_WIDTH", %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
PMAX(MM"3", MM"0")
|
||||
"movd "MM"3, %%"REG_a" \n\t"
|
||||
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
|
||||
: "+a" (last_non_zero_p1)
|
||||
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
|
||||
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
|
||||
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
|
||||
);
|
||||
}
|
||||
__asm__ volatile(
|
||||
"movd %1, "MM"1 \n\t" // max_qcoeff
|
||||
SPREADW(MM"1")
|
||||
"psubusw "MM"1, "MM"4 \n\t"
|
||||
"packuswb "MM"4, "MM"4 \n\t"
|
||||
#if COMPILE_TEMPLATE_SSE2
|
||||
"packuswb "MM"4, "MM"4 \n\t"
|
||||
#endif
|
||||
"movd "MM"4, %0 \n\t" // *overflow
|
||||
: "=g" (*overflow)
|
||||
: "g" (s->max_qcoeff)
|
||||
);
|
||||
|
||||
if(s->mb_intra) block[0]= level;
|
||||
else block[0]= temp_block[0];
|
||||
|
||||
if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
|
||||
block[0x20] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
|
||||
block[0x09] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
|
||||
block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
|
||||
block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
|
||||
block[0x0C] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
|
||||
block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
|
||||
block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
|
||||
block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
|
||||
block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
|
||||
block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
|
||||
block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
|
||||
block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
|
||||
block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
|
||||
block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
|
||||
block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
|
||||
block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
|
||||
block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
|
||||
block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
|
||||
block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
|
||||
block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
|
||||
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
|
||||
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
|
||||
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
}else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x04] = temp_block[0x01];
|
||||
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
|
||||
block[0x05] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
|
||||
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x1C] = temp_block[0x19];
|
||||
block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
|
||||
block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
|
||||
block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
|
||||
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
|
||||
block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
|
||||
block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
|
||||
block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
|
||||
block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
|
||||
block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
|
||||
block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
|
||||
block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
|
||||
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
|
||||
block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
|
||||
block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
|
||||
block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
|
||||
block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
|
||||
block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
|
||||
block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
|
||||
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
|
||||
block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
}else{
|
||||
if(last_non_zero_p1 <= 1) goto end;
|
||||
block[0x01] = temp_block[0x01];
|
||||
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
|
||||
if(last_non_zero_p1 <= 4) goto end;
|
||||
block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
|
||||
block[0x03] = temp_block[0x03];
|
||||
if(last_non_zero_p1 <= 7) goto end;
|
||||
block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
|
||||
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
|
||||
if(last_non_zero_p1 <= 11) goto end;
|
||||
block[0x19] = temp_block[0x19];
|
||||
block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
|
||||
block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
|
||||
if(last_non_zero_p1 <= 16) goto end;
|
||||
block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
|
||||
block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
|
||||
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
|
||||
block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
|
||||
if(last_non_zero_p1 <= 24) goto end;
|
||||
block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
|
||||
block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
|
||||
block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
|
||||
block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
|
||||
if(last_non_zero_p1 <= 32) goto end;
|
||||
block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
|
||||
block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
|
||||
block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
|
||||
block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
|
||||
if(last_non_zero_p1 <= 40) goto end;
|
||||
block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
|
||||
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
|
||||
block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
|
||||
block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
|
||||
if(last_non_zero_p1 <= 48) goto end;
|
||||
block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
|
||||
block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
|
||||
block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
|
||||
block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
|
||||
if(last_non_zero_p1 <= 56) goto end;
|
||||
block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
|
||||
block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
|
||||
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
|
||||
block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
|
||||
}
|
||||
end:
|
||||
return last_non_zero_p1 - 1;
|
||||
}
|
||||
173
project/jni/ffmpeg/libavcodec/x86/pngdsp.asm
Normal file
173
project/jni/ffmpeg/libavcodec/x86/pngdsp.asm
Normal file
@@ -0,0 +1,173 @@
|
||||
;******************************************************************************
|
||||
;* x86 optimizations for PNG decoding
|
||||
;*
|
||||
;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
cextern pw_255
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
; %1 = nr. of xmm registers used
|
||||
%macro ADD_BYTES_FN 1
|
||||
cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
|
||||
%if ARCH_X86_64
|
||||
movsxd waq, wad
|
||||
%endif
|
||||
xor iq, iq
|
||||
|
||||
; vector loop
|
||||
mov wq, waq
|
||||
and waq, ~(mmsize*2-1)
|
||||
jmp .end_v
|
||||
.loop_v:
|
||||
mova m0, [src1q+iq]
|
||||
mova m1, [src1q+iq+mmsize]
|
||||
paddb m0, [src2q+iq]
|
||||
paddb m1, [src2q+iq+mmsize]
|
||||
mova [dstq+iq ], m0
|
||||
mova [dstq+iq+mmsize], m1
|
||||
add iq, mmsize*2
|
||||
.end_v:
|
||||
cmp iq, waq
|
||||
jl .loop_v
|
||||
|
||||
%if mmsize == 16
|
||||
; vector loop
|
||||
mov waq, wq
|
||||
and waq, ~7
|
||||
jmp .end_l
|
||||
.loop_l:
|
||||
movq mm0, [src1q+iq]
|
||||
paddb mm0, [src2q+iq]
|
||||
movq [dstq+iq ], mm0
|
||||
add iq, 8
|
||||
.end_l:
|
||||
cmp iq, waq
|
||||
jl .loop_l
|
||||
%endif
|
||||
|
||||
; scalar loop for leftover
|
||||
jmp .end_s
|
||||
.loop_s:
|
||||
mov wab, [src1q+iq]
|
||||
add wab, [src2q+iq]
|
||||
mov [dstq+iq], wab
|
||||
inc iq
|
||||
.end_s:
|
||||
cmp iq, wq
|
||||
jl .loop_s
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
ADD_BYTES_FN 0
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD_BYTES_FN 2
|
||||
|
||||
%macro ADD_PAETH_PRED_FN 1
|
||||
cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
|
||||
%if ARCH_X86_64
|
||||
movsxd bppq, bppd
|
||||
movsxd wq, wd
|
||||
%endif
|
||||
lea endq, [dstq+wq-(mmsize/2-1)]
|
||||
sub topq, dstq
|
||||
sub srcq, dstq
|
||||
sub dstq, bppq
|
||||
pxor m7, m7
|
||||
|
||||
PUSH dstq
|
||||
lea cntrq, [bppq-1]
|
||||
shr cntrq, 2 + mmsize/16
|
||||
.bpp_loop:
|
||||
lea dstq, [dstq+cntrq*(mmsize/2)]
|
||||
movh m0, [dstq]
|
||||
movh m1, [topq+dstq]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
add dstq, bppq
|
||||
.loop:
|
||||
mova m2, m1
|
||||
movh m1, [topq+dstq]
|
||||
mova m3, m2
|
||||
punpcklbw m1, m7
|
||||
mova m4, m2
|
||||
psubw m3, m1
|
||||
psubw m4, m0
|
||||
mova m5, m3
|
||||
paddw m5, m4
|
||||
%if cpuflag(ssse3)
|
||||
pabsw m3, m3
|
||||
pabsw m4, m4
|
||||
pabsw m5, m5
|
||||
%else ; !cpuflag(ssse3)
|
||||
psubw m7, m5
|
||||
pmaxsw m5, m7
|
||||
pxor m6, m6
|
||||
pxor m7, m7
|
||||
psubw m6, m3
|
||||
psubw m7, m4
|
||||
pmaxsw m3, m6
|
||||
pmaxsw m4, m7
|
||||
pxor m7, m7
|
||||
%endif ; cpuflag(ssse3)
|
||||
mova m6, m4
|
||||
pminsw m6, m5
|
||||
pcmpgtw m3, m6
|
||||
pcmpgtw m4, m5
|
||||
mova m6, m4
|
||||
pand m4, m3
|
||||
pandn m6, m3
|
||||
pandn m3, m0
|
||||
movh m0, [srcq+dstq]
|
||||
pand m6, m1
|
||||
pand m2, m4
|
||||
punpcklbw m0, m7
|
||||
paddw m0, m6
|
||||
paddw m3, m2
|
||||
paddw m0, m3
|
||||
pand m0, [pw_255]
|
||||
mova m3, m0
|
||||
packuswb m3, m3
|
||||
movh [dstq], m3
|
||||
add dstq, bppq
|
||||
cmp dstq, endq
|
||||
jle .loop
|
||||
|
||||
mov dstq, [rsp]
|
||||
dec cntrq
|
||||
jge .bpp_loop
|
||||
POP dstq
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
ADD_PAETH_PRED_FN 0
|
||||
|
||||
INIT_MMX ssse3
|
||||
ADD_PAETH_PRED_FN 0
|
||||
49
project/jni/ffmpeg/libavcodec/x86/pngdsp_init.c
Normal file
49
project/jni/ffmpeg/libavcodec/x86/pngdsp_init.c
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* x86 PNG optimizations.
|
||||
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/pngdsp.h"
|
||||
|
||||
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
|
||||
uint8_t *top, int w, int bpp);
|
||||
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
|
||||
uint8_t *top, int w, int bpp);
|
||||
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
|
||||
uint8_t *src2, int w);
|
||||
void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
|
||||
uint8_t *src2, int w);
|
||||
|
||||
void ff_pngdsp_init_x86(PNGDSPContext *dsp)
|
||||
{
|
||||
int flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(flags))
|
||||
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
|
||||
#endif
|
||||
if (EXTERNAL_MMXEXT(flags))
|
||||
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
|
||||
if (EXTERNAL_SSE2(flags))
|
||||
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
|
||||
if (EXTERNAL_SSSE3(flags))
|
||||
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
|
||||
}
|
||||
326
project/jni/ffmpeg/libavcodec/x86/proresdsp.asm
Normal file
326
project/jni/ffmpeg/libavcodec/x86/proresdsp.asm
Normal file
@@ -0,0 +1,326 @@
|
||||
;******************************************************************************
|
||||
;* x86-SIMD-optimized IDCT for prores
|
||||
;* this is identical to "simple" IDCT written by Michael Niedermayer
|
||||
;* except for the clip range
|
||||
;*
|
||||
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
|
||||
%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
|
||||
%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
|
||||
%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
|
||||
%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
|
||||
%define W6sh2 8867 ; W6 = 35468 = 8867<<2
|
||||
%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
w4_plus_w2: times 4 dw W4sh2, +W2sh2
|
||||
w4_min_w2: times 4 dw W4sh2, -W2sh2
|
||||
w4_plus_w6: times 4 dw W4sh2, +W6sh2
|
||||
w4_min_w6: times 4 dw W4sh2, -W6sh2
|
||||
w1_plus_w3: times 4 dw W1sh2, +W3sh2
|
||||
w3_min_w1: times 4 dw W3sh2, -W1sh2
|
||||
w7_plus_w3: times 4 dw W7sh2, +W3sh2
|
||||
w3_min_w7: times 4 dw W3sh2, -W7sh2
|
||||
w1_plus_w5: times 4 dw W1sh2, +W5sh2
|
||||
w5_min_w1: times 4 dw W5sh2, -W1sh2
|
||||
w5_plus_w7: times 4 dw W5sh2, +W7sh2
|
||||
w7_min_w5: times 4 dw W7sh2, -W5sh2
|
||||
pw_88: times 8 dw 0x2008
|
||||
|
||||
cextern pw_1
|
||||
cextern pw_4
|
||||
cextern pw_512
|
||||
cextern pw_1019
|
||||
|
||||
section .text align=16
|
||||
|
||||
; interleave data while maintaining source
|
||||
; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
|
||||
%macro SBUTTERFLY3 5
|
||||
punpckl%1 m%2, m%4, m%5
|
||||
punpckh%1 m%3, m%4, m%5
|
||||
%endmacro
|
||||
|
||||
; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
|
||||
; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
|
||||
; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
|
||||
%macro SUMSUB_SHPK 7
|
||||
psubd %3, %1, %5 ; { a0 - b0 }[0-3]
|
||||
psubd %4, %2, %6 ; { a0 - b0 }[4-7]
|
||||
paddd %1, %5 ; { a0 + b0 }[0-3]
|
||||
paddd %2, %6 ; { a0 + b0 }[4-7]
|
||||
psrad %1, %7
|
||||
psrad %2, %7
|
||||
psrad %3, %7
|
||||
psrad %4, %7
|
||||
packssdw %1, %2 ; row[0]
|
||||
packssdw %3, %4 ; row[7]
|
||||
%endmacro
|
||||
|
||||
; %1 = row or col (for rounding variable)
|
||||
; %2 = number of bits to shift at the end
|
||||
%macro IDCT_1D 2
|
||||
; a0 = (W4 * row[0]) + (1 << (15 - 1));
|
||||
; a1 = a0;
|
||||
; a2 = a0;
|
||||
; a3 = a0;
|
||||
; a0 += W2 * row[2];
|
||||
; a1 += W6 * row[2];
|
||||
; a2 -= W6 * row[2];
|
||||
; a3 -= W2 * row[2];
|
||||
%ifidn %1, col
|
||||
paddw m10,[pw_88]
|
||||
%endif
|
||||
%ifidn %1, row
|
||||
paddw m10,[pw_1]
|
||||
%endif
|
||||
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
|
||||
pmaddwd m2, m0, [w4_plus_w6]
|
||||
pmaddwd m3, m1, [w4_plus_w6]
|
||||
pmaddwd m4, m0, [w4_min_w6]
|
||||
pmaddwd m5, m1, [w4_min_w6]
|
||||
pmaddwd m6, m0, [w4_min_w2]
|
||||
pmaddwd m7, m1, [w4_min_w2]
|
||||
pmaddwd m0, [w4_plus_w2]
|
||||
pmaddwd m1, [w4_plus_w2]
|
||||
|
||||
; a0: -1*row[0]-1*row[2]
|
||||
; a1: -1*row[0]
|
||||
; a2: -1*row[0]
|
||||
; a3: -1*row[0]+1*row[2]
|
||||
|
||||
; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
|
||||
; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
|
||||
; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
|
||||
; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
|
||||
SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
|
||||
pmaddwd m10, m8, [w4_plus_w6]
|
||||
pmaddwd m11, m9, [w4_plus_w6]
|
||||
paddd m0, m10 ; a0[0-3]
|
||||
paddd m1, m11 ; a0[4-7]
|
||||
pmaddwd m10, m8, [w4_min_w6]
|
||||
pmaddwd m11, m9, [w4_min_w6]
|
||||
paddd m6, m10 ; a3[0-3]
|
||||
paddd m7, m11 ; a3[4-7]
|
||||
pmaddwd m10, m8, [w4_min_w2]
|
||||
pmaddwd m11, m9, [w4_min_w2]
|
||||
pmaddwd m8, [w4_plus_w2]
|
||||
pmaddwd m9, [w4_plus_w2]
|
||||
psubd m4, m10 ; a2[0-3] intermediate
|
||||
psubd m5, m11 ; a2[4-7] intermediate
|
||||
psubd m2, m8 ; a1[0-3] intermediate
|
||||
psubd m3, m9 ; a1[4-7] intermediate
|
||||
|
||||
; load/store
|
||||
mova [r2+ 0], m0
|
||||
mova [r2+ 32], m2
|
||||
mova [r2+ 64], m4
|
||||
mova [r2+ 96], m6
|
||||
mova m10,[r2+ 16] ; { row[1] }[0-7]
|
||||
mova m8, [r2+ 48] ; { row[3] }[0-7]
|
||||
mova m13,[r2+ 80] ; { row[5] }[0-7]
|
||||
mova m14,[r2+112] ; { row[7] }[0-7]
|
||||
mova [r2+ 16], m1
|
||||
mova [r2+ 48], m3
|
||||
mova [r2+ 80], m5
|
||||
mova [r2+112], m7
|
||||
%ifidn %1, row
|
||||
pmullw m10,[r3+ 16]
|
||||
pmullw m8, [r3+ 48]
|
||||
pmullw m13,[r3+ 80]
|
||||
pmullw m14,[r3+112]
|
||||
%endif
|
||||
|
||||
; b0 = MUL(W1, row[1]);
|
||||
; MAC(b0, W3, row[3]);
|
||||
; b1 = MUL(W3, row[1]);
|
||||
; MAC(b1, -W7, row[3]);
|
||||
; b2 = MUL(W5, row[1]);
|
||||
; MAC(b2, -W1, row[3]);
|
||||
; b3 = MUL(W7, row[1]);
|
||||
; MAC(b3, -W5, row[3]);
|
||||
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
|
||||
pmaddwd m2, m0, [w3_min_w7]
|
||||
pmaddwd m3, m1, [w3_min_w7]
|
||||
pmaddwd m4, m0, [w5_min_w1]
|
||||
pmaddwd m5, m1, [w5_min_w1]
|
||||
pmaddwd m6, m0, [w7_min_w5]
|
||||
pmaddwd m7, m1, [w7_min_w5]
|
||||
pmaddwd m0, [w1_plus_w3]
|
||||
pmaddwd m1, [w1_plus_w3]
|
||||
|
||||
; b0: +1*row[1]+2*row[3]
|
||||
; b1: +2*row[1]-1*row[3]
|
||||
; b2: -1*row[1]-1*row[3]
|
||||
; b3: +1*row[1]+1*row[3]
|
||||
|
||||
; MAC(b0, W5, row[5]);
|
||||
; MAC(b0, W7, row[7]);
|
||||
; MAC(b1, -W1, row[5]);
|
||||
; MAC(b1, -W5, row[7]);
|
||||
; MAC(b2, W7, row[5]);
|
||||
; MAC(b2, W3, row[7]);
|
||||
; MAC(b3, W3, row[5]);
|
||||
; MAC(b3, -W1, row[7]);
|
||||
SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
|
||||
|
||||
; b0: -1*row[5]+1*row[7]
|
||||
; b1: -1*row[5]+1*row[7]
|
||||
; b2: +1*row[5]+2*row[7]
|
||||
; b3: +2*row[5]-1*row[7]
|
||||
|
||||
pmaddwd m10, m8, [w1_plus_w5]
|
||||
pmaddwd m11, m9, [w1_plus_w5]
|
||||
pmaddwd m12, m8, [w5_plus_w7]
|
||||
pmaddwd m13, m9, [w5_plus_w7]
|
||||
psubd m2, m10 ; b1[0-3]
|
||||
psubd m3, m11 ; b1[4-7]
|
||||
paddd m0, m12 ; b0[0-3]
|
||||
paddd m1, m13 ; b0[4-7]
|
||||
pmaddwd m12, m8, [w7_plus_w3]
|
||||
pmaddwd m13, m9, [w7_plus_w3]
|
||||
pmaddwd m8, [w3_min_w1]
|
||||
pmaddwd m9, [w3_min_w1]
|
||||
paddd m4, m12 ; b2[0-3]
|
||||
paddd m5, m13 ; b2[4-7]
|
||||
paddd m6, m8 ; b3[0-3]
|
||||
paddd m7, m9 ; b3[4-7]
|
||||
|
||||
; row[0] = (a0 + b0) >> 15;
|
||||
; row[7] = (a0 - b0) >> 15;
|
||||
; row[1] = (a1 + b1) >> 15;
|
||||
; row[6] = (a1 - b1) >> 15;
|
||||
; row[2] = (a2 + b2) >> 15;
|
||||
; row[5] = (a2 - b2) >> 15;
|
||||
; row[3] = (a3 + b3) >> 15;
|
||||
; row[4] = (a3 - b3) >> 15;
|
||||
mova m8, [r2+ 0] ; a0[0-3]
|
||||
mova m9, [r2+16] ; a0[4-7]
|
||||
SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
|
||||
mova m0, [r2+32] ; a1[0-3]
|
||||
mova m1, [r2+48] ; a1[4-7]
|
||||
SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
|
||||
mova m1, [r2+64] ; a2[0-3]
|
||||
mova m2, [r2+80] ; a2[4-7]
|
||||
SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
|
||||
mova m2, [r2+96] ; a3[0-3]
|
||||
mova m3, [r2+112] ; a3[4-7]
|
||||
SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
|
||||
%endmacro
|
||||
|
||||
; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
|
||||
; DCTELEM *block, const int16_t *qmat);
|
||||
%macro idct_put_fn 1
|
||||
cglobal prores_idct_put_10, 4, 4, %1
|
||||
movsxd r1, r1d
|
||||
pxor m15, m15 ; zero
|
||||
|
||||
; for (i = 0; i < 8; i++)
|
||||
; idctRowCondDC(block + i*8);
|
||||
mova m10,[r2+ 0] ; { row[0] }[0-7]
|
||||
mova m8, [r2+32] ; { row[2] }[0-7]
|
||||
mova m13,[r2+64] ; { row[4] }[0-7]
|
||||
mova m12,[r2+96] ; { row[6] }[0-7]
|
||||
|
||||
pmullw m10,[r3+ 0]
|
||||
pmullw m8, [r3+32]
|
||||
pmullw m13,[r3+64]
|
||||
pmullw m12,[r3+96]
|
||||
|
||||
IDCT_1D row, 15
|
||||
|
||||
; transpose for second part of IDCT
|
||||
TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
|
||||
mova [r2+ 16], m0
|
||||
mova [r2+ 48], m2
|
||||
mova [r2+ 80], m11
|
||||
mova [r2+112], m10
|
||||
SWAP 8, 10
|
||||
SWAP 1, 8
|
||||
SWAP 4, 13
|
||||
SWAP 9, 12
|
||||
|
||||
; for (i = 0; i < 8; i++)
|
||||
; idctSparseColAdd(dest + i, line_size, block + i);
|
||||
IDCT_1D col, 18
|
||||
|
||||
; clip/store
|
||||
mova m3, [pw_4]
|
||||
mova m5, [pw_1019]
|
||||
pmaxsw m8, m3
|
||||
pmaxsw m0, m3
|
||||
pmaxsw m1, m3
|
||||
pmaxsw m2, m3
|
||||
pmaxsw m4, m3
|
||||
pmaxsw m11, m3
|
||||
pmaxsw m9, m3
|
||||
pmaxsw m10, m3
|
||||
pminsw m8, m5
|
||||
pminsw m0, m5
|
||||
pminsw m1, m5
|
||||
pminsw m2, m5
|
||||
pminsw m4, m5
|
||||
pminsw m11, m5
|
||||
pminsw m9, m5
|
||||
pminsw m10, m5
|
||||
|
||||
lea r2, [r1*3]
|
||||
mova [r0 ], m8
|
||||
mova [r0+r1 ], m0
|
||||
mova [r0+r1*2], m1
|
||||
mova [r0+r2 ], m2
|
||||
lea r0, [r0+r1*4]
|
||||
mova [r0 ], m4
|
||||
mova [r0+r1 ], m11
|
||||
mova [r0+r1*2], m9
|
||||
mova [r0+r2 ], m10
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro SIGNEXTEND 2-3
|
||||
%if cpuflag(sse4) ; dstlow, dsthigh
|
||||
movhlps %2, %1
|
||||
pmovsxwd %1, %1
|
||||
pmovsxwd %2, %2
|
||||
%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
|
||||
pxor %3, %3
|
||||
pcmpgtw %3, %1
|
||||
mova %2, %1
|
||||
punpcklwd %1, %3
|
||||
punpckhwd %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
idct_put_fn 16
|
||||
INIT_XMM sse4
|
||||
idct_put_fn 16
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_XMM avx
|
||||
idct_put_fn 16
|
||||
%endif
|
||||
|
||||
%endif
|
||||
56
project/jni/ffmpeg/libavcodec/x86/proresdsp_init.c
Normal file
56
project/jni/ffmpeg/libavcodec/x86/proresdsp_init.c
Normal file
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Apple ProRes compatible decoder
|
||||
*
|
||||
* Copyright (c) 2010-2011 Maxim Poliakovski
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/proresdsp.h"
|
||||
|
||||
void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
|
||||
DCTELEM *block, const int16_t *qmat);
|
||||
void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
|
||||
DCTELEM *block, const int16_t *qmat);
|
||||
void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
|
||||
DCTELEM *block, const int16_t *qmat);
|
||||
|
||||
void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx)
|
||||
{
|
||||
#if ARCH_X86_64
|
||||
int flags = av_get_cpu_flags();
|
||||
|
||||
if(avctx->flags & CODEC_FLAG_BITEXACT)
|
||||
return;
|
||||
|
||||
if (EXTERNAL_SSE2(flags)) {
|
||||
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||
dsp->idct_put = ff_prores_idct_put_10_sse2;
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE4(flags)) {
|
||||
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||
dsp->idct_put = ff_prores_idct_put_10_sse4;
|
||||
}
|
||||
|
||||
if (EXTERNAL_AVX(flags)) {
|
||||
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
|
||||
dsp->idct_put = ff_prores_idct_put_10_avx;
|
||||
}
|
||||
#endif /* ARCH_X86_64 */
|
||||
}
|
||||
196
project/jni/ffmpeg/libavcodec/x86/rv34dsp.asm
Normal file
196
project/jni/ffmpeg/libavcodec/x86/rv34dsp.asm
Normal file
@@ -0,0 +1,196 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pw_row_coeffs: times 4 dw 13
|
||||
times 4 dw 17
|
||||
times 4 dw 7
|
||||
pd_512: times 2 dd 0x200
|
||||
pw_col_coeffs: dw 13, 13, 13, -13
|
||||
dw 17, 7, 7, -17
|
||||
dw 13, -13, 13, 13
|
||||
dw -7, 17, -17, -7
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro IDCT_DC_NOROUND 1
|
||||
imul %1, 13*13*3
|
||||
sar %1, 11
|
||||
%endmacro
|
||||
|
||||
%macro IDCT_DC_ROUND 1
|
||||
imul %1, 13*13
|
||||
add %1, 0x200
|
||||
sar %1, 10
|
||||
%endmacro
|
||||
|
||||
%macro rv34_idct 1
|
||||
cglobal rv34_idct_%1, 1, 2, 0
|
||||
movsx r1, word [r0]
|
||||
IDCT_DC r1
|
||||
movd m0, r1d
|
||||
pshufw m0, m0, 0
|
||||
movq [r0+ 0], m0
|
||||
movq [r0+ 8], m0
|
||||
movq [r0+16], m0
|
||||
movq [r0+24], m0
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
%define IDCT_DC IDCT_DC_ROUND
|
||||
rv34_idct dc
|
||||
%define IDCT_DC IDCT_DC_NOROUND
|
||||
rv34_idct dc_noround
|
||||
|
||||
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
|
||||
INIT_MMX mmx
|
||||
cglobal rv34_idct_dc_add, 3, 3
|
||||
; calculate DC
|
||||
IDCT_DC_ROUND r2
|
||||
pxor m1, m1
|
||||
movd m0, r2d
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
punpcklbw m0, m0
|
||||
punpcklbw m1, m1
|
||||
punpcklwd m0, m0
|
||||
punpcklwd m1, m1
|
||||
|
||||
; add DC
|
||||
lea r2, [r0+r1*2]
|
||||
movh m2, [r0]
|
||||
movh m3, [r0+r1]
|
||||
movh m4, [r2]
|
||||
movh m5, [r2+r1]
|
||||
paddusb m2, m0
|
||||
paddusb m3, m0
|
||||
paddusb m4, m0
|
||||
paddusb m5, m0
|
||||
psubusb m2, m1
|
||||
psubusb m3, m1
|
||||
psubusb m4, m1
|
||||
psubusb m5, m1
|
||||
movh [r0], m2
|
||||
movh [r0+r1], m3
|
||||
movh [r2], m4
|
||||
movh [r2+r1], m5
|
||||
RET
|
||||
|
||||
; Load coeffs and perform row transform
|
||||
; Output: coeffs in mm[0467], rounder in mm5
|
||||
%macro ROW_TRANSFORM 1
|
||||
pxor mm7, mm7
|
||||
mova mm0, [%1+ 0*8]
|
||||
mova mm1, [%1+ 1*8]
|
||||
mova mm2, [%1+ 2*8]
|
||||
mova mm3, [%1+ 3*8]
|
||||
mova [%1+ 0*8], mm7
|
||||
mova [%1+ 1*8], mm7
|
||||
mova [%1+ 2*8], mm7
|
||||
mova [%1+ 3*8], mm7
|
||||
mova mm4, mm0
|
||||
mova mm6, [pw_row_coeffs+ 0]
|
||||
paddsw mm0, mm2 ; b0 + b2
|
||||
psubsw mm4, mm2 ; b0 - b2
|
||||
pmullw mm0, mm6 ; *13 = z0
|
||||
pmullw mm4, mm6 ; *13 = z1
|
||||
mova mm5, mm1
|
||||
pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
|
||||
pmullw mm5, [pw_row_coeffs+16] ; b1* 7
|
||||
mova mm7, mm3
|
||||
pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
|
||||
pmullw mm7, [pw_row_coeffs+16] ; b3* 7
|
||||
paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
|
||||
psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
|
||||
mova mm7, mm0
|
||||
mova mm6, mm4
|
||||
paddsw mm0, mm1 ; z0 + z3
|
||||
psubsw mm7, mm1 ; z0 - z3
|
||||
paddsw mm4, mm5 ; z1 + z2
|
||||
psubsw mm6, mm5 ; z1 - z2
|
||||
mova mm5, [pd_512] ; 0x200
|
||||
%endmacro
|
||||
|
||||
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, DCTELEM *block);
|
||||
%macro COL_TRANSFORM 4
|
||||
pshufw mm3, %2, 0xDD ; col. 1,3,1,3
|
||||
pshufw %2, %2, 0x88 ; col. 0,2,0,2
|
||||
pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
|
||||
pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
|
||||
paddd %2, mm5
|
||||
pshufw mm1, %2, 01001110b ; z1 | z0
|
||||
pshufw mm2, mm3, 01001110b ; z2 | z3
|
||||
paddd %2, mm3 ; z0+z3 | z1+z2
|
||||
psubd mm1, mm2 ; z1-z2 | z0-z3
|
||||
movd mm3, %1
|
||||
psrad %2, 10
|
||||
pxor mm2, mm2
|
||||
psrad mm1, 10
|
||||
punpcklbw mm3, mm2
|
||||
packssdw %2, mm1
|
||||
paddw %2, mm3
|
||||
packuswb %2, %2
|
||||
movd %1, %2
|
||||
%endmacro
|
||||
INIT_MMX mmxext
|
||||
cglobal rv34_idct_add, 3,3,0, d, s, b
|
||||
ROW_TRANSFORM bq
|
||||
COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
|
||||
mova mm0, [pw_col_coeffs+ 0]
|
||||
COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
|
||||
mova mm4, [pw_col_coeffs+ 8]
|
||||
lea dq, [dq + 2*sq]
|
||||
COL_TRANSFORM [dq], mm6, mm0, mm4
|
||||
COL_TRANSFORM [dq+sq], mm7, mm0, mm4
|
||||
ret
|
||||
|
||||
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
|
||||
INIT_XMM sse4
|
||||
cglobal rv34_idct_dc_add, 3, 3, 6
|
||||
; load data
|
||||
IDCT_DC_ROUND r2
|
||||
pxor m1, m1
|
||||
|
||||
; calculate DC
|
||||
movd m0, r2d
|
||||
lea r2, [r0+r1*2]
|
||||
movd m2, [r0]
|
||||
movd m3, [r0+r1]
|
||||
pshuflw m0, m0, 0
|
||||
movd m4, [r2]
|
||||
movd m5, [r2+r1]
|
||||
punpcklqdq m0, m0
|
||||
punpckldq m2, m3
|
||||
punpckldq m4, m5
|
||||
punpcklbw m2, m1
|
||||
punpcklbw m4, m1
|
||||
paddw m2, m0
|
||||
paddw m4, m0
|
||||
packuswb m2, m4
|
||||
movd [r0], m2
|
||||
pextrd [r0+r1], m2, 1
|
||||
pextrd [r2], m2, 2
|
||||
pextrd [r2+r1], m2, 3
|
||||
RET
|
||||
46
project/jni/ffmpeg/libavcodec/x86/rv34dsp_init.c
Normal file
46
project/jni/ffmpeg/libavcodec/x86/rv34dsp_init.c
Normal file
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* RV30/40 MMX/SSE2 optimizations
|
||||
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
|
||||
void ff_rv34_idct_dc_mmxext(DCTELEM *block);
|
||||
void ff_rv34_idct_dc_noround_mmxext(DCTELEM *block);
|
||||
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
|
||||
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
|
||||
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, DCTELEM *block);
|
||||
|
||||
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c, DSPContext *dsp)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags))
|
||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
|
||||
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
|
||||
}
|
||||
if (EXTERNAL_SSE4(mm_flags))
|
||||
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
|
||||
}
|
||||
505
project/jni/ffmpeg/libavcodec/x86/rv40dsp.asm
Normal file
505
project/jni/ffmpeg/libavcodec/x86/rv40dsp.asm
Normal file
@@ -0,0 +1,505 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the RV40 decoder
|
||||
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
align 16
|
||||
pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
|
||||
|
||||
sixtap_filter_hb_m: times 8 db 1, -5
|
||||
times 8 db 52, 20
|
||||
; multiplied by 2 to have the same shift
|
||||
times 8 db 2, -10
|
||||
times 8 db 40, 40
|
||||
; back to normal
|
||||
times 8 db 1, -5
|
||||
times 8 db 20, 52
|
||||
|
||||
sixtap_filter_v_m: times 8 dw 1
|
||||
times 8 dw -5
|
||||
times 8 dw 52
|
||||
times 8 dw 20
|
||||
; multiplied by 2 to have the same shift
|
||||
times 8 dw 2
|
||||
times 8 dw -10
|
||||
times 8 dw 40
|
||||
times 8 dw 40
|
||||
; back to normal
|
||||
times 8 dw 1
|
||||
times 8 dw -5
|
||||
times 8 dw 20
|
||||
times 8 dw 52
|
||||
|
||||
%ifdef PIC
|
||||
%define sixtap_filter_hw picregq
|
||||
%define sixtap_filter_hb picregq
|
||||
%define sixtap_filter_v picregq
|
||||
%define npicregs 1
|
||||
%else
|
||||
%define sixtap_filter_hw sixtap_filter_hw_m
|
||||
%define sixtap_filter_hb sixtap_filter_hb_m
|
||||
%define sixtap_filter_v sixtap_filter_v_m
|
||||
%define npicregs 0
|
||||
%endif
|
||||
|
||||
filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
|
||||
filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
|
||||
filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
|
||||
|
||||
cextern pw_32
|
||||
cextern pw_16
|
||||
cextern pw_512
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; subpel MC functions:
|
||||
;
|
||||
; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
|
||||
; uint8_t *src, int srcstride,
|
||||
; int len, int m);
|
||||
;----------------------------------------------------------------------
|
||||
%macro LOAD 2
|
||||
%if WIN64
|
||||
movsxd %1q, %1d
|
||||
%endif
|
||||
%ifdef PIC
|
||||
add %1q, picregq
|
||||
%else
|
||||
add %1q, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE 3
|
||||
%ifidn %3, avg
|
||||
movh %2, [dstq]
|
||||
%endif
|
||||
packuswb %1, %1
|
||||
%ifidn %3, avg
|
||||
%if cpuflag(3dnow)
|
||||
pavgusb %1, %2
|
||||
%else
|
||||
pavgb %1, %2
|
||||
%endif
|
||||
%endif
|
||||
movh [dstq], %1
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_V 1
|
||||
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_v_m]
|
||||
%endif
|
||||
pxor m7, m7
|
||||
LOAD my, sixtap_filter_v
|
||||
|
||||
; read 5 lines
|
||||
sub srcq, srcstrideq
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+srcstrideq]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
add srcq, srcstrideq
|
||||
movh m3, [srcq]
|
||||
movh m4, [srcq+srcstrideq]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
punpcklbw m4, m7
|
||||
|
||||
%ifdef m8
|
||||
mova m8, [myq+ 0]
|
||||
mova m9, [myq+16]
|
||||
mova m10, [myq+32]
|
||||
mova m11, [myq+48]
|
||||
%define COEFF05 m8
|
||||
%define COEFF14 m9
|
||||
%define COEFF2 m10
|
||||
%define COEFF3 m11
|
||||
%else
|
||||
%define COEFF05 [myq+ 0]
|
||||
%define COEFF14 [myq+16]
|
||||
%define COEFF2 [myq+32]
|
||||
%define COEFF3 [myq+48]
|
||||
%endif
|
||||
.nextrow:
|
||||
mova m6, m1
|
||||
movh m5, [srcq+2*srcstrideq] ; read new row
|
||||
paddw m6, m4
|
||||
punpcklbw m5, m7
|
||||
pmullw m6, COEFF14
|
||||
paddw m0, m5
|
||||
pmullw m0, COEFF05
|
||||
paddw m6, m0
|
||||
mova m0, m1
|
||||
paddw m6, [pw_32]
|
||||
mova m1, m2
|
||||
pmullw m2, COEFF2
|
||||
paddw m6, m2
|
||||
mova m2, m3
|
||||
pmullw m3, COEFF3
|
||||
paddw m6, m3
|
||||
|
||||
; round/clip/store
|
||||
mova m3, m4
|
||||
psraw m6, 6
|
||||
mova m4, m5
|
||||
STORE m6, m5, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%macro FILTER_H 1
|
||||
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_v_m]
|
||||
%endif
|
||||
pxor m7, m7
|
||||
LOAD mx, sixtap_filter_v
|
||||
mova m6, [pw_32]
|
||||
%ifdef m8
|
||||
mova m8, [mxq+ 0]
|
||||
mova m9, [mxq+16]
|
||||
mova m10, [mxq+32]
|
||||
mova m11, [mxq+48]
|
||||
%define COEFF05 m8
|
||||
%define COEFF14 m9
|
||||
%define COEFF2 m10
|
||||
%define COEFF3 m11
|
||||
%else
|
||||
%define COEFF05 [mxq+ 0]
|
||||
%define COEFF14 [mxq+16]
|
||||
%define COEFF2 [mxq+32]
|
||||
%define COEFF3 [mxq+48]
|
||||
%endif
|
||||
.nextrow:
|
||||
movq m0, [srcq-2]
|
||||
movq m5, [srcq+3]
|
||||
movq m1, [srcq-1]
|
||||
movq m4, [srcq+2]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m5, m7
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m4, m7
|
||||
movq m2, [srcq-0]
|
||||
movq m3, [srcq+1]
|
||||
paddw m0, m5
|
||||
paddw m1, m4
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
pmullw m0, COEFF05
|
||||
pmullw m1, COEFF14
|
||||
pmullw m2, COEFF2
|
||||
pmullw m3, COEFF3
|
||||
paddw m0, m6
|
||||
paddw m1, m2
|
||||
paddw m0, m3
|
||||
paddw m0, m1
|
||||
psraw m0, 6
|
||||
STORE m0, m1, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
FILTER_V put
|
||||
FILTER_H put
|
||||
|
||||
INIT_MMX mmxext
|
||||
FILTER_V avg
|
||||
FILTER_H avg
|
||||
|
||||
INIT_MMX 3dnow
|
||||
FILTER_V avg
|
||||
FILTER_H avg
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
FILTER_H put
|
||||
FILTER_H avg
|
||||
FILTER_V put
|
||||
FILTER_V avg
|
||||
|
||||
%macro FILTER_SSSE3 1
|
||||
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_hb_m]
|
||||
%endif
|
||||
|
||||
; read 5 lines
|
||||
sub srcq, srcstrideq
|
||||
LOAD my, sixtap_filter_hb
|
||||
sub srcq, srcstrideq
|
||||
movh m0, [srcq]
|
||||
movh m1, [srcq+srcstrideq]
|
||||
movh m2, [srcq+srcstrideq*2]
|
||||
lea srcq, [srcq+srcstrideq*2]
|
||||
add srcq, srcstrideq
|
||||
mova m5, [myq]
|
||||
movh m3, [srcq]
|
||||
movh m4, [srcq+srcstrideq]
|
||||
lea srcq, [srcq+2*srcstrideq]
|
||||
|
||||
.nextrow:
|
||||
mova m6, m2
|
||||
punpcklbw m0, m1
|
||||
punpcklbw m6, m3
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m6, [myq+16]
|
||||
movh m7, [srcq] ; read new row
|
||||
paddw m6, m0
|
||||
mova m0, m1
|
||||
mova m1, m2
|
||||
mova m2, m3
|
||||
mova m3, m4
|
||||
mova m4, m7
|
||||
punpcklbw m7, m3
|
||||
pmaddubsw m7, m5
|
||||
paddw m6, m7
|
||||
pmulhrsw m6, [pw_512]
|
||||
STORE m6, m7, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
|
||||
%ifdef PIC
|
||||
lea picregq, [sixtap_filter_hb_m]
|
||||
%endif
|
||||
mova m3, [filter_h6_shuf2]
|
||||
mova m4, [filter_h6_shuf3]
|
||||
LOAD mx, sixtap_filter_hb
|
||||
mova m5, [mxq] ; set up 6tap filter in bytes
|
||||
mova m6, [mxq+16]
|
||||
mova m7, [filter_h6_shuf1]
|
||||
|
||||
.nextrow:
|
||||
movu m0, [srcq-2]
|
||||
mova m1, m0
|
||||
mova m2, m0
|
||||
pshufb m0, m7
|
||||
pshufb m1, m3
|
||||
pshufb m2, m4
|
||||
pmaddubsw m0, m5
|
||||
pmaddubsw m1, m6
|
||||
pmaddubsw m2, m5
|
||||
paddw m0, m1
|
||||
paddw m0, m2
|
||||
pmulhrsw m0, [pw_512]
|
||||
STORE m0, m1, %1
|
||||
|
||||
; go to next line
|
||||
add dstq, dststrideq
|
||||
add srcq, srcstrideq
|
||||
dec heightd ; next row
|
||||
jg .nextrow
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
FILTER_SSSE3 put
|
||||
FILTER_SSSE3 avg
|
||||
|
||||
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
|
||||
%macro RV40_WCORE 4-5
|
||||
movh m4, [%3 + r6 + 0]
|
||||
movh m5, [%4 + r6 + 0]
|
||||
%if %0 == 4
|
||||
%define OFFSET r6 + mmsize / 2
|
||||
%else
|
||||
; 8x8 block and sse2, stride was provided
|
||||
%define OFFSET r6
|
||||
add r6, r5
|
||||
%endif
|
||||
movh m6, [%3 + OFFSET]
|
||||
movh m7, [%4 + OFFSET]
|
||||
|
||||
%if %1 == 0
|
||||
; 14bits weights
|
||||
punpcklbw m4, m0
|
||||
punpcklbw m5, m0
|
||||
punpcklbw m6, m0
|
||||
punpcklbw m7, m0
|
||||
|
||||
psllw m4, 7
|
||||
psllw m5, 7
|
||||
psllw m6, 7
|
||||
psllw m7, 7
|
||||
pmulhw m4, m3
|
||||
pmulhw m5, m2
|
||||
pmulhw m6, m3
|
||||
pmulhw m7, m2
|
||||
|
||||
paddw m4, m5
|
||||
paddw m6, m7
|
||||
%else
|
||||
; 5bits weights
|
||||
%if cpuflag(ssse3)
|
||||
punpcklbw m4, m5
|
||||
punpcklbw m6, m7
|
||||
|
||||
pmaddubsw m4, m3
|
||||
pmaddubsw m6, m3
|
||||
%else
|
||||
punpcklbw m4, m0
|
||||
punpcklbw m5, m0
|
||||
punpcklbw m6, m0
|
||||
punpcklbw m7, m0
|
||||
|
||||
pmullw m4, m3
|
||||
pmullw m5, m2
|
||||
pmullw m6, m3
|
||||
pmullw m7, m2
|
||||
paddw m4, m5
|
||||
paddw m6, m7
|
||||
%endif
|
||||
|
||||
%endif
|
||||
|
||||
; bias and shift down
|
||||
%if cpuflag(ssse3)
|
||||
pmulhrsw m4, m1
|
||||
pmulhrsw m6, m1
|
||||
%else
|
||||
paddw m4, m1
|
||||
paddw m6, m1
|
||||
psrlw m4, 5
|
||||
psrlw m6, 5
|
||||
%endif
|
||||
|
||||
packuswb m4, m6
|
||||
%if %0 == 5
|
||||
; Only called for 8x8 blocks and sse2
|
||||
sub r6, r5
|
||||
movh [%2 + r6], m4
|
||||
add r6, r5
|
||||
movhps [%2 + r6], m4
|
||||
%else
|
||||
mova [%2 + r6], m4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro MAIN_LOOP 2
|
||||
%if mmsize == 8
|
||||
RV40_WCORE %2, r0, r1, r2
|
||||
%if %1 == 16
|
||||
RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
|
||||
%endif
|
||||
|
||||
; Prepare for next loop
|
||||
add r6, r5
|
||||
%else
|
||||
%ifidn %1, 8
|
||||
RV40_WCORE %2, r0, r1, r2, r5
|
||||
; Prepare 2 next lines
|
||||
add r6, r5
|
||||
%else
|
||||
RV40_WCORE %2, r0, r1, r2
|
||||
; Prepare single next line
|
||||
add r6, r5
|
||||
%endif
|
||||
%endif
|
||||
|
||||
%endmacro
|
||||
|
||||
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
|
||||
; %1=size %2=num of xmm regs
|
||||
; The weights are FP0.14 notation of fractions depending on pts.
|
||||
; For timebases without rounding error (i.e. PAL), the fractions
|
||||
; can be simplified, and several operations can be avoided.
|
||||
; Therefore, we check here whether they are multiples of 2^9 for
|
||||
; those simplifications to occur.
|
||||
%macro RV40_WEIGHT 3
|
||||
cglobal rv40_weight_func_%1_%2, 6, 7, 8
|
||||
%if cpuflag(ssse3)
|
||||
mova m1, [pw_1024]
|
||||
%else
|
||||
mova m1, [pw_16]
|
||||
%endif
|
||||
pxor m0, m0
|
||||
; Set loop counter and increments
|
||||
mov r6, r5
|
||||
shl r6, %3
|
||||
add r0, r6
|
||||
add r1, r6
|
||||
add r2, r6
|
||||
neg r6
|
||||
|
||||
movd m2, r3d
|
||||
movd m3, r4d
|
||||
%ifidn %1,rnd
|
||||
%define RND 0
|
||||
SPLATW m2, m2
|
||||
%else
|
||||
%define RND 1
|
||||
%if cpuflag(ssse3)
|
||||
punpcklbw m3, m2
|
||||
%else
|
||||
SPLATW m2, m2
|
||||
%endif
|
||||
%endif
|
||||
SPLATW m3, m3
|
||||
|
||||
.loop:
|
||||
MAIN_LOOP %2, RND
|
||||
jnz .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
RV40_WEIGHT rnd, 8, 3
|
||||
RV40_WEIGHT rnd, 16, 4
|
||||
RV40_WEIGHT nornd, 8, 3
|
||||
RV40_WEIGHT nornd, 16, 4
|
||||
|
||||
INIT_XMM sse2
|
||||
RV40_WEIGHT rnd, 8, 3
|
||||
RV40_WEIGHT rnd, 16, 4
|
||||
RV40_WEIGHT nornd, 8, 3
|
||||
RV40_WEIGHT nornd, 16, 4
|
||||
|
||||
INIT_XMM ssse3
|
||||
RV40_WEIGHT rnd, 8, 3
|
||||
RV40_WEIGHT rnd, 16, 4
|
||||
RV40_WEIGHT nornd, 8, 3
|
||||
RV40_WEIGHT nornd, 16, 4
|
||||
242
project/jni/ffmpeg/libavcodec/x86/rv40dsp_init.c
Normal file
242
project/jni/ffmpeg/libavcodec/x86/rv40dsp_init.c
Normal file
@@ -0,0 +1,242 @@
|
||||
/*
|
||||
* RV40 decoder motion compensation functions x86-optimised
|
||||
* Copyright (c) 2008 Konstantin Shishkov
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file
|
||||
* RV40 decoder motion compensation functions x86-optimised
|
||||
* 2,0 and 0,2 have h264 equivalents.
|
||||
* 3,3 is bugged in the rv40 format and maps to _xy2 version
|
||||
*/
|
||||
|
||||
#include "libavcodec/rv34dsp.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
|
||||
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
|
||||
#define DECLARE_WEIGHT(opt) \
|
||||
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride); \
|
||||
void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride); \
|
||||
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride); \
|
||||
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
|
||||
int w1, int w2, ptrdiff_t stride);
|
||||
DECLARE_WEIGHT(mmxext)
|
||||
DECLARE_WEIGHT(sse2)
|
||||
DECLARE_WEIGHT(ssse3)
|
||||
|
||||
/** @{ */
|
||||
/**
|
||||
* Define one qpel function.
|
||||
* LOOPSIZE must be already set to the number of pixels processed per
|
||||
* iteration in the inner loop of the called functions.
|
||||
* COFF(x) must be already defined so as to provide the offset into any
|
||||
* array of coeffs used by the called function for the qpel position x.
|
||||
*/
|
||||
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
|
||||
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
|
||||
uint8_t *src, \
|
||||
int stride) \
|
||||
{ \
|
||||
int i; \
|
||||
if (PH && PV) { \
|
||||
DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
|
||||
uint8_t *tmpptr = tmp + SIZE * 2; \
|
||||
src -= stride * 2; \
|
||||
\
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
|
||||
SIZE + 5, HCOFF(PH)); \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
|
||||
SIZE, SIZE, VCOFF(PV)); \
|
||||
} else if (PV) { \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
|
||||
stride, SIZE, VCOFF(PV)); \
|
||||
} else { \
|
||||
for (i = 0; i < SIZE; i += LOOPSIZE) \
|
||||
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
|
||||
stride, SIZE, HCOFF(PH)); \
|
||||
} \
|
||||
};
|
||||
|
||||
/** Declare functions for sizes 8 and 16 and given operations
|
||||
* and qpel position. */
|
||||
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
|
||||
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
|
||||
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
|
||||
|
||||
/** Declare all functions for all sizes and qpel positions */
|
||||
#define QPEL_MC_DECL(OP, OPT) \
|
||||
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t srcStride, \
|
||||
int len, int m); \
|
||||
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
|
||||
const uint8_t *src, \
|
||||
ptrdiff_t srcStride, \
|
||||
int len, int m); \
|
||||
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
|
||||
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
|
||||
/** @} */
|
||||
|
||||
#define LOOPSIZE 8
|
||||
#define HCOFF(x) (32 * (x - 1))
|
||||
#define VCOFF(x) (32 * (x - 1))
|
||||
QPEL_MC_DECL(put_, _ssse3)
|
||||
QPEL_MC_DECL(avg_, _ssse3)
|
||||
|
||||
#undef LOOPSIZE
|
||||
#undef HCOFF
|
||||
#undef VCOFF
|
||||
#define LOOPSIZE 8
|
||||
#define HCOFF(x) (64 * (x - 1))
|
||||
#define VCOFF(x) (64 * (x - 1))
|
||||
QPEL_MC_DECL(put_, _sse2)
|
||||
QPEL_MC_DECL(avg_, _sse2)
|
||||
|
||||
#if ARCH_X86_32
|
||||
#undef LOOPSIZE
|
||||
#undef HCOFF
|
||||
#undef VCOFF
|
||||
#define LOOPSIZE 4
|
||||
#define HCOFF(x) (64 * (x - 1))
|
||||
#define VCOFF(x) (64 * (x - 1))
|
||||
|
||||
QPEL_MC_DECL(put_, _mmx)
|
||||
|
||||
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
|
||||
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
|
||||
QPEL_MC_DECL(avg_, _mmxext)
|
||||
|
||||
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
|
||||
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
|
||||
QPEL_MC_DECL(avg_, _3dnow)
|
||||
#endif
|
||||
|
||||
/** @{ */
|
||||
/** Set one function */
|
||||
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
|
||||
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
|
||||
|
||||
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
|
||||
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
|
||||
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
|
||||
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
|
||||
|
||||
/** Set all functions for all sizes and qpel positions */
|
||||
#define QPEL_MC_SET(OP, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
|
||||
QPEL_FUNCS_SET (OP, 3, 2, OPT)
|
||||
/** @} */
|
||||
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
|
||||
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
|
||||
#if HAVE_MMX_INLINE
|
||||
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
|
||||
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
|
||||
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
|
||||
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
|
||||
#endif /* HAVE_MMX_INLINE */
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(put_, _mmx)
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_MMXEXT(mm_flags)) {
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(avg_, _mmxext)
|
||||
#endif
|
||||
} else if (EXTERNAL_AMD3DNOW(mm_flags)) {
|
||||
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
|
||||
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
|
||||
#if ARCH_X86_32
|
||||
QPEL_MC_SET(avg_, _3dnow)
|
||||
#endif
|
||||
}
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
|
||||
QPEL_MC_SET(put_, _sse2)
|
||||
QPEL_MC_SET(avg_, _sse2)
|
||||
}
|
||||
if (EXTERNAL_SSSE3(mm_flags)) {
|
||||
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
|
||||
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
|
||||
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
|
||||
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
|
||||
QPEL_MC_SET(put_, _ssse3)
|
||||
QPEL_MC_SET(avg_, _ssse3)
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
||||
180
project/jni/ffmpeg/libavcodec/x86/sbrdsp.asm
Normal file
180
project/jni/ffmpeg/libavcodec/x86/sbrdsp.asm
Normal file
@@ -0,0 +1,180 @@
|
||||
;******************************************************************************
|
||||
;* AAC Spectral Band Replication decoding functions
|
||||
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
; mask equivalent for multiply by -1.0 1.0
|
||||
ps_mask times 2 dd 1<<31, 0
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
INIT_XMM sse
|
||||
cglobal sbr_sum_square, 2, 3, 6
|
||||
mov r2, r1
|
||||
xorps m0, m0
|
||||
xorps m1, m1
|
||||
sar r2, 3
|
||||
jz .prepare
|
||||
.loop:
|
||||
movu m2, [r0 + 0]
|
||||
movu m3, [r0 + 16]
|
||||
movu m4, [r0 + 32]
|
||||
movu m5, [r0 + 48]
|
||||
mulps m2, m2
|
||||
mulps m3, m3
|
||||
mulps m4, m4
|
||||
mulps m5, m5
|
||||
addps m0, m2
|
||||
addps m1, m3
|
||||
addps m0, m4
|
||||
addps m1, m5
|
||||
add r0, 64
|
||||
dec r2
|
||||
jnz .loop
|
||||
.prepare:
|
||||
and r1, 7
|
||||
sar r1, 1
|
||||
jz .end
|
||||
; len is a multiple of 2, thus there are at least 4 elements to process
|
||||
.endloop:
|
||||
movu m2, [r0]
|
||||
add r0, 16
|
||||
mulps m2, m2
|
||||
dec r1
|
||||
addps m0, m2
|
||||
jnz .endloop
|
||||
.end:
|
||||
addps m0, m1
|
||||
movhlps m2, m0
|
||||
addps m0, m2
|
||||
movss m1, m0
|
||||
shufps m0, m0, 1
|
||||
addss m0, m1
|
||||
%if ARCH_X86_64 == 0
|
||||
movss r0m, m0
|
||||
fld dword r0m
|
||||
%endif
|
||||
RET
|
||||
|
||||
%define STEP 40*4*2
|
||||
cglobal sbr_hf_g_filt, 5, 6, 5
|
||||
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
|
||||
mov r5, r3
|
||||
and r3, 0xFC
|
||||
lea r2, [r2 + r3*4]
|
||||
lea r0, [r0 + r3*8]
|
||||
neg r3
|
||||
jz .loop1
|
||||
.loop4:
|
||||
movlps m0, [r2 + 4*r3 + 0]
|
||||
movlps m1, [r2 + 4*r3 + 8]
|
||||
movlps m2, [r1 + 0*STEP]
|
||||
movlps m3, [r1 + 2*STEP]
|
||||
movhps m2, [r1 + 1*STEP]
|
||||
movhps m3, [r1 + 3*STEP]
|
||||
unpcklps m0, m0
|
||||
unpcklps m1, m1
|
||||
mulps m0, m2
|
||||
mulps m1, m3
|
||||
movu [r0 + 8*r3 + 0], m0
|
||||
movu [r0 + 8*r3 + 16], m1
|
||||
add r1, 4*STEP
|
||||
add r3, 4
|
||||
jnz .loop4
|
||||
and r5, 3 ; number of single element loops
|
||||
jz .end
|
||||
.loop1: ; element 0 and 1 can be computed at the same time
|
||||
movss m0, [r2]
|
||||
movlps m2, [r1]
|
||||
unpcklps m0, m0
|
||||
mulps m2, m0
|
||||
movlps [r0], m2
|
||||
add r0, 8
|
||||
add r2, 4
|
||||
add r1, STEP
|
||||
dec r5
|
||||
jnz .loop1
|
||||
.end:
|
||||
RET
|
||||
|
||||
; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
|
||||
; const float alpha0[2], const float alpha1[2],
|
||||
; float bw, int start, int end)
|
||||
;
|
||||
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
|
||||
; load alpha factors
|
||||
%define bw m0
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
movss bw, BWm
|
||||
%endif
|
||||
movlps m2, [alpha1q]
|
||||
movlps m1, [alpha0q]
|
||||
shufps bw, bw, 0
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw
|
||||
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
|
||||
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
|
||||
mova m3, m1
|
||||
mova m4, m2
|
||||
|
||||
; Set pointers
|
||||
%if ARCH_X86_64 == 0 || WIN64
|
||||
; start and end 6th and 7th args on stack
|
||||
mov r2d, Sm
|
||||
mov r3d, Em
|
||||
%define start r2q
|
||||
%define end r3q
|
||||
%else
|
||||
; BW does not actually occupy a register, so shift by 1
|
||||
%define start BWq
|
||||
%define end Sq
|
||||
%endif
|
||||
sub start, end ; neg num of loops
|
||||
lea X_highq, [X_highq + end*2*4]
|
||||
lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
|
||||
shl start, 3 ; offset from num loops
|
||||
|
||||
mova m0, [X_lowq + start]
|
||||
shufps m3, m3, q1111
|
||||
shufps m4, m4, q1111
|
||||
xorps m3, [ps_mask]
|
||||
shufps m1, m1, q0000
|
||||
shufps m2, m2, q0000
|
||||
xorps m4, [ps_mask]
|
||||
.loop2:
|
||||
movu m7, [X_lowq + start + 8] ; BbCc
|
||||
mova m6, m0
|
||||
mova m5, m7
|
||||
shufps m0, m0, q2301 ; aAbB
|
||||
shufps m7, m7, q2301 ; bBcC
|
||||
mulps m0, m4
|
||||
mulps m7, m3
|
||||
mulps m6, m2
|
||||
mulps m5, m1
|
||||
addps m7, m0
|
||||
mova m0, [X_lowq + start +16] ; CcDd
|
||||
addps m7, m0
|
||||
addps m6, m5
|
||||
addps m7, m6
|
||||
mova [X_highq + start], m7
|
||||
add start, 16
|
||||
jnz .loop2
|
||||
RET
|
||||
43
project/jni/ffmpeg/libavcodec/x86/sbrdsp_init.c
Normal file
43
project/jni/ffmpeg/libavcodec/x86/sbrdsp_init.c
Normal file
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* AAC Spectral Band Replication decoding functions
|
||||
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/sbrdsp.h"
|
||||
|
||||
float ff_sbr_sum_square_sse(float (*x)[2], int n);
|
||||
void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
|
||||
const float *g_filt, int m_max, intptr_t ixh);
|
||||
void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
|
||||
const float alpha0[2], const float alpha1[2],
|
||||
float bw, int start, int end);
|
||||
|
||||
void ff_sbrdsp_init_x86(SBRDSPContext *s)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (EXTERNAL_SSE(mm_flags)) {
|
||||
s->sum_square = ff_sbr_sum_square_sse;
|
||||
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
|
||||
s->hf_gen = ff_sbr_hf_gen_sse;
|
||||
}
|
||||
}
|
||||
1168
project/jni/ffmpeg/libavcodec/x86/simple_idct.c
Normal file
1168
project/jni/ffmpeg/libavcodec/x86/simple_idct.c
Normal file
File diff suppressed because it is too large
Load Diff
902
project/jni/ffmpeg/libavcodec/x86/snowdsp.c
Normal file
902
project/jni/ffmpeg/libavcodec/x86/snowdsp.c
Normal file
@@ -0,0 +1,902 @@
|
||||
/*
|
||||
* MMX and SSE2 optimized snow DSP utils
|
||||
* Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/snow.h"
|
||||
#include "libavcodec/dwt.h"
|
||||
#include "dsputil_mmx.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
|
||||
const int w2= (width+1)>>1;
|
||||
const int w_l= (width>>1);
|
||||
const int w_r= w2 - 1;
|
||||
int i;
|
||||
|
||||
{ // Lift 0
|
||||
IDWTELEM * const ref = b + w2 - 1;
|
||||
IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
|
||||
// (the first time erroneously), we allow the SSE2 code to run an extra pass.
|
||||
// The savings in code and time are well worth having to store this value and
|
||||
// calculate b[0] correctly afterwards.
|
||||
|
||||
i = 0;
|
||||
__asm__ volatile(
|
||||
"pcmpeqd %%xmm7, %%xmm7 \n\t"
|
||||
"pcmpeqd %%xmm3, %%xmm3 \n\t"
|
||||
"psllw $1, %%xmm3 \n\t"
|
||||
"paddw %%xmm7, %%xmm3 \n\t"
|
||||
"psllw $13, %%xmm3 \n\t"
|
||||
::);
|
||||
for(; i<w_l-15; i+=16){
|
||||
__asm__ volatile(
|
||||
"movdqu (%1), %%xmm1 \n\t"
|
||||
"movdqu 16(%1), %%xmm5 \n\t"
|
||||
"movdqu 2(%1), %%xmm2 \n\t"
|
||||
"movdqu 18(%1), %%xmm6 \n\t"
|
||||
"paddw %%xmm1, %%xmm2 \n\t"
|
||||
"paddw %%xmm5, %%xmm6 \n\t"
|
||||
"paddw %%xmm7, %%xmm2 \n\t"
|
||||
"paddw %%xmm7, %%xmm6 \n\t"
|
||||
"pmulhw %%xmm3, %%xmm2 \n\t"
|
||||
"pmulhw %%xmm3, %%xmm6 \n\t"
|
||||
"paddw (%0), %%xmm2 \n\t"
|
||||
"paddw 16(%0), %%xmm6 \n\t"
|
||||
"movdqa %%xmm2, (%0) \n\t"
|
||||
"movdqa %%xmm6, 16(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
|
||||
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
|
||||
}
|
||||
|
||||
{ // Lift 1
|
||||
IDWTELEM * const dst = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
|
||||
dst[i] = dst[i] - (b[i] + b[i + 1]);
|
||||
}
|
||||
for(; i<w_r-15; i+=16){
|
||||
__asm__ volatile(
|
||||
"movdqu (%1), %%xmm1 \n\t"
|
||||
"movdqu 16(%1), %%xmm5 \n\t"
|
||||
"movdqu 2(%1), %%xmm2 \n\t"
|
||||
"movdqu 18(%1), %%xmm6 \n\t"
|
||||
"paddw %%xmm1, %%xmm2 \n\t"
|
||||
"paddw %%xmm5, %%xmm6 \n\t"
|
||||
"movdqa (%0), %%xmm0 \n\t"
|
||||
"movdqa 16(%0), %%xmm4 \n\t"
|
||||
"psubw %%xmm2, %%xmm0 \n\t"
|
||||
"psubw %%xmm6, %%xmm4 \n\t"
|
||||
"movdqa %%xmm0, (%0) \n\t"
|
||||
"movdqa %%xmm4, 16(%0) \n\t"
|
||||
:: "r"(&dst[i]), "r"(&b[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
|
||||
}
|
||||
|
||||
{ // Lift 2
|
||||
IDWTELEM * const ref = b+w2 - 1;
|
||||
IDWTELEM b_0 = b[0];
|
||||
|
||||
i = 0;
|
||||
__asm__ volatile(
|
||||
"psllw $15, %%xmm7 \n\t"
|
||||
"pcmpeqw %%xmm6, %%xmm6 \n\t"
|
||||
"psrlw $13, %%xmm6 \n\t"
|
||||
"paddw %%xmm7, %%xmm6 \n\t"
|
||||
::);
|
||||
for(; i<w_l-15; i+=16){
|
||||
__asm__ volatile(
|
||||
"movdqu (%1), %%xmm0 \n\t"
|
||||
"movdqu 16(%1), %%xmm4 \n\t"
|
||||
"movdqu 2(%1), %%xmm1 \n\t"
|
||||
"movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
|
||||
"paddw %%xmm6, %%xmm0 \n\t"
|
||||
"paddw %%xmm6, %%xmm4 \n\t"
|
||||
"paddw %%xmm7, %%xmm1 \n\t"
|
||||
"paddw %%xmm7, %%xmm5 \n\t"
|
||||
"pavgw %%xmm1, %%xmm0 \n\t"
|
||||
"pavgw %%xmm5, %%xmm4 \n\t"
|
||||
"psubw %%xmm7, %%xmm0 \n\t"
|
||||
"psubw %%xmm7, %%xmm4 \n\t"
|
||||
"psraw $1, %%xmm0 \n\t"
|
||||
"psraw $1, %%xmm4 \n\t"
|
||||
"movdqa (%0), %%xmm1 \n\t"
|
||||
"movdqa 16(%0), %%xmm5 \n\t"
|
||||
"paddw %%xmm1, %%xmm0 \n\t"
|
||||
"paddw %%xmm5, %%xmm4 \n\t"
|
||||
"psraw $2, %%xmm0 \n\t"
|
||||
"psraw $2, %%xmm4 \n\t"
|
||||
"paddw %%xmm1, %%xmm0 \n\t"
|
||||
"paddw %%xmm5, %%xmm4 \n\t"
|
||||
"movdqa %%xmm0, (%0) \n\t"
|
||||
"movdqa %%xmm4, 16(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
|
||||
b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
|
||||
}
|
||||
|
||||
{ // Lift 3
|
||||
IDWTELEM * const src = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
|
||||
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
|
||||
}
|
||||
for(; i<w_r-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movdqu 2(%1), %%xmm2 \n\t"
|
||||
"movdqu 18(%1), %%xmm6 \n\t"
|
||||
"paddw (%1), %%xmm2 \n\t"
|
||||
"paddw 16(%1), %%xmm6 \n\t"
|
||||
"movdqu (%0), %%xmm0 \n\t"
|
||||
"movdqu 16(%0), %%xmm4 \n\t"
|
||||
"paddw %%xmm2, %%xmm0 \n\t"
|
||||
"paddw %%xmm6, %%xmm4 \n\t"
|
||||
"psraw $1, %%xmm2 \n\t"
|
||||
"psraw $1, %%xmm6 \n\t"
|
||||
"paddw %%xmm0, %%xmm2 \n\t"
|
||||
"paddw %%xmm4, %%xmm6 \n\t"
|
||||
"movdqa %%xmm2, (%2) \n\t"
|
||||
"movdqa %%xmm6, 16(%2) \n\t"
|
||||
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
|
||||
}
|
||||
|
||||
{
|
||||
snow_interleave_line_header(&i, width, b, temp);
|
||||
|
||||
for (; (i & 0x3E) != 0x3E; i-=2){
|
||||
b[i+1] = temp[i>>1];
|
||||
b[i] = b[i>>1];
|
||||
}
|
||||
for (i-=62; i>=0; i-=64){
|
||||
__asm__ volatile(
|
||||
"movdqa (%1), %%xmm0 \n\t"
|
||||
"movdqa 16(%1), %%xmm2 \n\t"
|
||||
"movdqa 32(%1), %%xmm4 \n\t"
|
||||
"movdqa 48(%1), %%xmm6 \n\t"
|
||||
"movdqa (%1), %%xmm1 \n\t"
|
||||
"movdqa 16(%1), %%xmm3 \n\t"
|
||||
"movdqa 32(%1), %%xmm5 \n\t"
|
||||
"movdqa 48(%1), %%xmm7 \n\t"
|
||||
"punpcklwd (%2), %%xmm0 \n\t"
|
||||
"punpcklwd 16(%2), %%xmm2 \n\t"
|
||||
"punpcklwd 32(%2), %%xmm4 \n\t"
|
||||
"punpcklwd 48(%2), %%xmm6 \n\t"
|
||||
"movdqa %%xmm0, (%0) \n\t"
|
||||
"movdqa %%xmm2, 32(%0) \n\t"
|
||||
"movdqa %%xmm4, 64(%0) \n\t"
|
||||
"movdqa %%xmm6, 96(%0) \n\t"
|
||||
"punpckhwd (%2), %%xmm1 \n\t"
|
||||
"punpckhwd 16(%2), %%xmm3 \n\t"
|
||||
"punpckhwd 32(%2), %%xmm5 \n\t"
|
||||
"punpckhwd 48(%2), %%xmm7 \n\t"
|
||||
"movdqa %%xmm1, 16(%0) \n\t"
|
||||
"movdqa %%xmm3, 48(%0) \n\t"
|
||||
"movdqa %%xmm5, 80(%0) \n\t"
|
||||
"movdqa %%xmm7, 112(%0) \n\t"
|
||||
:: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
|
||||
const int w2= (width+1)>>1;
|
||||
const int w_l= (width>>1);
|
||||
const int w_r= w2 - 1;
|
||||
int i;
|
||||
|
||||
{ // Lift 0
|
||||
IDWTELEM * const ref = b + w2 - 1;
|
||||
|
||||
i = 1;
|
||||
b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
|
||||
__asm__ volatile(
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"pcmpeqw %%mm3, %%mm3 \n\t"
|
||||
"psllw $1, %%mm3 \n\t"
|
||||
"paddw %%mm7, %%mm3 \n\t"
|
||||
"psllw $13, %%mm3 \n\t"
|
||||
::);
|
||||
for(; i<w_l-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm2 \n\t"
|
||||
"movq 8(%1), %%mm6 \n\t"
|
||||
"paddw 2(%1), %%mm2 \n\t"
|
||||
"paddw 10(%1), %%mm6 \n\t"
|
||||
"paddw %%mm7, %%mm2 \n\t"
|
||||
"paddw %%mm7, %%mm6 \n\t"
|
||||
"pmulhw %%mm3, %%mm2 \n\t"
|
||||
"pmulhw %%mm3, %%mm6 \n\t"
|
||||
"paddw (%0), %%mm2 \n\t"
|
||||
"paddw 8(%0), %%mm6 \n\t"
|
||||
"movq %%mm2, (%0) \n\t"
|
||||
"movq %%mm6, 8(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
|
||||
}
|
||||
|
||||
{ // Lift 1
|
||||
IDWTELEM * const dst = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; i<w_r-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm2 \n\t"
|
||||
"movq 8(%1), %%mm6 \n\t"
|
||||
"paddw 2(%1), %%mm2 \n\t"
|
||||
"paddw 10(%1), %%mm6 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq 8(%0), %%mm4 \n\t"
|
||||
"psubw %%mm2, %%mm0 \n\t"
|
||||
"psubw %%mm6, %%mm4 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm4, 8(%0) \n\t"
|
||||
:: "r"(&dst[i]), "r"(&b[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
|
||||
}
|
||||
|
||||
{ // Lift 2
|
||||
IDWTELEM * const ref = b+w2 - 1;
|
||||
|
||||
i = 1;
|
||||
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
|
||||
__asm__ volatile(
|
||||
"psllw $15, %%mm7 \n\t"
|
||||
"pcmpeqw %%mm6, %%mm6 \n\t"
|
||||
"psrlw $13, %%mm6 \n\t"
|
||||
"paddw %%mm7, %%mm6 \n\t"
|
||||
::);
|
||||
for(; i<w_l-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm4 \n\t"
|
||||
"movq 2(%1), %%mm1 \n\t"
|
||||
"movq 10(%1), %%mm5 \n\t"
|
||||
"paddw %%mm6, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm4 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm7, %%mm5 \n\t"
|
||||
"pavgw %%mm1, %%mm0 \n\t"
|
||||
"pavgw %%mm5, %%mm4 \n\t"
|
||||
"psubw %%mm7, %%mm0 \n\t"
|
||||
"psubw %%mm7, %%mm4 \n\t"
|
||||
"psraw $1, %%mm0 \n\t"
|
||||
"psraw $1, %%mm4 \n\t"
|
||||
"movq (%0), %%mm1 \n\t"
|
||||
"movq 8(%0), %%mm5 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm5, %%mm4 \n\t"
|
||||
"psraw $2, %%mm0 \n\t"
|
||||
"psraw $2, %%mm4 \n\t"
|
||||
"paddw %%mm1, %%mm0 \n\t"
|
||||
"paddw %%mm5, %%mm4 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm4, 8(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&ref[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
|
||||
}
|
||||
|
||||
{ // Lift 3
|
||||
IDWTELEM * const src = b+w2;
|
||||
i = 0;
|
||||
|
||||
for(; i<w_r-7; i+=8){
|
||||
__asm__ volatile(
|
||||
"movq 2(%1), %%mm2 \n\t"
|
||||
"movq 10(%1), %%mm6 \n\t"
|
||||
"paddw (%1), %%mm2 \n\t"
|
||||
"paddw 8(%1), %%mm6 \n\t"
|
||||
"movq (%0), %%mm0 \n\t"
|
||||
"movq 8(%0), %%mm4 \n\t"
|
||||
"paddw %%mm2, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm4 \n\t"
|
||||
"psraw $1, %%mm2 \n\t"
|
||||
"psraw $1, %%mm6 \n\t"
|
||||
"paddw %%mm0, %%mm2 \n\t"
|
||||
"paddw %%mm4, %%mm6 \n\t"
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
"movq %%mm6, 8(%2) \n\t"
|
||||
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
|
||||
}
|
||||
|
||||
{
|
||||
snow_interleave_line_header(&i, width, b, temp);
|
||||
|
||||
for (; (i & 0x1E) != 0x1E; i-=2){
|
||||
b[i+1] = temp[i>>1];
|
||||
b[i] = b[i>>1];
|
||||
}
|
||||
for (i-=30; i>=0; i-=32){
|
||||
__asm__ volatile(
|
||||
"movq (%1), %%mm0 \n\t"
|
||||
"movq 8(%1), %%mm2 \n\t"
|
||||
"movq 16(%1), %%mm4 \n\t"
|
||||
"movq 24(%1), %%mm6 \n\t"
|
||||
"movq (%1), %%mm1 \n\t"
|
||||
"movq 8(%1), %%mm3 \n\t"
|
||||
"movq 16(%1), %%mm5 \n\t"
|
||||
"movq 24(%1), %%mm7 \n\t"
|
||||
"punpcklwd (%2), %%mm0 \n\t"
|
||||
"punpcklwd 8(%2), %%mm2 \n\t"
|
||||
"punpcklwd 16(%2), %%mm4 \n\t"
|
||||
"punpcklwd 24(%2), %%mm6 \n\t"
|
||||
"movq %%mm0, (%0) \n\t"
|
||||
"movq %%mm2, 16(%0) \n\t"
|
||||
"movq %%mm4, 32(%0) \n\t"
|
||||
"movq %%mm6, 48(%0) \n\t"
|
||||
"punpckhwd (%2), %%mm1 \n\t"
|
||||
"punpckhwd 8(%2), %%mm3 \n\t"
|
||||
"punpckhwd 16(%2), %%mm5 \n\t"
|
||||
"punpckhwd 24(%2), %%mm7 \n\t"
|
||||
"movq %%mm1, 8(%0) \n\t"
|
||||
"movq %%mm3, 24(%0) \n\t"
|
||||
"movq %%mm5, 40(%0) \n\t"
|
||||
"movq %%mm7, 56(%0) \n\t"
|
||||
:: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_7REGS
|
||||
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
|
||||
""op" ("r",%%"REG_d"), %%"t0" \n\t"\
|
||||
""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
|
||||
""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
|
||||
""op" 48("r",%%"REG_d"), %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"psubw %%"s0", %%"t0" \n\t"\
|
||||
"psubw %%"s1", %%"t1" \n\t"\
|
||||
"psubw %%"s2", %%"t2" \n\t"\
|
||||
"psubw %%"s3", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
|
||||
"movdqa %%"s0", ("w",%%"REG_d") \n\t"\
|
||||
"movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
|
||||
"movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
|
||||
"movdqa %%"s3", 48("w",%%"REG_d") \n\t"
|
||||
|
||||
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
|
||||
"psraw $"n", %%"t0" \n\t"\
|
||||
"psraw $"n", %%"t1" \n\t"\
|
||||
"psraw $"n", %%"t2" \n\t"\
|
||||
"psraw $"n", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"paddw %%"s0", %%"t0" \n\t"\
|
||||
"paddw %%"s1", %%"t1" \n\t"\
|
||||
"paddw %%"s2", %%"t2" \n\t"\
|
||||
"paddw %%"s3", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"pmulhw %%"s0", %%"t0" \n\t"\
|
||||
"pmulhw %%"s1", %%"t1" \n\t"\
|
||||
"pmulhw %%"s2", %%"t2" \n\t"\
|
||||
"pmulhw %%"s3", %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"movdqa %%"s0", %%"t0" \n\t"\
|
||||
"movdqa %%"s1", %%"t1" \n\t"\
|
||||
"movdqa %%"s2", %%"t2" \n\t"\
|
||||
"movdqa %%"s3", %%"t3" \n\t"
|
||||
|
||||
static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
|
||||
x86_reg i = width;
|
||||
|
||||
while(i & 0x1F)
|
||||
{
|
||||
i--;
|
||||
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
|
||||
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
|
||||
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
|
||||
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
|
||||
}
|
||||
i+=i;
|
||||
|
||||
__asm__ volatile (
|
||||
"jmp 2f \n\t"
|
||||
"1: \n\t"
|
||||
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
|
||||
"pcmpeqw %%xmm0, %%xmm0 \n\t"
|
||||
"pcmpeqw %%xmm2, %%xmm2 \n\t"
|
||||
"paddw %%xmm2, %%xmm2 \n\t"
|
||||
"paddw %%xmm0, %%xmm2 \n\t"
|
||||
"psllw $13, %%xmm2 \n\t"
|
||||
snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
"pcmpeqw %%xmm7, %%xmm7 \n\t"
|
||||
"pcmpeqw %%xmm5, %%xmm5 \n\t"
|
||||
"psllw $15, %%xmm7 \n\t"
|
||||
"psrlw $13, %%xmm5 \n\t"
|
||||
"paddw %%xmm7, %%xmm5 \n\t"
|
||||
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
|
||||
"movq (%2,%%"REG_d"), %%xmm1 \n\t"
|
||||
"movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
|
||||
"paddw %%xmm7, %%xmm1 \n\t"
|
||||
"paddw %%xmm7, %%xmm3 \n\t"
|
||||
"pavgw %%xmm1, %%xmm0 \n\t"
|
||||
"pavgw %%xmm3, %%xmm2 \n\t"
|
||||
"movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
|
||||
"movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
|
||||
"paddw %%xmm7, %%xmm1 \n\t"
|
||||
"paddw %%xmm7, %%xmm3 \n\t"
|
||||
"pavgw %%xmm1, %%xmm4 \n\t"
|
||||
"pavgw %%xmm3, %%xmm6 \n\t"
|
||||
snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
|
||||
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
|
||||
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
|
||||
|
||||
"2: \n\t"
|
||||
"sub $64, %%"REG_d" \n\t"
|
||||
"jge 1b \n\t"
|
||||
:"+d"(i)
|
||||
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
|
||||
}
|
||||
|
||||
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
|
||||
""op" ("r",%%"REG_d"), %%"t0" \n\t"\
|
||||
""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
|
||||
""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
|
||||
""op" 24("r",%%"REG_d"), %%"t3" \n\t"
|
||||
|
||||
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
|
||||
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
|
||||
|
||||
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
|
||||
"movq %%"s0", ("w",%%"REG_d") \n\t"\
|
||||
"movq %%"s1", 8("w",%%"REG_d") \n\t"\
|
||||
"movq %%"s2", 16("w",%%"REG_d") \n\t"\
|
||||
"movq %%"s3", 24("w",%%"REG_d") \n\t"
|
||||
|
||||
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
|
||||
"movq %%"s0", %%"t0" \n\t"\
|
||||
"movq %%"s1", %%"t1" \n\t"\
|
||||
"movq %%"s2", %%"t2" \n\t"\
|
||||
"movq %%"s3", %%"t3" \n\t"
|
||||
|
||||
|
||||
static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
|
||||
x86_reg i = width;
|
||||
while(i & 15)
|
||||
{
|
||||
i--;
|
||||
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
|
||||
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
|
||||
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
|
||||
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
|
||||
}
|
||||
i+=i;
|
||||
__asm__ volatile(
|
||||
"jmp 2f \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
|
||||
"pcmpeqw %%mm0, %%mm0 \n\t"
|
||||
"pcmpeqw %%mm2, %%mm2 \n\t"
|
||||
"paddw %%mm2, %%mm2 \n\t"
|
||||
"paddw %%mm0, %%mm2 \n\t"
|
||||
"psllw $13, %%mm2 \n\t"
|
||||
snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
|
||||
"pcmpeqw %%mm7, %%mm7 \n\t"
|
||||
"pcmpeqw %%mm5, %%mm5 \n\t"
|
||||
"psllw $15, %%mm7 \n\t"
|
||||
"psrlw $13, %%mm5 \n\t"
|
||||
"paddw %%mm7, %%mm5 \n\t"
|
||||
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
|
||||
"movq (%2,%%"REG_d"), %%mm1 \n\t"
|
||||
"movq 8(%2,%%"REG_d"), %%mm3 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm7, %%mm3 \n\t"
|
||||
"pavgw %%mm1, %%mm0 \n\t"
|
||||
"pavgw %%mm3, %%mm2 \n\t"
|
||||
"movq 16(%2,%%"REG_d"), %%mm1 \n\t"
|
||||
"movq 24(%2,%%"REG_d"), %%mm3 \n\t"
|
||||
"paddw %%mm7, %%mm1 \n\t"
|
||||
"paddw %%mm7, %%mm3 \n\t"
|
||||
"pavgw %%mm1, %%mm4 \n\t"
|
||||
"pavgw %%mm3, %%mm6 \n\t"
|
||||
snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
|
||||
|
||||
snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
|
||||
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
|
||||
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
|
||||
|
||||
"2: \n\t"
|
||||
"sub $32, %%"REG_d" \n\t"
|
||||
"jge 1b \n\t"
|
||||
:"+d"(i)
|
||||
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
|
||||
}
|
||||
#endif //HAVE_7REGS
|
||||
|
||||
#define snow_inner_add_yblock_sse2_header \
|
||||
IDWTELEM * * dst_array = sb->line + src_y;\
|
||||
x86_reg tmp;\
|
||||
__asm__ volatile(\
|
||||
"mov %7, %%"REG_c" \n\t"\
|
||||
"mov %6, %2 \n\t"\
|
||||
"mov %4, %%"REG_S" \n\t"\
|
||||
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
|
||||
"pcmpeqd %%xmm3, %%xmm3 \n\t"\
|
||||
"psllw $15, %%xmm3 \n\t"\
|
||||
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
|
||||
"1: \n\t"\
|
||||
"mov %1, %%"REG_D" \n\t"\
|
||||
"mov (%%"REG_D"), %%"REG_D" \n\t"\
|
||||
"add %3, %%"REG_D" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
|
||||
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
|
||||
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
|
||||
"movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
|
||||
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
|
||||
"movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm0 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm4 \n\t"\
|
||||
"pmullw %%xmm0, %%"out_reg1" \n\t"\
|
||||
"pmullw %%xmm4, %%"out_reg2" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
|
||||
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
|
||||
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
|
||||
"movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
|
||||
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
|
||||
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
|
||||
"movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm0 \n\t"\
|
||||
"punpcklbw %%xmm7, %%xmm4 \n\t"\
|
||||
"pmullw %%xmm0, %%"out_reg1" \n\t"\
|
||||
"pmullw %%xmm4, %%"out_reg2" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
|
||||
snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
|
||||
"paddusw %%xmm2, %%xmm1 \n\t"\
|
||||
"paddusw %%xmm6, %%xmm5 \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
|
||||
snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
|
||||
"paddusw %%xmm2, %%xmm1 \n\t"\
|
||||
"paddusw %%xmm6, %%xmm5 \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_common1\
|
||||
"add $32, %%"REG_S" \n\t"\
|
||||
"add %%"REG_c", %0 \n\t"\
|
||||
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
|
||||
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
|
||||
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
|
||||
"add %%"REG_c", (%%"REG_a") \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_common2\
|
||||
"jnz 1b \n\t"\
|
||||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
|
||||
:\
|
||||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
|
||||
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_8\
|
||||
"sal $1, %%"REG_c" \n\t"\
|
||||
"add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common1\
|
||||
"sar $1, %%"REG_c" \n\t"\
|
||||
"sub $2, %2 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common2
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_16\
|
||||
"add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common1\
|
||||
"dec %2 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common2
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_sse2_header
|
||||
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
|
||||
snow_inner_add_yblock_sse2_accum_8("2", "8")
|
||||
snow_inner_add_yblock_sse2_accum_8("1", "128")
|
||||
snow_inner_add_yblock_sse2_accum_8("0", "136")
|
||||
|
||||
"mov %0, %%"REG_d" \n\t"
|
||||
"movdqa (%%"REG_D"), %%xmm0 \n\t"
|
||||
"movdqa %%xmm1, %%xmm2 \n\t"
|
||||
|
||||
"punpckhwd %%xmm7, %%xmm1 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm2 \n\t"
|
||||
"paddd %%xmm2, %%xmm0 \n\t"
|
||||
"movdqa 16(%%"REG_D"), %%xmm2 \n\t"
|
||||
"paddd %%xmm1, %%xmm2 \n\t"
|
||||
"paddd %%xmm3, %%xmm0 \n\t"
|
||||
"paddd %%xmm3, %%xmm2 \n\t"
|
||||
|
||||
"mov %1, %%"REG_D" \n\t"
|
||||
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
|
||||
"add %3, %%"REG_D" \n\t"
|
||||
|
||||
"movdqa (%%"REG_D"), %%xmm4 \n\t"
|
||||
"movdqa %%xmm5, %%xmm6 \n\t"
|
||||
"punpckhwd %%xmm7, %%xmm5 \n\t"
|
||||
"punpcklwd %%xmm7, %%xmm6 \n\t"
|
||||
"paddd %%xmm6, %%xmm4 \n\t"
|
||||
"movdqa 16(%%"REG_D"), %%xmm6 \n\t"
|
||||
"paddd %%xmm5, %%xmm6 \n\t"
|
||||
"paddd %%xmm3, %%xmm4 \n\t"
|
||||
"paddd %%xmm3, %%xmm6 \n\t"
|
||||
|
||||
"psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
|
||||
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
|
||||
"packssdw %%xmm2, %%xmm0 \n\t"
|
||||
"packuswb %%xmm7, %%xmm0 \n\t"
|
||||
"movq %%xmm0, (%%"REG_d") \n\t"
|
||||
|
||||
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
|
||||
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
|
||||
"packssdw %%xmm6, %%xmm4 \n\t"
|
||||
"packuswb %%xmm7, %%xmm4 \n\t"
|
||||
"movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
|
||||
snow_inner_add_yblock_sse2_end_8
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_sse2_header
|
||||
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
|
||||
snow_inner_add_yblock_sse2_accum_16("2", "16")
|
||||
snow_inner_add_yblock_sse2_accum_16("1", "512")
|
||||
snow_inner_add_yblock_sse2_accum_16("0", "528")
|
||||
|
||||
"mov %0, %%"REG_d" \n\t"
|
||||
"psrlw $4, %%xmm1 \n\t"
|
||||
"psrlw $4, %%xmm5 \n\t"
|
||||
"paddw (%%"REG_D"), %%xmm1 \n\t"
|
||||
"paddw 16(%%"REG_D"), %%xmm5 \n\t"
|
||||
"paddw %%xmm3, %%xmm1 \n\t"
|
||||
"paddw %%xmm3, %%xmm5 \n\t"
|
||||
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
|
||||
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
|
||||
"packuswb %%xmm5, %%xmm1 \n\t"
|
||||
|
||||
"movdqu %%xmm1, (%%"REG_d") \n\t"
|
||||
|
||||
snow_inner_add_yblock_sse2_end_16
|
||||
}
|
||||
|
||||
#define snow_inner_add_yblock_mmx_header \
|
||||
IDWTELEM * * dst_array = sb->line + src_y;\
|
||||
x86_reg tmp;\
|
||||
__asm__ volatile(\
|
||||
"mov %7, %%"REG_c" \n\t"\
|
||||
"mov %6, %2 \n\t"\
|
||||
"mov %4, %%"REG_S" \n\t"\
|
||||
"pxor %%mm7, %%mm7 \n\t" /* 0 */\
|
||||
"pcmpeqd %%mm3, %%mm3 \n\t"\
|
||||
"psllw $15, %%mm3 \n\t"\
|
||||
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
|
||||
"1: \n\t"\
|
||||
"mov %1, %%"REG_D" \n\t"\
|
||||
"mov (%%"REG_D"), %%"REG_D" \n\t"\
|
||||
"add %3, %%"REG_D" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
|
||||
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
|
||||
"movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
|
||||
"movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
|
||||
"punpcklbw %%mm7, %%"out_reg1" \n\t"\
|
||||
"punpcklbw %%mm7, %%"out_reg2" \n\t"\
|
||||
"movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
|
||||
"movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm0 \n\t"\
|
||||
"punpcklbw %%mm7, %%mm4 \n\t"\
|
||||
"pmullw %%mm0, %%"out_reg1" \n\t"\
|
||||
"pmullw %%mm4, %%"out_reg2" \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
|
||||
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
|
||||
"paddusw %%mm2, %%mm1 \n\t"\
|
||||
"paddusw %%mm6, %%mm5 \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
|
||||
"mov %0, %%"REG_d" \n\t"\
|
||||
"psrlw $4, %%mm1 \n\t"\
|
||||
"psrlw $4, %%mm5 \n\t"\
|
||||
"paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
|
||||
"paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
|
||||
"paddw %%mm3, %%mm1 \n\t"\
|
||||
"paddw %%mm3, %%mm5 \n\t"\
|
||||
"psraw $4, %%mm1 \n\t"\
|
||||
"psraw $4, %%mm5 \n\t"\
|
||||
"packuswb %%mm5, %%mm1 \n\t"\
|
||||
"movq %%mm1, "write_offset"(%%"REG_d") \n\t"
|
||||
|
||||
#define snow_inner_add_yblock_mmx_end(s_step)\
|
||||
"add $"s_step", %%"REG_S" \n\t"\
|
||||
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
|
||||
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
|
||||
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
|
||||
"add %%"REG_c", (%%"REG_a") \n\t"\
|
||||
"add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
|
||||
"add %%"REG_c", %0 \n\t"\
|
||||
"dec %2 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
|
||||
:\
|
||||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
|
||||
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_mmx_header
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
|
||||
snow_inner_add_yblock_mmx_accum("2", "8", "0")
|
||||
snow_inner_add_yblock_mmx_accum("1", "128", "0")
|
||||
snow_inner_add_yblock_mmx_accum("0", "136", "0")
|
||||
snow_inner_add_yblock_mmx_mix("0", "0")
|
||||
snow_inner_add_yblock_mmx_end("16")
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_mmx_header
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
|
||||
snow_inner_add_yblock_mmx_accum("2", "16", "0")
|
||||
snow_inner_add_yblock_mmx_accum("1", "512", "0")
|
||||
snow_inner_add_yblock_mmx_accum("0", "528", "0")
|
||||
snow_inner_add_yblock_mmx_mix("0", "0")
|
||||
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
|
||||
snow_inner_add_yblock_mmx_accum("2", "24", "8")
|
||||
snow_inner_add_yblock_mmx_accum("1", "520", "8")
|
||||
snow_inner_add_yblock_mmx_accum("0", "536", "8")
|
||||
snow_inner_add_yblock_mmx_mix("16", "8")
|
||||
snow_inner_add_yblock_mmx_end("32")
|
||||
}
|
||||
|
||||
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
|
||||
if (b_w == 16)
|
||||
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else if (b_w == 8 && obmc_stride == 16) {
|
||||
if (!(b_h & 1))
|
||||
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else
|
||||
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
} else
|
||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
}
|
||||
|
||||
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
|
||||
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
if (b_w == 16)
|
||||
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else if (b_w == 8 && obmc_stride == 16)
|
||||
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
else
|
||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
|
||||
}
|
||||
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
|
||||
void ff_dwt_init_x86(DWTContext *c)
|
||||
{
|
||||
#if HAVE_INLINE_ASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
|
||||
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
|
||||
#if HAVE_7REGS
|
||||
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
|
||||
#endif
|
||||
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
|
||||
}
|
||||
else{
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
||||
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
|
||||
#if HAVE_7REGS
|
||||
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
|
||||
#endif
|
||||
}
|
||||
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
}
|
||||
48
project/jni/ffmpeg/libavcodec/x86/v210-init.c
Normal file
48
project/jni/ffmpeg/libavcodec/x86/v210-init.c
Normal file
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavcodec/v210dec.h"
|
||||
|
||||
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
|
||||
extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
|
||||
|
||||
av_cold void v210_x86_init(V210DecContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
#if HAVE_YASM
|
||||
if (s->aligned_input) {
|
||||
if (cpu_flags & AV_CPU_FLAG_SSSE3)
|
||||
s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
|
||||
|
||||
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
|
||||
s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
|
||||
}
|
||||
else {
|
||||
if (cpu_flags & AV_CPU_FLAG_SSSE3)
|
||||
s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
|
||||
|
||||
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
|
||||
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
88
project/jni/ffmpeg/libavcodec/x86/v210.asm
Normal file
88
project/jni/ffmpeg/libavcodec/x86/v210.asm
Normal file
@@ -0,0 +1,88 @@
|
||||
;******************************************************************************
|
||||
;* V210 SIMD unpack
|
||||
;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
|
||||
;*
|
||||
;* This file is part of Libav.
|
||||
;*
|
||||
;* Libav is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* Libav is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with Libav; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
v210_mask: times 4 dd 0x3ff
|
||||
v210_mult: dw 64,4,64,4,64,4,64,4
|
||||
v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
|
||||
v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro v210_planar_unpack 2
|
||||
|
||||
; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
|
||||
cglobal v210_planar_unpack_%1_%2, 5, 5, 7
|
||||
movsxdifnidn r4, r4d
|
||||
lea r1, [r1+2*r4]
|
||||
add r2, r4
|
||||
add r3, r4
|
||||
neg r4
|
||||
|
||||
mova m3, [v210_mult]
|
||||
mova m4, [v210_mask]
|
||||
mova m5, [v210_luma_shuf]
|
||||
mova m6, [v210_chroma_shuf]
|
||||
.loop
|
||||
%ifidn %1, unaligned
|
||||
movu m0, [r0]
|
||||
%else
|
||||
mova m0, [r0]
|
||||
%endif
|
||||
|
||||
pmullw m1, m0, m3
|
||||
psrld m0, 10
|
||||
psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
|
||||
pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
|
||||
|
||||
shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
|
||||
pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
|
||||
movu [r1+2*r4], m2
|
||||
|
||||
shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
|
||||
pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
|
||||
movq [r2+r4], m1
|
||||
movhps [r3+r4], m1
|
||||
|
||||
add r0, mmsize
|
||||
add r4, 6
|
||||
jl .loop
|
||||
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
v210_planar_unpack unaligned, ssse3
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_AVX
|
||||
v210_planar_unpack unaligned, avx
|
||||
%endif
|
||||
|
||||
INIT_XMM
|
||||
v210_planar_unpack aligned, ssse3
|
||||
%if HAVE_AVX_EXTERNAL
|
||||
INIT_AVX
|
||||
v210_planar_unpack aligned, avx
|
||||
%endif
|
||||
317
project/jni/ffmpeg/libavcodec/x86/vc1dsp.asm
Normal file
317
project/jni/ffmpeg/libavcodec/x86/vc1dsp.asm
Normal file
@@ -0,0 +1,317 @@
|
||||
;******************************************************************************
|
||||
;* VC1 deblocking optimizations
|
||||
;* Copyright (c) 2009 David Conrad
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
cextern pw_4
|
||||
cextern pw_5
|
||||
|
||||
section .text
|
||||
|
||||
; dst_low, dst_high (src), zero
|
||||
; zero-extends one vector from 8 to 16 bits
|
||||
%macro UNPACK_8TO16 4
|
||||
mova m%2, m%3
|
||||
punpckh%1 m%3, m%4
|
||||
punpckl%1 m%2, m%4
|
||||
%endmacro
|
||||
|
||||
%macro STORE_4_WORDS 6
|
||||
%if cpuflag(sse4)
|
||||
pextrw %1, %5, %6+0
|
||||
pextrw %2, %5, %6+1
|
||||
pextrw %3, %5, %6+2
|
||||
pextrw %4, %5, %6+3
|
||||
%else
|
||||
movd %6d, %5
|
||||
%if mmsize==16
|
||||
psrldq %5, 4
|
||||
%else
|
||||
psrlq %5, 32
|
||||
%endif
|
||||
mov %1, %6w
|
||||
shr %6, 16
|
||||
mov %2, %6w
|
||||
movd %6d, %5
|
||||
mov %3, %6w
|
||||
shr %6, 16
|
||||
mov %4, %6w
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; in: p1 p0 q0 q1, clobbers p0
|
||||
; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
|
||||
%macro VC1_LOOP_FILTER_A0 4
|
||||
psubw %1, %4
|
||||
psubw %2, %3
|
||||
paddw %1, %1
|
||||
pmullw %2, [pw_5]
|
||||
psubw %1, %2
|
||||
paddw %1, [pw_4]
|
||||
psraw %1, 3
|
||||
%endmacro
|
||||
|
||||
; in: p0 q0 a0 a1 a2
|
||||
; m0 m1 m7 m6 m5
|
||||
; %1: size
|
||||
; out: m0=p0' m1=q0'
|
||||
%macro VC1_FILTER 1
|
||||
PABSW m4, m7
|
||||
PABSW m3, m6
|
||||
PABSW m2, m5
|
||||
mova m6, m4
|
||||
pminsw m3, m2
|
||||
pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
|
||||
psubw m3, m4
|
||||
pmullw m3, [pw_5] ; 5*(a3 - a0)
|
||||
PABSW m2, m3
|
||||
psraw m2, 3 ; abs(d/8)
|
||||
pxor m7, m3 ; d_sign ^= a0_sign
|
||||
|
||||
pxor m5, m5
|
||||
movd m3, r2d
|
||||
%if %1 > 4
|
||||
punpcklbw m3, m3
|
||||
%endif
|
||||
punpcklbw m3, m5
|
||||
pcmpgtw m3, m4 ; if (a0 < pq)
|
||||
pand m6, m3
|
||||
|
||||
mova m3, m0
|
||||
psubw m3, m1
|
||||
PABSW m4, m3
|
||||
psraw m4, 1
|
||||
pxor m3, m7 ; d_sign ^ clip_sign
|
||||
psraw m3, 15
|
||||
pminsw m2, m4 ; min(d, clip)
|
||||
pcmpgtw m4, m5
|
||||
pand m6, m4 ; filt3 (C return value)
|
||||
|
||||
; each set of 4 pixels is not filtered if the 3rd is not
|
||||
%if mmsize==16
|
||||
pshuflw m4, m6, 0xaa
|
||||
%if %1 > 4
|
||||
pshufhw m4, m4, 0xaa
|
||||
%endif
|
||||
%else
|
||||
pshufw m4, m6, 0xaa
|
||||
%endif
|
||||
pandn m3, m4
|
||||
pand m2, m6
|
||||
pand m3, m2 ; d final
|
||||
|
||||
psraw m7, 15
|
||||
pxor m3, m7
|
||||
psubw m3, m7
|
||||
psubw m0, m3
|
||||
paddw m1, m3
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
%endmacro
|
||||
|
||||
; 1st param: size of filter
|
||||
; 2nd param: mov suffix equivalent to the filter size
|
||||
%macro VC1_V_LOOP_FILTER 2
|
||||
pxor m5, m5
|
||||
mov%2 m6, [r4]
|
||||
mov%2 m4, [r4+r1]
|
||||
mov%2 m7, [r4+2*r1]
|
||||
mov%2 m0, [r4+r3]
|
||||
punpcklbw m6, m5
|
||||
punpcklbw m4, m5
|
||||
punpcklbw m7, m5
|
||||
punpcklbw m0, m5
|
||||
|
||||
VC1_LOOP_FILTER_A0 m6, m4, m7, m0
|
||||
mov%2 m1, [r0]
|
||||
mov%2 m2, [r0+r1]
|
||||
punpcklbw m1, m5
|
||||
punpcklbw m2, m5
|
||||
mova m4, m0
|
||||
VC1_LOOP_FILTER_A0 m7, m4, m1, m2
|
||||
mov%2 m3, [r0+2*r1]
|
||||
mov%2 m4, [r0+r3]
|
||||
punpcklbw m3, m5
|
||||
punpcklbw m4, m5
|
||||
mova m5, m1
|
||||
VC1_LOOP_FILTER_A0 m5, m2, m3, m4
|
||||
|
||||
VC1_FILTER %1
|
||||
mov%2 [r4+r3], m0
|
||||
mov%2 [r0], m1
|
||||
%endmacro
|
||||
|
||||
; 1st param: size of filter
|
||||
; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
|
||||
; 2nd (optional) param: temp register to use for storing words
|
||||
%macro VC1_H_LOOP_FILTER 1-2
|
||||
%if %1 == 4
|
||||
movq m0, [r0 -4]
|
||||
movq m1, [r0+ r1-4]
|
||||
movq m2, [r0+2*r1-4]
|
||||
movq m3, [r0+ r3-4]
|
||||
TRANSPOSE4x4B 0, 1, 2, 3, 4
|
||||
%else
|
||||
movq m0, [r0 -4]
|
||||
movq m4, [r0+ r1-4]
|
||||
movq m1, [r0+2*r1-4]
|
||||
movq m5, [r0+ r3-4]
|
||||
movq m2, [r4 -4]
|
||||
movq m6, [r4+ r1-4]
|
||||
movq m3, [r4+2*r1-4]
|
||||
movq m7, [r4+ r3-4]
|
||||
punpcklbw m0, m4
|
||||
punpcklbw m1, m5
|
||||
punpcklbw m2, m6
|
||||
punpcklbw m3, m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
%endif
|
||||
pxor m5, m5
|
||||
|
||||
UNPACK_8TO16 bw, 6, 0, 5
|
||||
UNPACK_8TO16 bw, 7, 1, 5
|
||||
VC1_LOOP_FILTER_A0 m6, m0, m7, m1
|
||||
UNPACK_8TO16 bw, 4, 2, 5
|
||||
mova m0, m1 ; m0 = p0
|
||||
VC1_LOOP_FILTER_A0 m7, m1, m4, m2
|
||||
UNPACK_8TO16 bw, 1, 3, 5
|
||||
mova m5, m4
|
||||
VC1_LOOP_FILTER_A0 m5, m2, m1, m3
|
||||
SWAP 1, 4 ; m1 = q0
|
||||
|
||||
VC1_FILTER %1
|
||||
punpcklbw m0, m1
|
||||
%if %0 > 1
|
||||
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
|
||||
%if %1 > 4
|
||||
psrldq m0, 4
|
||||
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
|
||||
%endif
|
||||
%else
|
||||
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
|
||||
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro START_V_FILTER 0
|
||||
mov r4, r0
|
||||
lea r3, [4*r1]
|
||||
sub r4, r3
|
||||
lea r3, [r1+2*r1]
|
||||
imul r2, 0x01010101
|
||||
%endmacro
|
||||
|
||||
%macro START_H_FILTER 1
|
||||
lea r3, [r1+2*r1]
|
||||
%if %1 > 4
|
||||
lea r4, [r0+4*r1]
|
||||
%endif
|
||||
imul r2, 0x01010101
|
||||
%endmacro
|
||||
|
||||
%macro VC1_LF 0
|
||||
cglobal vc1_v_loop_filter_internal
|
||||
VC1_V_LOOP_FILTER 4, d
|
||||
ret
|
||||
|
||||
cglobal vc1_h_loop_filter_internal
|
||||
VC1_H_LOOP_FILTER 4, r4
|
||||
ret
|
||||
|
||||
; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_v_loop_filter4, 3,5,0
|
||||
START_V_FILTER
|
||||
call vc1_v_loop_filter_internal
|
||||
RET
|
||||
|
||||
; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_h_loop_filter4, 3,5,0
|
||||
START_H_FILTER 4
|
||||
call vc1_h_loop_filter_internal
|
||||
RET
|
||||
|
||||
; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_v_loop_filter8, 3,5,0
|
||||
START_V_FILTER
|
||||
call vc1_v_loop_filter_internal
|
||||
add r4, 4
|
||||
add r0, 4
|
||||
call vc1_v_loop_filter_internal
|
||||
RET
|
||||
|
||||
; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_h_loop_filter8, 3,5,0
|
||||
START_H_FILTER 4
|
||||
call vc1_h_loop_filter_internal
|
||||
lea r0, [r0+4*r1]
|
||||
call vc1_h_loop_filter_internal
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
VC1_LF
|
||||
|
||||
INIT_XMM sse2
|
||||
; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_v_loop_filter8, 3,5,8
|
||||
START_V_FILTER
|
||||
VC1_V_LOOP_FILTER 8, q
|
||||
RET
|
||||
|
||||
; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_h_loop_filter8, 3,6,8
|
||||
START_H_FILTER 8
|
||||
VC1_H_LOOP_FILTER 8, r5
|
||||
RET
|
||||
|
||||
INIT_MMX ssse3
|
||||
; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_v_loop_filter4, 3,5,0
|
||||
START_V_FILTER
|
||||
VC1_V_LOOP_FILTER 4, d
|
||||
RET
|
||||
|
||||
; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_h_loop_filter4, 3,5,0
|
||||
START_H_FILTER 4
|
||||
VC1_H_LOOP_FILTER 4, r4
|
||||
RET
|
||||
|
||||
INIT_XMM ssse3
|
||||
; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_v_loop_filter8, 3,5,8
|
||||
START_V_FILTER
|
||||
VC1_V_LOOP_FILTER 8, q
|
||||
RET
|
||||
|
||||
; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_h_loop_filter8, 3,6,8
|
||||
START_H_FILTER 8
|
||||
VC1_H_LOOP_FILTER 8, r5
|
||||
RET
|
||||
|
||||
INIT_XMM sse4
|
||||
; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
|
||||
cglobal vc1_h_loop_filter8, 3,5,8
|
||||
START_H_FILTER 8
|
||||
VC1_H_LOOP_FILTER 8
|
||||
RET
|
||||
29
project/jni/ffmpeg/libavcodec/x86/vc1dsp.h
Normal file
29
project/jni/ffmpeg/libavcodec/x86/vc1dsp.h
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* VC-1 and WMV3 decoder - X86 DSP init functions
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VC1DSP_H
|
||||
#define AVCODEC_X86_VC1DSP_H
|
||||
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
|
||||
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
|
||||
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
|
||||
|
||||
#endif /* AVCODEC_X86_VC1DSP_H */
|
||||
123
project/jni/ffmpeg/libavcodec/x86/vc1dsp_init.c
Normal file
123
project/jni/ffmpeg/libavcodec/x86/vc1dsp_init.c
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
* VC-1 and WMV3 - DSP functions MMX-optimized
|
||||
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
#include "vc1dsp.h"
|
||||
#include "config.h"
|
||||
|
||||
#define LOOP_FILTER(EXT) \
|
||||
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
|
||||
\
|
||||
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
|
||||
{ \
|
||||
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
|
||||
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
|
||||
} \
|
||||
\
|
||||
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
|
||||
{ \
|
||||
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
|
||||
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
|
||||
}
|
||||
|
||||
#if HAVE_YASM
|
||||
LOOP_FILTER(mmxext)
|
||||
LOOP_FILTER(sse2)
|
||||
LOOP_FILTER(ssse3)
|
||||
|
||||
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
|
||||
|
||||
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
|
||||
{
|
||||
ff_vc1_h_loop_filter8_sse4(src, stride, pq);
|
||||
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
|
||||
int stride, int h, int x, int y);
|
||||
|
||||
|
||||
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (INLINE_MMX(mm_flags))
|
||||
ff_vc1dsp_init_mmx(dsp);
|
||||
|
||||
if (INLINE_MMXEXT(mm_flags))
|
||||
ff_vc1dsp_init_mmxext(dsp);
|
||||
|
||||
#define ASSIGN_LF(EXT) \
|
||||
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
|
||||
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
|
||||
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
|
||||
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
|
||||
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
|
||||
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
|
||||
|
||||
#if HAVE_YASM
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
|
||||
}
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
||||
ASSIGN_LF(mmxext);
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
|
||||
} else if (mm_flags & AV_CPU_FLAG_3DNOW) {
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
|
||||
}
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
|
||||
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
|
||||
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
|
||||
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
||||
ASSIGN_LF(ssse3);
|
||||
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
|
||||
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSE4) {
|
||||
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
|
||||
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
||||
747
project/jni/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
Normal file
747
project/jni/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
Normal file
@@ -0,0 +1,747 @@
|
||||
/*
|
||||
* VC-1 and WMV3 - DSP functions MMX-optimized
|
||||
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
#include "vc1dsp.h"
|
||||
|
||||
#if HAVE_INLINE_ASM
|
||||
|
||||
#define OP_PUT(S,D)
|
||||
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
|
||||
|
||||
/** Add rounder from mm7 to mm3 and pack result at destination */
|
||||
#define NORMALIZE_MMX(SHIFT) \
|
||||
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
|
||||
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
|
||||
"psraw "SHIFT", %%mm3 \n\t" \
|
||||
"psraw "SHIFT", %%mm4 \n\t"
|
||||
|
||||
#define TRANSFER_DO_PACK(OP) \
|
||||
"packuswb %%mm4, %%mm3 \n\t" \
|
||||
OP((%2), %%mm3) \
|
||||
"movq %%mm3, (%2) \n\t"
|
||||
|
||||
#define TRANSFER_DONT_PACK(OP) \
|
||||
OP(0(%2), %%mm3) \
|
||||
OP(8(%2), %%mm4) \
|
||||
"movq %%mm3, 0(%2) \n\t" \
|
||||
"movq %%mm4, 8(%2) \n\t"
|
||||
|
||||
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
|
||||
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
|
||||
#define DONT_UNPACK(reg)
|
||||
|
||||
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
|
||||
#define LOAD_ROUNDER_MMX(ROUND) \
|
||||
"movd "ROUND", %%mm7 \n\t" \
|
||||
"punpcklwd %%mm7, %%mm7 \n\t" \
|
||||
"punpckldq %%mm7, %%mm7 \n\t"
|
||||
|
||||
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
|
||||
"paddw %%mm"#R2", %%mm"#R1" \n\t" \
|
||||
"movd (%0,%3), %%mm"#R0" \n\t" \
|
||||
"pmullw %%mm6, %%mm"#R1" \n\t" \
|
||||
"punpcklbw %%mm0, %%mm"#R0" \n\t" \
|
||||
"movd (%0,%2), %%mm"#R3" \n\t" \
|
||||
"psubw %%mm"#R0", %%mm"#R1" \n\t" \
|
||||
"punpcklbw %%mm0, %%mm"#R3" \n\t" \
|
||||
"paddw %%mm7, %%mm"#R1" \n\t" \
|
||||
"psubw %%mm"#R3", %%mm"#R1" \n\t" \
|
||||
"psraw %4, %%mm"#R1" \n\t" \
|
||||
"movq %%mm"#R1", "#OFF"(%1) \n\t" \
|
||||
"add %2, %0 \n\t"
|
||||
|
||||
/** Sacrifying mm6 allows to pipeline loads from src */
|
||||
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
const uint8_t *src, x86_reg stride,
|
||||
int rnd, int64_t shift)
|
||||
{
|
||||
__asm__ volatile(
|
||||
"mov $3, %%"REG_c" \n\t"
|
||||
LOAD_ROUNDER_MMX("%5")
|
||||
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
|
||||
"1: \n\t"
|
||||
"movd (%0), %%mm2 \n\t"
|
||||
"add %2, %0 \n\t"
|
||||
"movd (%0), %%mm3 \n\t"
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"
|
||||
"punpcklbw %%mm0, %%mm3 \n\t"
|
||||
SHIFT2_LINE( 0, 1, 2, 3, 4)
|
||||
SHIFT2_LINE( 24, 2, 3, 4, 1)
|
||||
SHIFT2_LINE( 48, 3, 4, 1, 2)
|
||||
SHIFT2_LINE( 72, 4, 1, 2, 3)
|
||||
SHIFT2_LINE( 96, 1, 2, 3, 4)
|
||||
SHIFT2_LINE(120, 2, 3, 4, 1)
|
||||
SHIFT2_LINE(144, 3, 4, 1, 2)
|
||||
SHIFT2_LINE(168, 4, 1, 2, 3)
|
||||
"sub %6, %0 \n\t"
|
||||
"add $8, %1 \n\t"
|
||||
"dec %%"REG_c" \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+r"(src), "+r"(dst)
|
||||
: "r"(stride), "r"(-2*stride),
|
||||
"m"(shift), "m"(rnd), "r"(9*stride-4)
|
||||
: "%"REG_c, "memory"
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Data is already unpacked, so some operations can directly be made from
|
||||
* memory.
|
||||
*/
|
||||
#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
|
||||
static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
|
||||
const int16_t *src, int rnd)\
|
||||
{\
|
||||
int h = 8;\
|
||||
\
|
||||
src -= 1;\
|
||||
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
|
||||
__asm__ volatile(\
|
||||
LOAD_ROUNDER_MMX("%4")\
|
||||
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
|
||||
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
|
||||
"1: \n\t"\
|
||||
"movq 2*0+0(%1), %%mm1 \n\t"\
|
||||
"movq 2*0+8(%1), %%mm2 \n\t"\
|
||||
"movq 2*1+0(%1), %%mm3 \n\t"\
|
||||
"movq 2*1+8(%1), %%mm4 \n\t"\
|
||||
"paddw 2*3+0(%1), %%mm1 \n\t"\
|
||||
"paddw 2*3+8(%1), %%mm2 \n\t"\
|
||||
"paddw 2*2+0(%1), %%mm3 \n\t"\
|
||||
"paddw 2*2+8(%1), %%mm4 \n\t"\
|
||||
"pmullw %%mm5, %%mm3 \n\t"\
|
||||
"pmullw %%mm5, %%mm4 \n\t"\
|
||||
"psubw %%mm1, %%mm3 \n\t"\
|
||||
"psubw %%mm2, %%mm4 \n\t"\
|
||||
NORMALIZE_MMX("$7")\
|
||||
/* Remove bias */\
|
||||
"paddw %%mm6, %%mm3 \n\t"\
|
||||
"paddw %%mm6, %%mm4 \n\t"\
|
||||
TRANSFER_DO_PACK(OP)\
|
||||
"add $24, %1 \n\t"\
|
||||
"add %3, %2 \n\t"\
|
||||
"decl %0 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
: "+r"(h), "+r" (src), "+r" (dst)\
|
||||
: "r"(stride), "m"(rnd)\
|
||||
: "memory"\
|
||||
);\
|
||||
}
|
||||
|
||||
VC1_HOR_16b_SHIFT2(OP_PUT, put_)
|
||||
VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
|
||||
|
||||
|
||||
/**
|
||||
* Purely vertical or horizontal 1/2 shift interpolation.
|
||||
* Sacrify mm6 for *9 factor.
|
||||
*/
|
||||
#define VC1_SHIFT2(OP, OPNAME)\
|
||||
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
|
||||
x86_reg stride, int rnd, x86_reg offset)\
|
||||
{\
|
||||
rnd = 8-rnd;\
|
||||
__asm__ volatile(\
|
||||
"mov $8, %%"REG_c" \n\t"\
|
||||
LOAD_ROUNDER_MMX("%5")\
|
||||
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
|
||||
"1: \n\t"\
|
||||
"movd 0(%0 ), %%mm3 \n\t"\
|
||||
"movd 4(%0 ), %%mm4 \n\t"\
|
||||
"movd 0(%0,%2), %%mm1 \n\t"\
|
||||
"movd 4(%0,%2), %%mm2 \n\t"\
|
||||
"add %2, %0 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm3 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm4 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"\
|
||||
"paddw %%mm1, %%mm3 \n\t"\
|
||||
"paddw %%mm2, %%mm4 \n\t"\
|
||||
"movd 0(%0,%3), %%mm1 \n\t"\
|
||||
"movd 4(%0,%3), %%mm2 \n\t"\
|
||||
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
|
||||
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
|
||||
"punpcklbw %%mm0, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"\
|
||||
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
|
||||
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
|
||||
"movd 0(%0,%2), %%mm1 \n\t"\
|
||||
"movd 4(%0,%2), %%mm2 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm1 \n\t"\
|
||||
"punpcklbw %%mm0, %%mm2 \n\t"\
|
||||
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
|
||||
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
|
||||
NORMALIZE_MMX("$4")\
|
||||
"packuswb %%mm4, %%mm3 \n\t"\
|
||||
OP((%1), %%mm3)\
|
||||
"movq %%mm3, (%1) \n\t"\
|
||||
"add %6, %0 \n\t"\
|
||||
"add %4, %1 \n\t"\
|
||||
"dec %%"REG_c" \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
: "+r"(src), "+r"(dst)\
|
||||
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
|
||||
"g"(stride-offset)\
|
||||
: "%"REG_c, "memory"\
|
||||
);\
|
||||
}
|
||||
|
||||
VC1_SHIFT2(OP_PUT, put_)
|
||||
VC1_SHIFT2(OP_AVG, avg_)
|
||||
|
||||
/**
|
||||
* Core of the 1/4 and 3/4 shift bicubic interpolation.
|
||||
*
|
||||
* @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
|
||||
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
|
||||
* @param A1 Address of 1st tap (beware of unpacked/packed).
|
||||
* @param A2 Address of 2nd tap
|
||||
* @param A3 Address of 3rd tap
|
||||
* @param A4 Address of 4th tap
|
||||
*/
|
||||
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
|
||||
MOVQ "*0+"A1", %%mm1 \n\t" \
|
||||
MOVQ "*4+"A1", %%mm2 \n\t" \
|
||||
UNPACK("%%mm1") \
|
||||
UNPACK("%%mm2") \
|
||||
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
|
||||
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
|
||||
MOVQ "*0+"A2", %%mm3 \n\t" \
|
||||
MOVQ "*4+"A2", %%mm4 \n\t" \
|
||||
UNPACK("%%mm3") \
|
||||
UNPACK("%%mm4") \
|
||||
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
|
||||
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
|
||||
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
|
||||
MOVQ "*0+"A4", %%mm1 \n\t" \
|
||||
MOVQ "*4+"A4", %%mm2 \n\t" \
|
||||
UNPACK("%%mm1") \
|
||||
UNPACK("%%mm2") \
|
||||
"psllw $2, %%mm1 \n\t" /* 4* */ \
|
||||
"psllw $2, %%mm2 \n\t" /* 4* */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
|
||||
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
|
||||
MOVQ "*0+"A3", %%mm1 \n\t" \
|
||||
MOVQ "*4+"A3", %%mm2 \n\t" \
|
||||
UNPACK("%%mm1") \
|
||||
UNPACK("%%mm2") \
|
||||
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
|
||||
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
|
||||
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
|
||||
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
|
||||
|
||||
/**
|
||||
* Macro to build the vertical 16bits version of vc1_put_shift[13].
|
||||
* Here, offset=src_stride. Parameters passed A1 to A4 must use
|
||||
* %3 (src_stride) and %4 (3*src_stride).
|
||||
*
|
||||
* @param NAME Either 1 or 3
|
||||
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
||||
*/
|
||||
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
|
||||
static void \
|
||||
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
|
||||
x86_reg src_stride, \
|
||||
int rnd, int64_t shift) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= src_stride; \
|
||||
__asm__ volatile( \
|
||||
LOAD_ROUNDER_MMX("%5") \
|
||||
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
|
||||
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
|
||||
".p2align 3 \n\t" \
|
||||
"1: \n\t" \
|
||||
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
||||
NORMALIZE_MMX("%6") \
|
||||
TRANSFER_DONT_PACK(OP_PUT) \
|
||||
/* Last 3 (in fact 4) bytes on the line */ \
|
||||
"movd 8+"A1", %%mm1 \n\t" \
|
||||
DO_UNPACK("%%mm1") \
|
||||
"movq %%mm1, %%mm3 \n\t" \
|
||||
"paddw %%mm1, %%mm1 \n\t" \
|
||||
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \
|
||||
"movd 8+"A2", %%mm3 \n\t" \
|
||||
DO_UNPACK("%%mm3") \
|
||||
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
|
||||
"movd 8+"A3", %%mm1 \n\t" \
|
||||
DO_UNPACK("%%mm1") \
|
||||
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
|
||||
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
|
||||
"movd 8+"A4", %%mm1 \n\t" \
|
||||
DO_UNPACK("%%mm1") \
|
||||
"psllw $2, %%mm1 \n\t" /* 4* */ \
|
||||
"psubw %%mm1, %%mm3 \n\t" \
|
||||
"paddw %%mm7, %%mm3 \n\t" \
|
||||
"psraw %6, %%mm3 \n\t" \
|
||||
"movq %%mm3, 16(%2) \n\t" \
|
||||
"add %3, %1 \n\t" \
|
||||
"add $24, %2 \n\t" \
|
||||
"decl %0 \n\t" \
|
||||
"jnz 1b \n\t" \
|
||||
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||
: "r"(src_stride), "r"(3*src_stride), \
|
||||
"m"(rnd), "m"(shift) \
|
||||
: "memory" \
|
||||
); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macro to build the horizontal 16bits version of vc1_put_shift[13].
|
||||
* Here, offset=16bits, so parameters passed A1 to A4 should be simple.
|
||||
*
|
||||
* @param NAME Either 1 or 3
|
||||
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
||||
*/
|
||||
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
||||
static void \
|
||||
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
|
||||
const int16_t *src, int rnd) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= 1; \
|
||||
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
|
||||
__asm__ volatile( \
|
||||
LOAD_ROUNDER_MMX("%4") \
|
||||
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
|
||||
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
|
||||
".p2align 3 \n\t" \
|
||||
"1: \n\t" \
|
||||
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
|
||||
NORMALIZE_MMX("$7") \
|
||||
/* Remove bias */ \
|
||||
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
|
||||
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
|
||||
TRANSFER_DO_PACK(OP) \
|
||||
"add $24, %1 \n\t" \
|
||||
"add %3, %2 \n\t" \
|
||||
"decl %0 \n\t" \
|
||||
"jnz 1b \n\t" \
|
||||
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||
: "r"(stride), "m"(rnd) \
|
||||
: "memory" \
|
||||
); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Macro to build the 8bits, any direction, version of vc1_put_shift[13].
|
||||
* Here, offset=src_stride. Parameters passed A1 to A4 must use
|
||||
* %3 (offset) and %4 (3*offset).
|
||||
*
|
||||
* @param NAME Either 1 or 3
|
||||
* @see MSPEL_FILTER13_CORE for information on A1->A4
|
||||
*/
|
||||
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
|
||||
static void \
|
||||
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
|
||||
x86_reg stride, int rnd, x86_reg offset) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= offset; \
|
||||
rnd = 32-rnd; \
|
||||
__asm__ volatile ( \
|
||||
LOAD_ROUNDER_MMX("%6") \
|
||||
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
|
||||
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
|
||||
".p2align 3 \n\t" \
|
||||
"1: \n\t" \
|
||||
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
||||
NORMALIZE_MMX("$6") \
|
||||
TRANSFER_DO_PACK(OP) \
|
||||
"add %5, %1 \n\t" \
|
||||
"add %5, %2 \n\t" \
|
||||
"decl %0 \n\t" \
|
||||
"jnz 1b \n\t" \
|
||||
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
|
||||
: "memory" \
|
||||
); \
|
||||
}
|
||||
|
||||
/** 1/4 shift bicubic interpolation */
|
||||
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
|
||||
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
|
||||
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
|
||||
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
|
||||
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
|
||||
|
||||
/** 3/4 shift bicubic interpolation */
|
||||
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
|
||||
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
|
||||
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
|
||||
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
|
||||
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
|
||||
|
||||
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
|
||||
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
|
||||
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
|
||||
|
||||
/**
|
||||
* Interpolate fractional pel values by applying proper vertical then
|
||||
* horizontal filter.
|
||||
*
|
||||
* @param dst Destination buffer for interpolated pels.
|
||||
* @param src Source buffer.
|
||||
* @param stride Stride for both src and dst buffers.
|
||||
* @param hmode Horizontal filter (expressed in quarter pixels shift).
|
||||
* @param hmode Vertical filter.
|
||||
* @param rnd Rounding bias.
|
||||
*/
|
||||
#define VC1_MSPEL_MC(OP)\
|
||||
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
|
||||
int hmode, int vmode, int rnd)\
|
||||
{\
|
||||
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
|
||||
{ NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
|
||||
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
|
||||
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
|
||||
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
|
||||
\
|
||||
__asm__ volatile(\
|
||||
"pxor %%mm0, %%mm0 \n\t"\
|
||||
::: "memory"\
|
||||
);\
|
||||
\
|
||||
if (vmode) { /* Vertical filter to apply */\
|
||||
if (hmode) { /* Horizontal filter to apply, output to tmp */\
|
||||
static const int shift_value[] = { 0, 5, 1, 5 };\
|
||||
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
|
||||
int r;\
|
||||
DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
|
||||
\
|
||||
r = (1<<(shift-1)) + rnd-1;\
|
||||
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
|
||||
\
|
||||
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
|
||||
return;\
|
||||
}\
|
||||
else { /* No horizontal filter, output 8 lines to dst */\
|
||||
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
|
||||
return;\
|
||||
}\
|
||||
}\
|
||||
\
|
||||
/* Horizontal mode with no vertical mode */\
|
||||
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
|
||||
}
|
||||
|
||||
VC1_MSPEL_MC(put_)
|
||||
VC1_MSPEL_MC(avg_)
|
||||
|
||||
/** Macro to ease bicubic filter interpolation functions declarations */
|
||||
#define DECLARE_FUNCTION(a, b) \
|
||||
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) { \
|
||||
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
|
||||
}\
|
||||
static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
|
||||
const uint8_t *src, \
|
||||
int stride, int rnd) \
|
||||
{ \
|
||||
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
|
||||
}
|
||||
|
||||
DECLARE_FUNCTION(0, 1)
|
||||
DECLARE_FUNCTION(0, 2)
|
||||
DECLARE_FUNCTION(0, 3)
|
||||
|
||||
DECLARE_FUNCTION(1, 0)
|
||||
DECLARE_FUNCTION(1, 1)
|
||||
DECLARE_FUNCTION(1, 2)
|
||||
DECLARE_FUNCTION(1, 3)
|
||||
|
||||
DECLARE_FUNCTION(2, 0)
|
||||
DECLARE_FUNCTION(2, 1)
|
||||
DECLARE_FUNCTION(2, 2)
|
||||
DECLARE_FUNCTION(2, 3)
|
||||
|
||||
DECLARE_FUNCTION(3, 0)
|
||||
DECLARE_FUNCTION(3, 1)
|
||||
DECLARE_FUNCTION(3, 2)
|
||||
DECLARE_FUNCTION(3, 3)
|
||||
|
||||
static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
|
||||
DCTELEM *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = (17 * dc + 4) >> 3;
|
||||
dc = (17 * dc + 64) >> 7;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm2 \n\t"
|
||||
"movd %1, %%mm3 \n\t"
|
||||
"movd %2, %%mm4 \n\t"
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movd %%mm2, %0 \n\t"
|
||||
"movd %%mm3, %1 \n\t"
|
||||
"movd %%mm4, %2 \n\t"
|
||||
"movd %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
|
||||
DCTELEM *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = (17 * dc + 4) >> 3;
|
||||
dc = (12 * dc + 64) >> 7;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm2 \n\t"
|
||||
"movd %1, %%mm3 \n\t"
|
||||
"movd %2, %%mm4 \n\t"
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movd %%mm2, %0 \n\t"
|
||||
"movd %%mm3, %1 \n\t"
|
||||
"movd %%mm4, %2 \n\t"
|
||||
"movd %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
dest += 4*linesize;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm2 \n\t"
|
||||
"movd %1, %%mm3 \n\t"
|
||||
"movd %2, %%mm4 \n\t"
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movd %%mm2, %0 \n\t"
|
||||
"movd %%mm3, %1 \n\t"
|
||||
"movd %%mm4, %2 \n\t"
|
||||
"movd %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
|
||||
DCTELEM *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = ( 3 * dc + 1) >> 1;
|
||||
dc = (17 * dc + 64) >> 7;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm2 \n\t"
|
||||
"movq %1, %%mm3 \n\t"
|
||||
"movq %2, %%mm4 \n\t"
|
||||
"movq %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq %%mm3, %1 \n\t"
|
||||
"movq %%mm4, %2 \n\t"
|
||||
"movq %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
|
||||
DCTELEM *block)
|
||||
{
|
||||
int dc = block[0];
|
||||
dc = (3 * dc + 1) >> 1;
|
||||
dc = (3 * dc + 16) >> 5;
|
||||
__asm__ volatile(
|
||||
"movd %0, %%mm0 \n\t"
|
||||
"pshufw $0, %%mm0, %%mm0 \n\t"
|
||||
"pxor %%mm1, %%mm1 \n\t"
|
||||
"psubw %%mm0, %%mm1 \n\t"
|
||||
"packuswb %%mm0, %%mm0 \n\t"
|
||||
"packuswb %%mm1, %%mm1 \n\t"
|
||||
::"r"(dc)
|
||||
);
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm2 \n\t"
|
||||
"movq %1, %%mm3 \n\t"
|
||||
"movq %2, %%mm4 \n\t"
|
||||
"movq %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq %%mm3, %1 \n\t"
|
||||
"movq %%mm4, %2 \n\t"
|
||||
"movq %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
dest += 4*linesize;
|
||||
__asm__ volatile(
|
||||
"movq %0, %%mm2 \n\t"
|
||||
"movq %1, %%mm3 \n\t"
|
||||
"movq %2, %%mm4 \n\t"
|
||||
"movq %3, %%mm5 \n\t"
|
||||
"paddusb %%mm0, %%mm2 \n\t"
|
||||
"paddusb %%mm0, %%mm3 \n\t"
|
||||
"paddusb %%mm0, %%mm4 \n\t"
|
||||
"paddusb %%mm0, %%mm5 \n\t"
|
||||
"psubusb %%mm1, %%mm2 \n\t"
|
||||
"psubusb %%mm1, %%mm3 \n\t"
|
||||
"psubusb %%mm1, %%mm4 \n\t"
|
||||
"psubusb %%mm1, %%mm5 \n\t"
|
||||
"movq %%mm2, %0 \n\t"
|
||||
"movq %%mm3, %1 \n\t"
|
||||
"movq %%mm4, %2 \n\t"
|
||||
"movq %%mm5, %3 \n\t"
|
||||
:"+m"(*(uint32_t*)(dest+0*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+1*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+2*linesize)),
|
||||
"+m"(*(uint32_t*)(dest+3*linesize))
|
||||
);
|
||||
}
|
||||
|
||||
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
|
||||
{
|
||||
dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
|
||||
|
||||
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
|
||||
|
||||
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
|
||||
|
||||
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
|
||||
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
|
||||
}
|
||||
|
||||
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
|
||||
{
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
|
||||
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
|
||||
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
|
||||
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
|
||||
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
|
||||
|
||||
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
|
||||
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
|
||||
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
|
||||
}
|
||||
#endif /* HAVE_INLINE_ASM */
|
||||
612
project/jni/ffmpeg/libavcodec/x86/videodsp.asm
Normal file
612
project/jni/ffmpeg/libavcodec/x86/videodsp.asm
Normal file
@@ -0,0 +1,612 @@
|
||||
;******************************************************************************
|
||||
;* Core video DSP functions
|
||||
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
|
||||
; x86_reg start_y, x86_reg end_y, x86_reg block_h,
|
||||
; x86_reg start_x, x86_reg end_x, x86_reg block_w);
|
||||
;
|
||||
; The actual function itself is below. It basically wraps a very simple
|
||||
; w = end_x - start_x
|
||||
; if (w) {
|
||||
; if (w > 22) {
|
||||
; jump to the slow loop functions
|
||||
; } else {
|
||||
; jump to the fast loop functions
|
||||
; }
|
||||
; }
|
||||
;
|
||||
; ... and then the same for left/right extend also. See below for loop
|
||||
; function implementations. Fast are fixed-width, slow is variable-width
|
||||
|
||||
%macro EMU_EDGE_FUNC 0
|
||||
%if ARCH_X86_64
|
||||
%define w_reg r7
|
||||
cglobal emu_edge_core, 6, 9, 1
|
||||
mov r8, r5 ; save block_h
|
||||
%else
|
||||
%define w_reg r6
|
||||
cglobal emu_edge_core, 2, 7, 0
|
||||
mov r4, r4m ; end_y
|
||||
mov r5, r5m ; block_h
|
||||
%endif
|
||||
|
||||
; start with vertical extend (top/bottom) and body pixel copy
|
||||
mov w_reg, r7m
|
||||
sub w_reg, r6m ; w = start_x - end_x
|
||||
sub r5, r4
|
||||
%if ARCH_X86_64
|
||||
sub r4, r3
|
||||
%else
|
||||
sub r4, dword r3m
|
||||
%endif
|
||||
cmp w_reg, 22
|
||||
jg .slow_v_extend_loop
|
||||
%if ARCH_X86_32
|
||||
mov r2, r2m ; linesize
|
||||
%endif
|
||||
sal w_reg, 7 ; w * 128
|
||||
%ifdef PIC
|
||||
lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
|
||||
add w_reg, rax
|
||||
%else
|
||||
lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
|
||||
%endif
|
||||
call w_reg ; fast top extend, body copy and bottom extend
|
||||
.v_extend_end:
|
||||
|
||||
; horizontal extend (left/right)
|
||||
mov w_reg, r6m ; start_x
|
||||
sub r0, w_reg
|
||||
%if ARCH_X86_64
|
||||
mov r3, r0 ; backup of buf+block_h*linesize
|
||||
mov r5, r8
|
||||
%else
|
||||
mov r0m, r0 ; backup of buf+block_h*linesize
|
||||
mov r5, r5m
|
||||
%endif
|
||||
test w_reg, w_reg
|
||||
jz .right_extend
|
||||
cmp w_reg, 22
|
||||
jg .slow_left_extend_loop
|
||||
mov r1, w_reg
|
||||
dec w_reg
|
||||
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
|
||||
sar w_reg, 1
|
||||
sal w_reg, 6
|
||||
; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
|
||||
; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
|
||||
%ifdef PIC
|
||||
lea rax, [.emuedge_extend_left_2]
|
||||
add w_reg, rax
|
||||
%else
|
||||
lea w_reg, [.emuedge_extend_left_2+w_reg]
|
||||
%endif
|
||||
call w_reg
|
||||
|
||||
; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
|
||||
.right_extend:
|
||||
%if ARCH_X86_32
|
||||
mov r0, r0m
|
||||
mov r5, r5m
|
||||
%endif
|
||||
mov w_reg, r7m ; end_x
|
||||
mov r1, r8m ; block_w
|
||||
mov r4, r1
|
||||
sub r1, w_reg
|
||||
jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
|
||||
cmp r1, 22
|
||||
jg .slow_right_extend_loop
|
||||
dec r1
|
||||
; FIXME we can do a if size == 1 here if that makes any speed difference, test me
|
||||
sar r1, 1
|
||||
sal r1, 6
|
||||
%ifdef PIC
|
||||
lea rax, [.emuedge_extend_right_2]
|
||||
add r1, rax
|
||||
%else
|
||||
lea r1, [.emuedge_extend_right_2+r1]
|
||||
%endif
|
||||
call r1
|
||||
.h_extend_end:
|
||||
RET
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define vall al
|
||||
%define valh ah
|
||||
%define valw ax
|
||||
%define valw2 r7w
|
||||
%define valw3 r3w
|
||||
%if WIN64
|
||||
%define valw4 r7w
|
||||
%else ; unix64
|
||||
%define valw4 r3w
|
||||
%endif
|
||||
%define vald eax
|
||||
%else
|
||||
%define vall bl
|
||||
%define valh bh
|
||||
%define valw bx
|
||||
%define valw2 r6w
|
||||
%define valw3 valw2
|
||||
%define valw4 valw3
|
||||
%define vald ebx
|
||||
%define stack_offset 0x14
|
||||
%endif
|
||||
|
||||
%endmacro
|
||||
|
||||
; macro to read/write a horizontal number of pixels (%2) to/from registers
|
||||
; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
|
||||
; - if (%2 & 15 == 8) fills the last 8 bytes into rax
|
||||
; - else if (%2 & 8) fills 8 bytes into mm0
|
||||
; - if (%2 & 7 == 4) fills the last 4 bytes into rax
|
||||
; - else if (%2 & 4) fills 4 bytes into mm0-1
|
||||
; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
|
||||
; (note that we're using r3 for body/bottom because it's a shorter
|
||||
; opcode, and then the loop fits in 128 bytes)
|
||||
; - else fills remaining bytes into rax
|
||||
; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
|
||||
; - if (%2 & 7 == 4) fills 4 bytes into ebx
|
||||
; - else if (%2 & 4) fills 4 bytes into mm0-7
|
||||
; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
|
||||
; - else fills remaining bytes into ebx
|
||||
; writing data out is in the same way
|
||||
%macro READ_NUM_BYTES 2
|
||||
%assign %%src_off 0 ; offset in source buffer
|
||||
%assign %%smidx 0 ; mmx register idx
|
||||
%assign %%sxidx 0 ; xmm register idx
|
||||
|
||||
%if cpuflag(sse)
|
||||
%rep %2/16
|
||||
movups xmm %+ %%sxidx, [r1+%%src_off]
|
||||
%assign %%src_off %%src_off+16
|
||||
%assign %%sxidx %%sxidx+1
|
||||
%endrep ; %2/16
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if (%2-%%src_off) == 8
|
||||
mov rax, [r1+%%src_off]
|
||||
%assign %%src_off %%src_off+8
|
||||
%endif ; (%2-%%src_off) == 8
|
||||
%endif ; x86-64
|
||||
|
||||
%rep (%2-%%src_off)/8
|
||||
movq mm %+ %%smidx, [r1+%%src_off]
|
||||
%assign %%src_off %%src_off+8
|
||||
%assign %%smidx %%smidx+1
|
||||
%endrep ; (%2-%%dst_off)/8
|
||||
|
||||
%if (%2-%%src_off) == 4
|
||||
mov vald, [r1+%%src_off]
|
||||
%elif (%2-%%src_off) & 4
|
||||
movd mm %+ %%smidx, [r1+%%src_off]
|
||||
%assign %%src_off %%src_off+4
|
||||
%endif ; (%2-%%src_off) ==/& 4
|
||||
|
||||
%if (%2-%%src_off) == 1
|
||||
mov vall, [r1+%%src_off]
|
||||
%elif (%2-%%src_off) == 2
|
||||
mov valw, [r1+%%src_off]
|
||||
%elif (%2-%%src_off) == 3
|
||||
%ifidn %1, top
|
||||
mov valw2, [r1+%%src_off]
|
||||
%elifidn %1, body
|
||||
mov valw3, [r1+%%src_off]
|
||||
%elifidn %1, bottom
|
||||
mov valw4, [r1+%%src_off]
|
||||
%endif ; %1 ==/!= top
|
||||
mov vall, [r1+%%src_off+2]
|
||||
%endif ; (%2-%%src_off) == 1/2/3
|
||||
%endmacro ; READ_NUM_BYTES
|
||||
|
||||
%macro WRITE_NUM_BYTES 2
|
||||
%assign %%dst_off 0 ; offset in destination buffer
|
||||
%assign %%dmidx 0 ; mmx register idx
|
||||
%assign %%dxidx 0 ; xmm register idx
|
||||
|
||||
%if cpuflag(sse)
|
||||
%rep %2/16
|
||||
movups [r0+%%dst_off], xmm %+ %%dxidx
|
||||
%assign %%dst_off %%dst_off+16
|
||||
%assign %%dxidx %%dxidx+1
|
||||
%endrep ; %2/16
|
||||
%endif
|
||||
|
||||
%if ARCH_X86_64
|
||||
%if (%2-%%dst_off) == 8
|
||||
mov [r0+%%dst_off], rax
|
||||
%assign %%dst_off %%dst_off+8
|
||||
%endif ; (%2-%%dst_off) == 8
|
||||
%endif ; x86-64
|
||||
|
||||
%rep (%2-%%dst_off)/8
|
||||
movq [r0+%%dst_off], mm %+ %%dmidx
|
||||
%assign %%dst_off %%dst_off+8
|
||||
%assign %%dmidx %%dmidx+1
|
||||
%endrep ; (%2-%%dst_off)/8
|
||||
|
||||
%if (%2-%%dst_off) == 4
|
||||
mov [r0+%%dst_off], vald
|
||||
%elif (%2-%%dst_off) & 4
|
||||
movd [r0+%%dst_off], mm %+ %%dmidx
|
||||
%assign %%dst_off %%dst_off+4
|
||||
%endif ; (%2-%%dst_off) ==/& 4
|
||||
|
||||
%if (%2-%%dst_off) == 1
|
||||
mov [r0+%%dst_off], vall
|
||||
%elif (%2-%%dst_off) == 2
|
||||
mov [r0+%%dst_off], valw
|
||||
%elif (%2-%%dst_off) == 3
|
||||
%ifidn %1, top
|
||||
mov [r0+%%dst_off], valw2
|
||||
%elifidn %1, body
|
||||
mov [r0+%%dst_off], valw3
|
||||
%elifidn %1, bottom
|
||||
mov [r0+%%dst_off], valw4
|
||||
%endif ; %1 ==/!= top
|
||||
mov [r0+%%dst_off+2], vall
|
||||
%endif ; (%2-%%dst_off) == 1/2/3
|
||||
%endmacro ; WRITE_NUM_BYTES
|
||||
|
||||
; vertical top/bottom extend and body copy fast loops
|
||||
; these are function pointers to set-width line copy functions, i.e.
|
||||
; they read a fixed number of pixels into set registers, and write
|
||||
; those out into the destination buffer
|
||||
; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
|
||||
; r6(eax/64)/r3(ebx/32)=val_reg
|
||||
%macro VERTICAL_EXTEND 0
|
||||
%assign %%n 1
|
||||
%rep 22
|
||||
ALIGN 128
|
||||
.emuedge_v_extend_ %+ %%n:
|
||||
; extend pixels above body
|
||||
%if ARCH_X86_64
|
||||
test r3 , r3 ; if (!start_y)
|
||||
jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
|
||||
%else ; ARCH_X86_32
|
||||
cmp dword r3m, 0
|
||||
je .emuedge_copy_body_ %+ %%n %+ _loop
|
||||
%endif ; ARCH_X86_64/32
|
||||
READ_NUM_BYTES top, %%n ; read bytes
|
||||
.emuedge_extend_top_ %+ %%n %+ _loop: ; do {
|
||||
WRITE_NUM_BYTES top, %%n ; write bytes
|
||||
add r0 , r2 ; dst += linesize
|
||||
%if ARCH_X86_64
|
||||
dec r3d
|
||||
%else ; ARCH_X86_32
|
||||
dec dword r3m
|
||||
%endif ; ARCH_X86_64/32
|
||||
jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
|
||||
|
||||
; copy body pixels
|
||||
.emuedge_copy_body_ %+ %%n %+ _loop: ; do {
|
||||
READ_NUM_BYTES body, %%n ; read bytes
|
||||
WRITE_NUM_BYTES body, %%n ; write bytes
|
||||
add r0 , r2 ; dst += linesize
|
||||
add r1 , r2 ; src += linesize
|
||||
dec r4d
|
||||
jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
|
||||
|
||||
; copy bottom pixels
|
||||
test r5 , r5 ; if (!block_h)
|
||||
jz .emuedge_v_extend_end_ %+ %%n ; goto end
|
||||
sub r1 , r2 ; src -= linesize
|
||||
READ_NUM_BYTES bottom, %%n ; read bytes
|
||||
.emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
|
||||
WRITE_NUM_BYTES bottom, %%n ; write bytes
|
||||
add r0 , r2 ; dst += linesize
|
||||
dec r5d
|
||||
jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
|
||||
|
||||
.emuedge_v_extend_end_ %+ %%n:
|
||||
%if ARCH_X86_64
|
||||
ret
|
||||
%else ; ARCH_X86_32
|
||||
rep ret
|
||||
%endif ; ARCH_X86_64/32
|
||||
%assign %%n %%n+1
|
||||
%endrep
|
||||
%endmacro VERTICAL_EXTEND
|
||||
|
||||
; left/right (horizontal) fast extend functions
|
||||
; these are essentially identical to the vertical extend ones above,
|
||||
; just left/right separated because number of pixels to extend is
|
||||
; obviously not the same on both sides.
|
||||
; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
|
||||
; lowest two bytes of the register (so val*0x0101), and are splatted
|
||||
; into each byte of mm0 as well if n_pixels >= 8
|
||||
|
||||
%macro READ_V_PIXEL 2
|
||||
mov vall, %2
|
||||
mov valh, vall
|
||||
%if %1 >= 8
|
||||
movd mm0, vald
|
||||
%if cpuflag(mmxext)
|
||||
pshufw mm0, mm0, 0
|
||||
%else ; mmx
|
||||
punpcklwd mm0, mm0
|
||||
punpckldq mm0, mm0
|
||||
%endif ; sse
|
||||
%endif ; %1 >= 8
|
||||
%endmacro
|
||||
|
||||
%macro WRITE_V_PIXEL 2
|
||||
%assign %%dst_off 0
|
||||
%rep %1/8
|
||||
movq [%2+%%dst_off], mm0
|
||||
%assign %%dst_off %%dst_off+8
|
||||
%endrep
|
||||
%if %1 & 4
|
||||
%if %1 >= 8
|
||||
movd [%2+%%dst_off], mm0
|
||||
%else ; %1 < 8
|
||||
mov [%2+%%dst_off] , valw
|
||||
mov [%2+%%dst_off+2], valw
|
||||
%endif ; %1 >=/< 8
|
||||
%assign %%dst_off %%dst_off+4
|
||||
%endif ; %1 & 4
|
||||
%if %1&2
|
||||
mov [%2+%%dst_off], valw
|
||||
%endif ; %1 & 2
|
||||
%endmacro
|
||||
|
||||
; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
|
||||
%macro LEFT_EXTEND 0
|
||||
%assign %%n 2
|
||||
%rep 11
|
||||
ALIGN 64
|
||||
.emuedge_extend_left_ %+ %%n: ; do {
|
||||
sub r0, r2 ; dst -= linesize
|
||||
READ_V_PIXEL %%n, [r0+r1] ; read pixels
|
||||
WRITE_V_PIXEL %%n, r0 ; write pixels
|
||||
dec r5
|
||||
jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
|
||||
%if ARCH_X86_64
|
||||
ret
|
||||
%else ; ARCH_X86_32
|
||||
rep ret
|
||||
%endif ; ARCH_X86_64/32
|
||||
%assign %%n %%n+2
|
||||
%endrep
|
||||
%endmacro ; LEFT_EXTEND
|
||||
|
||||
; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
|
||||
%macro RIGHT_EXTEND 0
|
||||
%assign %%n 2
|
||||
%rep 11
|
||||
ALIGN 64
|
||||
.emuedge_extend_right_ %+ %%n: ; do {
|
||||
%if ARCH_X86_64
|
||||
sub r3, r2 ; dst -= linesize
|
||||
READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
|
||||
WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
|
||||
dec r8
|
||||
%else ; ARCH_X86_32
|
||||
sub r0, r2 ; dst -= linesize
|
||||
READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
|
||||
WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
|
||||
dec r5
|
||||
%endif ; ARCH_X86_64/32
|
||||
jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
|
||||
%if ARCH_X86_64
|
||||
ret
|
||||
%else ; ARCH_X86_32
|
||||
rep ret
|
||||
%endif ; ARCH_X86_64/32
|
||||
%assign %%n %%n+2
|
||||
%endrep
|
||||
|
||||
%if ARCH_X86_32
|
||||
%define stack_offset 0x10
|
||||
%endif
|
||||
%endmacro ; RIGHT_EXTEND
|
||||
|
||||
; below follow the "slow" copy/extend functions, these act on a non-fixed
|
||||
; width specified in a register, and run a loop to copy the full amount
|
||||
; of bytes. They are optimized for copying of large amounts of pixels per
|
||||
; line, so they unconditionally splat data into mm registers to copy 8
|
||||
; bytes per loop iteration. It could be considered to use xmm for x86-64
|
||||
; also, but I haven't optimized this as much (i.e. FIXME)
|
||||
%macro V_COPY_NPX 4-5
|
||||
%if %0 == 4
|
||||
test w_reg, %4
|
||||
jz .%1_skip_%4_px
|
||||
%else ; %0 == 5
|
||||
.%1_%4_px_loop:
|
||||
%endif
|
||||
%3 %2, [r1+cnt_reg]
|
||||
%3 [r0+cnt_reg], %2
|
||||
add cnt_reg, %4
|
||||
%if %0 == 5
|
||||
sub w_reg, %4
|
||||
test w_reg, %5
|
||||
jnz .%1_%4_px_loop
|
||||
%endif
|
||||
.%1_skip_%4_px:
|
||||
%endmacro
|
||||
|
||||
%macro V_COPY_ROW 2
|
||||
%ifidn %1, bottom
|
||||
sub r1, linesize
|
||||
%endif
|
||||
.%1_copy_loop:
|
||||
xor cnt_reg, cnt_reg
|
||||
%if notcpuflag(sse)
|
||||
%define linesize r2m
|
||||
V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
|
||||
%else ; sse
|
||||
V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
|
||||
%if ARCH_X86_64
|
||||
%define linesize r2
|
||||
V_COPY_NPX %1, rax , mov, 8
|
||||
%else ; ARCH_X86_32
|
||||
%define linesize r2m
|
||||
V_COPY_NPX %1, mm0, movq, 8
|
||||
%endif ; ARCH_X86_64/32
|
||||
%endif ; sse
|
||||
V_COPY_NPX %1, vald, mov, 4
|
||||
V_COPY_NPX %1, valw, mov, 2
|
||||
V_COPY_NPX %1, vall, mov, 1
|
||||
mov w_reg, cnt_reg
|
||||
%ifidn %1, body
|
||||
add r1, linesize
|
||||
%endif
|
||||
add r0, linesize
|
||||
dec %2
|
||||
jnz .%1_copy_loop
|
||||
%endmacro
|
||||
|
||||
%macro SLOW_V_EXTEND 0
|
||||
.slow_v_extend_loop:
|
||||
; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
|
||||
; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
|
||||
%if ARCH_X86_64
|
||||
push r8 ; save old value of block_h
|
||||
test r3, r3
|
||||
%define cnt_reg r8
|
||||
jz .do_body_copy ; if (!start_y) goto do_body_copy
|
||||
V_COPY_ROW top, r3
|
||||
%else
|
||||
cmp dword r3m, 0
|
||||
%define cnt_reg r2
|
||||
je .do_body_copy ; if (!start_y) goto do_body_copy
|
||||
V_COPY_ROW top, dword r3m
|
||||
%endif
|
||||
|
||||
.do_body_copy:
|
||||
V_COPY_ROW body, r4
|
||||
|
||||
%if ARCH_X86_64
|
||||
pop r8 ; restore old value of block_h
|
||||
%define cnt_reg r3
|
||||
%endif
|
||||
test r5, r5
|
||||
%if ARCH_X86_64
|
||||
jz .v_extend_end
|
||||
%else
|
||||
jz .skip_bottom_extend
|
||||
%endif
|
||||
V_COPY_ROW bottom, r5
|
||||
%if ARCH_X86_32
|
||||
.skip_bottom_extend:
|
||||
mov r2, r2m
|
||||
%endif
|
||||
jmp .v_extend_end
|
||||
%endmacro
|
||||
|
||||
%macro SLOW_LEFT_EXTEND 0
|
||||
.slow_left_extend_loop:
|
||||
; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
|
||||
mov r4, 8
|
||||
sub r0, linesize
|
||||
READ_V_PIXEL 8, [r0+w_reg]
|
||||
.left_extend_8px_loop:
|
||||
movq [r0+r4-8], mm0
|
||||
add r4, 8
|
||||
cmp r4, w_reg
|
||||
jle .left_extend_8px_loop
|
||||
sub r4, 8
|
||||
cmp r4, w_reg
|
||||
jge .left_extend_loop_end
|
||||
.left_extend_2px_loop:
|
||||
mov [r0+r4], valw
|
||||
add r4, 2
|
||||
cmp r4, w_reg
|
||||
jl .left_extend_2px_loop
|
||||
.left_extend_loop_end:
|
||||
dec r5
|
||||
jnz .slow_left_extend_loop
|
||||
%if ARCH_X86_32
|
||||
mov r2, r2m
|
||||
%endif
|
||||
jmp .right_extend
|
||||
%endmacro
|
||||
|
||||
%macro SLOW_RIGHT_EXTEND 0
|
||||
.slow_right_extend_loop:
|
||||
; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
|
||||
; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
|
||||
%if ARCH_X86_64
|
||||
%define buf_reg r3
|
||||
%define bh_reg r8
|
||||
%else
|
||||
%define buf_reg r0
|
||||
%define bh_reg r5
|
||||
%endif
|
||||
lea r1, [r4-8]
|
||||
sub buf_reg, linesize
|
||||
READ_V_PIXEL 8, [buf_reg+w_reg-1]
|
||||
.right_extend_8px_loop:
|
||||
movq [buf_reg+r1], mm0
|
||||
sub r1, 8
|
||||
cmp r1, w_reg
|
||||
jge .right_extend_8px_loop
|
||||
add r1, 8
|
||||
cmp r1, w_reg
|
||||
je .right_extend_loop_end
|
||||
.right_extend_2px_loop:
|
||||
sub r1, 2
|
||||
mov [buf_reg+r1], valw
|
||||
cmp r1, w_reg
|
||||
jg .right_extend_2px_loop
|
||||
.right_extend_loop_end:
|
||||
dec bh_reg
|
||||
jnz .slow_right_extend_loop
|
||||
jmp .h_extend_end
|
||||
%endmacro
|
||||
|
||||
%macro emu_edge 1
|
||||
INIT_XMM %1
|
||||
EMU_EDGE_FUNC
|
||||
VERTICAL_EXTEND
|
||||
LEFT_EXTEND
|
||||
RIGHT_EXTEND
|
||||
SLOW_V_EXTEND
|
||||
SLOW_LEFT_EXTEND
|
||||
SLOW_RIGHT_EXTEND
|
||||
%endmacro
|
||||
|
||||
emu_edge sse
|
||||
%if ARCH_X86_32
|
||||
emu_edge mmx
|
||||
%endif
|
||||
|
||||
%macro PREFETCH_FN 1
|
||||
cglobal prefetch, 3, 3, 0, buf, stride, h
|
||||
.loop:
|
||||
%1 [bufq]
|
||||
add bufq, strideq
|
||||
dec hd
|
||||
jg .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
PREFETCH_FN prefetcht0
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX 3dnow
|
||||
PREFETCH_FN prefetch
|
||||
%endif
|
||||
127
project/jni/ffmpeg/libavcodec/x86/videodsp_init.c
Normal file
127
project/jni/ffmpeg/libavcodec/x86/videodsp_init.c
Normal file
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
* Copyright (C) 2002-2012 Michael Niedermayer
|
||||
* Copyright (C) 2012 Ronald S. Bultje
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavutil/common.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/videodsp.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
|
||||
x86_reg linesize, x86_reg start_y,
|
||||
x86_reg end_y, x86_reg block_h,
|
||||
x86_reg start_x, x86_reg end_x,
|
||||
x86_reg block_w);
|
||||
extern emu_edge_core_func ff_emu_edge_core_mmx;
|
||||
extern emu_edge_core_func ff_emu_edge_core_sse;
|
||||
|
||||
static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t linesize_arg,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y,
|
||||
int w, int h,
|
||||
emu_edge_core_func *core_fn)
|
||||
{
|
||||
int start_y, start_x, end_y, end_x, src_y_add = 0;
|
||||
int linesize = linesize_arg;
|
||||
|
||||
if(!w || !h)
|
||||
return;
|
||||
|
||||
if (src_y >= h) {
|
||||
src -= src_y*linesize;
|
||||
src_y_add = h - 1;
|
||||
src_y = h - 1;
|
||||
} else if (src_y <= -block_h) {
|
||||
src -= src_y*linesize;
|
||||
src_y_add = 1 - block_h;
|
||||
src_y = 1 - block_h;
|
||||
}
|
||||
if (src_x >= w) {
|
||||
src += w - 1 - src_x;
|
||||
src_x = w - 1;
|
||||
} else if (src_x <= -block_w) {
|
||||
src += 1 - block_w - src_x;
|
||||
src_x = 1 - block_w;
|
||||
}
|
||||
|
||||
start_y = FFMAX(0, -src_y);
|
||||
start_x = FFMAX(0, -src_x);
|
||||
end_y = FFMIN(block_h, h-src_y);
|
||||
end_x = FFMIN(block_w, w-src_x);
|
||||
av_assert2(start_x < end_x && block_w > 0);
|
||||
av_assert2(start_y < end_y && block_h > 0);
|
||||
|
||||
// fill in the to-be-copied part plus all above/below
|
||||
src += (src_y_add + start_y) * linesize + start_x;
|
||||
buf += start_x;
|
||||
core_fn(buf, src, linesize, start_y, end_y,
|
||||
block_h, start_x, end_x, block_w);
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t linesize,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y, int w, int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
|
||||
w, h, &ff_emu_edge_core_mmx);
|
||||
}
|
||||
#endif
|
||||
|
||||
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
|
||||
ptrdiff_t linesize,
|
||||
int block_w, int block_h,
|
||||
int src_x, int src_y, int w, int h)
|
||||
{
|
||||
emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
|
||||
w, h, &ff_emu_edge_core_sse);
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
|
||||
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
|
||||
|
||||
void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (bpc <= 8 && mm_flags & AV_CPU_FLAG_MMX) {
|
||||
ctx->emulated_edge_mc = emulated_edge_mc_mmx;
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_3DNOW) {
|
||||
ctx->prefetch = ff_prefetch_3dnow;
|
||||
}
|
||||
#endif /* ARCH_X86_32 */
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
||||
ctx->prefetch = ff_prefetch_mmxext;
|
||||
}
|
||||
if (bpc <= 8 && mm_flags & AV_CPU_FLAG_SSE) {
|
||||
ctx->emulated_edge_mc = emulated_edge_mc_sse;
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
||||
654
project/jni/ffmpeg/libavcodec/x86/vp3dsp.asm
Normal file
654
project/jni/ffmpeg/libavcodec/x86/vp3dsp.asm
Normal file
@@ -0,0 +1,654 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the VP3 decoder
|
||||
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
; MMX-optimized functions cribbed from the original VP3 source code.
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
vp3_idct_data: times 8 dw 64277
|
||||
times 8 dw 60547
|
||||
times 8 dw 54491
|
||||
times 8 dw 46341
|
||||
times 8 dw 36410
|
||||
times 8 dw 25080
|
||||
times 8 dw 12785
|
||||
|
||||
cextern pb_1
|
||||
cextern pb_3
|
||||
cextern pb_7
|
||||
cextern pb_1F
|
||||
cextern pb_80
|
||||
cextern pb_81
|
||||
|
||||
cextern pw_8
|
||||
|
||||
SECTION .text
|
||||
|
||||
; this is off by one or two for some cases when filter_limit is greater than 63
|
||||
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
|
||||
; out: p1 in mm4, p2 in mm3
|
||||
%macro VP3_LOOP_FILTER 0
|
||||
movq m7, m6
|
||||
pand m6, [pb_7] ; p0&7
|
||||
psrlw m7, 3
|
||||
pand m7, [pb_1F] ; p0>>3
|
||||
movq m3, m2 ; p2
|
||||
pxor m2, m4
|
||||
pand m2, [pb_1] ; (p2^p1)&1
|
||||
movq m5, m2
|
||||
paddb m2, m2
|
||||
paddb m2, m5 ; 3*(p2^p1)&1
|
||||
paddb m2, m6 ; extra bits lost in shifts
|
||||
pcmpeqb m0, m0
|
||||
pxor m1, m0 ; 255 - p3
|
||||
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
|
||||
pxor m0, m4 ; 255 - p1
|
||||
pavgb m0, m3 ; (256 + p2-p1) >> 1
|
||||
paddb m1, [pb_3]
|
||||
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
|
||||
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
|
||||
paddusb m7, m1 ; d+128+1
|
||||
movq m6, [pb_81]
|
||||
psubusb m6, m7
|
||||
psubusb m7, [pb_81]
|
||||
|
||||
movq m5, [r2+516] ; flim
|
||||
pminub m6, m5
|
||||
pminub m7, m5
|
||||
movq m0, m6
|
||||
movq m1, m7
|
||||
paddb m6, m6
|
||||
paddb m7, m7
|
||||
pminub m6, m5
|
||||
pminub m7, m5
|
||||
psubb m6, m0
|
||||
psubb m7, m1
|
||||
paddusb m4, m7
|
||||
psubusb m4, m6
|
||||
psubusb m3, m7
|
||||
paddusb m3, m6
|
||||
%endmacro
|
||||
|
||||
%macro STORE_4_WORDS 1
|
||||
movd r2d, %1
|
||||
mov [r0 -1], r2w
|
||||
psrlq %1, 32
|
||||
shr r2, 16
|
||||
mov [r0+r1 -1], r2w
|
||||
movd r2d, %1
|
||||
mov [r0+r1*2-1], r2w
|
||||
shr r2, 16
|
||||
mov [r0+r3 -1], r2w
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal vp3_v_loop_filter, 3, 4
|
||||
%if ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
mov r3, r1
|
||||
neg r1
|
||||
movq m6, [r0+r1*2]
|
||||
movq m4, [r0+r1 ]
|
||||
movq m2, [r0 ]
|
||||
movq m1, [r0+r3 ]
|
||||
|
||||
VP3_LOOP_FILTER
|
||||
|
||||
movq [r0+r1], m4
|
||||
movq [r0 ], m3
|
||||
RET
|
||||
|
||||
cglobal vp3_h_loop_filter, 3, 4
|
||||
%if ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
lea r3, [r1*3]
|
||||
|
||||
movd m6, [r0 -2]
|
||||
movd m4, [r0+r1 -2]
|
||||
movd m2, [r0+r1*2-2]
|
||||
movd m1, [r0+r3 -2]
|
||||
lea r0, [r0+r1*4 ]
|
||||
punpcklbw m6, [r0 -2]
|
||||
punpcklbw m4, [r0+r1 -2]
|
||||
punpcklbw m2, [r0+r1*2-2]
|
||||
punpcklbw m1, [r0+r3 -2]
|
||||
sub r0, r3
|
||||
sub r0, r1
|
||||
|
||||
TRANSPOSE4x4B 6, 4, 2, 1, 0
|
||||
VP3_LOOP_FILTER
|
||||
SBUTTERFLY bw, 4, 3, 5
|
||||
|
||||
STORE_4_WORDS m4
|
||||
lea r0, [r0+r1*4 ]
|
||||
STORE_4_WORDS m3
|
||||
RET
|
||||
|
||||
; from original comments: The Macro does IDct on 4 1-D Dcts
|
||||
%macro BeginIDCT 0
|
||||
movq m2, I(3)
|
||||
movq m6, C(3)
|
||||
movq m4, m2
|
||||
movq m7, J(5)
|
||||
pmulhw m4, m6 ; r4 = c3*i3 - i3
|
||||
movq m1, C(5)
|
||||
pmulhw m6, m7 ; r6 = c3*i5 - i5
|
||||
movq m5, m1
|
||||
pmulhw m1, m2 ; r1 = c5*i3 - i3
|
||||
movq m3, I(1)
|
||||
pmulhw m5, m7 ; r5 = c5*i5 - i5
|
||||
movq m0, C(1)
|
||||
paddw m4, m2 ; r4 = c3*i3
|
||||
paddw m6, m7 ; r6 = c3*i5
|
||||
paddw m2, m1 ; r2 = c5*i3
|
||||
movq m1, J(7)
|
||||
paddw m7, m5 ; r7 = c5*i5
|
||||
movq m5, m0 ; r5 = c1
|
||||
pmulhw m0, m3 ; r0 = c1*i1 - i1
|
||||
paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
|
||||
pmulhw m5, m1 ; r5 = c1*i7 - i7
|
||||
movq m7, C(7)
|
||||
psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
|
||||
paddw m0, m3 ; r0 = c1*i1
|
||||
pmulhw m3, m7 ; r3 = c7*i1
|
||||
movq m2, I(2)
|
||||
pmulhw m7, m1 ; r7 = c7*i7
|
||||
paddw m5, m1 ; r5 = c1*i7
|
||||
movq m1, m2 ; r1 = i2
|
||||
pmulhw m2, C(2) ; r2 = c2*i2 - i2
|
||||
psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
|
||||
movq m5, J(6)
|
||||
paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
|
||||
movq m7, m5 ; r7 = i6
|
||||
psubsw m0, m4 ; r0 = A - C
|
||||
pmulhw m5, C(2) ; r5 = c2*i6 - i6
|
||||
paddw m2, m1 ; r2 = c2*i2
|
||||
pmulhw m1, C(6) ; r1 = c6*i2
|
||||
paddsw m4, m4 ; r4 = C + C
|
||||
paddsw m4, m0 ; r4 = C. = A + C
|
||||
psubsw m3, m6 ; r3 = B - D
|
||||
paddw m5, m7 ; r5 = c2*i6
|
||||
paddsw m6, m6 ; r6 = D + D
|
||||
pmulhw m7, C(6) ; r7 = c6*i6
|
||||
paddsw m6, m3 ; r6 = D. = B + D
|
||||
movq I(1), m4 ; save C. at I(1)
|
||||
psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
|
||||
movq m4, C(4)
|
||||
movq m5, m3 ; r5 = B - D
|
||||
pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
|
||||
paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
|
||||
movq I(2), m6 ; save D. at I(2)
|
||||
movq m2, m0 ; r2 = A - C
|
||||
movq m6, I(0)
|
||||
pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
|
||||
paddw m5, m3 ; r5 = B. = c4 * (B - D)
|
||||
movq m3, J(4)
|
||||
psubsw m5, m1 ; r5 = B.. = B. - H
|
||||
paddw m2, m0 ; r0 = A. = c4 * (A - C)
|
||||
psubsw m6, m3 ; r6 = i0 - i4
|
||||
movq m0, m6
|
||||
pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
|
||||
paddsw m3, m3 ; r3 = i4 + i4
|
||||
paddsw m1, m1 ; r1 = H + H
|
||||
paddsw m3, m0 ; r3 = i0 + i4
|
||||
paddsw m1, m5 ; r1 = H. = B + H
|
||||
pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
|
||||
paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
|
||||
psubsw m6, m2 ; r6 = F. = F - A.
|
||||
paddsw m2, m2 ; r2 = A. + A.
|
||||
movq m0, I(1) ; r0 = C.
|
||||
paddsw m2, m6 ; r2 = A.. = F + A.
|
||||
paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
|
||||
psubsw m2, m1 ; r2 = R2 = A.. - H.
|
||||
%endmacro
|
||||
|
||||
; RowIDCT gets ready to transpose
|
||||
%macro RowIDCT 0
|
||||
BeginIDCT
|
||||
movq m3, I(2) ; r3 = D.
|
||||
psubsw m4, m7 ; r4 = E. = E - G
|
||||
paddsw m1, m1 ; r1 = H. + H.
|
||||
paddsw m7, m7 ; r7 = G + G
|
||||
paddsw m1, m2 ; r1 = R1 = A.. + H.
|
||||
paddsw m7, m4 ; r1 = R1 = A.. + H.
|
||||
psubsw m4, m3 ; r4 = R4 = E. - D.
|
||||
paddsw m3, m3
|
||||
psubsw m6, m5 ; r6 = R6 = F. - B..
|
||||
paddsw m5, m5
|
||||
paddsw m3, m4 ; r3 = R3 = E. + D.
|
||||
paddsw m5, m6 ; r5 = R5 = F. + B..
|
||||
psubsw m7, m0 ; r7 = R7 = G. - C.
|
||||
paddsw m0, m0
|
||||
movq I(1), m1 ; save R1
|
||||
paddsw m0, m7 ; r0 = R0 = G. + C.
|
||||
%endmacro
|
||||
|
||||
; Column IDCT normalizes and stores final results
|
||||
%macro ColumnIDCT 0
|
||||
BeginIDCT
|
||||
paddsw m2, OC_8 ; adjust R2 (and R1) for shift
|
||||
paddsw m1, m1 ; r1 = H. + H.
|
||||
paddsw m1, m2 ; r1 = R1 = A.. + H.
|
||||
psraw m2, 4 ; r2 = NR2
|
||||
psubsw m4, m7 ; r4 = E. = E - G
|
||||
psraw m1, 4 ; r1 = NR2
|
||||
movq m3, I(2) ; r3 = D.
|
||||
paddsw m7, m7 ; r7 = G + G
|
||||
movq I(2), m2 ; store NR2 at I2
|
||||
paddsw m7, m4 ; r7 = G. = E + G
|
||||
movq I(1), m1 ; store NR1 at I1
|
||||
psubsw m4, m3 ; r4 = R4 = E. - D.
|
||||
paddsw m4, OC_8 ; adjust R4 (and R3) for shift
|
||||
paddsw m3, m3 ; r3 = D. + D.
|
||||
paddsw m3, m4 ; r3 = R3 = E. + D.
|
||||
psraw m4, 4 ; r4 = NR4
|
||||
psubsw m6, m5 ; r6 = R6 = F. - B..
|
||||
psraw m3, 4 ; r3 = NR3
|
||||
paddsw m6, OC_8 ; adjust R6 (and R5) for shift
|
||||
paddsw m5, m5 ; r5 = B.. + B..
|
||||
paddsw m5, m6 ; r5 = R5 = F. + B..
|
||||
psraw m6, 4 ; r6 = NR6
|
||||
movq J(4), m4 ; store NR4 at J4
|
||||
psraw m5, 4 ; r5 = NR5
|
||||
movq I(3), m3 ; store NR3 at I3
|
||||
psubsw m7, m0 ; r7 = R7 = G. - C.
|
||||
paddsw m7, OC_8 ; adjust R7 (and R0) for shift
|
||||
paddsw m0, m0 ; r0 = C. + C.
|
||||
paddsw m0, m7 ; r0 = R0 = G. + C.
|
||||
psraw m7, 4 ; r7 = NR7
|
||||
movq J(6), m6 ; store NR6 at J6
|
||||
psraw m0, 4 ; r0 = NR0
|
||||
movq J(5), m5 ; store NR5 at J5
|
||||
movq J(7), m7 ; store NR7 at J7
|
||||
movq I(0), m0 ; store NR0 at I0
|
||||
%endmacro
|
||||
|
||||
; Following macro does two 4x4 transposes in place.
|
||||
;
|
||||
; At entry (we assume):
|
||||
;
|
||||
; r0 = a3 a2 a1 a0
|
||||
; I(1) = b3 b2 b1 b0
|
||||
; r2 = c3 c2 c1 c0
|
||||
; r3 = d3 d2 d1 d0
|
||||
;
|
||||
; r4 = e3 e2 e1 e0
|
||||
; r5 = f3 f2 f1 f0
|
||||
; r6 = g3 g2 g1 g0
|
||||
; r7 = h3 h2 h1 h0
|
||||
;
|
||||
; At exit, we have:
|
||||
;
|
||||
; I(0) = d0 c0 b0 a0
|
||||
; I(1) = d1 c1 b1 a1
|
||||
; I(2) = d2 c2 b2 a2
|
||||
; I(3) = d3 c3 b3 a3
|
||||
;
|
||||
; J(4) = h0 g0 f0 e0
|
||||
; J(5) = h1 g1 f1 e1
|
||||
; J(6) = h2 g2 f2 e2
|
||||
; J(7) = h3 g3 f3 e3
|
||||
;
|
||||
; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
||||
; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
||||
;
|
||||
; Since r1 is free at entry, we calculate the Js first.
|
||||
%macro Transpose 0
|
||||
movq m1, m4 ; r1 = e3 e2 e1 e0
|
||||
punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
|
||||
movq I(0), m0 ; save a3 a2 a1 a0
|
||||
punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
|
||||
movq m0, m6 ; r0 = g3 g2 g1 g0
|
||||
punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
|
||||
movq m5, m4 ; r5 = f1 e1 f0 e0
|
||||
punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
|
||||
punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
|
||||
movq m6, m1 ; r6 = f3 e3 f2 e2
|
||||
movq J(4), m4
|
||||
punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
|
||||
movq J(5), m5
|
||||
punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
|
||||
movq m4, I(0) ; r4 = a3 a2 a1 a0
|
||||
punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
|
||||
movq m5, I(1) ; r5 = b3 b2 b1 b0
|
||||
movq m0, m4 ; r0 = a3 a2 a1 a0
|
||||
movq J(7), m6
|
||||
punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
|
||||
movq J(6), m1
|
||||
punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
|
||||
movq m5, m2 ; r5 = c3 c2 c1 c0
|
||||
punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
|
||||
movq m1, m0 ; r1 = b1 a1 b0 a0
|
||||
punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
|
||||
punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
|
||||
movq m2, m4 ; r2 = b3 a3 b2 a2
|
||||
movq I(0), m0
|
||||
punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
|
||||
movq I(1), m1
|
||||
punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
|
||||
punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
|
||||
movq I(3), m4
|
||||
movq I(2), m2
|
||||
%endmacro
|
||||
|
||||
%macro VP3_1D_IDCT_SSE2 0
|
||||
movdqa m2, I(3) ; xmm2 = i3
|
||||
movdqa m6, C(3) ; xmm6 = c3
|
||||
movdqa m4, m2 ; xmm4 = i3
|
||||
movdqa m7, I(5) ; xmm7 = i5
|
||||
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
|
||||
movdqa m1, C(5) ; xmm1 = c5
|
||||
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
|
||||
movdqa m5, m1 ; xmm5 = c5
|
||||
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
|
||||
movdqa m3, I(1) ; xmm3 = i1
|
||||
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
|
||||
movdqa m0, C(1) ; xmm0 = c1
|
||||
paddw m4, m2 ; xmm4 = c3 * i3
|
||||
paddw m6, m7 ; xmm6 = c3 * i5
|
||||
paddw m2, m1 ; xmm2 = c5 * i3
|
||||
movdqa m1, I(7) ; xmm1 = i7
|
||||
paddw m7, m5 ; xmm7 = c5 * i5
|
||||
movdqa m5, m0 ; xmm5 = c1
|
||||
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
|
||||
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
|
||||
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
|
||||
movdqa m7, C(7) ; xmm7 = c7
|
||||
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
|
||||
paddw m0, m3 ; xmm0 = c1 * i1
|
||||
pmulhw m3, m7 ; xmm3 = c7 * i1
|
||||
movdqa m2, I(2) ; xmm2 = i2
|
||||
pmulhw m7, m1 ; xmm7 = c7 * i7
|
||||
paddw m5, m1 ; xmm5 = c1 * i7
|
||||
movdqa m1, m2 ; xmm1 = i2
|
||||
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
|
||||
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
|
||||
movdqa m5, I(6) ; xmm5 = i6
|
||||
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
|
||||
movdqa m7, m5 ; xmm7 = i6
|
||||
psubsw m0, m4 ; xmm0 = A - C
|
||||
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
|
||||
paddw m2, m1 ; xmm2 = i2 * c2
|
||||
pmulhw m1, C(6) ; xmm1 = c6 * i2
|
||||
paddsw m4, m4 ; xmm4 = C + C
|
||||
paddsw m4, m0 ; xmm4 = A + C = C.
|
||||
psubsw m3, m6 ; xmm3 = B - D
|
||||
paddw m5, m7 ; xmm5 = c2 * i6
|
||||
paddsw m6, m6 ; xmm6 = D + D
|
||||
pmulhw m7, C(6) ; xmm7 = c6 * i6
|
||||
paddsw m6, m3 ; xmm6 = B + D = D.
|
||||
movdqa I(1), m4 ; Save C. at I(1)
|
||||
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
|
||||
movdqa m4, C(4) ; xmm4 = C4
|
||||
movdqa m5, m3 ; xmm5 = B - D
|
||||
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
|
||||
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
|
||||
movdqa I(2), m6 ; save D. at I(2)
|
||||
movdqa m2, m0 ; xmm2 = A - C
|
||||
movdqa m6, I(0) ; xmm6 = i0
|
||||
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
|
||||
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
|
||||
movdqa m3, I(4) ; xmm3 = i4
|
||||
psubsw m5, m1 ; xmm5 = B. - H = B..
|
||||
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
|
||||
psubsw m6, m3 ; xmm6 = i0 - i4
|
||||
movdqa m0, m6 ; xmm0 = i0 - i4
|
||||
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
|
||||
paddsw m3, m3 ; xmm3 = i4 + i4
|
||||
paddsw m1, m1 ; xmm1 = H + H
|
||||
paddsw m3, m0 ; xmm3 = i0 + i4
|
||||
paddsw m1, m5 ; xmm1 = B. + H = H.
|
||||
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
|
||||
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
|
||||
psubsw m6, m2 ; xmm6 = F - A. = F.
|
||||
paddsw m2, m2 ; xmm2 = A. + A.
|
||||
movdqa m0, I(1) ; Load C. from I(1)
|
||||
paddsw m2, m6 ; xmm2 = F + A. = A..
|
||||
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
|
||||
psubsw m2, m1 ; xmm2 = A.. - H. = R2
|
||||
ADD(m2) ; Adjust R2 and R1 before shifting
|
||||
paddsw m1, m1 ; xmm1 = H. + H.
|
||||
paddsw m1, m2 ; xmm1 = A.. + H. = R1
|
||||
SHIFT(m2) ; xmm2 = op2
|
||||
psubsw m4, m7 ; xmm4 = E - G = E.
|
||||
SHIFT(m1) ; xmm1 = op1
|
||||
movdqa m3, I(2) ; Load D. from I(2)
|
||||
paddsw m7, m7 ; xmm7 = G + G
|
||||
paddsw m7, m4 ; xmm7 = E + G = G.
|
||||
psubsw m4, m3 ; xmm4 = E. - D. = R4
|
||||
ADD(m4) ; Adjust R4 and R3 before shifting
|
||||
paddsw m3, m3 ; xmm3 = D. + D.
|
||||
paddsw m3, m4 ; xmm3 = E. + D. = R3
|
||||
SHIFT(m4) ; xmm4 = op4
|
||||
psubsw m6, m5 ; xmm6 = F. - B..= R6
|
||||
SHIFT(m3) ; xmm3 = op3
|
||||
ADD(m6) ; Adjust R6 and R5 before shifting
|
||||
paddsw m5, m5 ; xmm5 = B.. + B..
|
||||
paddsw m5, m6 ; xmm5 = F. + B.. = R5
|
||||
SHIFT(m6) ; xmm6 = op6
|
||||
SHIFT(m5) ; xmm5 = op5
|
||||
psubsw m7, m0 ; xmm7 = G. - C. = R7
|
||||
ADD(m7) ; Adjust R7 and R0 before shifting
|
||||
paddsw m0, m0 ; xmm0 = C. + C.
|
||||
paddsw m0, m7 ; xmm0 = G. + C.
|
||||
SHIFT(m7) ; xmm7 = op7
|
||||
SHIFT(m0) ; xmm0 = op0
|
||||
%endmacro
|
||||
|
||||
%macro PUT_BLOCK 8
|
||||
movdqa O(0), m%1
|
||||
movdqa O(1), m%2
|
||||
movdqa O(2), m%3
|
||||
movdqa O(3), m%4
|
||||
movdqa O(4), m%5
|
||||
movdqa O(5), m%6
|
||||
movdqa O(6), m%7
|
||||
movdqa O(7), m%8
|
||||
%endmacro
|
||||
|
||||
%macro VP3_IDCT 1
|
||||
%if mmsize == 16
|
||||
%define I(x) [%1+16*x]
|
||||
%define O(x) [%1+16*x]
|
||||
%define C(x) [vp3_idct_data+16*(x-1)]
|
||||
%define SHIFT(x)
|
||||
%define ADD(x)
|
||||
VP3_1D_IDCT_SSE2
|
||||
%if ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
|
||||
%endif
|
||||
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
%define SHIFT(x) psraw x, 4
|
||||
%define ADD(x) paddsw x, [pw_8]
|
||||
VP3_1D_IDCT_SSE2
|
||||
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
|
||||
%else ; mmsize == 8
|
||||
; eax = quantized input
|
||||
; ebx = dequantizer matrix
|
||||
; ecx = IDCT constants
|
||||
; M(I) = ecx + MaskOffset(0) + I * 8
|
||||
; C(I) = ecx + CosineOffset(32) + (I-1) * 8
|
||||
; edx = output
|
||||
; r0..r7 = mm0..mm7
|
||||
%define OC_8 [pw_8]
|
||||
%define C(x) [vp3_idct_data+16*(x-1)]
|
||||
|
||||
; at this point, function has completed dequantization + dezigzag +
|
||||
; partial transposition; now do the idct itself
|
||||
%define I(x) [%1+16* x ]
|
||||
%define J(x) [%1+16*(x-4)+8]
|
||||
RowIDCT
|
||||
Transpose
|
||||
|
||||
%define I(x) [%1+16* x +64]
|
||||
%define J(x) [%1+16*(x-4)+72]
|
||||
RowIDCT
|
||||
Transpose
|
||||
|
||||
%define I(x) [%1+16*x]
|
||||
%define J(x) [%1+16*x]
|
||||
ColumnIDCT
|
||||
|
||||
%define I(x) [%1+16*x+8]
|
||||
%define J(x) [%1+16*x+8]
|
||||
ColumnIDCT
|
||||
%endif ; mmsize == 16/8
|
||||
%endmacro
|
||||
|
||||
%macro vp3_idct_funcs 0
|
||||
cglobal vp3_idct_put, 3, 4, 9
|
||||
VP3_IDCT r2
|
||||
|
||||
movsxdifnidn r1, r1d
|
||||
mova m4, [pb_80]
|
||||
lea r3, [r1*3]
|
||||
%assign %%i 0
|
||||
%rep 16/mmsize
|
||||
mova m0, [r2+mmsize*0+%%i]
|
||||
mova m1, [r2+mmsize*2+%%i]
|
||||
mova m2, [r2+mmsize*4+%%i]
|
||||
mova m3, [r2+mmsize*6+%%i]
|
||||
packsswb m0, [r2+mmsize*1+%%i]
|
||||
packsswb m1, [r2+mmsize*3+%%i]
|
||||
packsswb m2, [r2+mmsize*5+%%i]
|
||||
packsswb m3, [r2+mmsize*7+%%i]
|
||||
paddb m0, m4
|
||||
paddb m1, m4
|
||||
paddb m2, m4
|
||||
paddb m3, m4
|
||||
movq [r0 ], m0
|
||||
%if mmsize == 8
|
||||
movq [r0+r1 ], m1
|
||||
movq [r0+r1*2], m2
|
||||
movq [r0+r3 ], m3
|
||||
%else
|
||||
movhps [r0+r1 ], m0
|
||||
movq [r0+r1*2], m1
|
||||
movhps [r0+r3 ], m1
|
||||
%endif
|
||||
%if %%i == 0
|
||||
lea r0, [r0+r1*4]
|
||||
%endif
|
||||
%if mmsize == 16
|
||||
movq [r0 ], m2
|
||||
movhps [r0+r1 ], m2
|
||||
movq [r0+r1*2], m3
|
||||
movhps [r0+r3 ], m3
|
||||
%endif
|
||||
%assign %%i %%i+64
|
||||
%endrep
|
||||
RET
|
||||
|
||||
cglobal vp3_idct_add, 3, 4, 9
|
||||
VP3_IDCT r2
|
||||
|
||||
mov r3, 4
|
||||
pxor m4, m4
|
||||
movsxdifnidn r1, r1d
|
||||
.loop:
|
||||
movq m0, [r0]
|
||||
movq m1, [r0+r1]
|
||||
%if mmsize == 8
|
||||
mova m2, m0
|
||||
mova m3, m1
|
||||
%endif
|
||||
punpcklbw m0, m4
|
||||
punpcklbw m1, m4
|
||||
%if mmsize == 8
|
||||
punpckhbw m2, m4
|
||||
punpckhbw m3, m4
|
||||
%endif
|
||||
paddsw m0, [r2+ 0]
|
||||
paddsw m1, [r2+16]
|
||||
%if mmsize == 8
|
||||
paddsw m2, [r2+ 8]
|
||||
paddsw m3, [r2+24]
|
||||
packuswb m0, m2
|
||||
packuswb m1, m3
|
||||
%else ; mmsize == 16
|
||||
packuswb m0, m1
|
||||
%endif
|
||||
movq [r0 ], m0
|
||||
%if mmsize == 8
|
||||
movq [r0+r1], m1
|
||||
%else ; mmsize == 16
|
||||
movhps [r0+r1], m0
|
||||
%endif
|
||||
lea r0, [r0+r1*2]
|
||||
add r2, 32
|
||||
dec r3
|
||||
jg .loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
vp3_idct_funcs
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
vp3_idct_funcs
|
||||
|
||||
%macro DC_ADD 0
|
||||
movq m2, [r0 ]
|
||||
movq m3, [r0+r1 ]
|
||||
paddusb m2, m0
|
||||
movq m4, [r0+r1*2]
|
||||
paddusb m3, m0
|
||||
movq m5, [r0+r3 ]
|
||||
paddusb m4, m0
|
||||
paddusb m5, m0
|
||||
psubusb m2, m1
|
||||
psubusb m3, m1
|
||||
movq [r0 ], m2
|
||||
psubusb m4, m1
|
||||
movq [r0+r1 ], m3
|
||||
psubusb m5, m1
|
||||
movq [r0+r1*2], m4
|
||||
movq [r0+r3 ], m5
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmxext
|
||||
cglobal vp3_idct_dc_add, 3, 4
|
||||
%if ARCH_X86_64
|
||||
movsxd r1, r1d
|
||||
%endif
|
||||
lea r3, [r1*3]
|
||||
movsx r2, word [r2]
|
||||
add r2, 15
|
||||
sar r2, 5
|
||||
movd m0, r2d
|
||||
pshufw m0, m0, 0x0
|
||||
pxor m1, m1
|
||||
psubw m1, m0
|
||||
packuswb m0, m0
|
||||
packuswb m1, m1
|
||||
DC_ADD
|
||||
lea r0, [r0+r1*4]
|
||||
DC_ADD
|
||||
RET
|
||||
68
project/jni/ffmpeg/libavcodec/x86/vp3dsp_init.c
Normal file
68
project/jni/ffmpeg/libavcodec/x86/vp3dsp_init.c
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavcodec/vp3dsp.h"
|
||||
#include "config.h"
|
||||
|
||||
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
|
||||
|
||||
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
|
||||
const DCTELEM *block);
|
||||
|
||||
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
|
||||
int *bounding_values);
|
||||
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
|
||||
int *bounding_values);
|
||||
|
||||
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
|
||||
{
|
||||
int cpuflags = av_get_cpu_flags();
|
||||
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(cpuflags)) {
|
||||
c->idct_put = ff_vp3_idct_put_mmx;
|
||||
c->idct_add = ff_vp3_idct_add_mmx;
|
||||
c->idct_perm = FF_PARTTRANS_IDCT_PERM;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (EXTERNAL_MMXEXT(cpuflags)) {
|
||||
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
|
||||
|
||||
if (!(flags & CODEC_FLAG_BITEXACT)) {
|
||||
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
|
||||
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
|
||||
}
|
||||
}
|
||||
|
||||
if (EXTERNAL_SSE2(cpuflags)) {
|
||||
c->idct_put = ff_vp3_idct_put_sse2;
|
||||
c->idct_add = ff_vp3_idct_add_sse2;
|
||||
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
|
||||
}
|
||||
}
|
||||
54
project/jni/ffmpeg/libavcodec/x86/vp56_arith.h
Normal file
54
project/jni/ffmpeg/libavcodec/x86/vp56_arith.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/**
|
||||
* VP5 and VP6 compatible video decoder (arith decoder)
|
||||
*
|
||||
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
|
||||
* Copyright (C) 2010 Eli Friedman
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#ifndef AVCODEC_X86_VP56_ARITH_H
|
||||
#define AVCODEC_X86_VP56_ARITH_H
|
||||
|
||||
#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
|
||||
#define vp56_rac_get_prob vp56_rac_get_prob
|
||||
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
|
||||
{
|
||||
unsigned int code_word = vp56_rac_renorm(c);
|
||||
unsigned int high = c->high;
|
||||
unsigned int low = 1 + (((high - 1) * prob) >> 8);
|
||||
unsigned int low_shift = low << 16;
|
||||
int bit = 0;
|
||||
|
||||
__asm__(
|
||||
"subl %4, %1 \n\t"
|
||||
"subl %3, %2 \n\t"
|
||||
"leal (%2, %3), %3 \n\t"
|
||||
"setae %b0 \n\t"
|
||||
"cmovb %4, %1 \n\t"
|
||||
"cmovb %3, %2 \n\t"
|
||||
: "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
|
||||
: "r"(low)
|
||||
);
|
||||
|
||||
c->high = high;
|
||||
c->code_word = code_word;
|
||||
return bit;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* AVCODEC_X86_VP56_ARITH_H */
|
||||
170
project/jni/ffmpeg/libavcodec/x86/vp56dsp.asm
Normal file
170
project/jni/ffmpeg/libavcodec/x86/vp56dsp.asm
Normal file
@@ -0,0 +1,170 @@
|
||||
;******************************************************************************
|
||||
;* MMX/SSE2-optimized functions for the VP6 decoder
|
||||
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
|
||||
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
|
||||
;*
|
||||
;* This file is part of FFmpeg.
|
||||
;*
|
||||
;* FFmpeg is free software; you can redistribute it and/or
|
||||
;* modify it under the terms of the GNU Lesser General Public
|
||||
;* License as published by the Free Software Foundation; either
|
||||
;* version 2.1 of the License, or (at your option) any later version.
|
||||
;*
|
||||
;* FFmpeg is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
;* Lesser General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU Lesser General Public
|
||||
;* License along with FFmpeg; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
;******************************************************************************
|
||||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
cextern pw_64
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro DIAG4 6
|
||||
%if mmsize == 8
|
||||
movq m0, [%1+%2]
|
||||
movq m1, [%1+%3]
|
||||
movq m3, m0
|
||||
movq m4, m1
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
punpckhbw m3, m7
|
||||
punpckhbw m4, m7
|
||||
pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
|
||||
pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
|
||||
pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
|
||||
pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
|
||||
paddw m0, m1
|
||||
paddw m3, m4
|
||||
movq m1, [%1+%4]
|
||||
movq m2, [%1+%5]
|
||||
movq m4, m1
|
||||
movq m5, m2
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m4, m7
|
||||
punpckhbw m5, m7
|
||||
pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
|
||||
pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
|
||||
pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
|
||||
pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
|
||||
paddw m1, m2
|
||||
paddw m4, m5
|
||||
paddsw m0, m1
|
||||
paddsw m3, m4
|
||||
paddsw m0, m6 ; Add 64
|
||||
paddsw m3, m6 ; Add 64
|
||||
psraw m0, 7
|
||||
psraw m3, 7
|
||||
packuswb m0, m3
|
||||
movq [%6], m0
|
||||
%else ; mmsize == 16
|
||||
movq m0, [%1+%2]
|
||||
movq m1, [%1+%3]
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
pmullw m0, m4 ; src[x-8 ] * biweight [0]
|
||||
pmullw m1, m5 ; src[x ] * biweight [1]
|
||||
paddw m0, m1
|
||||
movq m1, [%1+%4]
|
||||
movq m2, [%1+%5]
|
||||
punpcklbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
pmullw m1, m6 ; src[x+8 ] * biweight [2]
|
||||
pmullw m2, m3 ; src[x+16] * biweight [3]
|
||||
paddw m1, m2
|
||||
paddsw m0, m1
|
||||
paddsw m0, [pw_64] ; Add 64
|
||||
psraw m0, 7
|
||||
packuswb m0, m0
|
||||
movq [%6], m0
|
||||
%endif ; mmsize == 8/16
|
||||
%endmacro
|
||||
|
||||
%macro SPLAT4REGS 0
|
||||
%if mmsize == 8
|
||||
movq m5, m3
|
||||
punpcklwd m3, m3
|
||||
movq m4, m3
|
||||
punpckldq m3, m3
|
||||
punpckhdq m4, m4
|
||||
punpckhwd m5, m5
|
||||
movq m2, m5
|
||||
punpckhdq m2, m2
|
||||
punpckldq m5, m5
|
||||
movq [rsp+8*11], m3
|
||||
movq [rsp+8*12], m4
|
||||
movq [rsp+8*13], m5
|
||||
movq [rsp+8*14], m2
|
||||
%else ; mmsize == 16
|
||||
pshuflw m4, m3, 0x0
|
||||
pshuflw m5, m3, 0x55
|
||||
pshuflw m6, m3, 0xAA
|
||||
pshuflw m3, m3, 0xFF
|
||||
punpcklqdq m4, m4
|
||||
punpcklqdq m5, m5
|
||||
punpcklqdq m6, m6
|
||||
punpcklqdq m3, m3
|
||||
%endif ; mmsize == 8/16
|
||||
%endmacro
|
||||
|
||||
%macro vp6_filter_diag4 0
|
||||
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
|
||||
; const int16_t h_weight[4], const int16_t v_weights[4])
|
||||
cglobal vp6_filter_diag4, 5, 7, 8
|
||||
mov r5, rsp ; backup stack pointer
|
||||
and rsp, ~(mmsize-1) ; align stack
|
||||
%if mmsize == 16
|
||||
sub rsp, 8*11
|
||||
%else
|
||||
sub rsp, 8*15
|
||||
movq m6, [pw_64]
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
movsxd r2, r2d
|
||||
%endif
|
||||
|
||||
sub r1, r2
|
||||
|
||||
pxor m7, m7
|
||||
movq m3, [r3]
|
||||
SPLAT4REGS
|
||||
|
||||
mov r3, rsp
|
||||
mov r6, 11
|
||||
.nextrow:
|
||||
DIAG4 r1, -1, 0, 1, 2, r3
|
||||
add r3, 8
|
||||
add r1, r2
|
||||
dec r6
|
||||
jnz .nextrow
|
||||
|
||||
movq m3, [r4]
|
||||
SPLAT4REGS
|
||||
|
||||
lea r3, [rsp+8]
|
||||
mov r6, 8
|
||||
.nextcol:
|
||||
DIAG4 r3, -8, 0, 8, 16, r0
|
||||
add r3, 8
|
||||
add r0, r2
|
||||
dec r6
|
||||
jnz .nextcol
|
||||
|
||||
mov rsp, r5 ; restore stack pointer
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%if ARCH_X86_32
|
||||
INIT_MMX mmx
|
||||
vp6_filter_diag4
|
||||
%endif
|
||||
|
||||
INIT_XMM sse2
|
||||
vp6_filter_diag4
|
||||
49
project/jni/ffmpeg/libavcodec/x86/vp56dsp_init.c
Normal file
49
project/jni/ffmpeg/libavcodec/x86/vp56dsp_init.c
Normal file
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* VP6 MMX/SSE2 optimizations
|
||||
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
|
||||
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavutil/x86/cpu.h"
|
||||
#include "libavcodec/dsputil.h"
|
||||
#include "libavcodec/vp56dsp.h"
|
||||
|
||||
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride,
|
||||
const int16_t *h_weights,const int16_t *v_weights);
|
||||
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride,
|
||||
const int16_t *h_weights,const int16_t *v_weights);
|
||||
|
||||
av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec)
|
||||
{
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (CONFIG_VP6_DECODER && codec == AV_CODEC_ID_VP6) {
|
||||
#if ARCH_X86_32
|
||||
if (EXTERNAL_MMX(mm_flags)) {
|
||||
c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (EXTERNAL_SSE2(mm_flags)) {
|
||||
c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
|
||||
}
|
||||
}
|
||||
}
|
||||
2781
project/jni/ffmpeg/libavcodec/x86/vp8dsp.asm
Normal file
2781
project/jni/ffmpeg/libavcodec/x86/vp8dsp.asm
Normal file
File diff suppressed because it is too large
Load Diff
442
project/jni/ffmpeg/libavcodec/x86/vp8dsp_init.c
Normal file
442
project/jni/ffmpeg/libavcodec/x86/vp8dsp_init.c
Normal file
@@ -0,0 +1,442 @@
|
||||
/*
|
||||
* VP8 DSP functions x86-optimized
|
||||
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/mem.h"
|
||||
#include "libavutil/x86/asm.h"
|
||||
#include "libavcodec/vp8dsp.h"
|
||||
|
||||
#if HAVE_YASM
|
||||
|
||||
/*
|
||||
* MC functions
|
||||
*/
|
||||
extern void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
extern void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
extern void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
|
||||
extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
|
||||
uint8_t *src, ptrdiff_t srcstride,
|
||||
int height, int mx, int my);
|
||||
|
||||
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
|
||||
}
|
||||
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
|
||||
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst, dststride, src, srcstride, height, mx, my); \
|
||||
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
|
||||
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
TAP_W8 (mmxext, epel, h4)
|
||||
TAP_W8 (mmxext, epel, h6)
|
||||
TAP_W16(mmxext, epel, h6)
|
||||
TAP_W8 (mmxext, epel, v4)
|
||||
TAP_W8 (mmxext, epel, v6)
|
||||
TAP_W16(mmxext, epel, v6)
|
||||
TAP_W8 (mmxext, bilinear, h)
|
||||
TAP_W16(mmxext, bilinear, h)
|
||||
TAP_W8 (mmxext, bilinear, v)
|
||||
TAP_W16(mmxext, bilinear, v)
|
||||
#endif
|
||||
|
||||
TAP_W16(sse2, epel, h6)
|
||||
TAP_W16(sse2, epel, v6)
|
||||
TAP_W16(sse2, bilinear, h)
|
||||
TAP_W16(sse2, bilinear, v)
|
||||
|
||||
TAP_W16(ssse3, epel, h6)
|
||||
TAP_W16(ssse3, epel, v6)
|
||||
TAP_W16(ssse3, bilinear, h)
|
||||
TAP_W16(ssse3, bilinear, v)
|
||||
|
||||
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
|
||||
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
|
||||
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
|
||||
src -= srcstride * (TAPNUMY / 2 - 1); \
|
||||
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
|
||||
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
|
||||
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
|
||||
dst, dststride, tmpptr, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
#if ARCH_X86_32
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8) \
|
||||
HVTAP(mmxext, 8, x, y, 8, 16)
|
||||
|
||||
HVTAP(mmxext, 8, 6, 6, 16, 16)
|
||||
#else
|
||||
#define HVTAPMMX(x, y) \
|
||||
HVTAP(mmxext, 8, x, y, 4, 8)
|
||||
#endif
|
||||
|
||||
HVTAPMMX(4, 4)
|
||||
HVTAPMMX(4, 6)
|
||||
HVTAPMMX(6, 4)
|
||||
HVTAPMMX(6, 6)
|
||||
|
||||
#define HVTAPSSE2(x, y, w) \
|
||||
HVTAP(sse2, 16, x, y, w, 16) \
|
||||
HVTAP(ssse3, 16, x, y, w, 16)
|
||||
|
||||
HVTAPSSE2(4, 4, 8)
|
||||
HVTAPSSE2(4, 6, 8)
|
||||
HVTAPSSE2(6, 4, 8)
|
||||
HVTAPSSE2(6, 6, 8)
|
||||
HVTAPSSE2(6, 6, 16)
|
||||
|
||||
HVTAP(ssse3, 16, 4, 4, 4, 8)
|
||||
HVTAP(ssse3, 16, 4, 6, 4, 8)
|
||||
HVTAP(ssse3, 16, 6, 4, 4, 8)
|
||||
HVTAP(ssse3, 16, 6, 6, 4, 8)
|
||||
|
||||
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
|
||||
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
|
||||
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||
ptrdiff_t srcstride, int height, int mx, int my) \
|
||||
{ \
|
||||
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
|
||||
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
|
||||
tmp, SIZE, src, srcstride, height + 1, mx, my); \
|
||||
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
|
||||
dst, dststride, tmp, SIZE, height, mx, my); \
|
||||
}
|
||||
|
||||
HVBILIN(mmxext, 8, 4, 8)
|
||||
#if ARCH_X86_32
|
||||
HVBILIN(mmxext, 8, 8, 16)
|
||||
HVBILIN(mmxext, 8, 16, 16)
|
||||
#endif
|
||||
HVBILIN(sse2, 8, 8, 16)
|
||||
HVBILIN(sse2, 8, 16, 16)
|
||||
HVBILIN(ssse3, 8, 4, 8)
|
||||
HVBILIN(ssse3, 8, 8, 16)
|
||||
HVBILIN(ssse3, 8, 16, 16)
|
||||
|
||||
extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
|
||||
ptrdiff_t stride);
|
||||
extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16],
|
||||
ptrdiff_t stride);
|
||||
extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16],
|
||||
ptrdiff_t stride);
|
||||
extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16],
|
||||
ptrdiff_t stride);
|
||||
extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16],
|
||||
ptrdiff_t stride);
|
||||
extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||
extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
|
||||
extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16],
|
||||
ptrdiff_t stride);
|
||||
extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16],
|
||||
ptrdiff_t stride);
|
||||
|
||||
#define DECLARE_LOOP_FILTER(NAME)\
|
||||
extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int flim);\
|
||||
extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride, \
|
||||
int flim);\
|
||||
extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
|
||||
ptrdiff_t stride,\
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
|
||||
ptrdiff_t stride,\
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
|
||||
uint8_t *dstV,\
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
|
||||
uint8_t *dstV,\
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride,\
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
|
||||
ptrdiff_t stride,\
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
|
||||
uint8_t *dstV,\
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt);\
|
||||
extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
|
||||
uint8_t *dstV,\
|
||||
ptrdiff_t s, \
|
||||
int e, int i, int hvt);
|
||||
|
||||
DECLARE_LOOP_FILTER(mmx)
|
||||
DECLARE_LOOP_FILTER(mmxext)
|
||||
DECLARE_LOOP_FILTER(sse2)
|
||||
DECLARE_LOOP_FILTER(ssse3)
|
||||
DECLARE_LOOP_FILTER(sse4)
|
||||
|
||||
#endif /* HAVE_YASM */
|
||||
|
||||
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
|
||||
|
||||
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
|
||||
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
|
||||
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
|
||||
|
||||
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
|
||||
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
|
||||
|
||||
|
||||
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
|
||||
{
|
||||
#if HAVE_YASM
|
||||
int mm_flags = av_get_cpu_flags();
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX) {
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
|
||||
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
|
||||
#if ARCH_X86_32
|
||||
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
|
||||
c->vp8_idct_add = ff_vp8_idct_add_mmx;
|
||||
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
|
||||
#endif
|
||||
c->put_vp8_epel_pixels_tab[1][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
|
||||
|
||||
#if ARCH_X86_32
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* note that 4-tap width=16 functions are missing because w=16
|
||||
* is only used for luma, and luma is always a copy or sixtap. */
|
||||
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
|
||||
VP8_MC_FUNC(2, 4, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
|
||||
#if ARCH_X86_32
|
||||
VP8_LUMA_MC_FUNC(0, 16, mmxext);
|
||||
VP8_MC_FUNC(1, 8, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE) {
|
||||
c->vp8_idct_add = ff_vp8_idct_add_sse;
|
||||
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
|
||||
c->put_vp8_epel_pixels_tab[0][0][0] =
|
||||
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
|
||||
}
|
||||
|
||||
if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
|
||||
VP8_LUMA_MC_FUNC(0, 16, sse2);
|
||||
VP8_MC_FUNC(1, 8, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
|
||||
}
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE2) {
|
||||
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
|
||||
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
|
||||
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
|
||||
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
|
||||
}
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
||||
VP8_LUMA_MC_FUNC(0, 16, ssse3);
|
||||
VP8_MC_FUNC(1, 8, ssse3);
|
||||
VP8_MC_FUNC(2, 4, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
|
||||
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
|
||||
|
||||
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
|
||||
|
||||
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
|
||||
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
|
||||
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
|
||||
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
|
||||
|
||||
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
|
||||
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
|
||||
}
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_SSE4) {
|
||||
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
|
||||
|
||||
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
|
||||
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
|
||||
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
|
||||
}
|
||||
#endif /* HAVE_YASM */
|
||||
}
|
||||
80
project/jni/ffmpeg/libavcodec/x86/w64xmmtest.c
Normal file
80
project/jni/ffmpeg/libavcodec/x86/w64xmmtest.c
Normal file
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* check XMM registers for clobbers on Win64
|
||||
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
|
||||
*
|
||||
* This file is part of Libav.
|
||||
*
|
||||
* Libav is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* Libav is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with Libav; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavcodec/avcodec.h"
|
||||
#include "libavutil/x86/w64xmmtest.h"
|
||||
|
||||
wrap(avcodec_open2(AVCodecContext *avctx,
|
||||
AVCodec *codec,
|
||||
AVDictionary **options))
|
||||
{
|
||||
testxmmclobbers(avcodec_open2, avctx, codec, options);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_audio4(AVCodecContext *avctx,
|
||||
AVFrame *frame,
|
||||
int *got_frame_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testxmmclobbers(avcodec_decode_audio4, avctx, frame,
|
||||
got_frame_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_video2(AVCodecContext *avctx,
|
||||
AVFrame *picture,
|
||||
int *got_picture_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testxmmclobbers(avcodec_decode_video2, avctx, picture,
|
||||
got_picture_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
|
||||
AVSubtitle *sub,
|
||||
int *got_sub_ptr,
|
||||
AVPacket *avpkt))
|
||||
{
|
||||
testxmmclobbers(avcodec_decode_subtitle2, avctx, sub,
|
||||
got_sub_ptr, avpkt);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_audio2(AVCodecContext *avctx,
|
||||
AVPacket *avpkt,
|
||||
const AVFrame *frame,
|
||||
int *got_packet_ptr))
|
||||
{
|
||||
testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
|
||||
got_packet_ptr);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_video(AVCodecContext *avctx,
|
||||
uint8_t *buf, int buf_size,
|
||||
const AVFrame *pict))
|
||||
{
|
||||
testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
|
||||
}
|
||||
|
||||
wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
|
||||
uint8_t *buf, int buf_size,
|
||||
const AVSubtitle *sub))
|
||||
{
|
||||
testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
|
||||
}
|
||||
Reference in New Issue
Block a user