00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024 #include "fft.h"
00025
00026 DECLARE_ASM_CONST(16, int, ff_m1m1m1m1)[4] =
00027 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
00028
00029 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
00030 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
00031
00032 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00033 {
00034 int n = 1 << s->nbits;
00035
00036 ff_fft_dispatch_interleave_sse(z, s->nbits);
00037
00038 if(n <= 16) {
00039 x86_reg i = -8*n;
00040 __asm__ volatile(
00041 "1: \n"
00042 "movaps (%0,%1), %%xmm0 \n"
00043 "movaps %%xmm0, %%xmm1 \n"
00044 "unpcklps 16(%0,%1), %%xmm0 \n"
00045 "unpckhps 16(%0,%1), %%xmm1 \n"
00046 "movaps %%xmm0, (%0,%1) \n"
00047 "movaps %%xmm1, 16(%0,%1) \n"
00048 "add $32, %0 \n"
00049 "jl 1b \n"
00050 :"+r"(i)
00051 :"r"(z+n)
00052 :"memory"
00053 );
00054 }
00055 }
00056
00057 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
00058 {
00059 int n = 1 << s->nbits;
00060 int i;
00061 for(i=0; i<n; i+=2) {
00062 __asm__ volatile(
00063 "movaps %2, %%xmm0 \n"
00064 "movlps %%xmm0, %0 \n"
00065 "movhps %%xmm0, %1 \n"
00066 :"=m"(s->tmp_buf[s->revtab[i]]),
00067 "=m"(s->tmp_buf[s->revtab[i+1]])
00068 :"m"(z[i])
00069 );
00070 }
00071 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
00072 }
00073
00074 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
00075 {
00076 x86_reg j, k;
00077 long n = s->mdct_size;
00078 long n4 = n >> 2;
00079
00080 ff_imdct_half_sse(s, output+n4, input);
00081
00082 j = -n;
00083 k = n-16;
00084 __asm__ volatile(
00085 "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
00086 "1: \n"
00087 "movaps (%2,%1), %%xmm0 \n"
00088 "movaps (%3,%0), %%xmm1 \n"
00089 "shufps $0x1b, %%xmm0, %%xmm0 \n"
00090 "shufps $0x1b, %%xmm1, %%xmm1 \n"
00091 "xorps %%xmm7, %%xmm0 \n"
00092 "movaps %%xmm1, (%3,%1) \n"
00093 "movaps %%xmm0, (%2,%0) \n"
00094 "sub $16, %1 \n"
00095 "add $16, %0 \n"
00096 "jl 1b \n"
00097 :"+r"(j), "+r"(k)
00098 :"r"(output+n4), "r"(output+n4*3)
00099 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
00100 );
00101 }
00102