00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024
00025 #define CONFIG_FLOAT 1
00026 #include "libavcodec/mpegaudio.h"
00027
00028 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00029 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00030
00031 #define SUM8(op, sum, w, p) \
00032 { \
00033 op(sum, (w)[0 * 64], (p)[0 * 64]); \
00034 op(sum, (w)[1 * 64], (p)[1 * 64]); \
00035 op(sum, (w)[2 * 64], (p)[2 * 64]); \
00036 op(sum, (w)[3 * 64], (p)[3 * 64]); \
00037 op(sum, (w)[4 * 64], (p)[4 * 64]); \
00038 op(sum, (w)[5 * 64], (p)[5 * 64]); \
00039 op(sum, (w)[6 * 64], (p)[6 * 64]); \
00040 op(sum, (w)[7 * 64], (p)[7 * 64]); \
00041 }
00042
00043 static void apply_window(const float *buf, const float *win1,
00044 const float *win2, float *sum1, float *sum2, int len)
00045 {
00046 x86_reg count = - 4*len;
00047 const float *win1a = win1+len;
00048 const float *win2a = win2+len;
00049 const float *bufa = buf+len;
00050 float *sum1a = sum1+len;
00051 float *sum2a = sum2+len;
00052
00053
00054 #define MULT(a, b) \
00055 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
00056 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
00057 "mulps %%xmm2, %%xmm1 \n\t" \
00058 "subps %%xmm1, %%xmm0 \n\t" \
00059 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
00060 "subps %%xmm2, %%xmm4 \n\t" \
00061
00062 __asm__ volatile(
00063 "1: \n\t"
00064 "xorps %%xmm0, %%xmm0 \n\t"
00065 "xorps %%xmm4, %%xmm4 \n\t"
00066
00067 MULT( 0, 0)
00068 MULT( 256, 64)
00069 MULT( 512, 128)
00070 MULT( 768, 192)
00071 MULT(1024, 256)
00072 MULT(1280, 320)
00073 MULT(1536, 384)
00074 MULT(1792, 448)
00075
00076 "movaps %%xmm0, (%4,%0) \n\t"
00077 "movaps %%xmm4, (%5,%0) \n\t"
00078 "add $16, %0 \n\t"
00079 "jl 1b \n\t"
00080 :"+&r"(count)
00081 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00082 );
00083
00084 #undef MULT
00085 }
00086
00087 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00088 int incr)
00089 {
00090 LOCAL_ALIGNED_16(float, suma, [17]);
00091 LOCAL_ALIGNED_16(float, sumb, [17]);
00092 LOCAL_ALIGNED_16(float, sumc, [17]);
00093 LOCAL_ALIGNED_16(float, sumd, [17]);
00094
00095 float sum;
00096
00097
00098 memcpy(in + 512, in, 32 * sizeof(*in));
00099
00100 apply_window(in + 16, win , win + 512, suma, sumc, 16);
00101 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00102
00103 SUM8(MACS, suma[0], win + 32, in + 48);
00104
00105 sumc[ 0] = 0;
00106 sumb[16] = 0;
00107 sumd[16] = 0;
00108
00109 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
00110 "movups " #sumd "(%4), %%xmm0 \n\t" \
00111 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00112 "subps " #suma "(%1), %%xmm0 \n\t" \
00113 "movaps %%xmm0," #out1 "(%0) \n\t" \
00114 \
00115 "movups " #sumc "(%3), %%xmm0 \n\t" \
00116 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00117 "addps " #sumb "(%2), %%xmm0 \n\t" \
00118 "movaps %%xmm0," #out2 "(%0) \n\t"
00119
00120 if (incr == 1) {
00121 __asm__ volatile(
00122 SUMS( 0, 48, 4, 52, 0, 112)
00123 SUMS(16, 32, 20, 36, 16, 96)
00124 SUMS(32, 16, 36, 20, 32, 80)
00125 SUMS(48, 0, 52, 4, 48, 64)
00126
00127 :"+&r"(out)
00128 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00129 :"memory"
00130 );
00131 out += 16*incr;
00132 } else {
00133 int j;
00134 float *out2 = out + 32 * incr;
00135 out[0 ] = -suma[ 0];
00136 out += incr;
00137 out2 -= incr;
00138 for(j=1;j<16;j++) {
00139 *out = -suma[ j] + sumd[16-j];
00140 *out2 = sumb[16-j] + sumc[ j];
00141 out += incr;
00142 out2 -= incr;
00143 }
00144 }
00145
00146 sum = 0;
00147 SUM8(MLSS, sum, win + 16 + 32, in + 32);
00148 *out = sum;
00149 }
00150
00151 void ff_mpegaudiodec_init_mmx(MPADecodeContext *s)
00152 {
00153 int mm_flags = av_get_cpu_flags();
00154
00155 if (mm_flags & AV_CPU_FLAG_SSE2) {
00156 s->apply_window_mp3 = apply_window_mp3;
00157 }
00158 }