00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/fmtconvert.h"
00028
00029 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
00030 {
00031 x86_reg i = -4*len;
00032 __asm__ volatile(
00033 "movss %3, %%xmm4 \n"
00034 "shufps $0, %%xmm4, %%xmm4 \n"
00035 "1: \n"
00036 "cvtpi2ps (%2,%0), %%xmm0 \n"
00037 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
00038 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
00039 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
00040 "movlhps %%xmm1, %%xmm0 \n"
00041 "movlhps %%xmm3, %%xmm2 \n"
00042 "mulps %%xmm4, %%xmm0 \n"
00043 "mulps %%xmm4, %%xmm2 \n"
00044 "movaps %%xmm0, (%1,%0) \n"
00045 "movaps %%xmm2, 16(%1,%0) \n"
00046 "add $32, %0 \n"
00047 "jl 1b \n"
00048 :"+r"(i)
00049 :"r"(dst+len), "r"(src+len), "m"(mul)
00050 );
00051 }
00052
00053 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
00054 {
00055 x86_reg i = -4*len;
00056 __asm__ volatile(
00057 "movss %3, %%xmm4 \n"
00058 "shufps $0, %%xmm4, %%xmm4 \n"
00059 "1: \n"
00060 "cvtdq2ps (%2,%0), %%xmm0 \n"
00061 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
00062 "mulps %%xmm4, %%xmm0 \n"
00063 "mulps %%xmm4, %%xmm1 \n"
00064 "movaps %%xmm0, (%1,%0) \n"
00065 "movaps %%xmm1, 16(%1,%0) \n"
00066 "add $32, %0 \n"
00067 "jl 1b \n"
00068 :"+r"(i)
00069 :"r"(dst+len), "r"(src+len), "m"(mul)
00070 );
00071 }
00072
00073 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
00074 x86_reg reglen = len;
00075
00076 __asm__ volatile(
00077 "add %0 , %0 \n\t"
00078 "lea (%2,%0,2) , %2 \n\t"
00079 "add %0 , %1 \n\t"
00080 "neg %0 \n\t"
00081 "1: \n\t"
00082 "pf2id (%2,%0,2) , %%mm0 \n\t"
00083 "pf2id 8(%2,%0,2) , %%mm1 \n\t"
00084 "pf2id 16(%2,%0,2) , %%mm2 \n\t"
00085 "pf2id 24(%2,%0,2) , %%mm3 \n\t"
00086 "packssdw %%mm1 , %%mm0 \n\t"
00087 "packssdw %%mm3 , %%mm2 \n\t"
00088 "movq %%mm0 , (%1,%0) \n\t"
00089 "movq %%mm2 , 8(%1,%0) \n\t"
00090 "add $16 , %0 \n\t"
00091 " js 1b \n\t"
00092 "femms \n\t"
00093 :"+r"(reglen), "+r"(dst), "+r"(src)
00094 );
00095 }
00096
00097 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
00098 x86_reg reglen = len;
00099 __asm__ volatile(
00100 "add %0 , %0 \n\t"
00101 "lea (%2,%0,2) , %2 \n\t"
00102 "add %0 , %1 \n\t"
00103 "neg %0 \n\t"
00104 "1: \n\t"
00105 "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
00106 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
00107 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
00108 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
00109 "packssdw %%mm1 , %%mm0 \n\t"
00110 "packssdw %%mm3 , %%mm2 \n\t"
00111 "movq %%mm0 , (%1,%0) \n\t"
00112 "movq %%mm2 , 8(%1,%0) \n\t"
00113 "add $16 , %0 \n\t"
00114 " js 1b \n\t"
00115 "emms \n\t"
00116 :"+r"(reglen), "+r"(dst), "+r"(src)
00117 );
00118 }
00119
00120 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
00121 x86_reg reglen = len;
00122 __asm__ volatile(
00123 "add %0 , %0 \n\t"
00124 "lea (%2,%0,2) , %2 \n\t"
00125 "add %0 , %1 \n\t"
00126 "neg %0 \n\t"
00127 "1: \n\t"
00128 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
00129 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
00130 "packssdw %%xmm1 , %%xmm0 \n\t"
00131 "movdqa %%xmm0 , (%1,%0) \n\t"
00132 "add $16 , %0 \n\t"
00133 " js 1b \n\t"
00134 :"+r"(reglen), "+r"(dst), "+r"(src)
00135 );
00136 }
00137
00138 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
00139 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
00140 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
00141
00142 #if !HAVE_YASM
00143 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
00144 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
00145 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
00146 #endif
00147 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
00148
00149 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
00150 \
00151 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
00152 DECLARE_ALIGNED(16, int16_t, tmp)[len];\
00153 int i,j,c;\
00154 for(c=0; c<channels; c++){\
00155 float_to_int16_##cpu(tmp, src[c], len);\
00156 for(i=0, j=c; i<len; i++, j+=channels)\
00157 dst[j] = tmp[i];\
00158 }\
00159 }\
00160 \
00161 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
00162 if(channels==1)\
00163 float_to_int16_##cpu(dst, src[0], len);\
00164 else if(channels==2){\
00165 x86_reg reglen = len; \
00166 const float *src0 = src[0];\
00167 const float *src1 = src[1];\
00168 __asm__ volatile(\
00169 "shl $2, %0 \n"\
00170 "add %0, %1 \n"\
00171 "add %0, %2 \n"\
00172 "add %0, %3 \n"\
00173 "neg %0 \n"\
00174 body\
00175 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
00176 );\
00177 }else if(channels==6){\
00178 ff_float_to_int16_interleave6_##cpu(dst, src, len);\
00179 }else\
00180 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
00181 }
00182
00183 FLOAT_TO_INT16_INTERLEAVE(3dnow,
00184 "1: \n"
00185 "pf2id (%2,%0), %%mm0 \n"
00186 "pf2id 8(%2,%0), %%mm1 \n"
00187 "pf2id (%3,%0), %%mm2 \n"
00188 "pf2id 8(%3,%0), %%mm3 \n"
00189 "packssdw %%mm1, %%mm0 \n"
00190 "packssdw %%mm3, %%mm2 \n"
00191 "movq %%mm0, %%mm1 \n"
00192 "punpcklwd %%mm2, %%mm0 \n"
00193 "punpckhwd %%mm2, %%mm1 \n"
00194 "movq %%mm0, (%1,%0)\n"
00195 "movq %%mm1, 8(%1,%0)\n"
00196 "add $16, %0 \n"
00197 "js 1b \n"
00198 "femms \n"
00199 )
00200
00201 FLOAT_TO_INT16_INTERLEAVE(sse,
00202 "1: \n"
00203 "cvtps2pi (%2,%0), %%mm0 \n"
00204 "cvtps2pi 8(%2,%0), %%mm1 \n"
00205 "cvtps2pi (%3,%0), %%mm2 \n"
00206 "cvtps2pi 8(%3,%0), %%mm3 \n"
00207 "packssdw %%mm1, %%mm0 \n"
00208 "packssdw %%mm3, %%mm2 \n"
00209 "movq %%mm0, %%mm1 \n"
00210 "punpcklwd %%mm2, %%mm0 \n"
00211 "punpckhwd %%mm2, %%mm1 \n"
00212 "movq %%mm0, (%1,%0)\n"
00213 "movq %%mm1, 8(%1,%0)\n"
00214 "add $16, %0 \n"
00215 "js 1b \n"
00216 "emms \n"
00217 )
00218
00219 FLOAT_TO_INT16_INTERLEAVE(sse2,
00220 "1: \n"
00221 "cvtps2dq (%2,%0), %%xmm0 \n"
00222 "cvtps2dq (%3,%0), %%xmm1 \n"
00223 "packssdw %%xmm1, %%xmm0 \n"
00224 "movhlps %%xmm0, %%xmm1 \n"
00225 "punpcklwd %%xmm1, %%xmm0 \n"
00226 "movdqa %%xmm0, (%1,%0) \n"
00227 "add $16, %0 \n"
00228 "js 1b \n"
00229 )
00230
00231 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
00232 if(channels==6)
00233 ff_float_to_int16_interleave6_3dn2(dst, src, len);
00234 else
00235 float_to_int16_interleave_3dnow(dst, src, len, channels);
00236 }
00237
00238 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
00239 {
00240 int mm_flags = av_get_cpu_flags();
00241
00242 if (mm_flags & AV_CPU_FLAG_MMX) {
00243
00244 if(mm_flags & AV_CPU_FLAG_3DNOW){
00245 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00246 c->float_to_int16 = float_to_int16_3dnow;
00247 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
00248 }
00249 }
00250 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
00251 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00252 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
00253 }
00254 }
00255 if(mm_flags & AV_CPU_FLAG_SSE){
00256 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
00257 c->float_to_int16 = float_to_int16_sse;
00258 c->float_to_int16_interleave = float_to_int16_interleave_sse;
00259 }
00260 if(mm_flags & AV_CPU_FLAG_SSE2){
00261 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
00262 c->float_to_int16 = float_to_int16_sse2;
00263 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
00264 }
00265 }
00266 }