00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "libavutil/cpu.h"
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/h264dsp.h"
00024 #include "dsputil_mmx.h"
00025
00026 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
00027
00028
00029
00030
00031 void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride);
00032 void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride);
00033 void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride);
00034 void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride);
00035 void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride);
00036
00037 void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset,
00038 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00039 void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset,
00040 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00041 void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset,
00042 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00043 void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset,
00044 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00045 void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
00046 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00047 void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset,
00048 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00049 void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset,
00050 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00051 void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset,
00052 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00053 void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset,
00054 DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
00055
00056 void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block,
00057 int stride, const uint8_t nnzc[6*8]);
00058 void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block,
00059 int stride, const uint8_t nnzc[6*8]);
00060 void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
00061 int stride, const uint8_t nnzc[6*8]);
00062 void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
00063 void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
00064
00065
00066
00067
00068 #define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \
00069 do { \
00070 x86_reg b_idx; \
00071 mask_mv <<= 3; \
00072 for( b_idx=0; b_idx<edges; b_idx+=step ) { \
00073 if (!mask_dir) \
00074 __asm__ volatile( \
00075 "pxor %%mm0, %%mm0 \n\t" \
00076 :: \
00077 ); \
00078 if(!(mask_mv & b_idx)) { \
00079 if(bidir) { \
00080 __asm__ volatile( \
00081 "movd %a3(%0,%2), %%mm2 \n" \
00082 "punpckldq %a4(%0,%2), %%mm2 \n" \
00083 "pshufw $0x44, 12(%0,%2), %%mm0 \n" \
00084 "pshufw $0x44, 52(%0,%2), %%mm1 \n" \
00085 "pshufw $0x4E, %%mm2, %%mm3 \n" \
00086 "psubb %%mm2, %%mm0 \n" \
00087 "psubb %%mm3, %%mm1 \n" \
00088 \
00089 "por %%mm1, %%mm0 \n" \
00090 "movq %a5(%1,%2,4), %%mm1 \n" \
00091 "movq %a6(%1,%2,4), %%mm2 \n" \
00092 "movq %%mm1, %%mm3 \n" \
00093 "movq %%mm2, %%mm4 \n" \
00094 "psubw 48(%1,%2,4), %%mm1 \n" \
00095 "psubw 56(%1,%2,4), %%mm2 \n" \
00096 "psubw 208(%1,%2,4), %%mm3 \n" \
00097 "psubw 216(%1,%2,4), %%mm4 \n" \
00098 "packsswb %%mm2, %%mm1 \n" \
00099 "packsswb %%mm4, %%mm3 \n" \
00100 "paddb %%mm6, %%mm1 \n" \
00101 "paddb %%mm6, %%mm3 \n" \
00102 "psubusb %%mm5, %%mm1 \n" \
00103 "psubusb %%mm5, %%mm3 \n" \
00104 "packsswb %%mm3, %%mm1 \n" \
00105 \
00106 "por %%mm1, %%mm0 \n" \
00107 "movq %a7(%1,%2,4), %%mm1 \n" \
00108 "movq %a8(%1,%2,4), %%mm2 \n" \
00109 "movq %%mm1, %%mm3 \n" \
00110 "movq %%mm2, %%mm4 \n" \
00111 "psubw 48(%1,%2,4), %%mm1 \n" \
00112 "psubw 56(%1,%2,4), %%mm2 \n" \
00113 "psubw 208(%1,%2,4), %%mm3 \n" \
00114 "psubw 216(%1,%2,4), %%mm4 \n" \
00115 "packsswb %%mm2, %%mm1 \n" \
00116 "packsswb %%mm4, %%mm3 \n" \
00117 "paddb %%mm6, %%mm1 \n" \
00118 "paddb %%mm6, %%mm3 \n" \
00119 "psubusb %%mm5, %%mm1 \n" \
00120 "psubusb %%mm5, %%mm3 \n" \
00121 "packsswb %%mm3, %%mm1 \n" \
00122 \
00123 "pshufw $0x4E, %%mm1, %%mm1 \n" \
00124 "por %%mm1, %%mm0 \n" \
00125 "pshufw $0x4E, %%mm0, %%mm1 \n" \
00126 "pminub %%mm1, %%mm0 \n" \
00127 ::"r"(ref), \
00128 "r"(mv), \
00129 "r"(b_idx), \
00130 "i"(d_idx+12), \
00131 "i"(d_idx+52), \
00132 "i"(d_idx*4+48), \
00133 "i"(d_idx*4+56), \
00134 "i"(d_idx*4+208), \
00135 "i"(d_idx*4+216) \
00136 ); \
00137 } else { \
00138 __asm__ volatile( \
00139 "movd 12(%0,%2), %%mm0 \n" \
00140 "psubb %a3(%0,%2), %%mm0 \n" \
00141 "movq 48(%1,%2,4), %%mm1 \n" \
00142 "movq 56(%1,%2,4), %%mm2 \n" \
00143 "psubw %a4(%1,%2,4), %%mm1 \n" \
00144 "psubw %a5(%1,%2,4), %%mm2 \n" \
00145 "packsswb %%mm2, %%mm1 \n" \
00146 "paddb %%mm6, %%mm1 \n" \
00147 "psubusb %%mm5, %%mm1 \n" \
00148 "packsswb %%mm1, %%mm1 \n" \
00149 "por %%mm1, %%mm0 \n" \
00150 ::"r"(ref), \
00151 "r"(mv), \
00152 "r"(b_idx), \
00153 "i"(d_idx+12), \
00154 "i"(d_idx*4+48), \
00155 "i"(d_idx*4+56) \
00156 ); \
00157 } \
00158 } \
00159 __asm__ volatile( \
00160 "movd 12(%0,%1), %%mm1 \n" \
00161 "por %a2(%0,%1), %%mm1 \n" \
00162 ::"r"(nnz), \
00163 "r"(b_idx), \
00164 "i"(d_idx+12) \
00165 ); \
00166 __asm__ volatile( \
00167 "pminub %%mm7, %%mm1 \n" \
00168 "pminub %%mm7, %%mm0 \n" \
00169 "psllw $1, %%mm1 \n" \
00170 "pxor %%mm2, %%mm2 \n" \
00171 "pmaxub %%mm0, %%mm1 \n" \
00172 "punpcklbw %%mm2, %%mm1 \n" \
00173 "movq %%mm1, %a1(%0,%2) \n" \
00174 ::"r"(bS), \
00175 "i"(32*dir), \
00176 "r"(b_idx) \
00177 :"memory" \
00178 ); \
00179 } \
00180 } while (0)
00181
00182 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00183 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00184 __asm__ volatile(
00185 "movq %0, %%mm7 \n"
00186 "movq %1, %%mm6 \n"
00187 ::"m"(ff_pb_1), "m"(ff_pb_3)
00188 );
00189 if(field)
00190 __asm__ volatile(
00191 "movq %0, %%mm6 \n"
00192 ::"m"(ff_pb_3_1)
00193 );
00194 __asm__ volatile(
00195 "movq %%mm6, %%mm5 \n"
00196 "paddb %%mm5, %%mm5 \n"
00197 :);
00198
00199
00200
00201 step <<= 3;
00202 edges <<= 3;
00203 h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, edges, step, mask_mv1, 1, -8, 0);
00204 h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, 32, 8, mask_mv0, 0, -1, -1);
00205
00206 __asm__ volatile(
00207 "movq (%0), %%mm0 \n\t"
00208 "movq 8(%0), %%mm1 \n\t"
00209 "movq 16(%0), %%mm2 \n\t"
00210 "movq 24(%0), %%mm3 \n\t"
00211 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00212 "movq %%mm0, (%0) \n\t"
00213 "movq %%mm3, 8(%0) \n\t"
00214 "movq %%mm4, 16(%0) \n\t"
00215 "movq %%mm2, 24(%0) \n\t"
00216 ::"r"(bS[0])
00217 :"memory"
00218 );
00219 }
00220
00221 #define LF_FUNC(DIR, TYPE, OPT) \
00222 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
00223 int alpha, int beta, int8_t *tc0);
00224 #define LF_IFUNC(DIR, TYPE, OPT) \
00225 void ff_x264_deblock_ ## DIR ## _ ## TYPE ## _ ## OPT (uint8_t *pix, int stride, \
00226 int alpha, int beta);
00227
00228 LF_FUNC (h, chroma, mmxext)
00229 LF_IFUNC(h, chroma_intra, mmxext)
00230 LF_FUNC (v, chroma, mmxext)
00231 LF_IFUNC(v, chroma_intra, mmxext)
00232
00233 LF_FUNC (h, luma, mmxext)
00234 LF_IFUNC(h, luma_intra, mmxext)
00235 #if HAVE_YASM && ARCH_X86_32
00236 LF_FUNC (v8, luma, mmxext)
00237 static void ff_x264_deblock_v_luma_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00238 {
00239 if((tc0[0] & tc0[1]) >= 0)
00240 ff_x264_deblock_v8_luma_mmxext(pix+0, stride, alpha, beta, tc0);
00241 if((tc0[2] & tc0[3]) >= 0)
00242 ff_x264_deblock_v8_luma_mmxext(pix+8, stride, alpha, beta, tc0+2);
00243 }
00244 LF_IFUNC(v8, luma_intra, mmxext)
00245 static void ff_x264_deblock_v_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta)
00246 {
00247 ff_x264_deblock_v8_luma_intra_mmxext(pix+0, stride, alpha, beta);
00248 ff_x264_deblock_v8_luma_intra_mmxext(pix+8, stride, alpha, beta);
00249 }
00250 #endif
00251
00252 LF_FUNC (h, luma, sse2)
00253 LF_IFUNC(h, luma_intra, sse2)
00254 LF_FUNC (v, luma, sse2)
00255 LF_IFUNC(v, luma_intra, sse2)
00256
00257
00258
00259
00260 #define H264_WEIGHT(W, H, OPT) \
00261 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
00262 int stride, int log2_denom, int weight, int offset);
00263
00264 #define H264_BIWEIGHT(W, H, OPT) \
00265 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
00266 uint8_t *src, int stride, int log2_denom, int weightd, \
00267 int weights, int offset);
00268
00269 #define H264_BIWEIGHT_MMX(W,H) \
00270 H264_WEIGHT (W, H, mmx2) \
00271 H264_BIWEIGHT(W, H, mmx2)
00272
00273 #define H264_BIWEIGHT_MMX_SSE(W,H) \
00274 H264_BIWEIGHT_MMX(W, H) \
00275 H264_WEIGHT (W, H, sse2) \
00276 H264_BIWEIGHT (W, H, sse2) \
00277 H264_BIWEIGHT (W, H, ssse3)
00278
00279 H264_BIWEIGHT_MMX_SSE(16, 16)
00280 H264_BIWEIGHT_MMX_SSE(16, 8)
00281 H264_BIWEIGHT_MMX_SSE( 8, 16)
00282 H264_BIWEIGHT_MMX_SSE( 8, 8)
00283 H264_BIWEIGHT_MMX_SSE( 8, 4)
00284 H264_BIWEIGHT_MMX ( 4, 8)
00285 H264_BIWEIGHT_MMX ( 4, 4)
00286 H264_BIWEIGHT_MMX ( 4, 2)
00287
00288 void ff_h264dsp_init_x86(H264DSPContext *c)
00289 {
00290 int mm_flags = av_get_cpu_flags();
00291
00292 if (mm_flags & AV_CPU_FLAG_MMX2) {
00293 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
00294 }
00295 #if HAVE_YASM
00296 if (mm_flags & AV_CPU_FLAG_MMX) {
00297 c->h264_idct_dc_add=
00298 c->h264_idct_add= ff_h264_idct_add_mmx;
00299 c->h264_idct8_dc_add=
00300 c->h264_idct8_add= ff_h264_idct8_add_mmx;
00301
00302 c->h264_idct_add16 = ff_h264_idct_add16_mmx;
00303 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
00304 c->h264_idct_add8 = ff_h264_idct_add8_mmx;
00305 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
00306 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
00307
00308 if (mm_flags & AV_CPU_FLAG_MMX2) {
00309 c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
00310 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
00311 c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
00312 c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
00313 c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
00314 c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
00315
00316 c->h264_v_loop_filter_chroma= ff_x264_deblock_v_chroma_mmxext;
00317 c->h264_h_loop_filter_chroma= ff_x264_deblock_h_chroma_mmxext;
00318 c->h264_v_loop_filter_chroma_intra= ff_x264_deblock_v_chroma_intra_mmxext;
00319 c->h264_h_loop_filter_chroma_intra= ff_x264_deblock_h_chroma_intra_mmxext;
00320 #if ARCH_X86_32
00321 c->h264_v_loop_filter_luma= ff_x264_deblock_v_luma_mmxext;
00322 c->h264_h_loop_filter_luma= ff_x264_deblock_h_luma_mmxext;
00323 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
00324 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
00325 #endif
00326 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
00327 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
00328 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
00329 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
00330 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
00331 c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
00332 c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
00333 c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
00334
00335 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
00336 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
00337 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
00338 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
00339 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
00340 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
00341 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
00342 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
00343
00344 if (mm_flags&AV_CPU_FLAG_SSE2) {
00345 c->h264_idct8_add = ff_h264_idct8_add_sse2;
00346 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
00347 c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
00348
00349 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
00350 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
00351 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
00352 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
00353 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
00354
00355 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
00356 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
00357 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
00358 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
00359 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
00360
00361 #if HAVE_ALIGNED_STACK
00362 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
00363 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
00364 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
00365 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
00366 #endif
00367
00368 c->h264_idct_add16 = ff_h264_idct_add16_sse2;
00369 c->h264_idct_add8 = ff_h264_idct_add8_sse2;
00370 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
00371 }
00372 if (mm_flags&AV_CPU_FLAG_SSSE3) {
00373 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
00374 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
00375 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
00376 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
00377 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
00378 }
00379 }
00380 }
00381 #endif
00382 }