00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #undef REAL_MOVNTQ
00022 #undef MOVNTQ
00023 #undef PAVGB
00024 #undef PREFETCH
00025
00026 #if COMPILE_TEMPLATE_AMD3DNOW
00027 #define PREFETCH "prefetch"
00028 #elif COMPILE_TEMPLATE_MMX2
00029 #define PREFETCH "prefetchnta"
00030 #else
00031 #define PREFETCH " # nop"
00032 #endif
00033
00034 #if COMPILE_TEMPLATE_MMX2
00035 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
00036 #elif COMPILE_TEMPLATE_AMD3DNOW
00037 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
00038 #endif
00039
00040 #if COMPILE_TEMPLATE_MMX2
00041 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00042 #else
00043 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00044 #endif
00045 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
00046
00047 #if COMPILE_TEMPLATE_ALTIVEC
00048 #include "ppc/swscale_altivec_template.c"
00049 #endif
00050
00051 #define YSCALEYUV2YV12X(x, offset, dest, width) \
00052 __asm__ volatile(\
00053 "xor %%"REG_a", %%"REG_a" \n\t"\
00054 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
00055 "movq %%mm3, %%mm4 \n\t"\
00056 "lea " offset "(%0), %%"REG_d" \n\t"\
00057 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00058 ".p2align 4 \n\t" \
00059 "1: \n\t"\
00060 "movq 8(%%"REG_d"), %%mm0 \n\t" \
00061 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \
00062 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" \
00063 "add $16, %%"REG_d" \n\t"\
00064 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00065 "test %%"REG_S", %%"REG_S" \n\t"\
00066 "pmulhw %%mm0, %%mm2 \n\t"\
00067 "pmulhw %%mm0, %%mm5 \n\t"\
00068 "paddw %%mm2, %%mm3 \n\t"\
00069 "paddw %%mm5, %%mm4 \n\t"\
00070 " jnz 1b \n\t"\
00071 "psraw $3, %%mm3 \n\t"\
00072 "psraw $3, %%mm4 \n\t"\
00073 "packuswb %%mm4, %%mm3 \n\t"\
00074 MOVNTQ(%%mm3, (%1, %%REGa))\
00075 "add $8, %%"REG_a" \n\t"\
00076 "cmp %2, %%"REG_a" \n\t"\
00077 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
00078 "movq %%mm3, %%mm4 \n\t"\
00079 "lea " offset "(%0), %%"REG_d" \n\t"\
00080 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00081 "jb 1b \n\t"\
00082 :: "r" (&c->redDither),\
00083 "r" (dest), "g" ((x86_reg)width)\
00084 : "%"REG_a, "%"REG_d, "%"REG_S\
00085 );
00086
00087 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
00088 __asm__ volatile(\
00089 "lea " offset "(%0), %%"REG_d" \n\t"\
00090 "xor %%"REG_a", %%"REG_a" \n\t"\
00091 "pxor %%mm4, %%mm4 \n\t"\
00092 "pxor %%mm5, %%mm5 \n\t"\
00093 "pxor %%mm6, %%mm6 \n\t"\
00094 "pxor %%mm7, %%mm7 \n\t"\
00095 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00096 ".p2align 4 \n\t"\
00097 "1: \n\t"\
00098 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \
00099 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \
00100 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
00101 "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" \
00102 "movq %%mm0, %%mm3 \n\t"\
00103 "punpcklwd %%mm1, %%mm0 \n\t"\
00104 "punpckhwd %%mm1, %%mm3 \n\t"\
00105 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" \
00106 "pmaddwd %%mm1, %%mm0 \n\t"\
00107 "pmaddwd %%mm1, %%mm3 \n\t"\
00108 "paddd %%mm0, %%mm4 \n\t"\
00109 "paddd %%mm3, %%mm5 \n\t"\
00110 "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \
00111 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
00112 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
00113 "test %%"REG_S", %%"REG_S" \n\t"\
00114 "movq %%mm2, %%mm0 \n\t"\
00115 "punpcklwd %%mm3, %%mm2 \n\t"\
00116 "punpckhwd %%mm3, %%mm0 \n\t"\
00117 "pmaddwd %%mm1, %%mm2 \n\t"\
00118 "pmaddwd %%mm1, %%mm0 \n\t"\
00119 "paddd %%mm2, %%mm6 \n\t"\
00120 "paddd %%mm0, %%mm7 \n\t"\
00121 " jnz 1b \n\t"\
00122 "psrad $16, %%mm4 \n\t"\
00123 "psrad $16, %%mm5 \n\t"\
00124 "psrad $16, %%mm6 \n\t"\
00125 "psrad $16, %%mm7 \n\t"\
00126 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
00127 "packssdw %%mm5, %%mm4 \n\t"\
00128 "packssdw %%mm7, %%mm6 \n\t"\
00129 "paddw %%mm0, %%mm4 \n\t"\
00130 "paddw %%mm0, %%mm6 \n\t"\
00131 "psraw $3, %%mm4 \n\t"\
00132 "psraw $3, %%mm6 \n\t"\
00133 "packuswb %%mm6, %%mm4 \n\t"\
00134 MOVNTQ(%%mm4, (%1, %%REGa))\
00135 "add $8, %%"REG_a" \n\t"\
00136 "cmp %2, %%"REG_a" \n\t"\
00137 "lea " offset "(%0), %%"REG_d" \n\t"\
00138 "pxor %%mm4, %%mm4 \n\t"\
00139 "pxor %%mm5, %%mm5 \n\t"\
00140 "pxor %%mm6, %%mm6 \n\t"\
00141 "pxor %%mm7, %%mm7 \n\t"\
00142 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00143 "jb 1b \n\t"\
00144 :: "r" (&c->redDither),\
00145 "r" (dest), "g" ((x86_reg)width)\
00146 : "%"REG_a, "%"REG_d, "%"REG_S\
00147 );
00148
00149 #define YSCALEYUV2YV121 \
00150 "mov %2, %%"REG_a" \n\t"\
00151 ".p2align 4 \n\t" \
00152 "1: \n\t"\
00153 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
00154 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
00155 "psraw $7, %%mm0 \n\t"\
00156 "psraw $7, %%mm1 \n\t"\
00157 "packuswb %%mm1, %%mm0 \n\t"\
00158 MOVNTQ(%%mm0, (%1, %%REGa))\
00159 "add $8, %%"REG_a" \n\t"\
00160 "jnc 1b \n\t"
00161
00162 #define YSCALEYUV2YV121_ACCURATE \
00163 "mov %2, %%"REG_a" \n\t"\
00164 "pcmpeqw %%mm7, %%mm7 \n\t"\
00165 "psrlw $15, %%mm7 \n\t"\
00166 "psllw $6, %%mm7 \n\t"\
00167 ".p2align 4 \n\t" \
00168 "1: \n\t"\
00169 "movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
00170 "movq 8(%0, %%"REG_a", 2), %%mm1 \n\t"\
00171 "paddsw %%mm7, %%mm0 \n\t"\
00172 "paddsw %%mm7, %%mm1 \n\t"\
00173 "psraw $7, %%mm0 \n\t"\
00174 "psraw $7, %%mm1 \n\t"\
00175 "packuswb %%mm1, %%mm0 \n\t"\
00176 MOVNTQ(%%mm0, (%1, %%REGa))\
00177 "add $8, %%"REG_a" \n\t"\
00178 "jnc 1b \n\t"
00179
00180
00181
00182
00183
00184
00185
00186
00187 #define YSCALEYUV2PACKEDX_UV \
00188 __asm__ volatile(\
00189 "xor %%"REG_a", %%"REG_a" \n\t"\
00190 ".p2align 4 \n\t"\
00191 "nop \n\t"\
00192 "1: \n\t"\
00193 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
00194 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00195 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
00196 "movq %%mm3, %%mm4 \n\t"\
00197 ".p2align 4 \n\t"\
00198 "2: \n\t"\
00199 "movq 8(%%"REG_d"), %%mm0 \n\t" \
00200 "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
00201 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" \
00202 "add $16, %%"REG_d" \n\t"\
00203 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00204 "pmulhw %%mm0, %%mm2 \n\t"\
00205 "pmulhw %%mm0, %%mm5 \n\t"\
00206 "paddw %%mm2, %%mm3 \n\t"\
00207 "paddw %%mm5, %%mm4 \n\t"\
00208 "test %%"REG_S", %%"REG_S" \n\t"\
00209 " jnz 2b \n\t"\
00210
00211 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
00212 "lea "offset"(%0), %%"REG_d" \n\t"\
00213 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00214 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
00215 "movq "#dst1", "#dst2" \n\t"\
00216 ".p2align 4 \n\t"\
00217 "2: \n\t"\
00218 "movq 8(%%"REG_d"), "#coeff" \n\t" \
00219 "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" \
00220 "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" \
00221 "add $16, %%"REG_d" \n\t"\
00222 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00223 "pmulhw "#coeff", "#src1" \n\t"\
00224 "pmulhw "#coeff", "#src2" \n\t"\
00225 "paddw "#src1", "#dst1" \n\t"\
00226 "paddw "#src2", "#dst2" \n\t"\
00227 "test %%"REG_S", %%"REG_S" \n\t"\
00228 " jnz 2b \n\t"\
00229
00230 #define YSCALEYUV2PACKEDX \
00231 YSCALEYUV2PACKEDX_UV \
00232 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
00233
00234 #define YSCALEYUV2PACKEDX_END \
00235 :: "r" (&c->redDither), \
00236 "m" (dummy), "m" (dummy), "m" (dummy),\
00237 "r" (dest), "m" (dstW_reg) \
00238 : "%"REG_a, "%"REG_d, "%"REG_S \
00239 );
00240
00241 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
00242 __asm__ volatile(\
00243 "xor %%"REG_a", %%"REG_a" \n\t"\
00244 ".p2align 4 \n\t"\
00245 "nop \n\t"\
00246 "1: \n\t"\
00247 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
00248 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00249 "pxor %%mm4, %%mm4 \n\t"\
00250 "pxor %%mm5, %%mm5 \n\t"\
00251 "pxor %%mm6, %%mm6 \n\t"\
00252 "pxor %%mm7, %%mm7 \n\t"\
00253 ".p2align 4 \n\t"\
00254 "2: \n\t"\
00255 "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" \
00256 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" \
00257 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
00258 "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" \
00259 "movq %%mm0, %%mm3 \n\t"\
00260 "punpcklwd %%mm1, %%mm0 \n\t"\
00261 "punpckhwd %%mm1, %%mm3 \n\t"\
00262 "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" \
00263 "pmaddwd %%mm1, %%mm0 \n\t"\
00264 "pmaddwd %%mm1, %%mm3 \n\t"\
00265 "paddd %%mm0, %%mm4 \n\t"\
00266 "paddd %%mm3, %%mm5 \n\t"\
00267 "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" \
00268 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
00269 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
00270 "test %%"REG_S", %%"REG_S" \n\t"\
00271 "movq %%mm2, %%mm0 \n\t"\
00272 "punpcklwd %%mm3, %%mm2 \n\t"\
00273 "punpckhwd %%mm3, %%mm0 \n\t"\
00274 "pmaddwd %%mm1, %%mm2 \n\t"\
00275 "pmaddwd %%mm1, %%mm0 \n\t"\
00276 "paddd %%mm2, %%mm6 \n\t"\
00277 "paddd %%mm0, %%mm7 \n\t"\
00278 " jnz 2b \n\t"\
00279 "psrad $16, %%mm4 \n\t"\
00280 "psrad $16, %%mm5 \n\t"\
00281 "psrad $16, %%mm6 \n\t"\
00282 "psrad $16, %%mm7 \n\t"\
00283 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
00284 "packssdw %%mm5, %%mm4 \n\t"\
00285 "packssdw %%mm7, %%mm6 \n\t"\
00286 "paddw %%mm0, %%mm4 \n\t"\
00287 "paddw %%mm0, %%mm6 \n\t"\
00288 "movq %%mm4, "U_TEMP"(%0) \n\t"\
00289 "movq %%mm6, "V_TEMP"(%0) \n\t"\
00290
00291 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
00292 "lea "offset"(%0), %%"REG_d" \n\t"\
00293 "mov (%%"REG_d"), %%"REG_S" \n\t"\
00294 "pxor %%mm1, %%mm1 \n\t"\
00295 "pxor %%mm5, %%mm5 \n\t"\
00296 "pxor %%mm7, %%mm7 \n\t"\
00297 "pxor %%mm6, %%mm6 \n\t"\
00298 ".p2align 4 \n\t"\
00299 "2: \n\t"\
00300 "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" \
00301 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" \
00302 "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
00303 "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" \
00304 "movq %%mm0, %%mm3 \n\t"\
00305 "punpcklwd %%mm4, %%mm0 \n\t"\
00306 "punpckhwd %%mm4, %%mm3 \n\t"\
00307 "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" \
00308 "pmaddwd %%mm4, %%mm0 \n\t"\
00309 "pmaddwd %%mm4, %%mm3 \n\t"\
00310 "paddd %%mm0, %%mm1 \n\t"\
00311 "paddd %%mm3, %%mm5 \n\t"\
00312 "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" \
00313 "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
00314 "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
00315 "test %%"REG_S", %%"REG_S" \n\t"\
00316 "movq %%mm2, %%mm0 \n\t"\
00317 "punpcklwd %%mm3, %%mm2 \n\t"\
00318 "punpckhwd %%mm3, %%mm0 \n\t"\
00319 "pmaddwd %%mm4, %%mm2 \n\t"\
00320 "pmaddwd %%mm4, %%mm0 \n\t"\
00321 "paddd %%mm2, %%mm7 \n\t"\
00322 "paddd %%mm0, %%mm6 \n\t"\
00323 " jnz 2b \n\t"\
00324 "psrad $16, %%mm1 \n\t"\
00325 "psrad $16, %%mm5 \n\t"\
00326 "psrad $16, %%mm7 \n\t"\
00327 "psrad $16, %%mm6 \n\t"\
00328 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
00329 "packssdw %%mm5, %%mm1 \n\t"\
00330 "packssdw %%mm6, %%mm7 \n\t"\
00331 "paddw %%mm0, %%mm1 \n\t"\
00332 "paddw %%mm0, %%mm7 \n\t"\
00333 "movq "U_TEMP"(%0), %%mm3 \n\t"\
00334 "movq "V_TEMP"(%0), %%mm4 \n\t"\
00335
00336 #define YSCALEYUV2PACKEDX_ACCURATE \
00337 YSCALEYUV2PACKEDX_ACCURATE_UV \
00338 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
00339
00340 #define YSCALEYUV2RGBX \
00341 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \
00342 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \
00343 "movq %%mm3, %%mm2 \n\t" \
00344 "movq %%mm4, %%mm5 \n\t" \
00345 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
00346 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
00347 \
00348 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
00349 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
00350 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \
00351 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \
00352 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
00353 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
00354 \
00355 "paddw %%mm3, %%mm4 \n\t"\
00356 "movq %%mm2, %%mm0 \n\t"\
00357 "movq %%mm5, %%mm6 \n\t"\
00358 "movq %%mm4, %%mm3 \n\t"\
00359 "punpcklwd %%mm2, %%mm2 \n\t"\
00360 "punpcklwd %%mm5, %%mm5 \n\t"\
00361 "punpcklwd %%mm4, %%mm4 \n\t"\
00362 "paddw %%mm1, %%mm2 \n\t"\
00363 "paddw %%mm1, %%mm5 \n\t"\
00364 "paddw %%mm1, %%mm4 \n\t"\
00365 "punpckhwd %%mm0, %%mm0 \n\t"\
00366 "punpckhwd %%mm6, %%mm6 \n\t"\
00367 "punpckhwd %%mm3, %%mm3 \n\t"\
00368 "paddw %%mm7, %%mm0 \n\t"\
00369 "paddw %%mm7, %%mm6 \n\t"\
00370 "paddw %%mm7, %%mm3 \n\t"\
00371 \
00372 "packuswb %%mm0, %%mm2 \n\t"\
00373 "packuswb %%mm6, %%mm5 \n\t"\
00374 "packuswb %%mm3, %%mm4 \n\t"\
00375
00376 #define REAL_YSCALEYUV2PACKED(index, c) \
00377 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
00378 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
00379 "psraw $3, %%mm0 \n\t"\
00380 "psraw $3, %%mm1 \n\t"\
00381 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00382 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00383 "xor "#index", "#index" \n\t"\
00384 ".p2align 4 \n\t"\
00385 "1: \n\t"\
00386 "movq (%2, "#index"), %%mm2 \n\t" \
00387 "movq (%3, "#index"), %%mm3 \n\t" \
00388 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" \
00389 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" \
00390 "psubw %%mm3, %%mm2 \n\t" \
00391 "psubw %%mm4, %%mm5 \n\t" \
00392 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
00393 "pmulhw %%mm0, %%mm2 \n\t" \
00394 "pmulhw %%mm0, %%mm5 \n\t" \
00395 "psraw $7, %%mm3 \n\t" \
00396 "psraw $7, %%mm4 \n\t" \
00397 "paddw %%mm2, %%mm3 \n\t" \
00398 "paddw %%mm5, %%mm4 \n\t" \
00399 "movq (%0, "#index", 2), %%mm0 \n\t" \
00400 "movq (%1, "#index", 2), %%mm1 \n\t" \
00401 "movq 8(%0, "#index", 2), %%mm6 \n\t" \
00402 "movq 8(%1, "#index", 2), %%mm7 \n\t" \
00403 "psubw %%mm1, %%mm0 \n\t" \
00404 "psubw %%mm7, %%mm6 \n\t" \
00405 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
00406 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
00407 "psraw $7, %%mm1 \n\t" \
00408 "psraw $7, %%mm7 \n\t" \
00409 "paddw %%mm0, %%mm1 \n\t" \
00410 "paddw %%mm6, %%mm7 \n\t" \
00411
00412 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
00413
00414 #define REAL_YSCALEYUV2RGB_UV(index, c) \
00415 "xor "#index", "#index" \n\t"\
00416 ".p2align 4 \n\t"\
00417 "1: \n\t"\
00418 "movq (%2, "#index"), %%mm2 \n\t" \
00419 "movq (%3, "#index"), %%mm3 \n\t" \
00420 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" \
00421 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" \
00422 "psubw %%mm3, %%mm2 \n\t" \
00423 "psubw %%mm4, %%mm5 \n\t" \
00424 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
00425 "pmulhw %%mm0, %%mm2 \n\t" \
00426 "pmulhw %%mm0, %%mm5 \n\t" \
00427 "psraw $4, %%mm3 \n\t" \
00428 "psraw $4, %%mm4 \n\t" \
00429 "paddw %%mm2, %%mm3 \n\t" \
00430 "paddw %%mm5, %%mm4 \n\t" \
00431 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
00432 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
00433 "movq %%mm3, %%mm2 \n\t" \
00434 "movq %%mm4, %%mm5 \n\t" \
00435 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
00436 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
00437 \
00438
00439 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
00440 "movq ("#b1", "#index", 2), %%mm0 \n\t" \
00441 "movq ("#b2", "#index", 2), %%mm1 \n\t" \
00442 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \
00443 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \
00444 "psubw %%mm1, %%mm0 \n\t" \
00445 "psubw %%mm7, %%mm6 \n\t" \
00446 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
00447 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
00448 "psraw $4, %%mm1 \n\t" \
00449 "psraw $4, %%mm7 \n\t" \
00450 "paddw %%mm0, %%mm1 \n\t" \
00451 "paddw %%mm6, %%mm7 \n\t" \
00452
00453 #define REAL_YSCALEYUV2RGB_COEFF(c) \
00454 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
00455 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
00456 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
00457 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
00458 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
00459 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
00460 \
00461 "paddw %%mm3, %%mm4 \n\t"\
00462 "movq %%mm2, %%mm0 \n\t"\
00463 "movq %%mm5, %%mm6 \n\t"\
00464 "movq %%mm4, %%mm3 \n\t"\
00465 "punpcklwd %%mm2, %%mm2 \n\t"\
00466 "punpcklwd %%mm5, %%mm5 \n\t"\
00467 "punpcklwd %%mm4, %%mm4 \n\t"\
00468 "paddw %%mm1, %%mm2 \n\t"\
00469 "paddw %%mm1, %%mm5 \n\t"\
00470 "paddw %%mm1, %%mm4 \n\t"\
00471 "punpckhwd %%mm0, %%mm0 \n\t"\
00472 "punpckhwd %%mm6, %%mm6 \n\t"\
00473 "punpckhwd %%mm3, %%mm3 \n\t"\
00474 "paddw %%mm7, %%mm0 \n\t"\
00475 "paddw %%mm7, %%mm6 \n\t"\
00476 "paddw %%mm7, %%mm3 \n\t"\
00477 \
00478 "packuswb %%mm0, %%mm2 \n\t"\
00479 "packuswb %%mm6, %%mm5 \n\t"\
00480 "packuswb %%mm3, %%mm4 \n\t"\
00481
00482 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
00483
00484 #define YSCALEYUV2RGB(index, c) \
00485 REAL_YSCALEYUV2RGB_UV(index, c) \
00486 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
00487 REAL_YSCALEYUV2RGB_COEFF(c)
00488
00489 #define REAL_YSCALEYUV2PACKED1(index, c) \
00490 "xor "#index", "#index" \n\t"\
00491 ".p2align 4 \n\t"\
00492 "1: \n\t"\
00493 "movq (%2, "#index"), %%mm3 \n\t" \
00494 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" \
00495 "psraw $7, %%mm3 \n\t" \
00496 "psraw $7, %%mm4 \n\t" \
00497 "movq (%0, "#index", 2), %%mm1 \n\t" \
00498 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
00499 "psraw $7, %%mm1 \n\t" \
00500 "psraw $7, %%mm7 \n\t" \
00501
00502 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
00503
00504 #define REAL_YSCALEYUV2RGB1(index, c) \
00505 "xor "#index", "#index" \n\t"\
00506 ".p2align 4 \n\t"\
00507 "1: \n\t"\
00508 "movq (%2, "#index"), %%mm3 \n\t" \
00509 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4 \n\t" \
00510 "psraw $4, %%mm3 \n\t" \
00511 "psraw $4, %%mm4 \n\t" \
00512 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
00513 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
00514 "movq %%mm3, %%mm2 \n\t" \
00515 "movq %%mm4, %%mm5 \n\t" \
00516 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
00517 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
00518 \
00519 "movq (%0, "#index", 2), %%mm1 \n\t" \
00520 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
00521 "psraw $4, %%mm1 \n\t" \
00522 "psraw $4, %%mm7 \n\t" \
00523 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
00524 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
00525 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
00526 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
00527 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
00528 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
00529 \
00530 "paddw %%mm3, %%mm4 \n\t"\
00531 "movq %%mm2, %%mm0 \n\t"\
00532 "movq %%mm5, %%mm6 \n\t"\
00533 "movq %%mm4, %%mm3 \n\t"\
00534 "punpcklwd %%mm2, %%mm2 \n\t"\
00535 "punpcklwd %%mm5, %%mm5 \n\t"\
00536 "punpcklwd %%mm4, %%mm4 \n\t"\
00537 "paddw %%mm1, %%mm2 \n\t"\
00538 "paddw %%mm1, %%mm5 \n\t"\
00539 "paddw %%mm1, %%mm4 \n\t"\
00540 "punpckhwd %%mm0, %%mm0 \n\t"\
00541 "punpckhwd %%mm6, %%mm6 \n\t"\
00542 "punpckhwd %%mm3, %%mm3 \n\t"\
00543 "paddw %%mm7, %%mm0 \n\t"\
00544 "paddw %%mm7, %%mm6 \n\t"\
00545 "paddw %%mm7, %%mm3 \n\t"\
00546 \
00547 "packuswb %%mm0, %%mm2 \n\t"\
00548 "packuswb %%mm6, %%mm5 \n\t"\
00549 "packuswb %%mm3, %%mm4 \n\t"\
00550
00551 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
00552
00553 #define REAL_YSCALEYUV2PACKED1b(index, c) \
00554 "xor "#index", "#index" \n\t"\
00555 ".p2align 4 \n\t"\
00556 "1: \n\t"\
00557 "movq (%2, "#index"), %%mm2 \n\t" \
00558 "movq (%3, "#index"), %%mm3 \n\t" \
00559 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" \
00560 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" \
00561 "paddw %%mm2, %%mm3 \n\t" \
00562 "paddw %%mm5, %%mm4 \n\t" \
00563 "psrlw $8, %%mm3 \n\t" \
00564 "psrlw $8, %%mm4 \n\t" \
00565 "movq (%0, "#index", 2), %%mm1 \n\t" \
00566 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
00567 "psraw $7, %%mm1 \n\t" \
00568 "psraw $7, %%mm7 \n\t"
00569 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
00570
00571
00572 #define REAL_YSCALEYUV2RGB1b(index, c) \
00573 "xor "#index", "#index" \n\t"\
00574 ".p2align 4 \n\t"\
00575 "1: \n\t"\
00576 "movq (%2, "#index"), %%mm2 \n\t" \
00577 "movq (%3, "#index"), %%mm3 \n\t" \
00578 "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5 \n\t" \
00579 "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4 \n\t" \
00580 "paddw %%mm2, %%mm3 \n\t" \
00581 "paddw %%mm5, %%mm4 \n\t" \
00582 "psrlw $5, %%mm3 \n\t" \
00583 "psrlw $5, %%mm4 \n\t" \
00584 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
00585 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
00586 "movq %%mm3, %%mm2 \n\t" \
00587 "movq %%mm4, %%mm5 \n\t" \
00588 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
00589 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
00590 \
00591 "movq (%0, "#index", 2), %%mm1 \n\t" \
00592 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
00593 "psraw $4, %%mm1 \n\t" \
00594 "psraw $4, %%mm7 \n\t" \
00595 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
00596 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
00597 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
00598 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
00599 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
00600 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
00601 \
00602 "paddw %%mm3, %%mm4 \n\t"\
00603 "movq %%mm2, %%mm0 \n\t"\
00604 "movq %%mm5, %%mm6 \n\t"\
00605 "movq %%mm4, %%mm3 \n\t"\
00606 "punpcklwd %%mm2, %%mm2 \n\t"\
00607 "punpcklwd %%mm5, %%mm5 \n\t"\
00608 "punpcklwd %%mm4, %%mm4 \n\t"\
00609 "paddw %%mm1, %%mm2 \n\t"\
00610 "paddw %%mm1, %%mm5 \n\t"\
00611 "paddw %%mm1, %%mm4 \n\t"\
00612 "punpckhwd %%mm0, %%mm0 \n\t"\
00613 "punpckhwd %%mm6, %%mm6 \n\t"\
00614 "punpckhwd %%mm3, %%mm3 \n\t"\
00615 "paddw %%mm7, %%mm0 \n\t"\
00616 "paddw %%mm7, %%mm6 \n\t"\
00617 "paddw %%mm7, %%mm3 \n\t"\
00618 \
00619 "packuswb %%mm0, %%mm2 \n\t"\
00620 "packuswb %%mm6, %%mm5 \n\t"\
00621 "packuswb %%mm3, %%mm4 \n\t"\
00622
00623 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
00624
00625 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
00626 "movq (%1, "#index", 2), %%mm7 \n\t" \
00627 "movq 8(%1, "#index", 2), %%mm1 \n\t" \
00628 "psraw $7, %%mm7 \n\t" \
00629 "psraw $7, %%mm1 \n\t" \
00630 "packuswb %%mm1, %%mm7 \n\t"
00631 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
00632
00633 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
00634 "movq "#b", "#q2" \n\t" \
00635 "movq "#r", "#t" \n\t" \
00636 "punpcklbw "#g", "#b" \n\t" \
00637 "punpcklbw "#a", "#r" \n\t" \
00638 "punpckhbw "#g", "#q2" \n\t" \
00639 "punpckhbw "#a", "#t" \n\t" \
00640 "movq "#b", "#q0" \n\t" \
00641 "movq "#q2", "#q3" \n\t" \
00642 "punpcklwd "#r", "#q0" \n\t" \
00643 "punpckhwd "#r", "#b" \n\t" \
00644 "punpcklwd "#t", "#q2" \n\t" \
00645 "punpckhwd "#t", "#q3" \n\t" \
00646 \
00647 MOVNTQ( q0, (dst, index, 4))\
00648 MOVNTQ( b, 8(dst, index, 4))\
00649 MOVNTQ( q2, 16(dst, index, 4))\
00650 MOVNTQ( q3, 24(dst, index, 4))\
00651 \
00652 "add $8, "#index" \n\t"\
00653 "cmp "#dstw", "#index" \n\t"\
00654 " jb 1b \n\t"
00655 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
00656
00657 #define REAL_WRITERGB16(dst, dstw, index) \
00658 "pand "MANGLE(bF8)", %%mm2 \n\t" \
00659 "pand "MANGLE(bFC)", %%mm4 \n\t" \
00660 "pand "MANGLE(bF8)", %%mm5 \n\t" \
00661 "psrlq $3, %%mm2 \n\t"\
00662 \
00663 "movq %%mm2, %%mm1 \n\t"\
00664 "movq %%mm4, %%mm3 \n\t"\
00665 \
00666 "punpcklbw %%mm7, %%mm3 \n\t"\
00667 "punpcklbw %%mm5, %%mm2 \n\t"\
00668 "punpckhbw %%mm7, %%mm4 \n\t"\
00669 "punpckhbw %%mm5, %%mm1 \n\t"\
00670 \
00671 "psllq $3, %%mm3 \n\t"\
00672 "psllq $3, %%mm4 \n\t"\
00673 \
00674 "por %%mm3, %%mm2 \n\t"\
00675 "por %%mm4, %%mm1 \n\t"\
00676 \
00677 MOVNTQ(%%mm2, (dst, index, 2))\
00678 MOVNTQ(%%mm1, 8(dst, index, 2))\
00679 \
00680 "add $8, "#index" \n\t"\
00681 "cmp "#dstw", "#index" \n\t"\
00682 " jb 1b \n\t"
00683 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
00684
00685 #define REAL_WRITERGB15(dst, dstw, index) \
00686 "pand "MANGLE(bF8)", %%mm2 \n\t" \
00687 "pand "MANGLE(bF8)", %%mm4 \n\t" \
00688 "pand "MANGLE(bF8)", %%mm5 \n\t" \
00689 "psrlq $3, %%mm2 \n\t"\
00690 "psrlq $1, %%mm5 \n\t"\
00691 \
00692 "movq %%mm2, %%mm1 \n\t"\
00693 "movq %%mm4, %%mm3 \n\t"\
00694 \
00695 "punpcklbw %%mm7, %%mm3 \n\t"\
00696 "punpcklbw %%mm5, %%mm2 \n\t"\
00697 "punpckhbw %%mm7, %%mm4 \n\t"\
00698 "punpckhbw %%mm5, %%mm1 \n\t"\
00699 \
00700 "psllq $2, %%mm3 \n\t"\
00701 "psllq $2, %%mm4 \n\t"\
00702 \
00703 "por %%mm3, %%mm2 \n\t"\
00704 "por %%mm4, %%mm1 \n\t"\
00705 \
00706 MOVNTQ(%%mm2, (dst, index, 2))\
00707 MOVNTQ(%%mm1, 8(dst, index, 2))\
00708 \
00709 "add $8, "#index" \n\t"\
00710 "cmp "#dstw", "#index" \n\t"\
00711 " jb 1b \n\t"
00712 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
00713
00714 #define WRITEBGR24OLD(dst, dstw, index) \
00715 \
00716 "movq %%mm2, %%mm1 \n\t" \
00717 "movq %%mm5, %%mm6 \n\t" \
00718 "punpcklbw %%mm4, %%mm2 \n\t" \
00719 "punpcklbw %%mm7, %%mm5 \n\t" \
00720 "punpckhbw %%mm4, %%mm1 \n\t" \
00721 "punpckhbw %%mm7, %%mm6 \n\t" \
00722 "movq %%mm2, %%mm0 \n\t" \
00723 "movq %%mm1, %%mm3 \n\t" \
00724 "punpcklwd %%mm5, %%mm0 \n\t" \
00725 "punpckhwd %%mm5, %%mm2 \n\t" \
00726 "punpcklwd %%mm6, %%mm1 \n\t" \
00727 "punpckhwd %%mm6, %%mm3 \n\t" \
00728 \
00729 "movq %%mm0, %%mm4 \n\t" \
00730 "psrlq $8, %%mm0 \n\t" \
00731 "pand "MANGLE(bm00000111)", %%mm4 \n\t" \
00732 "pand "MANGLE(bm11111000)", %%mm0 \n\t" \
00733 "por %%mm4, %%mm0 \n\t" \
00734 "movq %%mm2, %%mm4 \n\t" \
00735 "psllq $48, %%mm2 \n\t" \
00736 "por %%mm2, %%mm0 \n\t" \
00737 \
00738 "movq %%mm4, %%mm2 \n\t" \
00739 "psrld $16, %%mm4 \n\t" \
00740 "psrlq $24, %%mm2 \n\t" \
00741 "por %%mm4, %%mm2 \n\t" \
00742 "pand "MANGLE(bm00001111)", %%mm2 \n\t" \
00743 "movq %%mm1, %%mm4 \n\t" \
00744 "psrlq $8, %%mm1 \n\t" \
00745 "pand "MANGLE(bm00000111)", %%mm4 \n\t" \
00746 "pand "MANGLE(bm11111000)", %%mm1 \n\t" \
00747 "por %%mm4, %%mm1 \n\t" \
00748 "movq %%mm1, %%mm4 \n\t" \
00749 "psllq $32, %%mm1 \n\t" \
00750 "por %%mm1, %%mm2 \n\t" \
00751 \
00752 "psrlq $32, %%mm4 \n\t" \
00753 "movq %%mm3, %%mm5 \n\t" \
00754 "psrlq $8, %%mm3 \n\t" \
00755 "pand "MANGLE(bm00000111)", %%mm5 \n\t" \
00756 "pand "MANGLE(bm11111000)", %%mm3 \n\t" \
00757 "por %%mm5, %%mm3 \n\t" \
00758 "psllq $16, %%mm3 \n\t" \
00759 "por %%mm4, %%mm3 \n\t" \
00760 \
00761 MOVNTQ(%%mm0, (dst))\
00762 MOVNTQ(%%mm2, 8(dst))\
00763 MOVNTQ(%%mm3, 16(dst))\
00764 "add $24, "#dst" \n\t"\
00765 \
00766 "add $8, "#index" \n\t"\
00767 "cmp "#dstw", "#index" \n\t"\
00768 " jb 1b \n\t"
00769
00770 #define WRITEBGR24MMX(dst, dstw, index) \
00771 \
00772 "movq %%mm2, %%mm1 \n\t" \
00773 "movq %%mm5, %%mm6 \n\t" \
00774 "punpcklbw %%mm4, %%mm2 \n\t" \
00775 "punpcklbw %%mm7, %%mm5 \n\t" \
00776 "punpckhbw %%mm4, %%mm1 \n\t" \
00777 "punpckhbw %%mm7, %%mm6 \n\t" \
00778 "movq %%mm2, %%mm0 \n\t" \
00779 "movq %%mm1, %%mm3 \n\t" \
00780 "punpcklwd %%mm5, %%mm0 \n\t" \
00781 "punpckhwd %%mm5, %%mm2 \n\t" \
00782 "punpcklwd %%mm6, %%mm1 \n\t" \
00783 "punpckhwd %%mm6, %%mm3 \n\t" \
00784 \
00785 "movq %%mm0, %%mm4 \n\t" \
00786 "movq %%mm2, %%mm6 \n\t" \
00787 "movq %%mm1, %%mm5 \n\t" \
00788 "movq %%mm3, %%mm7 \n\t" \
00789 \
00790 "psllq $40, %%mm0 \n\t" \
00791 "psllq $40, %%mm2 \n\t" \
00792 "psllq $40, %%mm1 \n\t" \
00793 "psllq $40, %%mm3 \n\t" \
00794 \
00795 "punpckhdq %%mm4, %%mm0 \n\t" \
00796 "punpckhdq %%mm6, %%mm2 \n\t" \
00797 "punpckhdq %%mm5, %%mm1 \n\t" \
00798 "punpckhdq %%mm7, %%mm3 \n\t" \
00799 \
00800 "psrlq $8, %%mm0 \n\t" \
00801 "movq %%mm2, %%mm6 \n\t" \
00802 "psllq $40, %%mm2 \n\t" \
00803 "por %%mm2, %%mm0 \n\t" \
00804 MOVNTQ(%%mm0, (dst))\
00805 \
00806 "psrlq $24, %%mm6 \n\t" \
00807 "movq %%mm1, %%mm5 \n\t" \
00808 "psllq $24, %%mm1 \n\t" \
00809 "por %%mm1, %%mm6 \n\t" \
00810 MOVNTQ(%%mm6, 8(dst))\
00811 \
00812 "psrlq $40, %%mm5 \n\t" \
00813 "psllq $8, %%mm3 \n\t" \
00814 "por %%mm3, %%mm5 \n\t" \
00815 MOVNTQ(%%mm5, 16(dst))\
00816 \
00817 "add $24, "#dst" \n\t"\
00818 \
00819 "add $8, "#index" \n\t"\
00820 "cmp "#dstw", "#index" \n\t"\
00821 " jb 1b \n\t"
00822
00823 #define WRITEBGR24MMX2(dst, dstw, index) \
00824 \
00825 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
00826 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
00827 "pshufw $0x50, %%mm2, %%mm1 \n\t" \
00828 "pshufw $0x50, %%mm4, %%mm3 \n\t" \
00829 "pshufw $0x00, %%mm5, %%mm6 \n\t" \
00830 \
00831 "pand %%mm0, %%mm1 \n\t" \
00832 "pand %%mm0, %%mm3 \n\t" \
00833 "pand %%mm7, %%mm6 \n\t" \
00834 \
00835 "psllq $8, %%mm3 \n\t" \
00836 "por %%mm1, %%mm6 \n\t"\
00837 "por %%mm3, %%mm6 \n\t"\
00838 MOVNTQ(%%mm6, (dst))\
00839 \
00840 "psrlq $8, %%mm4 \n\t" \
00841 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \
00842 "pshufw $0x55, %%mm4, %%mm3 \n\t" \
00843 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \
00844 \
00845 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \
00846 "pand %%mm7, %%mm3 \n\t" \
00847 "pand %%mm0, %%mm6 \n\t" \
00848 \
00849 "por %%mm1, %%mm3 \n\t" \
00850 "por %%mm3, %%mm6 \n\t"\
00851 MOVNTQ(%%mm6, 8(dst))\
00852 \
00853 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \
00854 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \
00855 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \
00856 \
00857 "pand %%mm7, %%mm1 \n\t" \
00858 "pand %%mm0, %%mm3 \n\t" \
00859 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \
00860 \
00861 "por %%mm1, %%mm3 \n\t"\
00862 "por %%mm3, %%mm6 \n\t"\
00863 MOVNTQ(%%mm6, 16(dst))\
00864 \
00865 "add $24, "#dst" \n\t"\
00866 \
00867 "add $8, "#index" \n\t"\
00868 "cmp "#dstw", "#index" \n\t"\
00869 " jb 1b \n\t"
00870
00871 #if COMPILE_TEMPLATE_MMX2
00872 #undef WRITEBGR24
00873 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index)
00874 #else
00875 #undef WRITEBGR24
00876 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
00877 #endif
00878
00879 #define REAL_WRITEYUY2(dst, dstw, index) \
00880 "packuswb %%mm3, %%mm3 \n\t"\
00881 "packuswb %%mm4, %%mm4 \n\t"\
00882 "packuswb %%mm7, %%mm1 \n\t"\
00883 "punpcklbw %%mm4, %%mm3 \n\t"\
00884 "movq %%mm1, %%mm7 \n\t"\
00885 "punpcklbw %%mm3, %%mm1 \n\t"\
00886 "punpckhbw %%mm3, %%mm7 \n\t"\
00887 \
00888 MOVNTQ(%%mm1, (dst, index, 2))\
00889 MOVNTQ(%%mm7, 8(dst, index, 2))\
00890 \
00891 "add $8, "#index" \n\t"\
00892 "cmp "#dstw", "#index" \n\t"\
00893 " jb 1b \n\t"
00894 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
00895
00896
00897 static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
00898 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
00899 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
00900 {
00901 #if COMPILE_TEMPLATE_MMX
00902 if(!(c->flags & SWS_BITEXACT)) {
00903 if (c->flags & SWS_ACCURATE_RND) {
00904 if (uDest) {
00905 YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
00906 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
00907 }
00908 if (CONFIG_SWSCALE_ALPHA && aDest) {
00909 YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
00910 }
00911
00912 YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
00913 } else {
00914 if (uDest) {
00915 YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
00916 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
00917 }
00918 if (CONFIG_SWSCALE_ALPHA && aDest) {
00919 YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
00920 }
00921
00922 YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
00923 }
00924 return;
00925 }
00926 #endif
00927 #if COMPILE_TEMPLATE_ALTIVEC
00928 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
00929 chrFilter, chrSrc, chrFilterSize,
00930 dest, uDest, vDest, dstW, chrDstW);
00931 #else //COMPILE_TEMPLATE_ALTIVEC
00932 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
00933 chrFilter, chrSrc, chrFilterSize,
00934 alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
00935 #endif
00936 }
00937
00938 static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
00939 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
00940 uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
00941 {
00942 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
00943 chrFilter, chrSrc, chrFilterSize,
00944 dest, uDest, dstW, chrDstW, dstFormat);
00945 }
00946
00947 static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
00948 uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
00949 {
00950 int i;
00951 #if COMPILE_TEMPLATE_MMX
00952 if(!(c->flags & SWS_BITEXACT)) {
00953 long p= 4;
00954 const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
00955 uint8_t *dst[4]= {aDest, dest, uDest, vDest};
00956 x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
00957
00958 if (c->flags & SWS_ACCURATE_RND) {
00959 while(p--) {
00960 if (dst[p]) {
00961 __asm__ volatile(
00962 YSCALEYUV2YV121_ACCURATE
00963 :: "r" (src[p]), "r" (dst[p] + counter[p]),
00964 "g" (-counter[p])
00965 : "%"REG_a
00966 );
00967 }
00968 }
00969 } else {
00970 while(p--) {
00971 if (dst[p]) {
00972 __asm__ volatile(
00973 YSCALEYUV2YV121
00974 :: "r" (src[p]), "r" (dst[p] + counter[p]),
00975 "g" (-counter[p])
00976 : "%"REG_a
00977 );
00978 }
00979 }
00980 }
00981 return;
00982 }
00983 #endif
00984 for (i=0; i<dstW; i++) {
00985 int val= (lumSrc[i]+64)>>7;
00986
00987 if (val&256) {
00988 if (val<0) val=0;
00989 else val=255;
00990 }
00991
00992 dest[i]= val;
00993 }
00994
00995 if (uDest)
00996 for (i=0; i<chrDstW; i++) {
00997 int u=(chrSrc[i ]+64)>>7;
00998 int v=(chrSrc[i + VOFW]+64)>>7;
00999
01000 if ((u|v)&256) {
01001 if (u<0) u=0;
01002 else if (u>255) u=255;
01003 if (v<0) v=0;
01004 else if (v>255) v=255;
01005 }
01006
01007 uDest[i]= u;
01008 vDest[i]= v;
01009 }
01010
01011 if (CONFIG_SWSCALE_ALPHA && aDest)
01012 for (i=0; i<dstW; i++) {
01013 int val= (alpSrc[i]+64)>>7;
01014 aDest[i]= av_clip_uint8(val);
01015 }
01016 }
01017
01018
01022 static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
01023 const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
01024 const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
01025 {
01026 #if COMPILE_TEMPLATE_MMX
01027 x86_reg dummy=0;
01028 x86_reg dstW_reg = dstW;
01029 if(!(c->flags & SWS_BITEXACT)) {
01030 if (c->flags & SWS_ACCURATE_RND) {
01031 switch(c->dstFormat) {
01032 case PIX_FMT_RGB32:
01033 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01034 YSCALEYUV2PACKEDX_ACCURATE
01035 YSCALEYUV2RGBX
01036 "movq %%mm2, "U_TEMP"(%0) \n\t"
01037 "movq %%mm4, "V_TEMP"(%0) \n\t"
01038 "movq %%mm5, "Y_TEMP"(%0) \n\t"
01039 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
01040 "movq "Y_TEMP"(%0), %%mm5 \n\t"
01041 "psraw $3, %%mm1 \n\t"
01042 "psraw $3, %%mm7 \n\t"
01043 "packuswb %%mm7, %%mm1 \n\t"
01044 WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
01045
01046 YSCALEYUV2PACKEDX_END
01047 } else {
01048 YSCALEYUV2PACKEDX_ACCURATE
01049 YSCALEYUV2RGBX
01050 "pcmpeqd %%mm7, %%mm7 \n\t"
01051 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01052
01053 YSCALEYUV2PACKEDX_END
01054 }
01055 return;
01056 case PIX_FMT_BGR24:
01057 YSCALEYUV2PACKEDX_ACCURATE
01058 YSCALEYUV2RGBX
01059 "pxor %%mm7, %%mm7 \n\t"
01060 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t"
01061 "add %4, %%"REG_c" \n\t"
01062 WRITEBGR24(%%REGc, %5, %%REGa)
01063
01064
01065 :: "r" (&c->redDither),
01066 "m" (dummy), "m" (dummy), "m" (dummy),
01067 "r" (dest), "m" (dstW_reg)
01068 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
01069 );
01070 return;
01071 case PIX_FMT_RGB555:
01072 YSCALEYUV2PACKEDX_ACCURATE
01073 YSCALEYUV2RGBX
01074 "pxor %%mm7, %%mm7 \n\t"
01075
01076 #ifdef DITHER1XBPP
01077 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
01078 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
01079 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
01080 #endif
01081
01082 WRITERGB15(%4, %5, %%REGa)
01083 YSCALEYUV2PACKEDX_END
01084 return;
01085 case PIX_FMT_RGB565:
01086 YSCALEYUV2PACKEDX_ACCURATE
01087 YSCALEYUV2RGBX
01088 "pxor %%mm7, %%mm7 \n\t"
01089
01090 #ifdef DITHER1XBPP
01091 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
01092 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
01093 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
01094 #endif
01095
01096 WRITERGB16(%4, %5, %%REGa)
01097 YSCALEYUV2PACKEDX_END
01098 return;
01099 case PIX_FMT_YUYV422:
01100 YSCALEYUV2PACKEDX_ACCURATE
01101
01102
01103 "psraw $3, %%mm3 \n\t"
01104 "psraw $3, %%mm4 \n\t"
01105 "psraw $3, %%mm1 \n\t"
01106 "psraw $3, %%mm7 \n\t"
01107 WRITEYUY2(%4, %5, %%REGa)
01108 YSCALEYUV2PACKEDX_END
01109 return;
01110 }
01111 } else {
01112 switch(c->dstFormat) {
01113 case PIX_FMT_RGB32:
01114 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01115 YSCALEYUV2PACKEDX
01116 YSCALEYUV2RGBX
01117 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
01118 "psraw $3, %%mm1 \n\t"
01119 "psraw $3, %%mm7 \n\t"
01120 "packuswb %%mm7, %%mm1 \n\t"
01121 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01122 YSCALEYUV2PACKEDX_END
01123 } else {
01124 YSCALEYUV2PACKEDX
01125 YSCALEYUV2RGBX
01126 "pcmpeqd %%mm7, %%mm7 \n\t"
01127 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01128 YSCALEYUV2PACKEDX_END
01129 }
01130 return;
01131 case PIX_FMT_BGR24:
01132 YSCALEYUV2PACKEDX
01133 YSCALEYUV2RGBX
01134 "pxor %%mm7, %%mm7 \n\t"
01135 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t"
01136 "add %4, %%"REG_c" \n\t"
01137 WRITEBGR24(%%REGc, %5, %%REGa)
01138
01139 :: "r" (&c->redDither),
01140 "m" (dummy), "m" (dummy), "m" (dummy),
01141 "r" (dest), "m" (dstW_reg)
01142 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
01143 );
01144 return;
01145 case PIX_FMT_RGB555:
01146 YSCALEYUV2PACKEDX
01147 YSCALEYUV2RGBX
01148 "pxor %%mm7, %%mm7 \n\t"
01149
01150 #ifdef DITHER1XBPP
01151 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
01152 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
01153 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
01154 #endif
01155
01156 WRITERGB15(%4, %5, %%REGa)
01157 YSCALEYUV2PACKEDX_END
01158 return;
01159 case PIX_FMT_RGB565:
01160 YSCALEYUV2PACKEDX
01161 YSCALEYUV2RGBX
01162 "pxor %%mm7, %%mm7 \n\t"
01163
01164 #ifdef DITHER1XBPP
01165 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
01166 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
01167 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
01168 #endif
01169
01170 WRITERGB16(%4, %5, %%REGa)
01171 YSCALEYUV2PACKEDX_END
01172 return;
01173 case PIX_FMT_YUYV422:
01174 YSCALEYUV2PACKEDX
01175
01176
01177 "psraw $3, %%mm3 \n\t"
01178 "psraw $3, %%mm4 \n\t"
01179 "psraw $3, %%mm1 \n\t"
01180 "psraw $3, %%mm7 \n\t"
01181 WRITEYUY2(%4, %5, %%REGa)
01182 YSCALEYUV2PACKEDX_END
01183 return;
01184 }
01185 }
01186 }
01187 #endif
01188 #if COMPILE_TEMPLATE_ALTIVEC
01189
01190
01191 if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
01192 (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA ||
01193 c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
01194 c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB))
01195 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
01196 chrFilter, chrSrc, chrFilterSize,
01197 dest, dstW, dstY);
01198 else
01199 #endif
01200 yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
01201 chrFilter, chrSrc, chrFilterSize,
01202 alpSrc, dest, dstW, dstY);
01203 }
01204
01208 static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
01209 const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
01210 {
01211 int yalpha1=4095- yalpha;
01212 int uvalpha1=4095-uvalpha;
01213 int i;
01214
01215 #if COMPILE_TEMPLATE_MMX
01216 if(!(c->flags & SWS_BITEXACT)) {
01217 switch(c->dstFormat) {
01218
01219 case PIX_FMT_RGB32:
01220 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01221 #if ARCH_X86_64
01222 __asm__ volatile(
01223 YSCALEYUV2RGB(%%r8, %5)
01224 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
01225 "psraw $3, %%mm1 \n\t"
01226 "psraw $3, %%mm7 \n\t"
01227 "packuswb %%mm7, %%mm1 \n\t"
01228 WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01229
01230 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
01231 "a" (&c->redDither)
01232 ,"r" (abuf0), "r" (abuf1)
01233 : "%r8"
01234 );
01235 #else
01236 *(const uint16_t **)(&c->u_temp)=abuf0;
01237 *(const uint16_t **)(&c->v_temp)=abuf1;
01238 __asm__ volatile(
01239 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01240 "mov %4, %%"REG_b" \n\t"
01241 "push %%"REG_BP" \n\t"
01242 YSCALEYUV2RGB(%%REGBP, %5)
01243 "push %0 \n\t"
01244 "push %1 \n\t"
01245 "mov "U_TEMP"(%5), %0 \n\t"
01246 "mov "V_TEMP"(%5), %1 \n\t"
01247 YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
01248 "psraw $3, %%mm1 \n\t"
01249 "psraw $3, %%mm7 \n\t"
01250 "packuswb %%mm7, %%mm1 \n\t"
01251 "pop %1 \n\t"
01252 "pop %0 \n\t"
01253 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
01254 "pop %%"REG_BP" \n\t"
01255 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01256
01257 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01258 "a" (&c->redDither)
01259 );
01260 #endif
01261 } else {
01262 __asm__ volatile(
01263 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01264 "mov %4, %%"REG_b" \n\t"
01265 "push %%"REG_BP" \n\t"
01266 YSCALEYUV2RGB(%%REGBP, %5)
01267 "pcmpeqd %%mm7, %%mm7 \n\t"
01268 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01269 "pop %%"REG_BP" \n\t"
01270 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01271
01272 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01273 "a" (&c->redDither)
01274 );
01275 }
01276 return;
01277 case PIX_FMT_BGR24:
01278 __asm__ volatile(
01279 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01280 "mov %4, %%"REG_b" \n\t"
01281 "push %%"REG_BP" \n\t"
01282 YSCALEYUV2RGB(%%REGBP, %5)
01283 "pxor %%mm7, %%mm7 \n\t"
01284 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01285 "pop %%"REG_BP" \n\t"
01286 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01287 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01288 "a" (&c->redDither)
01289 );
01290 return;
01291 case PIX_FMT_RGB555:
01292 __asm__ volatile(
01293 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01294 "mov %4, %%"REG_b" \n\t"
01295 "push %%"REG_BP" \n\t"
01296 YSCALEYUV2RGB(%%REGBP, %5)
01297 "pxor %%mm7, %%mm7 \n\t"
01298
01299 #ifdef DITHER1XBPP
01300 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01301 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01302 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01303 #endif
01304
01305 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01306 "pop %%"REG_BP" \n\t"
01307 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01308
01309 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01310 "a" (&c->redDither)
01311 );
01312 return;
01313 case PIX_FMT_RGB565:
01314 __asm__ volatile(
01315 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01316 "mov %4, %%"REG_b" \n\t"
01317 "push %%"REG_BP" \n\t"
01318 YSCALEYUV2RGB(%%REGBP, %5)
01319 "pxor %%mm7, %%mm7 \n\t"
01320
01321 #ifdef DITHER1XBPP
01322 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01323 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01324 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01325 #endif
01326
01327 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01328 "pop %%"REG_BP" \n\t"
01329 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01330 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01331 "a" (&c->redDither)
01332 );
01333 return;
01334 case PIX_FMT_YUYV422:
01335 __asm__ volatile(
01336 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01337 "mov %4, %%"REG_b" \n\t"
01338 "push %%"REG_BP" \n\t"
01339 YSCALEYUV2PACKED(%%REGBP, %5)
01340 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01341 "pop %%"REG_BP" \n\t"
01342 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01343 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01344 "a" (&c->redDither)
01345 );
01346 return;
01347 default: break;
01348 }
01349 }
01350 #endif //COMPILE_TEMPLATE_MMX
01351 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
01352 }
01353
01357 static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
01358 const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
01359 {
01360 const int yalpha1=0;
01361 int i;
01362
01363 const uint16_t *buf1= buf0;
01364 const int yalpha= 4096;
01365
01366 if (flags&SWS_FULL_CHR_H_INT) {
01367 c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
01368 return;
01369 }
01370
01371 #if COMPILE_TEMPLATE_MMX
01372 if(!(flags & SWS_BITEXACT)) {
01373 if (uvalpha < 2048) {
01374 switch(dstFormat) {
01375 case PIX_FMT_RGB32:
01376 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01377 __asm__ volatile(
01378 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01379 "mov %4, %%"REG_b" \n\t"
01380 "push %%"REG_BP" \n\t"
01381 YSCALEYUV2RGB1(%%REGBP, %5)
01382 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01383 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01384 "pop %%"REG_BP" \n\t"
01385 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01386
01387 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01388 "a" (&c->redDither)
01389 );
01390 } else {
01391 __asm__ volatile(
01392 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01393 "mov %4, %%"REG_b" \n\t"
01394 "push %%"REG_BP" \n\t"
01395 YSCALEYUV2RGB1(%%REGBP, %5)
01396 "pcmpeqd %%mm7, %%mm7 \n\t"
01397 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01398 "pop %%"REG_BP" \n\t"
01399 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01400
01401 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01402 "a" (&c->redDither)
01403 );
01404 }
01405 return;
01406 case PIX_FMT_BGR24:
01407 __asm__ volatile(
01408 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01409 "mov %4, %%"REG_b" \n\t"
01410 "push %%"REG_BP" \n\t"
01411 YSCALEYUV2RGB1(%%REGBP, %5)
01412 "pxor %%mm7, %%mm7 \n\t"
01413 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01414 "pop %%"REG_BP" \n\t"
01415 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01416
01417 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01418 "a" (&c->redDither)
01419 );
01420 return;
01421 case PIX_FMT_RGB555:
01422 __asm__ volatile(
01423 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01424 "mov %4, %%"REG_b" \n\t"
01425 "push %%"REG_BP" \n\t"
01426 YSCALEYUV2RGB1(%%REGBP, %5)
01427 "pxor %%mm7, %%mm7 \n\t"
01428
01429 #ifdef DITHER1XBPP
01430 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01431 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01432 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01433 #endif
01434 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01435 "pop %%"REG_BP" \n\t"
01436 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01437
01438 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01439 "a" (&c->redDither)
01440 );
01441 return;
01442 case PIX_FMT_RGB565:
01443 __asm__ volatile(
01444 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01445 "mov %4, %%"REG_b" \n\t"
01446 "push %%"REG_BP" \n\t"
01447 YSCALEYUV2RGB1(%%REGBP, %5)
01448 "pxor %%mm7, %%mm7 \n\t"
01449
01450 #ifdef DITHER1XBPP
01451 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01452 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01453 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01454 #endif
01455
01456 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01457 "pop %%"REG_BP" \n\t"
01458 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01459
01460 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01461 "a" (&c->redDither)
01462 );
01463 return;
01464 case PIX_FMT_YUYV422:
01465 __asm__ volatile(
01466 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01467 "mov %4, %%"REG_b" \n\t"
01468 "push %%"REG_BP" \n\t"
01469 YSCALEYUV2PACKED1(%%REGBP, %5)
01470 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01471 "pop %%"REG_BP" \n\t"
01472 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01473
01474 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01475 "a" (&c->redDither)
01476 );
01477 return;
01478 }
01479 } else {
01480 switch(dstFormat) {
01481 case PIX_FMT_RGB32:
01482 if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01483 __asm__ volatile(
01484 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01485 "mov %4, %%"REG_b" \n\t"
01486 "push %%"REG_BP" \n\t"
01487 YSCALEYUV2RGB1b(%%REGBP, %5)
01488 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01489 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01490 "pop %%"REG_BP" \n\t"
01491 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01492
01493 :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01494 "a" (&c->redDither)
01495 );
01496 } else {
01497 __asm__ volatile(
01498 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01499 "mov %4, %%"REG_b" \n\t"
01500 "push %%"REG_BP" \n\t"
01501 YSCALEYUV2RGB1b(%%REGBP, %5)
01502 "pcmpeqd %%mm7, %%mm7 \n\t"
01503 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01504 "pop %%"REG_BP" \n\t"
01505 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01506
01507 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01508 "a" (&c->redDither)
01509 );
01510 }
01511 return;
01512 case PIX_FMT_BGR24:
01513 __asm__ volatile(
01514 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01515 "mov %4, %%"REG_b" \n\t"
01516 "push %%"REG_BP" \n\t"
01517 YSCALEYUV2RGB1b(%%REGBP, %5)
01518 "pxor %%mm7, %%mm7 \n\t"
01519 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01520 "pop %%"REG_BP" \n\t"
01521 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01522
01523 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01524 "a" (&c->redDither)
01525 );
01526 return;
01527 case PIX_FMT_RGB555:
01528 __asm__ volatile(
01529 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01530 "mov %4, %%"REG_b" \n\t"
01531 "push %%"REG_BP" \n\t"
01532 YSCALEYUV2RGB1b(%%REGBP, %5)
01533 "pxor %%mm7, %%mm7 \n\t"
01534
01535 #ifdef DITHER1XBPP
01536 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01537 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01538 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01539 #endif
01540 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01541 "pop %%"REG_BP" \n\t"
01542 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01543
01544 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01545 "a" (&c->redDither)
01546 );
01547 return;
01548 case PIX_FMT_RGB565:
01549 __asm__ volatile(
01550 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01551 "mov %4, %%"REG_b" \n\t"
01552 "push %%"REG_BP" \n\t"
01553 YSCALEYUV2RGB1b(%%REGBP, %5)
01554 "pxor %%mm7, %%mm7 \n\t"
01555
01556 #ifdef DITHER1XBPP
01557 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
01558 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
01559 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
01560 #endif
01561
01562 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01563 "pop %%"REG_BP" \n\t"
01564 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01565
01566 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01567 "a" (&c->redDither)
01568 );
01569 return;
01570 case PIX_FMT_YUYV422:
01571 __asm__ volatile(
01572 "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
01573 "mov %4, %%"REG_b" \n\t"
01574 "push %%"REG_BP" \n\t"
01575 YSCALEYUV2PACKED1b(%%REGBP, %5)
01576 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01577 "pop %%"REG_BP" \n\t"
01578 "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
01579
01580 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01581 "a" (&c->redDither)
01582 );
01583 return;
01584 }
01585 }
01586 }
01587 #endif
01588 if (uvalpha < 2048) {
01589 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
01590 } else {
01591 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
01592 }
01593 }
01594
01595
01596
01597 static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01598 {
01599 #if COMPILE_TEMPLATE_MMX
01600 __asm__ volatile(
01601 "movq "MANGLE(bm01010101)", %%mm2 \n\t"
01602 "mov %0, %%"REG_a" \n\t"
01603 "1: \n\t"
01604 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01605 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01606 "pand %%mm2, %%mm0 \n\t"
01607 "pand %%mm2, %%mm1 \n\t"
01608 "packuswb %%mm1, %%mm0 \n\t"
01609 "movq %%mm0, (%2, %%"REG_a") \n\t"
01610 "add $8, %%"REG_a" \n\t"
01611 " js 1b \n\t"
01612 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
01613 : "%"REG_a
01614 );
01615 #else
01616 int i;
01617 for (i=0; i<width; i++)
01618 dst[i]= src[2*i];
01619 #endif
01620 }
01621
01622 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01623 {
01624 #if COMPILE_TEMPLATE_MMX
01625 __asm__ volatile(
01626 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01627 "mov %0, %%"REG_a" \n\t"
01628 "1: \n\t"
01629 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
01630 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
01631 "psrlw $8, %%mm0 \n\t"
01632 "psrlw $8, %%mm1 \n\t"
01633 "packuswb %%mm1, %%mm0 \n\t"
01634 "movq %%mm0, %%mm1 \n\t"
01635 "psrlw $8, %%mm0 \n\t"
01636 "pand %%mm4, %%mm1 \n\t"
01637 "packuswb %%mm0, %%mm0 \n\t"
01638 "packuswb %%mm1, %%mm1 \n\t"
01639 "movd %%mm0, (%3, %%"REG_a") \n\t"
01640 "movd %%mm1, (%2, %%"REG_a") \n\t"
01641 "add $4, %%"REG_a" \n\t"
01642 " js 1b \n\t"
01643 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01644 : "%"REG_a
01645 );
01646 #else
01647 int i;
01648 for (i=0; i<width; i++) {
01649 dstU[i]= src1[4*i + 1];
01650 dstV[i]= src1[4*i + 3];
01651 }
01652 #endif
01653 assert(src1 == src2);
01654 }
01655
01656 static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01657 {
01658 #if COMPILE_TEMPLATE_MMX
01659 __asm__ volatile(
01660 "mov %0, %%"REG_a" \n\t"
01661 "1: \n\t"
01662 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01663 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01664 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
01665 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
01666 "psrlw $8, %%mm0 \n\t"
01667 "psrlw $8, %%mm1 \n\t"
01668 "psrlw $8, %%mm2 \n\t"
01669 "psrlw $8, %%mm3 \n\t"
01670 "packuswb %%mm1, %%mm0 \n\t"
01671 "packuswb %%mm3, %%mm2 \n\t"
01672 "movq %%mm0, (%3, %%"REG_a") \n\t"
01673 "movq %%mm2, (%4, %%"REG_a") \n\t"
01674 "add $8, %%"REG_a" \n\t"
01675 " js 1b \n\t"
01676 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
01677 : "%"REG_a
01678 );
01679 #else
01680 int i;
01681 for (i=0; i<width; i++) {
01682 dstU[i]= src1[2*i + 1];
01683 dstV[i]= src2[2*i + 1];
01684 }
01685 #endif
01686 }
01687
01688
01689
01690 static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01691 {
01692 #if COMPILE_TEMPLATE_MMX
01693 __asm__ volatile(
01694 "mov %0, %%"REG_a" \n\t"
01695 "1: \n\t"
01696 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01697 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01698 "psrlw $8, %%mm0 \n\t"
01699 "psrlw $8, %%mm1 \n\t"
01700 "packuswb %%mm1, %%mm0 \n\t"
01701 "movq %%mm0, (%2, %%"REG_a") \n\t"
01702 "add $8, %%"REG_a" \n\t"
01703 " js 1b \n\t"
01704 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
01705 : "%"REG_a
01706 );
01707 #else
01708 int i;
01709 for (i=0; i<width; i++)
01710 dst[i]= src[2*i+1];
01711 #endif
01712 }
01713
01714 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01715 {
01716 #if COMPILE_TEMPLATE_MMX
01717 __asm__ volatile(
01718 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01719 "mov %0, %%"REG_a" \n\t"
01720 "1: \n\t"
01721 "movq (%1, %%"REG_a",4), %%mm0 \n\t"
01722 "movq 8(%1, %%"REG_a",4), %%mm1 \n\t"
01723 "pand %%mm4, %%mm0 \n\t"
01724 "pand %%mm4, %%mm1 \n\t"
01725 "packuswb %%mm1, %%mm0 \n\t"
01726 "movq %%mm0, %%mm1 \n\t"
01727 "psrlw $8, %%mm0 \n\t"
01728 "pand %%mm4, %%mm1 \n\t"
01729 "packuswb %%mm0, %%mm0 \n\t"
01730 "packuswb %%mm1, %%mm1 \n\t"
01731 "movd %%mm0, (%3, %%"REG_a") \n\t"
01732 "movd %%mm1, (%2, %%"REG_a") \n\t"
01733 "add $4, %%"REG_a" \n\t"
01734 " js 1b \n\t"
01735 : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01736 : "%"REG_a
01737 );
01738 #else
01739 int i;
01740 for (i=0; i<width; i++) {
01741 dstU[i]= src1[4*i + 0];
01742 dstV[i]= src1[4*i + 2];
01743 }
01744 #endif
01745 assert(src1 == src2);
01746 }
01747
01748 static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01749 {
01750 #if COMPILE_TEMPLATE_MMX
01751 __asm__ volatile(
01752 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01753 "mov %0, %%"REG_a" \n\t"
01754 "1: \n\t"
01755 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01756 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01757 "movq (%2, %%"REG_a",2), %%mm2 \n\t"
01758 "movq 8(%2, %%"REG_a",2), %%mm3 \n\t"
01759 "pand %%mm4, %%mm0 \n\t"
01760 "pand %%mm4, %%mm1 \n\t"
01761 "pand %%mm4, %%mm2 \n\t"
01762 "pand %%mm4, %%mm3 \n\t"
01763 "packuswb %%mm1, %%mm0 \n\t"
01764 "packuswb %%mm3, %%mm2 \n\t"
01765 "movq %%mm0, (%3, %%"REG_a") \n\t"
01766 "movq %%mm2, (%4, %%"REG_a") \n\t"
01767 "add $8, %%"REG_a" \n\t"
01768 " js 1b \n\t"
01769 : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
01770 : "%"REG_a
01771 );
01772 #else
01773 int i;
01774 for (i=0; i<width; i++) {
01775 dstU[i]= src1[2*i];
01776 dstV[i]= src2[2*i];
01777 }
01778 #endif
01779 }
01780
01781 static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
01782 const uint8_t *src, long width)
01783 {
01784 #if COMPILE_TEMPLATE_MMX
01785 __asm__ volatile(
01786 "movq "MANGLE(bm01010101)", %%mm4 \n\t"
01787 "mov %0, %%"REG_a" \n\t"
01788 "1: \n\t"
01789 "movq (%1, %%"REG_a",2), %%mm0 \n\t"
01790 "movq 8(%1, %%"REG_a",2), %%mm1 \n\t"
01791 "movq %%mm0, %%mm2 \n\t"
01792 "movq %%mm1, %%mm3 \n\t"
01793 "pand %%mm4, %%mm0 \n\t"
01794 "pand %%mm4, %%mm1 \n\t"
01795 "psrlw $8, %%mm2 \n\t"
01796 "psrlw $8, %%mm3 \n\t"
01797 "packuswb %%mm1, %%mm0 \n\t"
01798 "packuswb %%mm3, %%mm2 \n\t"
01799 "movq %%mm0, (%2, %%"REG_a") \n\t"
01800 "movq %%mm2, (%3, %%"REG_a") \n\t"
01801 "add $8, %%"REG_a" \n\t"
01802 " js 1b \n\t"
01803 : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
01804 : "%"REG_a
01805 );
01806 #else
01807 int i;
01808 for (i = 0; i < width; i++) {
01809 dst1[i] = src[2*i+0];
01810 dst2[i] = src[2*i+1];
01811 }
01812 #endif
01813 }
01814
01815 static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
01816 const uint8_t *src1, const uint8_t *src2,
01817 long width, uint32_t *unused)
01818 {
01819 RENAME(nvXXtoUV)(dstU, dstV, src1, width);
01820 }
01821
01822 static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
01823 const uint8_t *src1, const uint8_t *src2,
01824 long width, uint32_t *unused)
01825 {
01826 RENAME(nvXXtoUV)(dstV, dstU, src1, width);
01827 }
01828
01829 #if COMPILE_TEMPLATE_MMX
01830 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
01831 {
01832
01833 if(srcFormat == PIX_FMT_BGR24) {
01834 __asm__ volatile(
01835 "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t"
01836 "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t"
01837 :
01838 );
01839 } else {
01840 __asm__ volatile(
01841 "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t"
01842 "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t"
01843 :
01844 );
01845 }
01846
01847 __asm__ volatile(
01848 "movq "MANGLE(ff_bgr24toYOffset)", %%mm4 \n\t"
01849 "mov %2, %%"REG_a" \n\t"
01850 "pxor %%mm7, %%mm7 \n\t"
01851 "1: \n\t"
01852 PREFETCH" 64(%0) \n\t"
01853 "movd (%0), %%mm0 \n\t"
01854 "movd 2(%0), %%mm1 \n\t"
01855 "movd 6(%0), %%mm2 \n\t"
01856 "movd 8(%0), %%mm3 \n\t"
01857 "add $12, %0 \n\t"
01858 "punpcklbw %%mm7, %%mm0 \n\t"
01859 "punpcklbw %%mm7, %%mm1 \n\t"
01860 "punpcklbw %%mm7, %%mm2 \n\t"
01861 "punpcklbw %%mm7, %%mm3 \n\t"
01862 "pmaddwd %%mm5, %%mm0 \n\t"
01863 "pmaddwd %%mm6, %%mm1 \n\t"
01864 "pmaddwd %%mm5, %%mm2 \n\t"
01865 "pmaddwd %%mm6, %%mm3 \n\t"
01866 "paddd %%mm1, %%mm0 \n\t"
01867 "paddd %%mm3, %%mm2 \n\t"
01868 "paddd %%mm4, %%mm0 \n\t"
01869 "paddd %%mm4, %%mm2 \n\t"
01870 "psrad $15, %%mm0 \n\t"
01871 "psrad $15, %%mm2 \n\t"
01872 "packssdw %%mm2, %%mm0 \n\t"
01873 "packuswb %%mm0, %%mm0 \n\t"
01874 "movd %%mm0, (%1, %%"REG_a") \n\t"
01875 "add $4, %%"REG_a" \n\t"
01876 " js 1b \n\t"
01877 : "+r" (src)
01878 : "r" (dst+width), "g" ((x86_reg)-width)
01879 : "%"REG_a
01880 );
01881 }
01882
01883 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
01884 {
01885 __asm__ volatile(
01886 "movq 24(%4), %%mm6 \n\t"
01887 "mov %3, %%"REG_a" \n\t"
01888 "pxor %%mm7, %%mm7 \n\t"
01889 "1: \n\t"
01890 PREFETCH" 64(%0) \n\t"
01891 "movd (%0), %%mm0 \n\t"
01892 "movd 2(%0), %%mm1 \n\t"
01893 "punpcklbw %%mm7, %%mm0 \n\t"
01894 "punpcklbw %%mm7, %%mm1 \n\t"
01895 "movq %%mm0, %%mm2 \n\t"
01896 "movq %%mm1, %%mm3 \n\t"
01897 "pmaddwd (%4), %%mm0 \n\t"
01898 "pmaddwd 8(%4), %%mm1 \n\t"
01899 "pmaddwd 16(%4), %%mm2 \n\t"
01900 "pmaddwd %%mm6, %%mm3 \n\t"
01901 "paddd %%mm1, %%mm0 \n\t"
01902 "paddd %%mm3, %%mm2 \n\t"
01903
01904 "movd 6(%0), %%mm1 \n\t"
01905 "movd 8(%0), %%mm3 \n\t"
01906 "add $12, %0 \n\t"
01907 "punpcklbw %%mm7, %%mm1 \n\t"
01908 "punpcklbw %%mm7, %%mm3 \n\t"
01909 "movq %%mm1, %%mm4 \n\t"
01910 "movq %%mm3, %%mm5 \n\t"
01911 "pmaddwd (%4), %%mm1 \n\t"
01912 "pmaddwd 8(%4), %%mm3 \n\t"
01913 "pmaddwd 16(%4), %%mm4 \n\t"
01914 "pmaddwd %%mm6, %%mm5 \n\t"
01915 "paddd %%mm3, %%mm1 \n\t"
01916 "paddd %%mm5, %%mm4 \n\t"
01917
01918 "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3 \n\t"
01919 "paddd %%mm3, %%mm0 \n\t"
01920 "paddd %%mm3, %%mm2 \n\t"
01921 "paddd %%mm3, %%mm1 \n\t"
01922 "paddd %%mm3, %%mm4 \n\t"
01923 "psrad $15, %%mm0 \n\t"
01924 "psrad $15, %%mm2 \n\t"
01925 "psrad $15, %%mm1 \n\t"
01926 "psrad $15, %%mm4 \n\t"
01927 "packssdw %%mm1, %%mm0 \n\t"
01928 "packssdw %%mm4, %%mm2 \n\t"
01929 "packuswb %%mm0, %%mm0 \n\t"
01930 "packuswb %%mm2, %%mm2 \n\t"
01931 "movd %%mm0, (%1, %%"REG_a") \n\t"
01932 "movd %%mm2, (%2, %%"REG_a") \n\t"
01933 "add $4, %%"REG_a" \n\t"
01934 " js 1b \n\t"
01935 : "+r" (src)
01936 : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
01937 : "%"REG_a
01938 );
01939 }
01940 #endif
01941
01942 static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01943 {
01944 #if COMPILE_TEMPLATE_MMX
01945 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
01946 #else
01947 int i;
01948 for (i=0; i<width; i++) {
01949 int b= src[i*3+0];
01950 int g= src[i*3+1];
01951 int r= src[i*3+2];
01952
01953 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
01954 }
01955 #endif
01956 }
01957
01958 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01959 {
01960 #if COMPILE_TEMPLATE_MMX
01961 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
01962 #else
01963 int i;
01964 for (i=0; i<width; i++) {
01965 int b= src1[3*i + 0];
01966 int g= src1[3*i + 1];
01967 int r= src1[3*i + 2];
01968
01969 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01970 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01971 }
01972 #endif
01973 assert(src1 == src2);
01974 }
01975
01976 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
01977 {
01978 int i;
01979 for (i=0; i<width; i++) {
01980 int b= src1[6*i + 0] + src1[6*i + 3];
01981 int g= src1[6*i + 1] + src1[6*i + 4];
01982 int r= src1[6*i + 2] + src1[6*i + 5];
01983
01984 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01985 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01986 }
01987 assert(src1 == src2);
01988 }
01989
01990 static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
01991 {
01992 #if COMPILE_TEMPLATE_MMX
01993 RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
01994 #else
01995 int i;
01996 for (i=0; i<width; i++) {
01997 int r= src[i*3+0];
01998 int g= src[i*3+1];
01999 int b= src[i*3+2];
02000
02001 dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
02002 }
02003 #endif
02004 }
02005
02006 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
02007 {
02008 #if COMPILE_TEMPLATE_MMX
02009 assert(src1==src2);
02010 RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
02011 #else
02012 int i;
02013 assert(src1==src2);
02014 for (i=0; i<width; i++) {
02015 int r= src1[3*i + 0];
02016 int g= src1[3*i + 1];
02017 int b= src1[3*i + 2];
02018
02019 dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
02020 dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
02021 }
02022 #endif
02023 }
02024
02025 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
02026 {
02027 int i;
02028 assert(src1==src2);
02029 for (i=0; i<width; i++) {
02030 int r= src1[6*i + 0] + src1[6*i + 3];
02031 int g= src1[6*i + 1] + src1[6*i + 4];
02032 int b= src1[6*i + 2] + src1[6*i + 5];
02033
02034 dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
02035 dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
02036 }
02037 }
02038
02039
02040
02041 static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
02042 const int16_t *filter, const int16_t *filterPos, long filterSize)
02043 {
02044 #if COMPILE_TEMPLATE_MMX
02045 assert(filterSize % 4 == 0 && filterSize>0);
02046 if (filterSize==4) {
02047 x86_reg counter= -2*dstW;
02048 filter-= counter*2;
02049 filterPos-= counter/2;
02050 dst-= counter/2;
02051 __asm__ volatile(
02052 #if defined(PIC)
02053 "push %%"REG_b" \n\t"
02054 #endif
02055 "pxor %%mm7, %%mm7 \n\t"
02056 "push %%"REG_BP" \n\t"
02057 "mov %%"REG_a", %%"REG_BP" \n\t"
02058 ".p2align 4 \n\t"
02059 "1: \n\t"
02060 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
02061 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
02062 "movq (%1, %%"REG_BP", 4), %%mm1 \n\t"
02063 "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t"
02064 "movd (%3, %%"REG_a"), %%mm0 \n\t"
02065 "movd (%3, %%"REG_b"), %%mm2 \n\t"
02066 "punpcklbw %%mm7, %%mm0 \n\t"
02067 "punpcklbw %%mm7, %%mm2 \n\t"
02068 "pmaddwd %%mm1, %%mm0 \n\t"
02069 "pmaddwd %%mm2, %%mm3 \n\t"
02070 "movq %%mm0, %%mm4 \n\t"
02071 "punpckldq %%mm3, %%mm0 \n\t"
02072 "punpckhdq %%mm3, %%mm4 \n\t"
02073 "paddd %%mm4, %%mm0 \n\t"
02074 "psrad $7, %%mm0 \n\t"
02075 "packssdw %%mm0, %%mm0 \n\t"
02076 "movd %%mm0, (%4, %%"REG_BP") \n\t"
02077 "add $4, %%"REG_BP" \n\t"
02078 " jnc 1b \n\t"
02079
02080 "pop %%"REG_BP" \n\t"
02081 #if defined(PIC)
02082 "pop %%"REG_b" \n\t"
02083 #endif
02084 : "+a" (counter)
02085 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02086 #if !defined(PIC)
02087 : "%"REG_b
02088 #endif
02089 );
02090 } else if (filterSize==8) {
02091 x86_reg counter= -2*dstW;
02092 filter-= counter*4;
02093 filterPos-= counter/2;
02094 dst-= counter/2;
02095 __asm__ volatile(
02096 #if defined(PIC)
02097 "push %%"REG_b" \n\t"
02098 #endif
02099 "pxor %%mm7, %%mm7 \n\t"
02100 "push %%"REG_BP" \n\t"
02101 "mov %%"REG_a", %%"REG_BP" \n\t"
02102 ".p2align 4 \n\t"
02103 "1: \n\t"
02104 "movzwl (%2, %%"REG_BP"), %%eax \n\t"
02105 "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t"
02106 "movq (%1, %%"REG_BP", 8), %%mm1 \n\t"
02107 "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t"
02108 "movd (%3, %%"REG_a"), %%mm0 \n\t"
02109 "movd (%3, %%"REG_b"), %%mm2 \n\t"
02110 "punpcklbw %%mm7, %%mm0 \n\t"
02111 "punpcklbw %%mm7, %%mm2 \n\t"
02112 "pmaddwd %%mm1, %%mm0 \n\t"
02113 "pmaddwd %%mm2, %%mm3 \n\t"
02114
02115 "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t"
02116 "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t"
02117 "movd 4(%3, %%"REG_a"), %%mm4 \n\t"
02118 "movd 4(%3, %%"REG_b"), %%mm2 \n\t"
02119 "punpcklbw %%mm7, %%mm4 \n\t"
02120 "punpcklbw %%mm7, %%mm2 \n\t"
02121 "pmaddwd %%mm1, %%mm4 \n\t"
02122 "pmaddwd %%mm2, %%mm5 \n\t"
02123 "paddd %%mm4, %%mm0 \n\t"
02124 "paddd %%mm5, %%mm3 \n\t"
02125 "movq %%mm0, %%mm4 \n\t"
02126 "punpckldq %%mm3, %%mm0 \n\t"
02127 "punpckhdq %%mm3, %%mm4 \n\t"
02128 "paddd %%mm4, %%mm0 \n\t"
02129 "psrad $7, %%mm0 \n\t"
02130 "packssdw %%mm0, %%mm0 \n\t"
02131 "movd %%mm0, (%4, %%"REG_BP") \n\t"
02132 "add $4, %%"REG_BP" \n\t"
02133 " jnc 1b \n\t"
02134
02135 "pop %%"REG_BP" \n\t"
02136 #if defined(PIC)
02137 "pop %%"REG_b" \n\t"
02138 #endif
02139 : "+a" (counter)
02140 : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02141 #if !defined(PIC)
02142 : "%"REG_b
02143 #endif
02144 );
02145 } else {
02146 const uint8_t *offset = src+filterSize;
02147 x86_reg counter= -2*dstW;
02148
02149 filterPos-= counter/2;
02150 dst-= counter/2;
02151 __asm__ volatile(
02152 "pxor %%mm7, %%mm7 \n\t"
02153 ".p2align 4 \n\t"
02154 "1: \n\t"
02155 "mov %2, %%"REG_c" \n\t"
02156 "movzwl (%%"REG_c", %0), %%eax \n\t"
02157 "movzwl 2(%%"REG_c", %0), %%edx \n\t"
02158 "mov %5, %%"REG_c" \n\t"
02159 "pxor %%mm4, %%mm4 \n\t"
02160 "pxor %%mm5, %%mm5 \n\t"
02161 "2: \n\t"
02162 "movq (%1), %%mm1 \n\t"
02163 "movq (%1, %6), %%mm3 \n\t"
02164 "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t"
02165 "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t"
02166 "punpcklbw %%mm7, %%mm0 \n\t"
02167 "punpcklbw %%mm7, %%mm2 \n\t"
02168 "pmaddwd %%mm1, %%mm0 \n\t"
02169 "pmaddwd %%mm2, %%mm3 \n\t"
02170 "paddd %%mm3, %%mm5 \n\t"
02171 "paddd %%mm0, %%mm4 \n\t"
02172 "add $8, %1 \n\t"
02173 "add $4, %%"REG_c" \n\t"
02174 "cmp %4, %%"REG_c" \n\t"
02175 " jb 2b \n\t"
02176 "add %6, %1 \n\t"
02177 "movq %%mm4, %%mm0 \n\t"
02178 "punpckldq %%mm5, %%mm4 \n\t"
02179 "punpckhdq %%mm5, %%mm0 \n\t"
02180 "paddd %%mm0, %%mm4 \n\t"
02181 "psrad $7, %%mm4 \n\t"
02182 "packssdw %%mm4, %%mm4 \n\t"
02183 "mov %3, %%"REG_a" \n\t"
02184 "movd %%mm4, (%%"REG_a", %0) \n\t"
02185 "add $4, %0 \n\t"
02186 " jnc 1b \n\t"
02187
02188 : "+r" (counter), "+r" (filter)
02189 : "m" (filterPos), "m" (dst), "m"(offset),
02190 "m" (src), "r" ((x86_reg)filterSize*2)
02191 : "%"REG_a, "%"REG_c, "%"REG_d
02192 );
02193 }
02194 #else
02195 #if COMPILE_TEMPLATE_ALTIVEC
02196 hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
02197 #else
02198 int i;
02199 for (i=0; i<dstW; i++) {
02200 int j;
02201 int srcPos= filterPos[i];
02202 int val=0;
02203
02204 for (j=0; j<filterSize; j++) {
02205
02206 val += ((int)src[srcPos + j])*filter[filterSize*i + j];
02207 }
02208
02209 dst[i] = FFMIN(val>>7, (1<<15)-1);
02210
02211 }
02212 #endif
02213 #endif
02214 }
02215
02216
02217
02218 static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
02219 {
02220 int i;
02221 for (i = 0; i < width; i++) {
02222 dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12;
02223 dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12;
02224 }
02225 }
02226 static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
02227 {
02228 int i;
02229 for (i = 0; i < width; i++) {
02230 dst[i ] = (dst[i ]*1799 + 4081085)>>11;
02231 dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11;
02232 }
02233 }
02234 static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
02235 {
02236 int i;
02237 for (i = 0; i < width; i++)
02238 dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
02239 }
02240 static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
02241 {
02242 int i;
02243 for (i = 0; i < width; i++)
02244 dst[i] = (dst[i]*14071 + 33561947)>>14;
02245 }
02246
02247 #define FAST_BILINEAR_X86 \
02248 "subl %%edi, %%esi \n\t" \
02249 "imull %%ecx, %%esi \n\t" \
02250 "shll $16, %%edi \n\t" \
02251 "addl %%edi, %%esi \n\t" \
02252 "mov %1, %%"REG_D"\n\t" \
02253 "shrl $9, %%esi \n\t" \
02254
02255 static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
02256 long dstWidth, const uint8_t *src, int srcW,
02257 int xInc)
02258 {
02259 #if ARCH_X86
02260 #if COMPILE_TEMPLATE_MMX2
02261 int32_t *filterPos = c->hLumFilterPos;
02262 int16_t *filter = c->hLumFilter;
02263 int canMMX2BeUsed = c->canMMX2BeUsed;
02264 void *mmx2FilterCode= c->lumMmx2FilterCode;
02265 int i;
02266 #if defined(PIC)
02267 DECLARE_ALIGNED(8, uint64_t, ebxsave);
02268 #endif
02269 if (canMMX2BeUsed) {
02270 __asm__ volatile(
02271 #if defined(PIC)
02272 "mov %%"REG_b", %5 \n\t"
02273 #endif
02274 "pxor %%mm7, %%mm7 \n\t"
02275 "mov %0, %%"REG_c" \n\t"
02276 "mov %1, %%"REG_D" \n\t"
02277 "mov %2, %%"REG_d" \n\t"
02278 "mov %3, %%"REG_b" \n\t"
02279 "xor %%"REG_a", %%"REG_a" \n\t"
02280 PREFETCH" (%%"REG_c") \n\t"
02281 PREFETCH" 32(%%"REG_c") \n\t"
02282 PREFETCH" 64(%%"REG_c") \n\t"
02283
02284 #if ARCH_X86_64
02285
02286 #define CALL_MMX2_FILTER_CODE \
02287 "movl (%%"REG_b"), %%esi \n\t"\
02288 "call *%4 \n\t"\
02289 "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
02290 "add %%"REG_S", %%"REG_c" \n\t"\
02291 "add %%"REG_a", %%"REG_D" \n\t"\
02292 "xor %%"REG_a", %%"REG_a" \n\t"\
02293
02294 #else
02295
02296 #define CALL_MMX2_FILTER_CODE \
02297 "movl (%%"REG_b"), %%esi \n\t"\
02298 "call *%4 \n\t"\
02299 "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
02300 "add %%"REG_a", %%"REG_D" \n\t"\
02301 "xor %%"REG_a", %%"REG_a" \n\t"\
02302
02303 #endif
02304
02305 CALL_MMX2_FILTER_CODE
02306 CALL_MMX2_FILTER_CODE
02307 CALL_MMX2_FILTER_CODE
02308 CALL_MMX2_FILTER_CODE
02309 CALL_MMX2_FILTER_CODE
02310 CALL_MMX2_FILTER_CODE
02311 CALL_MMX2_FILTER_CODE
02312 CALL_MMX2_FILTER_CODE
02313
02314 #if defined(PIC)
02315 "mov %5, %%"REG_b" \n\t"
02316 #endif
02317 :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
02318 "m" (mmx2FilterCode)
02319 #if defined(PIC)
02320 ,"m" (ebxsave)
02321 #endif
02322 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02323 #if !defined(PIC)
02324 ,"%"REG_b
02325 #endif
02326 );
02327 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
02328 } else {
02329 #endif
02330 x86_reg xInc_shr16 = xInc >> 16;
02331 uint16_t xInc_mask = xInc & 0xffff;
02332 x86_reg dstWidth_reg = dstWidth;
02333
02334 __asm__ volatile(
02335 "xor %%"REG_a", %%"REG_a" \n\t"
02336 "xor %%"REG_d", %%"REG_d" \n\t"
02337 "xorl %%ecx, %%ecx \n\t"
02338 ".p2align 4 \n\t"
02339 "1: \n\t"
02340 "movzbl (%0, %%"REG_d"), %%edi \n\t"
02341 "movzbl 1(%0, %%"REG_d"), %%esi \n\t"
02342 FAST_BILINEAR_X86
02343 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
02344 "addw %4, %%cx \n\t"
02345 "adc %3, %%"REG_d" \n\t"
02346
02347 "movzbl (%0, %%"REG_d"), %%edi \n\t"
02348 "movzbl 1(%0, %%"REG_d"), %%esi \n\t"
02349 FAST_BILINEAR_X86
02350 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t"
02351 "addw %4, %%cx \n\t"
02352 "adc %3, %%"REG_d" \n\t"
02353
02354
02355 "add $2, %%"REG_a" \n\t"
02356 "cmp %2, %%"REG_a" \n\t"
02357 " jb 1b \n\t"
02358
02359
02360 :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
02361 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
02362 );
02363 #if COMPILE_TEMPLATE_MMX2
02364 }
02365 #endif
02366 #else
02367 int i;
02368 unsigned int xpos=0;
02369 for (i=0;i<dstWidth;i++) {
02370 register unsigned int xx=xpos>>16;
02371 register unsigned int xalpha=(xpos&0xFFFF)>>9;
02372 dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
02373 xpos+=xInc;
02374 }
02375 #endif
02376 }
02377
02378
02379 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
02380 const int16_t *hLumFilter,
02381 const int16_t *hLumFilterPos, int hLumFilterSize,
02382 uint8_t *formatConvBuffer,
02383 uint32_t *pal, int isAlpha)
02384 {
02385 void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
02386 void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
02387
02388 src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
02389
02390 if (toYV12) {
02391 toYV12(formatConvBuffer, src, srcW, pal);
02392 src= formatConvBuffer;
02393 }
02394
02395 if (!c->hyscale_fast) {
02396 c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
02397 } else {
02398 c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
02399 }
02400
02401 if (convertRange)
02402 convertRange(dst, dstWidth);
02403 }
02404
02405 static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
02406 long dstWidth, const uint8_t *src1,
02407 const uint8_t *src2, int srcW, int xInc)
02408 {
02409 #if ARCH_X86
02410 #if COMPILE_TEMPLATE_MMX2
02411 int32_t *filterPos = c->hChrFilterPos;
02412 int16_t *filter = c->hChrFilter;
02413 int canMMX2BeUsed = c->canMMX2BeUsed;
02414 void *mmx2FilterCode= c->chrMmx2FilterCode;
02415 int i;
02416 #if defined(PIC)
02417 DECLARE_ALIGNED(8, uint64_t, ebxsave);
02418 #endif
02419 if (canMMX2BeUsed) {
02420 __asm__ volatile(
02421 #if defined(PIC)
02422 "mov %%"REG_b", %6 \n\t"
02423 #endif
02424 "pxor %%mm7, %%mm7 \n\t"
02425 "mov %0, %%"REG_c" \n\t"
02426 "mov %1, %%"REG_D" \n\t"
02427 "mov %2, %%"REG_d" \n\t"
02428 "mov %3, %%"REG_b" \n\t"
02429 "xor %%"REG_a", %%"REG_a" \n\t"
02430 PREFETCH" (%%"REG_c") \n\t"
02431 PREFETCH" 32(%%"REG_c") \n\t"
02432 PREFETCH" 64(%%"REG_c") \n\t"
02433
02434 CALL_MMX2_FILTER_CODE
02435 CALL_MMX2_FILTER_CODE
02436 CALL_MMX2_FILTER_CODE
02437 CALL_MMX2_FILTER_CODE
02438 "xor %%"REG_a", %%"REG_a" \n\t"
02439 "mov %5, %%"REG_c" \n\t"
02440 "mov %1, %%"REG_D" \n\t"
02441 "add $"AV_STRINGIFY(VOF)", %%"REG_D" \n\t"
02442 PREFETCH" (%%"REG_c") \n\t"
02443 PREFETCH" 32(%%"REG_c") \n\t"
02444 PREFETCH" 64(%%"REG_c") \n\t"
02445
02446 CALL_MMX2_FILTER_CODE
02447 CALL_MMX2_FILTER_CODE
02448 CALL_MMX2_FILTER_CODE
02449 CALL_MMX2_FILTER_CODE
02450
02451 #if defined(PIC)
02452 "mov %6, %%"REG_b" \n\t"
02453 #endif
02454 :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
02455 "m" (mmx2FilterCode), "m" (src2)
02456 #if defined(PIC)
02457 ,"m" (ebxsave)
02458 #endif
02459 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02460 #if !defined(PIC)
02461 ,"%"REG_b
02462 #endif
02463 );
02464 for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
02465
02466 dst[i] = src1[srcW-1]*128;
02467 dst[i+VOFW] = src2[srcW-1]*128;
02468 }
02469 } else {
02470 #endif
02471 x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
02472 uint16_t xInc_mask = xInc & 0xffff;
02473 x86_reg dstWidth_reg = dstWidth;
02474 __asm__ volatile(
02475 "xor %%"REG_a", %%"REG_a" \n\t"
02476 "xor %%"REG_d", %%"REG_d" \n\t"
02477 "xorl %%ecx, %%ecx \n\t"
02478 ".p2align 4 \n\t"
02479 "1: \n\t"
02480 "mov %0, %%"REG_S" \n\t"
02481 "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t"
02482 "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t"
02483 FAST_BILINEAR_X86
02484 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t"
02485
02486 "movzbl (%5, %%"REG_d"), %%edi \n\t"
02487 "movzbl 1(%5, %%"REG_d"), %%esi \n\t"
02488 FAST_BILINEAR_X86
02489 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t"
02490
02491 "addw %4, %%cx \n\t"
02492 "adc %3, %%"REG_d" \n\t"
02493 "add $1, %%"REG_a" \n\t"
02494 "cmp %2, %%"REG_a" \n\t"
02495 " jb 1b \n\t"
02496
02497
02498
02499 #if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
02500 :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
02501 #else
02502 :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
02503 #endif
02504 "r" (src2)
02505 : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
02506 );
02507 #if COMPILE_TEMPLATE_MMX2
02508 }
02509 #endif
02510 #else
02511 int i;
02512 unsigned int xpos=0;
02513 for (i=0;i<dstWidth;i++) {
02514 register unsigned int xx=xpos>>16;
02515 register unsigned int xalpha=(xpos&0xFFFF)>>9;
02516 dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
02517 dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
02518
02519
02520
02521
02522 xpos+=xInc;
02523 }
02524 #endif
02525 }
02526
02527 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
02528 int srcW, int xInc, const int16_t *hChrFilter,
02529 const int16_t *hChrFilterPos, int hChrFilterSize,
02530 uint8_t *formatConvBuffer,
02531 uint32_t *pal)
02532 {
02533
02534 src1 += c->chrSrcOffset;
02535 src2 += c->chrSrcOffset;
02536
02537 if (c->chrToYV12) {
02538 c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02539 src1= formatConvBuffer;
02540 src2= formatConvBuffer+VOFW;
02541 }
02542
02543 if (!c->hcscale_fast) {
02544 c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02545 c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02546 } else {
02547 c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
02548 }
02549
02550 if (c->chrConvertRange)
02551 c->chrConvertRange(dst, dstWidth);
02552 }
02553
02554 #define DEBUG_SWSCALE_BUFFERS 0
02555 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
02556
02557 static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
02558 int srcSliceH, uint8_t* dst[], int dstStride[])
02559 {
02560
02561 const int srcW= c->srcW;
02562 const int dstW= c->dstW;
02563 const int dstH= c->dstH;
02564 const int chrDstW= c->chrDstW;
02565 const int chrSrcW= c->chrSrcW;
02566 const int lumXInc= c->lumXInc;
02567 const int chrXInc= c->chrXInc;
02568 const enum PixelFormat dstFormat= c->dstFormat;
02569 const int flags= c->flags;
02570 int16_t *vLumFilterPos= c->vLumFilterPos;
02571 int16_t *vChrFilterPos= c->vChrFilterPos;
02572 int16_t *hLumFilterPos= c->hLumFilterPos;
02573 int16_t *hChrFilterPos= c->hChrFilterPos;
02574 int16_t *vLumFilter= c->vLumFilter;
02575 int16_t *vChrFilter= c->vChrFilter;
02576 int16_t *hLumFilter= c->hLumFilter;
02577 int16_t *hChrFilter= c->hChrFilter;
02578 int32_t *lumMmxFilter= c->lumMmxFilter;
02579 int32_t *chrMmxFilter= c->chrMmxFilter;
02580 int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
02581 const int vLumFilterSize= c->vLumFilterSize;
02582 const int vChrFilterSize= c->vChrFilterSize;
02583 const int hLumFilterSize= c->hLumFilterSize;
02584 const int hChrFilterSize= c->hChrFilterSize;
02585 int16_t **lumPixBuf= c->lumPixBuf;
02586 int16_t **chrPixBuf= c->chrPixBuf;
02587 int16_t **alpPixBuf= c->alpPixBuf;
02588 const int vLumBufSize= c->vLumBufSize;
02589 const int vChrBufSize= c->vChrBufSize;
02590 uint8_t *formatConvBuffer= c->formatConvBuffer;
02591 const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
02592 const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
02593 int lastDstY;
02594 uint32_t *pal=c->pal_yuv;
02595
02596
02597 int dstY= c->dstY;
02598 int lumBufIndex= c->lumBufIndex;
02599 int chrBufIndex= c->chrBufIndex;
02600 int lastInLumBuf= c->lastInLumBuf;
02601 int lastInChrBuf= c->lastInChrBuf;
02602
02603 if (isPacked(c->srcFormat)) {
02604 src[0]=
02605 src[1]=
02606 src[2]=
02607 src[3]= src[0];
02608 srcStride[0]=
02609 srcStride[1]=
02610 srcStride[2]=
02611 srcStride[3]= srcStride[0];
02612 }
02613 srcStride[1]<<= c->vChrDrop;
02614 srcStride[2]<<= c->vChrDrop;
02615
02616 DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
02617 src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
02618 dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
02619 DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
02620 srcSliceY, srcSliceH, dstY, dstH);
02621 DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
02622 vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
02623
02624 if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
02625 static int warnedAlready=0;
02626 if (flags & SWS_PRINT_INFO && !warnedAlready) {
02627 av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
02628 " ->cannot do aligned memory accesses anymore\n");
02629 warnedAlready=1;
02630 }
02631 }
02632
02633
02634
02635
02636 if (srcSliceY ==0) {
02637 lumBufIndex=-1;
02638 chrBufIndex=-1;
02639 dstY=0;
02640 lastInLumBuf= -1;
02641 lastInChrBuf= -1;
02642 }
02643
02644 lastDstY= dstY;
02645
02646 for (;dstY < dstH; dstY++) {
02647 unsigned char *dest =dst[0]+dstStride[0]*dstY;
02648 const int chrDstY= dstY>>c->chrDstVSubSample;
02649 unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
02650 unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
02651 unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
02652
02653 const int firstLumSrcY= vLumFilterPos[dstY];
02654 const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
02655 const int firstChrSrcY= vChrFilterPos[chrDstY];
02656 int lastLumSrcY= firstLumSrcY + vLumFilterSize -1;
02657 int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1;
02658 int lastChrSrcY= firstChrSrcY + vChrFilterSize -1;
02659 int enough_lines;
02660
02661
02662 if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
02663 if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
02664 assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
02665 assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
02666
02667 DEBUG_BUFFERS("dstY: %d\n", dstY);
02668 DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
02669 firstLumSrcY, lastLumSrcY, lastInLumBuf);
02670 DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
02671 firstChrSrcY, lastChrSrcY, lastInChrBuf);
02672
02673
02674 enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
02675
02676 if (!enough_lines) {
02677 lastLumSrcY = srcSliceY + srcSliceH - 1;
02678 lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
02679 DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
02680 lastLumSrcY, lastChrSrcY);
02681 }
02682
02683
02684 while(lastInLumBuf < lastLumSrcY) {
02685 const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
02686 const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
02687 lumBufIndex++;
02688 assert(lumBufIndex < 2*vLumBufSize);
02689 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
02690 assert(lastInLumBuf + 1 - srcSliceY >= 0);
02691 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
02692 hLumFilter, hLumFilterPos, hLumFilterSize,
02693 formatConvBuffer,
02694 pal, 0);
02695 if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
02696 RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
02697 hLumFilter, hLumFilterPos, hLumFilterSize,
02698 formatConvBuffer,
02699 pal, 1);
02700 lastInLumBuf++;
02701 DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
02702 lumBufIndex, lastInLumBuf);
02703 }
02704 while(lastInChrBuf < lastChrSrcY) {
02705 const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
02706 const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
02707 chrBufIndex++;
02708 assert(chrBufIndex < 2*vChrBufSize);
02709 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
02710 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
02711
02712
02713 if (c->needs_hcscale)
02714 RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
02715 hChrFilter, hChrFilterPos, hChrFilterSize,
02716 formatConvBuffer,
02717 pal);
02718 lastInChrBuf++;
02719 DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
02720 chrBufIndex, lastInChrBuf);
02721 }
02722
02723 if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
02724 if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
02725 if (!enough_lines)
02726 break;
02727
02728 #if COMPILE_TEMPLATE_MMX
02729 c->blueDither= ff_dither8[dstY&1];
02730 if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
02731 c->greenDither= ff_dither8[dstY&1];
02732 else
02733 c->greenDither= ff_dither4[dstY&1];
02734 c->redDither= ff_dither8[(dstY+1)&1];
02735 #endif
02736 if (dstY < dstH-2) {
02737 const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02738 const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02739 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
02740 #if COMPILE_TEMPLATE_MMX
02741 int i;
02742 if (flags & SWS_ACCURATE_RND) {
02743 int s= APCK_SIZE / 8;
02744 for (i=0; i<vLumFilterSize; i+=2) {
02745 *(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
02746 *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
02747 lumMmxFilter[s*i+APCK_COEF/4 ]=
02748 lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
02749 + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
02750 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
02751 *(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
02752 *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
02753 alpMmxFilter[s*i+APCK_COEF/4 ]=
02754 alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
02755 }
02756 }
02757 for (i=0; i<vChrFilterSize; i+=2) {
02758 *(const void**)&chrMmxFilter[s*i ]= chrSrcPtr[i ];
02759 *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrSrcPtr[i+(vChrFilterSize>1)];
02760 chrMmxFilter[s*i+APCK_COEF/4 ]=
02761 chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
02762 + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
02763 }
02764 } else {
02765 for (i=0; i<vLumFilterSize; i++) {
02766 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
02767 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
02768 lumMmxFilter[4*i+2]=
02769 lumMmxFilter[4*i+3]=
02770 ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
02771 if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
02772 alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
02773 alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
02774 alpMmxFilter[4*i+2]=
02775 alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
02776 }
02777 }
02778 for (i=0; i<vChrFilterSize; i++) {
02779 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
02780 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
02781 chrMmxFilter[4*i+2]=
02782 chrMmxFilter[4*i+3]=
02783 ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
02784 }
02785 }
02786 #endif
02787 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
02788 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02789 if (dstY&chrSkipMask) uDest= NULL;
02790 c->yuv2nv12X(c,
02791 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
02792 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02793 dest, uDest, dstW, chrDstW, dstFormat);
02794 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) {
02795 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02796 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL;
02797 if (is16BPS(dstFormat)) {
02798 yuv2yuvX16inC(
02799 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
02800 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02801 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
02802 dstFormat);
02803 } else if (vLumFilterSize == 1 && vChrFilterSize == 1) {
02804 const int16_t *lumBuf = lumSrcPtr[0];
02805 const int16_t *chrBuf= chrSrcPtr[0];
02806 const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
02807 c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
02808 } else {
02809 c->yuv2yuvX(c,
02810 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
02811 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02812 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
02813 }
02814 } else {
02815 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02816 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02817 if (vLumFilterSize == 1 && vChrFilterSize == 2) {
02818 int chrAlpha= vChrFilter[2*dstY+1];
02819 if(flags & SWS_FULL_CHR_H_INT) {
02820 yuv2rgbXinC_full(c,
02821 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02822 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02823 alpSrcPtr, dest, dstW, dstY);
02824 } else {
02825 c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
02826 alpPixBuf ? *alpSrcPtr : NULL,
02827 dest, dstW, chrAlpha, dstFormat, flags, dstY);
02828 }
02829 } else if (vLumFilterSize == 2 && vChrFilterSize == 2) {
02830 int lumAlpha= vLumFilter[2*dstY+1];
02831 int chrAlpha= vChrFilter[2*dstY+1];
02832 lumMmxFilter[2]=
02833 lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001;
02834 chrMmxFilter[2]=
02835 chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
02836 if(flags & SWS_FULL_CHR_H_INT) {
02837 yuv2rgbXinC_full(c,
02838 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02839 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02840 alpSrcPtr, dest, dstW, dstY);
02841 } else {
02842 c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
02843 alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
02844 dest, dstW, lumAlpha, chrAlpha, dstY);
02845 }
02846 } else {
02847 if(flags & SWS_FULL_CHR_H_INT) {
02848 yuv2rgbXinC_full(c,
02849 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02850 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02851 alpSrcPtr, dest, dstW, dstY);
02852 } else {
02853 c->yuv2packedX(c,
02854 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02855 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02856 alpSrcPtr, dest, dstW, dstY);
02857 }
02858 }
02859 }
02860 } else {
02861 const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02862 const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02863 const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
02864 if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
02865 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02866 if (dstY&chrSkipMask) uDest= NULL;
02867 yuv2nv12XinC(
02868 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
02869 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02870 dest, uDest, dstW, chrDstW, dstFormat);
02871 } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) {
02872 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02873 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL;
02874 if (is16BPS(dstFormat)) {
02875 yuv2yuvX16inC(
02876 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
02877 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02878 alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
02879 dstFormat);
02880 } else {
02881 yuv2yuvXinC(
02882 vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize,
02883 vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02884 alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
02885 }
02886 } else {
02887 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02888 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02889 if(flags & SWS_FULL_CHR_H_INT) {
02890 yuv2rgbXinC_full(c,
02891 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02892 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02893 alpSrcPtr, dest, dstW, dstY);
02894 } else {
02895 yuv2packedXinC(c,
02896 vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02897 vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02898 alpSrcPtr, dest, dstW, dstY);
02899 }
02900 }
02901 }
02902 }
02903
02904 if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
02905 fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
02906
02907 #if COMPILE_TEMPLATE_MMX
02908 if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory");
02909
02910 if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory");
02911 else __asm__ volatile("emms" :::"memory");
02912 #endif
02913
02914 c->dstY= dstY;
02915 c->lumBufIndex= lumBufIndex;
02916 c->chrBufIndex= chrBufIndex;
02917 c->lastInLumBuf= lastInLumBuf;
02918 c->lastInChrBuf= lastInChrBuf;
02919
02920 return dstY - lastDstY;
02921 }
02922
02923 static void RENAME(sws_init_swScale)(SwsContext *c)
02924 {
02925 enum PixelFormat srcFormat = c->srcFormat;
02926
02927 c->yuv2nv12X = RENAME(yuv2nv12X );
02928 c->yuv2yuv1 = RENAME(yuv2yuv1 );
02929 c->yuv2yuvX = RENAME(yuv2yuvX );
02930 c->yuv2packed1 = RENAME(yuv2packed1 );
02931 c->yuv2packed2 = RENAME(yuv2packed2 );
02932 c->yuv2packedX = RENAME(yuv2packedX );
02933
02934 c->hScale = RENAME(hScale );
02935
02936 #if COMPILE_TEMPLATE_MMX
02937
02938 if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
02939 #else
02940 if (c->flags & SWS_FAST_BILINEAR)
02941 #endif
02942 {
02943 c->hyscale_fast = RENAME(hyscale_fast);
02944 c->hcscale_fast = RENAME(hcscale_fast);
02945 }
02946
02947 c->chrToYV12 = NULL;
02948 switch(srcFormat) {
02949 case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break;
02950 case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break;
02951 case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break;
02952 case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break;
02953 case PIX_FMT_RGB8 :
02954 case PIX_FMT_BGR8 :
02955 case PIX_FMT_PAL8 :
02956 case PIX_FMT_BGR4_BYTE:
02957 case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
02958 case PIX_FMT_YUV420P16BE:
02959 case PIX_FMT_YUV422P16BE:
02960 case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
02961 case PIX_FMT_YUV420P16LE:
02962 case PIX_FMT_YUV422P16LE:
02963 case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
02964 }
02965 if (c->chrSrcHSubSample) {
02966 switch(srcFormat) {
02967 case PIX_FMT_RGB48BE:
02968 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
02969 case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV_half; break;
02970 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
02971 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
02972 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
02973 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
02974 case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV_half; break;
02975 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
02976 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
02977 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
02978 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
02979 }
02980 } else {
02981 switch(srcFormat) {
02982 case PIX_FMT_RGB48BE:
02983 case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
02984 case PIX_FMT_RGB32 : c->chrToYV12 = bgr32ToUV; break;
02985 case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
02986 case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break;
02987 case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
02988 case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
02989 case PIX_FMT_BGR32 : c->chrToYV12 = rgb32ToUV; break;
02990 case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
02991 case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break;
02992 case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
02993 case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
02994 }
02995 }
02996
02997 c->lumToYV12 = NULL;
02998 c->alpToYV12 = NULL;
02999 switch (srcFormat) {
03000 case PIX_FMT_YUYV422 :
03001 case PIX_FMT_YUV420P16BE:
03002 case PIX_FMT_YUV422P16BE:
03003 case PIX_FMT_YUV444P16BE:
03004 case PIX_FMT_Y400A :
03005 case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
03006 case PIX_FMT_UYVY422 :
03007 case PIX_FMT_YUV420P16LE:
03008 case PIX_FMT_YUV422P16LE:
03009 case PIX_FMT_YUV444P16LE:
03010 case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
03011 case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break;
03012 case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break;
03013 case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break;
03014 case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break;
03015 case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break;
03016 case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break;
03017 case PIX_FMT_RGB8 :
03018 case PIX_FMT_BGR8 :
03019 case PIX_FMT_PAL8 :
03020 case PIX_FMT_BGR4_BYTE:
03021 case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
03022 case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
03023 case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
03024 case PIX_FMT_RGB32 : c->lumToYV12 = bgr32ToY; break;
03025 case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
03026 case PIX_FMT_BGR32 : c->lumToYV12 = rgb32ToY; break;
03027 case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
03028 case PIX_FMT_RGB48BE:
03029 case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
03030 }
03031 if (c->alpPixBuf) {
03032 switch (srcFormat) {
03033 case PIX_FMT_RGB32 :
03034 case PIX_FMT_RGB32_1:
03035 case PIX_FMT_BGR32 :
03036 case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
03037 case PIX_FMT_Y400A : c->alpToYV12 = RENAME(yuy2ToY); break;
03038 }
03039 }
03040
03041 switch (srcFormat) {
03042 case PIX_FMT_Y400A :
03043 c->alpSrcOffset = 1;
03044 break;
03045 case PIX_FMT_RGB32 :
03046 case PIX_FMT_BGR32 :
03047 c->alpSrcOffset = 3;
03048 break;
03049 case PIX_FMT_RGB48LE:
03050 c->lumSrcOffset = 1;
03051 c->chrSrcOffset = 1;
03052 c->alpSrcOffset = 1;
03053 break;
03054 }
03055
03056 if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
03057 if (c->srcRange) {
03058 c->lumConvertRange = RENAME(lumRangeFromJpeg);
03059 c->chrConvertRange = RENAME(chrRangeFromJpeg);
03060 } else {
03061 c->lumConvertRange = RENAME(lumRangeToJpeg);
03062 c->chrConvertRange = RENAME(chrRangeToJpeg);
03063 }
03064 }
03065
03066 if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
03067 srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
03068 c->needs_hcscale = 1;
03069 }