• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libswscale/x86/swscale_template.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #undef REAL_MOVNTQ
00022 #undef MOVNTQ
00023 #undef MOVNTQ2
00024 #undef PREFETCH
00025 
00026 #if COMPILE_TEMPLATE_MMX2
00027 #define PREFETCH "prefetchnta"
00028 #else
00029 #define PREFETCH  " # nop"
00030 #endif
00031 
00032 #if COMPILE_TEMPLATE_MMX2
00033 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00034 #define MOVNTQ2 "movntq "
00035 #else
00036 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00037 #define MOVNTQ2 "movq "
00038 #endif
00039 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
00040 
00041 #if !COMPILE_TEMPLATE_MMX2
00042 static av_always_inline void
00043 dither_8to16(const uint8_t *srcDither, int rot)
00044 {
00045     if (rot) {
00046         __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
00047                          "movq       (%0), %%mm3\n\t"
00048                          "movq      %%mm3, %%mm4\n\t"
00049                          "psrlq       $24, %%mm3\n\t"
00050                          "psllq       $40, %%mm4\n\t"
00051                          "por       %%mm4, %%mm3\n\t"
00052                          "movq      %%mm3, %%mm4\n\t"
00053                          "punpcklbw %%mm0, %%mm3\n\t"
00054                          "punpckhbw %%mm0, %%mm4\n\t"
00055                          :: "r"(srcDither)
00056                          );
00057     } else {
00058         __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
00059                          "movq       (%0), %%mm3\n\t"
00060                          "movq      %%mm3, %%mm4\n\t"
00061                          "punpcklbw %%mm0, %%mm3\n\t"
00062                          "punpckhbw %%mm0, %%mm4\n\t"
00063                          :: "r"(srcDither)
00064                          );
00065     }
00066 }
00067 #endif
00068 
00069 static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
00070                            const int16_t **src, uint8_t *dest, int dstW,
00071                            const uint8_t *dither, int offset)
00072 {
00073     dither_8to16(dither, offset);
00074     __asm__ volatile(\
00075         "psraw        $4, %%mm3\n\t"
00076         "psraw        $4, %%mm4\n\t"
00077         "movq    %%mm3, %%mm6\n\t"
00078         "movq    %%mm4, %%mm7\n\t"
00079         "movl %3, %%ecx\n\t"
00080         "mov                                 %0, %%"REG_d"  \n\t"\
00081         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00082         ".p2align                             4             \n\t" /* FIXME Unroll? */\
00083         "1:                                                 \n\t"\
00084         "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
00085         "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
00086         "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
00087         "add                                $16, %%"REG_d"  \n\t"\
00088         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00089         "test                         %%"REG_S", %%"REG_S"  \n\t"\
00090         "pmulhw                           %%mm0, %%mm2      \n\t"\
00091         "pmulhw                           %%mm0, %%mm5      \n\t"\
00092         "paddw                            %%mm2, %%mm3      \n\t"\
00093         "paddw                            %%mm5, %%mm4      \n\t"\
00094         " jnz                                1b             \n\t"\
00095         "psraw                               $3, %%mm3      \n\t"\
00096         "psraw                               $3, %%mm4      \n\t"\
00097         "packuswb                         %%mm4, %%mm3      \n\t"
00098         MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
00099         "add                          $8, %%"REG_c"         \n\t"\
00100         "cmp                          %2, %%"REG_c"         \n\t"\
00101         "movq    %%mm6, %%mm3\n\t"
00102         "movq    %%mm7, %%mm4\n\t"
00103         "mov                                 %0, %%"REG_d"  \n\t"\
00104         "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00105         "jb                                  1b             \n\t"\
00106         :: "g" (filter),
00107            "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
00108         : "%"REG_d, "%"REG_S, "%"REG_c
00109     );
00110 }
00111 
00112 #define YSCALEYUV2PACKEDX_UV \
00113     __asm__ volatile(\
00114         "xor                   %%"REG_a", %%"REG_a"     \n\t"\
00115         ".p2align                      4                \n\t"\
00116         "nop                                            \n\t"\
00117         "1:                                             \n\t"\
00118         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00119         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00120         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
00121         "movq                      %%mm3, %%mm4         \n\t"\
00122         ".p2align                      4                \n\t"\
00123         "2:                                             \n\t"\
00124         "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
00125         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
00126         "add                          %6, %%"REG_S"     \n\t" \
00127         "movq     (%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
00128         "add                         $16, %%"REG_d"     \n\t"\
00129         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00130         "pmulhw                    %%mm0, %%mm2         \n\t"\
00131         "pmulhw                    %%mm0, %%mm5         \n\t"\
00132         "paddw                     %%mm2, %%mm3         \n\t"\
00133         "paddw                     %%mm5, %%mm4         \n\t"\
00134         "test                  %%"REG_S", %%"REG_S"     \n\t"\
00135         " jnz                         2b                \n\t"\
00136 
00137 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
00138     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00139     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00140     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
00141     "movq                    "#dst1", "#dst2"       \n\t"\
00142     ".p2align                      4                \n\t"\
00143     "2:                                             \n\t"\
00144     "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
00145     "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
00146     "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
00147     "add                         $16, %%"REG_d"            \n\t"\
00148     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00149     "pmulhw                 "#coeff", "#src1"       \n\t"\
00150     "pmulhw                 "#coeff", "#src2"       \n\t"\
00151     "paddw                   "#src1", "#dst1"       \n\t"\
00152     "paddw                   "#src2", "#dst2"       \n\t"\
00153     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00154     " jnz                         2b                \n\t"\
00155 
00156 #define YSCALEYUV2PACKEDX \
00157     YSCALEYUV2PACKEDX_UV \
00158     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
00159 
00160 #define YSCALEYUV2PACKEDX_END                     \
00161         :: "r" (&c->redDither),                   \
00162             "m" (dummy), "m" (dummy), "m" (dummy),\
00163             "r" (dest), "m" (dstW_reg), "m"(uv_off) \
00164         : "%"REG_a, "%"REG_d, "%"REG_S            \
00165     );
00166 
00167 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
00168     __asm__ volatile(\
00169         "xor %%"REG_a", %%"REG_a"                       \n\t"\
00170         ".p2align                      4                \n\t"\
00171         "nop                                            \n\t"\
00172         "1:                                             \n\t"\
00173         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00174         "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00175         "pxor                      %%mm4, %%mm4         \n\t"\
00176         "pxor                      %%mm5, %%mm5         \n\t"\
00177         "pxor                      %%mm6, %%mm6         \n\t"\
00178         "pxor                      %%mm7, %%mm7         \n\t"\
00179         ".p2align                      4                \n\t"\
00180         "2:                                             \n\t"\
00181         "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
00182         "add                          %6, %%"REG_S"      \n\t" \
00183         "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
00184         "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00185         "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
00186         "movq                      %%mm0, %%mm3         \n\t"\
00187         "punpcklwd                 %%mm1, %%mm0         \n\t"\
00188         "punpckhwd                 %%mm1, %%mm3         \n\t"\
00189         "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
00190         "pmaddwd                   %%mm1, %%mm0         \n\t"\
00191         "pmaddwd                   %%mm1, %%mm3         \n\t"\
00192         "paddd                     %%mm0, %%mm4         \n\t"\
00193         "paddd                     %%mm3, %%mm5         \n\t"\
00194         "add                          %6, %%"REG_S"      \n\t" \
00195         "movq     (%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
00196         "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00197         "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00198         "test                  %%"REG_S", %%"REG_S"     \n\t"\
00199         "movq                      %%mm2, %%mm0         \n\t"\
00200         "punpcklwd                 %%mm3, %%mm2         \n\t"\
00201         "punpckhwd                 %%mm3, %%mm0         \n\t"\
00202         "pmaddwd                   %%mm1, %%mm2         \n\t"\
00203         "pmaddwd                   %%mm1, %%mm0         \n\t"\
00204         "paddd                     %%mm2, %%mm6         \n\t"\
00205         "paddd                     %%mm0, %%mm7         \n\t"\
00206         " jnz                         2b                \n\t"\
00207         "psrad                       $16, %%mm4         \n\t"\
00208         "psrad                       $16, %%mm5         \n\t"\
00209         "psrad                       $16, %%mm6         \n\t"\
00210         "psrad                       $16, %%mm7         \n\t"\
00211         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00212         "packssdw                  %%mm5, %%mm4         \n\t"\
00213         "packssdw                  %%mm7, %%mm6         \n\t"\
00214         "paddw                     %%mm0, %%mm4         \n\t"\
00215         "paddw                     %%mm0, %%mm6         \n\t"\
00216         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
00217         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
00218 
00219 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
00220     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00221     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00222     "pxor                      %%mm1, %%mm1         \n\t"\
00223     "pxor                      %%mm5, %%mm5         \n\t"\
00224     "pxor                      %%mm7, %%mm7         \n\t"\
00225     "pxor                      %%mm6, %%mm6         \n\t"\
00226     ".p2align                      4                \n\t"\
00227     "2:                                             \n\t"\
00228     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
00229     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
00230     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00231     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
00232     "movq                      %%mm0, %%mm3         \n\t"\
00233     "punpcklwd                 %%mm4, %%mm0         \n\t"\
00234     "punpckhwd                 %%mm4, %%mm3         \n\t"\
00235     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
00236     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00237     "pmaddwd                   %%mm4, %%mm3         \n\t"\
00238     "paddd                     %%mm0, %%mm1         \n\t"\
00239     "paddd                     %%mm3, %%mm5         \n\t"\
00240     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
00241     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00242     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00243     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00244     "movq                      %%mm2, %%mm0         \n\t"\
00245     "punpcklwd                 %%mm3, %%mm2         \n\t"\
00246     "punpckhwd                 %%mm3, %%mm0         \n\t"\
00247     "pmaddwd                   %%mm4, %%mm2         \n\t"\
00248     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00249     "paddd                     %%mm2, %%mm7         \n\t"\
00250     "paddd                     %%mm0, %%mm6         \n\t"\
00251     " jnz                         2b                \n\t"\
00252     "psrad                       $16, %%mm1         \n\t"\
00253     "psrad                       $16, %%mm5         \n\t"\
00254     "psrad                       $16, %%mm7         \n\t"\
00255     "psrad                       $16, %%mm6         \n\t"\
00256     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00257     "packssdw                  %%mm5, %%mm1         \n\t"\
00258     "packssdw                  %%mm6, %%mm7         \n\t"\
00259     "paddw                     %%mm0, %%mm1         \n\t"\
00260     "paddw                     %%mm0, %%mm7         \n\t"\
00261     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
00262     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
00263 
00264 #define YSCALEYUV2PACKEDX_ACCURATE \
00265     YSCALEYUV2PACKEDX_ACCURATE_UV \
00266     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
00267 
00268 #define YSCALEYUV2RGBX \
00269     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
00270     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
00271     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
00272     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
00273     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
00274     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
00275     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00276     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
00277     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
00278     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
00279     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
00280     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
00281     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
00282     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00283     "paddw           %%mm3, %%mm4       \n\t"\
00284     "movq            %%mm2, %%mm0       \n\t"\
00285     "movq            %%mm5, %%mm6       \n\t"\
00286     "movq            %%mm4, %%mm3       \n\t"\
00287     "punpcklwd       %%mm2, %%mm2       \n\t"\
00288     "punpcklwd       %%mm5, %%mm5       \n\t"\
00289     "punpcklwd       %%mm4, %%mm4       \n\t"\
00290     "paddw           %%mm1, %%mm2       \n\t"\
00291     "paddw           %%mm1, %%mm5       \n\t"\
00292     "paddw           %%mm1, %%mm4       \n\t"\
00293     "punpckhwd       %%mm0, %%mm0       \n\t"\
00294     "punpckhwd       %%mm6, %%mm6       \n\t"\
00295     "punpckhwd       %%mm3, %%mm3       \n\t"\
00296     "paddw           %%mm7, %%mm0       \n\t"\
00297     "paddw           %%mm7, %%mm6       \n\t"\
00298     "paddw           %%mm7, %%mm3       \n\t"\
00299     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00300     "packuswb        %%mm0, %%mm2       \n\t"\
00301     "packuswb        %%mm6, %%mm5       \n\t"\
00302     "packuswb        %%mm3, %%mm4       \n\t"\
00303 
00304 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
00305     "movq       "#b", "#q2"     \n\t" /* B */\
00306     "movq       "#r", "#t"      \n\t" /* R */\
00307     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
00308     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
00309     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
00310     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
00311     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
00312     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
00313     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
00314     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
00315     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
00316     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
00317 \
00318     MOVNTQ(   q0,   (dst, index, 4))\
00319     MOVNTQ(    b,  8(dst, index, 4))\
00320     MOVNTQ(   q2, 16(dst, index, 4))\
00321     MOVNTQ(   q3, 24(dst, index, 4))\
00322 \
00323     "add      $8, "#index"      \n\t"\
00324     "cmp "#dstw", "#index"      \n\t"\
00325     " jb      1b                \n\t"
00326 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
00327 
00328 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
00329                                    const int16_t **lumSrc, int lumFilterSize,
00330                                    const int16_t *chrFilter, const int16_t **chrUSrc,
00331                                    const int16_t **chrVSrc,
00332                                    int chrFilterSize, const int16_t **alpSrc,
00333                                    uint8_t *dest, int dstW, int dstY)
00334 {
00335     x86_reg dummy=0;
00336     x86_reg dstW_reg = dstW;
00337     x86_reg uv_off = c->uv_offx2;
00338 
00339     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00340         YSCALEYUV2PACKEDX_ACCURATE
00341         YSCALEYUV2RGBX
00342         "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
00343         "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
00344         "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
00345         YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
00346         "movq               "Y_TEMP"(%0), %%mm5         \n\t"
00347         "psraw                        $3, %%mm1         \n\t"
00348         "psraw                        $3, %%mm7         \n\t"
00349         "packuswb                  %%mm7, %%mm1         \n\t"
00350         WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
00351         YSCALEYUV2PACKEDX_END
00352     } else {
00353         YSCALEYUV2PACKEDX_ACCURATE
00354         YSCALEYUV2RGBX
00355         "pcmpeqd %%mm7, %%mm7 \n\t"
00356         WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00357         YSCALEYUV2PACKEDX_END
00358     }
00359 }
00360 
00361 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
00362                                 const int16_t **lumSrc, int lumFilterSize,
00363                                 const int16_t *chrFilter, const int16_t **chrUSrc,
00364                                 const int16_t **chrVSrc,
00365                                 int chrFilterSize, const int16_t **alpSrc,
00366                                 uint8_t *dest, int dstW, int dstY)
00367 {
00368     x86_reg dummy=0;
00369     x86_reg dstW_reg = dstW;
00370     x86_reg uv_off = c->uv_offx2;
00371 
00372     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00373         YSCALEYUV2PACKEDX
00374         YSCALEYUV2RGBX
00375         YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
00376         "psraw                        $3, %%mm1         \n\t"
00377         "psraw                        $3, %%mm7         \n\t"
00378         "packuswb                  %%mm7, %%mm1         \n\t"
00379         WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00380         YSCALEYUV2PACKEDX_END
00381     } else {
00382         YSCALEYUV2PACKEDX
00383         YSCALEYUV2RGBX
00384         "pcmpeqd %%mm7, %%mm7 \n\t"
00385         WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00386         YSCALEYUV2PACKEDX_END
00387     }
00388 }
00389 
00390 #define REAL_WRITERGB16(dst, dstw, index) \
00391     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00392     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
00393     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00394     "psrlq           $3, %%mm2  \n\t"\
00395 \
00396     "movq         %%mm2, %%mm1  \n\t"\
00397     "movq         %%mm4, %%mm3  \n\t"\
00398 \
00399     "punpcklbw    %%mm7, %%mm3  \n\t"\
00400     "punpcklbw    %%mm5, %%mm2  \n\t"\
00401     "punpckhbw    %%mm7, %%mm4  \n\t"\
00402     "punpckhbw    %%mm5, %%mm1  \n\t"\
00403 \
00404     "psllq           $3, %%mm3  \n\t"\
00405     "psllq           $3, %%mm4  \n\t"\
00406 \
00407     "por          %%mm3, %%mm2  \n\t"\
00408     "por          %%mm4, %%mm1  \n\t"\
00409 \
00410     MOVNTQ(%%mm2,  (dst, index, 2))\
00411     MOVNTQ(%%mm1, 8(dst, index, 2))\
00412 \
00413     "add             $8, "#index"   \n\t"\
00414     "cmp        "#dstw", "#index"   \n\t"\
00415     " jb             1b             \n\t"
00416 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
00417 
00418 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
00419                                     const int16_t **lumSrc, int lumFilterSize,
00420                                     const int16_t *chrFilter, const int16_t **chrUSrc,
00421                                     const int16_t **chrVSrc,
00422                                     int chrFilterSize, const int16_t **alpSrc,
00423                                     uint8_t *dest, int dstW, int dstY)
00424 {
00425     x86_reg dummy=0;
00426     x86_reg dstW_reg = dstW;
00427     x86_reg uv_off = c->uv_offx2;
00428 
00429     YSCALEYUV2PACKEDX_ACCURATE
00430     YSCALEYUV2RGBX
00431     "pxor %%mm7, %%mm7 \n\t"
00432     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00433 #ifdef DITHER1XBPP
00434     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
00435     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
00436     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
00437 #endif
00438     WRITERGB16(%4, %5, %%REGa)
00439     YSCALEYUV2PACKEDX_END
00440 }
00441 
00442 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
00443                                  const int16_t **lumSrc, int lumFilterSize,
00444                                  const int16_t *chrFilter, const int16_t **chrUSrc,
00445                                  const int16_t **chrVSrc,
00446                                  int chrFilterSize, const int16_t **alpSrc,
00447                                  uint8_t *dest, int dstW, int dstY)
00448 {
00449     x86_reg dummy=0;
00450     x86_reg dstW_reg = dstW;
00451     x86_reg uv_off = c->uv_offx2;
00452 
00453     YSCALEYUV2PACKEDX
00454     YSCALEYUV2RGBX
00455     "pxor %%mm7, %%mm7 \n\t"
00456     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00457 #ifdef DITHER1XBPP
00458     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
00459     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
00460     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
00461 #endif
00462     WRITERGB16(%4, %5, %%REGa)
00463     YSCALEYUV2PACKEDX_END
00464 }
00465 
00466 #define REAL_WRITERGB15(dst, dstw, index) \
00467     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00468     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
00469     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00470     "psrlq           $3, %%mm2  \n\t"\
00471     "psrlq           $1, %%mm5  \n\t"\
00472 \
00473     "movq         %%mm2, %%mm1  \n\t"\
00474     "movq         %%mm4, %%mm3  \n\t"\
00475 \
00476     "punpcklbw    %%mm7, %%mm3  \n\t"\
00477     "punpcklbw    %%mm5, %%mm2  \n\t"\
00478     "punpckhbw    %%mm7, %%mm4  \n\t"\
00479     "punpckhbw    %%mm5, %%mm1  \n\t"\
00480 \
00481     "psllq           $2, %%mm3  \n\t"\
00482     "psllq           $2, %%mm4  \n\t"\
00483 \
00484     "por          %%mm3, %%mm2  \n\t"\
00485     "por          %%mm4, %%mm1  \n\t"\
00486 \
00487     MOVNTQ(%%mm2,  (dst, index, 2))\
00488     MOVNTQ(%%mm1, 8(dst, index, 2))\
00489 \
00490     "add             $8, "#index"   \n\t"\
00491     "cmp        "#dstw", "#index"   \n\t"\
00492     " jb             1b             \n\t"
00493 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
00494 
00495 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
00496                                     const int16_t **lumSrc, int lumFilterSize,
00497                                     const int16_t *chrFilter, const int16_t **chrUSrc,
00498                                     const int16_t **chrVSrc,
00499                                     int chrFilterSize, const int16_t **alpSrc,
00500                                     uint8_t *dest, int dstW, int dstY)
00501 {
00502     x86_reg dummy=0;
00503     x86_reg dstW_reg = dstW;
00504     x86_reg uv_off = c->uv_offx2;
00505 
00506     YSCALEYUV2PACKEDX_ACCURATE
00507     YSCALEYUV2RGBX
00508     "pxor %%mm7, %%mm7 \n\t"
00509     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00510 #ifdef DITHER1XBPP
00511     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
00512     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
00513     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
00514 #endif
00515     WRITERGB15(%4, %5, %%REGa)
00516     YSCALEYUV2PACKEDX_END
00517 }
00518 
00519 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
00520                                  const int16_t **lumSrc, int lumFilterSize,
00521                                  const int16_t *chrFilter, const int16_t **chrUSrc,
00522                                  const int16_t **chrVSrc,
00523                                  int chrFilterSize, const int16_t **alpSrc,
00524                                  uint8_t *dest, int dstW, int dstY)
00525 {
00526     x86_reg dummy=0;
00527     x86_reg dstW_reg = dstW;
00528     x86_reg uv_off = c->uv_offx2;
00529 
00530     YSCALEYUV2PACKEDX
00531     YSCALEYUV2RGBX
00532     "pxor %%mm7, %%mm7 \n\t"
00533     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00534 #ifdef DITHER1XBPP
00535     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
00536     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
00537     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
00538 #endif
00539     WRITERGB15(%4, %5, %%REGa)
00540     YSCALEYUV2PACKEDX_END
00541 }
00542 
00543 #define WRITEBGR24MMX(dst, dstw, index) \
00544     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00545     "movq      %%mm2, %%mm1     \n\t" /* B */\
00546     "movq      %%mm5, %%mm6     \n\t" /* R */\
00547     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
00548     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
00549     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
00550     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
00551     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
00552     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
00553     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
00554     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
00555     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
00556     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
00557 \
00558     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
00559     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
00560     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
00561     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
00562 \
00563     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
00564     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
00565     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
00566     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
00567 \
00568     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
00569     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
00570     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
00571     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
00572 \
00573     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
00574     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
00575     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
00576     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
00577     MOVNTQ(%%mm0, (dst))\
00578 \
00579     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
00580     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
00581     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
00582     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
00583     MOVNTQ(%%mm6, 8(dst))\
00584 \
00585     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
00586     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
00587     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
00588     MOVNTQ(%%mm5, 16(dst))\
00589 \
00590     "add         $24, "#dst"    \n\t"\
00591 \
00592     "add          $8, "#index"  \n\t"\
00593     "cmp     "#dstw", "#index"  \n\t"\
00594     " jb          1b            \n\t"
00595 
00596 #define WRITEBGR24MMX2(dst, dstw, index) \
00597     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00598     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
00599     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
00600     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
00601     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
00602     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
00603 \
00604     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
00605     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
00606     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
00607 \
00608     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
00609     "por    %%mm1, %%mm6        \n\t"\
00610     "por    %%mm3, %%mm6        \n\t"\
00611     MOVNTQ(%%mm6, (dst))\
00612 \
00613     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
00614     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
00615     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
00616     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
00617 \
00618     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
00619     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
00620     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
00621 \
00622     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
00623     "por    %%mm3, %%mm6        \n\t"\
00624     MOVNTQ(%%mm6, 8(dst))\
00625 \
00626     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
00627     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
00628     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
00629 \
00630     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
00631     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
00632     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
00633 \
00634     "por    %%mm1, %%mm3        \n\t"\
00635     "por    %%mm3, %%mm6        \n\t"\
00636     MOVNTQ(%%mm6, 16(dst))\
00637 \
00638     "add      $24, "#dst"       \n\t"\
00639 \
00640     "add       $8, "#index"     \n\t"\
00641     "cmp  "#dstw", "#index"     \n\t"\
00642     " jb       1b               \n\t"
00643 
00644 #if COMPILE_TEMPLATE_MMX2
00645 #undef WRITEBGR24
00646 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
00647 #else
00648 #undef WRITEBGR24
00649 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
00650 #endif
00651 
00652 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
00653                                    const int16_t **lumSrc, int lumFilterSize,
00654                                    const int16_t *chrFilter, const int16_t **chrUSrc,
00655                                    const int16_t **chrVSrc,
00656                                    int chrFilterSize, const int16_t **alpSrc,
00657                                    uint8_t *dest, int dstW, int dstY)
00658 {
00659     x86_reg dummy=0;
00660     x86_reg dstW_reg = dstW;
00661     x86_reg uv_off = c->uv_offx2;
00662 
00663     YSCALEYUV2PACKEDX_ACCURATE
00664     YSCALEYUV2RGBX
00665     "pxor %%mm7, %%mm7 \n\t"
00666     "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
00667     "add %4, %%"REG_c"                        \n\t"
00668     WRITEBGR24(%%REGc, %5, %%REGa)
00669     :: "r" (&c->redDither),
00670        "m" (dummy), "m" (dummy), "m" (dummy),
00671        "r" (dest), "m" (dstW_reg), "m"(uv_off)
00672     : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
00673     );
00674 }
00675 
00676 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
00677                                 const int16_t **lumSrc, int lumFilterSize,
00678                                 const int16_t *chrFilter, const int16_t **chrUSrc,
00679                                 const int16_t **chrVSrc,
00680                                 int chrFilterSize, const int16_t **alpSrc,
00681                                 uint8_t *dest, int dstW, int dstY)
00682 {
00683     x86_reg dummy=0;
00684     x86_reg dstW_reg = dstW;
00685     x86_reg uv_off = c->uv_offx2;
00686 
00687     YSCALEYUV2PACKEDX
00688     YSCALEYUV2RGBX
00689     "pxor                    %%mm7, %%mm7       \n\t"
00690     "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
00691     "add                        %4, %%"REG_c"   \n\t"
00692     WRITEBGR24(%%REGc, %5, %%REGa)
00693     :: "r" (&c->redDither),
00694        "m" (dummy), "m" (dummy), "m" (dummy),
00695        "r" (dest),  "m" (dstW_reg), "m"(uv_off)
00696     : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
00697     );
00698 }
00699 
00700 #define REAL_WRITEYUY2(dst, dstw, index) \
00701     "packuswb  %%mm3, %%mm3     \n\t"\
00702     "packuswb  %%mm4, %%mm4     \n\t"\
00703     "packuswb  %%mm7, %%mm1     \n\t"\
00704     "punpcklbw %%mm4, %%mm3     \n\t"\
00705     "movq      %%mm1, %%mm7     \n\t"\
00706     "punpcklbw %%mm3, %%mm1     \n\t"\
00707     "punpckhbw %%mm3, %%mm7     \n\t"\
00708 \
00709     MOVNTQ(%%mm1, (dst, index, 2))\
00710     MOVNTQ(%%mm7, 8(dst, index, 2))\
00711 \
00712     "add          $8, "#index"  \n\t"\
00713     "cmp     "#dstw", "#index"  \n\t"\
00714     " jb          1b            \n\t"
00715 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
00716 
00717 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
00718                                      const int16_t **lumSrc, int lumFilterSize,
00719                                      const int16_t *chrFilter, const int16_t **chrUSrc,
00720                                      const int16_t **chrVSrc,
00721                                      int chrFilterSize, const int16_t **alpSrc,
00722                                      uint8_t *dest, int dstW, int dstY)
00723 {
00724     x86_reg dummy=0;
00725     x86_reg dstW_reg = dstW;
00726     x86_reg uv_off = c->uv_offx2;
00727 
00728     YSCALEYUV2PACKEDX_ACCURATE
00729     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00730     "psraw $3, %%mm3    \n\t"
00731     "psraw $3, %%mm4    \n\t"
00732     "psraw $3, %%mm1    \n\t"
00733     "psraw $3, %%mm7    \n\t"
00734     WRITEYUY2(%4, %5, %%REGa)
00735     YSCALEYUV2PACKEDX_END
00736 }
00737 
00738 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
00739                                   const int16_t **lumSrc, int lumFilterSize,
00740                                   const int16_t *chrFilter, const int16_t **chrUSrc,
00741                                   const int16_t **chrVSrc,
00742                                   int chrFilterSize, const int16_t **alpSrc,
00743                                   uint8_t *dest, int dstW, int dstY)
00744 {
00745     x86_reg dummy=0;
00746     x86_reg dstW_reg = dstW;
00747     x86_reg uv_off = c->uv_offx2;
00748 
00749     YSCALEYUV2PACKEDX
00750     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00751     "psraw $3, %%mm3    \n\t"
00752     "psraw $3, %%mm4    \n\t"
00753     "psraw $3, %%mm1    \n\t"
00754     "psraw $3, %%mm7    \n\t"
00755     WRITEYUY2(%4, %5, %%REGa)
00756     YSCALEYUV2PACKEDX_END
00757 }
00758 
00759 #define REAL_YSCALEYUV2RGB_UV(index, c) \
00760     "xor            "#index", "#index"  \n\t"\
00761     ".p2align              4            \n\t"\
00762     "1:                                 \n\t"\
00763     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00764     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00765     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00766     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00767     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00768     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00769     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00770     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00771     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00772     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00773     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00774     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00775     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00776     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00777     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00778     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00779     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00780     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00781     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00782     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00783     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00784     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00785 
00786 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
00787     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00788     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00789     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00790     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00791     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00792     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00793     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00794     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00795     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00796     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00797     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00798     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00799 
00800 #define REAL_YSCALEYUV2RGB_COEFF(c) \
00801     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00802     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00803     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00804     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00805     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00806     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00807     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00808     "paddw             %%mm3, %%mm4     \n\t"\
00809     "movq              %%mm2, %%mm0     \n\t"\
00810     "movq              %%mm5, %%mm6     \n\t"\
00811     "movq              %%mm4, %%mm3     \n\t"\
00812     "punpcklwd         %%mm2, %%mm2     \n\t"\
00813     "punpcklwd         %%mm5, %%mm5     \n\t"\
00814     "punpcklwd         %%mm4, %%mm4     \n\t"\
00815     "paddw             %%mm1, %%mm2     \n\t"\
00816     "paddw             %%mm1, %%mm5     \n\t"\
00817     "paddw             %%mm1, %%mm4     \n\t"\
00818     "punpckhwd         %%mm0, %%mm0     \n\t"\
00819     "punpckhwd         %%mm6, %%mm6     \n\t"\
00820     "punpckhwd         %%mm3, %%mm3     \n\t"\
00821     "paddw             %%mm7, %%mm0     \n\t"\
00822     "paddw             %%mm7, %%mm6     \n\t"\
00823     "paddw             %%mm7, %%mm3     \n\t"\
00824     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00825     "packuswb          %%mm0, %%mm2     \n\t"\
00826     "packuswb          %%mm6, %%mm5     \n\t"\
00827     "packuswb          %%mm3, %%mm4     \n\t"\
00828 
00829 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
00830 
00831 #define YSCALEYUV2RGB(index, c) \
00832     REAL_YSCALEYUV2RGB_UV(index, c) \
00833     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
00834     REAL_YSCALEYUV2RGB_COEFF(c)
00835 
00839 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
00840                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
00841                                 const int16_t *abuf[2], uint8_t *dest,
00842                                 int dstW, int yalpha, int uvalpha, int y)
00843 {
00844     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00845                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00846 
00847     if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
00848         const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
00849 #if ARCH_X86_64
00850         __asm__ volatile(
00851             YSCALEYUV2RGB(%%r8, %5)
00852             YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
00853             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00854             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00855             "packuswb            %%mm7, %%mm1       \n\t"
00856             WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00857             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
00858                "a" (&c->redDither),
00859                "r" (abuf0), "r" (abuf1)
00860             : "%r8"
00861         );
00862 #else
00863         c->u_temp=(intptr_t)abuf0;
00864         c->v_temp=(intptr_t)abuf1;
00865         __asm__ volatile(
00866             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00867             "mov        %4, %%"REG_b"               \n\t"
00868             "push %%"REG_BP"                        \n\t"
00869             YSCALEYUV2RGB(%%REGBP, %5)
00870             "push                   %0              \n\t"
00871             "push                   %1              \n\t"
00872             "mov          "U_TEMP"(%5), %0          \n\t"
00873             "mov          "V_TEMP"(%5), %1          \n\t"
00874             YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
00875             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00876             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
00877             "packuswb            %%mm7, %%mm1       \n\t"
00878             "pop                    %1              \n\t"
00879             "pop                    %0              \n\t"
00880             WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
00881             "pop %%"REG_BP"                         \n\t"
00882             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00883             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00884                "a" (&c->redDither)
00885         );
00886 #endif
00887     } else {
00888         __asm__ volatile(
00889             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00890             "mov        %4, %%"REG_b"               \n\t"
00891             "push %%"REG_BP"                        \n\t"
00892             YSCALEYUV2RGB(%%REGBP, %5)
00893             "pcmpeqd %%mm7, %%mm7                   \n\t"
00894             WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
00895             "pop %%"REG_BP"                         \n\t"
00896             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00897             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00898                "a" (&c->redDither)
00899         );
00900     }
00901 }
00902 
00903 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
00904                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
00905                                 const int16_t *abuf[2], uint8_t *dest,
00906                                 int dstW, int yalpha, int uvalpha, int y)
00907 {
00908     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00909                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00910 
00911     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00912     __asm__ volatile(
00913         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00914         "mov        %4, %%"REG_b"               \n\t"
00915         "push %%"REG_BP"                        \n\t"
00916         YSCALEYUV2RGB(%%REGBP, %5)
00917         "pxor    %%mm7, %%mm7                   \n\t"
00918         WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
00919         "pop %%"REG_BP"                         \n\t"
00920         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00921         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00922            "a" (&c->redDither)
00923     );
00924 }
00925 
00926 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
00927                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
00928                                  const int16_t *abuf[2], uint8_t *dest,
00929                                  int dstW, int yalpha, int uvalpha, int y)
00930 {
00931     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00932                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00933 
00934     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00935     __asm__ volatile(
00936         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00937         "mov        %4, %%"REG_b"               \n\t"
00938         "push %%"REG_BP"                        \n\t"
00939         YSCALEYUV2RGB(%%REGBP, %5)
00940         "pxor    %%mm7, %%mm7                   \n\t"
00941         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00942 #ifdef DITHER1XBPP
00943         "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
00944         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
00945         "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
00946 #endif
00947         WRITERGB15(%%REGb, 8280(%5), %%REGBP)
00948         "pop %%"REG_BP"                         \n\t"
00949         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00950         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00951            "a" (&c->redDither)
00952     );
00953 }
00954 
00955 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
00956                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
00957                                  const int16_t *abuf[2], uint8_t *dest,
00958                                  int dstW, int yalpha, int uvalpha, int y)
00959 {
00960     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
00961                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
00962 
00963     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
00964     __asm__ volatile(
00965         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
00966         "mov        %4, %%"REG_b"               \n\t"
00967         "push %%"REG_BP"                        \n\t"
00968         YSCALEYUV2RGB(%%REGBP, %5)
00969         "pxor    %%mm7, %%mm7                   \n\t"
00970         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
00971 #ifdef DITHER1XBPP
00972         "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
00973         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
00974         "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
00975 #endif
00976         WRITERGB16(%%REGb, 8280(%5), %%REGBP)
00977         "pop %%"REG_BP"                         \n\t"
00978         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
00979         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
00980            "a" (&c->redDither)
00981     );
00982 }
00983 
00984 #define REAL_YSCALEYUV2PACKED(index, c) \
00985     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
00986     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
00987     "psraw                $3, %%mm0                           \n\t"\
00988     "psraw                $3, %%mm1                           \n\t"\
00989     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00990     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00991     "xor            "#index", "#index"                        \n\t"\
00992     ".p2align              4            \n\t"\
00993     "1:                                 \n\t"\
00994     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00995     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00996     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
00997     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00998     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00999     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01000     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
01001     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
01002     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
01003     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
01004     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
01005     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
01006     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
01007     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
01008     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
01009     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
01010     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
01011     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
01012     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
01013     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
01014     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
01015     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
01016     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
01017     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01018     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01019     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
01020     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
01021 
01022 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
01023 
01024 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
01025                                   const int16_t *ubuf[2], const int16_t *vbuf[2],
01026                                   const int16_t *abuf[2], uint8_t *dest,
01027                                   int dstW, int yalpha, int uvalpha, int y)
01028 {
01029     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
01030                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01031 
01032     //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
01033     __asm__ volatile(
01034         "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01035         "mov %4, %%"REG_b"                        \n\t"
01036         "push %%"REG_BP"                        \n\t"
01037         YSCALEYUV2PACKED(%%REGBP, %5)
01038         WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01039         "pop %%"REG_BP"                         \n\t"
01040         "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01041         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01042            "a" (&c->redDither)
01043     );
01044 }
01045 
01046 #define REAL_YSCALEYUV2RGB1(index, c) \
01047     "xor            "#index", "#index"  \n\t"\
01048     ".p2align              4            \n\t"\
01049     "1:                                 \n\t"\
01050     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
01051     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01052     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
01053     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01054     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
01055     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
01056     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
01057     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
01058     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
01059     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
01060     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
01061     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
01062     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
01063     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01064     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01065     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01066     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01067     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
01068     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
01069     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
01070     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
01071     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
01072     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
01073     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
01074     "paddw             %%mm3, %%mm4     \n\t"\
01075     "movq              %%mm2, %%mm0     \n\t"\
01076     "movq              %%mm5, %%mm6     \n\t"\
01077     "movq              %%mm4, %%mm3     \n\t"\
01078     "punpcklwd         %%mm2, %%mm2     \n\t"\
01079     "punpcklwd         %%mm5, %%mm5     \n\t"\
01080     "punpcklwd         %%mm4, %%mm4     \n\t"\
01081     "paddw             %%mm1, %%mm2     \n\t"\
01082     "paddw             %%mm1, %%mm5     \n\t"\
01083     "paddw             %%mm1, %%mm4     \n\t"\
01084     "punpckhwd         %%mm0, %%mm0     \n\t"\
01085     "punpckhwd         %%mm6, %%mm6     \n\t"\
01086     "punpckhwd         %%mm3, %%mm3     \n\t"\
01087     "paddw             %%mm7, %%mm0     \n\t"\
01088     "paddw             %%mm7, %%mm6     \n\t"\
01089     "paddw             %%mm7, %%mm3     \n\t"\
01090     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
01091     "packuswb          %%mm0, %%mm2     \n\t"\
01092     "packuswb          %%mm6, %%mm5     \n\t"\
01093     "packuswb          %%mm3, %%mm4     \n\t"\
01094 
01095 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
01096 
01097 // do vertical chrominance interpolation
01098 #define REAL_YSCALEYUV2RGB1b(index, c) \
01099     "xor            "#index", "#index"  \n\t"\
01100     ".p2align              4            \n\t"\
01101     "1:                                 \n\t"\
01102     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
01103     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
01104     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01105     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
01106     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
01107     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01108     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
01109     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
01110     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
01111     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
01112     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
01113     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
01114     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
01115     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
01116     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
01117     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
01118     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
01119     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01120     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01121     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01122     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
01123     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
01124     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
01125     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
01126     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
01127     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
01128     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
01129     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
01130     "paddw             %%mm3, %%mm4     \n\t"\
01131     "movq              %%mm2, %%mm0     \n\t"\
01132     "movq              %%mm5, %%mm6     \n\t"\
01133     "movq              %%mm4, %%mm3     \n\t"\
01134     "punpcklwd         %%mm2, %%mm2     \n\t"\
01135     "punpcklwd         %%mm5, %%mm5     \n\t"\
01136     "punpcklwd         %%mm4, %%mm4     \n\t"\
01137     "paddw             %%mm1, %%mm2     \n\t"\
01138     "paddw             %%mm1, %%mm5     \n\t"\
01139     "paddw             %%mm1, %%mm4     \n\t"\
01140     "punpckhwd         %%mm0, %%mm0     \n\t"\
01141     "punpckhwd         %%mm6, %%mm6     \n\t"\
01142     "punpckhwd         %%mm3, %%mm3     \n\t"\
01143     "paddw             %%mm7, %%mm0     \n\t"\
01144     "paddw             %%mm7, %%mm6     \n\t"\
01145     "paddw             %%mm7, %%mm3     \n\t"\
01146     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
01147     "packuswb          %%mm0, %%mm2     \n\t"\
01148     "packuswb          %%mm6, %%mm5     \n\t"\
01149     "packuswb          %%mm3, %%mm4     \n\t"\
01150 
01151 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
01152 
01153 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
01154     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
01155     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
01156     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
01157     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
01158     "packuswb          %%mm1, %%mm7     \n\t"
01159 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
01160 
01164 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
01165                                 const int16_t *ubuf[2], const int16_t *bguf[2],
01166                                 const int16_t *abuf0, uint8_t *dest,
01167                                 int dstW, int uvalpha, int y)
01168 {
01169     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01170     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01171 
01172     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01173         if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01174             __asm__ volatile(
01175                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01176                 "mov        %4, %%"REG_b"               \n\t"
01177                 "push %%"REG_BP"                        \n\t"
01178                 YSCALEYUV2RGB1(%%REGBP, %5)
01179                 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01180                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01181                 "pop %%"REG_BP"                         \n\t"
01182                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01183                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01184                    "a" (&c->redDither)
01185             );
01186         } else {
01187             __asm__ volatile(
01188                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01189                 "mov        %4, %%"REG_b"               \n\t"
01190                 "push %%"REG_BP"                        \n\t"
01191                 YSCALEYUV2RGB1(%%REGBP, %5)
01192                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01193                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01194                 "pop %%"REG_BP"                         \n\t"
01195                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01196                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01197                    "a" (&c->redDither)
01198             );
01199         }
01200     } else {
01201         if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
01202             __asm__ volatile(
01203                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01204                 "mov        %4, %%"REG_b"               \n\t"
01205                 "push %%"REG_BP"                        \n\t"
01206                 YSCALEYUV2RGB1b(%%REGBP, %5)
01207                 YSCALEYUV2RGB1_ALPHA(%%REGBP)
01208                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01209                 "pop %%"REG_BP"                         \n\t"
01210                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01211                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01212                    "a" (&c->redDither)
01213             );
01214         } else {
01215             __asm__ volatile(
01216                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01217                 "mov        %4, %%"REG_b"               \n\t"
01218                 "push %%"REG_BP"                        \n\t"
01219                 YSCALEYUV2RGB1b(%%REGBP, %5)
01220                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01221                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01222                 "pop %%"REG_BP"                         \n\t"
01223                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01224                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01225                    "a" (&c->redDither)
01226             );
01227         }
01228     }
01229 }
01230 
01231 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
01232                                 const int16_t *ubuf[2], const int16_t *bguf[2],
01233                                 const int16_t *abuf0, uint8_t *dest,
01234                                 int dstW, int uvalpha, int y)
01235 {
01236     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01237     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01238 
01239     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01240         __asm__ volatile(
01241             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01242             "mov        %4, %%"REG_b"               \n\t"
01243             "push %%"REG_BP"                        \n\t"
01244             YSCALEYUV2RGB1(%%REGBP, %5)
01245             "pxor    %%mm7, %%mm7                   \n\t"
01246             WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01247             "pop %%"REG_BP"                         \n\t"
01248             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01249             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01250                "a" (&c->redDither)
01251         );
01252     } else {
01253         __asm__ volatile(
01254             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01255             "mov        %4, %%"REG_b"               \n\t"
01256             "push %%"REG_BP"                        \n\t"
01257             YSCALEYUV2RGB1b(%%REGBP, %5)
01258             "pxor    %%mm7, %%mm7                   \n\t"
01259             WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01260             "pop %%"REG_BP"                         \n\t"
01261             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01262             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01263                "a" (&c->redDither)
01264         );
01265     }
01266 }
01267 
01268 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
01269                                  const int16_t *ubuf[2], const int16_t *bguf[2],
01270                                  const int16_t *abuf0, uint8_t *dest,
01271                                  int dstW, int uvalpha, int y)
01272 {
01273     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01274     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01275 
01276     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01277         __asm__ volatile(
01278             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01279             "mov        %4, %%"REG_b"               \n\t"
01280             "push %%"REG_BP"                        \n\t"
01281             YSCALEYUV2RGB1(%%REGBP, %5)
01282             "pxor    %%mm7, %%mm7                   \n\t"
01283             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01284 #ifdef DITHER1XBPP
01285             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01286             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01287             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01288 #endif
01289             WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01290             "pop %%"REG_BP"                         \n\t"
01291             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01292             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01293                "a" (&c->redDither)
01294         );
01295     } else {
01296         __asm__ volatile(
01297             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01298             "mov        %4, %%"REG_b"               \n\t"
01299             "push %%"REG_BP"                        \n\t"
01300             YSCALEYUV2RGB1b(%%REGBP, %5)
01301             "pxor    %%mm7, %%mm7                   \n\t"
01302             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01303 #ifdef DITHER1XBPP
01304             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01305             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01306             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01307 #endif
01308             WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01309             "pop %%"REG_BP"                         \n\t"
01310             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01311             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01312                "a" (&c->redDither)
01313         );
01314     }
01315 }
01316 
01317 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
01318                                  const int16_t *ubuf[2], const int16_t *bguf[2],
01319                                  const int16_t *abuf0, uint8_t *dest,
01320                                  int dstW, int uvalpha, int y)
01321 {
01322     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01323     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01324 
01325     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01326         __asm__ volatile(
01327             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01328             "mov        %4, %%"REG_b"               \n\t"
01329             "push %%"REG_BP"                        \n\t"
01330             YSCALEYUV2RGB1(%%REGBP, %5)
01331             "pxor    %%mm7, %%mm7                   \n\t"
01332             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01333 #ifdef DITHER1XBPP
01334             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01335             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01336             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01337 #endif
01338             WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01339             "pop %%"REG_BP"                         \n\t"
01340             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01341             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01342                "a" (&c->redDither)
01343         );
01344     } else {
01345         __asm__ volatile(
01346             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01347             "mov        %4, %%"REG_b"               \n\t"
01348             "push %%"REG_BP"                        \n\t"
01349             YSCALEYUV2RGB1b(%%REGBP, %5)
01350             "pxor    %%mm7, %%mm7                   \n\t"
01351             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01352 #ifdef DITHER1XBPP
01353             "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01354             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01355             "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01356 #endif
01357             WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01358             "pop %%"REG_BP"                         \n\t"
01359             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01360             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01361                "a" (&c->redDither)
01362         );
01363     }
01364 }
01365 
01366 #define REAL_YSCALEYUV2PACKED1(index, c) \
01367     "xor            "#index", "#index"  \n\t"\
01368     ".p2align              4            \n\t"\
01369     "1:                                 \n\t"\
01370     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
01371     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01372     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
01373     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01374     "psraw                $7, %%mm3     \n\t" \
01375     "psraw                $7, %%mm4     \n\t" \
01376     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01377     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01378     "psraw                $7, %%mm1     \n\t" \
01379     "psraw                $7, %%mm7     \n\t" \
01380 
01381 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
01382 
01383 #define REAL_YSCALEYUV2PACKED1b(index, c) \
01384     "xor "#index", "#index"             \n\t"\
01385     ".p2align              4            \n\t"\
01386     "1:                                 \n\t"\
01387     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
01388     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
01389     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01390     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
01391     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
01392     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
01393     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
01394     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
01395     "psrlw                $8, %%mm3     \n\t" \
01396     "psrlw                $8, %%mm4     \n\t" \
01397     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
01398     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
01399     "psraw                $7, %%mm1     \n\t" \
01400     "psraw                $7, %%mm7     \n\t"
01401 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
01402 
01403 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
01404                                   const int16_t *ubuf[2], const int16_t *bguf[2],
01405                                   const int16_t *abuf0, uint8_t *dest,
01406                                   int dstW, int uvalpha, int y)
01407 {
01408     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
01409     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01410 
01411     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01412         __asm__ volatile(
01413             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01414             "mov        %4, %%"REG_b"               \n\t"
01415             "push %%"REG_BP"                        \n\t"
01416             YSCALEYUV2PACKED1(%%REGBP, %5)
01417             WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01418             "pop %%"REG_BP"                         \n\t"
01419             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01420             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01421                "a" (&c->redDither)
01422         );
01423     } else {
01424         __asm__ volatile(
01425             "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01426             "mov        %4, %%"REG_b"               \n\t"
01427             "push %%"REG_BP"                        \n\t"
01428             YSCALEYUV2PACKED1b(%%REGBP, %5)
01429             WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01430             "pop %%"REG_BP"                         \n\t"
01431             "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01432             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
01433                "a" (&c->redDither)
01434         );
01435     }
01436 }
01437 
01438 static av_always_inline void RENAME(bgr24ToY_mmx)(int16_t *dst, const uint8_t *src,
01439                                                   int width, enum PixelFormat srcFormat)
01440 {
01441 
01442     if(srcFormat == PIX_FMT_BGR24) {
01443         __asm__ volatile(
01444             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
01445             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
01446             :
01447         );
01448     } else {
01449         __asm__ volatile(
01450             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
01451             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
01452             :
01453         );
01454     }
01455 
01456     __asm__ volatile(
01457         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
01458         "mov                        %2, %%"REG_a"   \n\t"
01459         "pxor                    %%mm7, %%mm7       \n\t"
01460         "1:                                         \n\t"
01461         PREFETCH"               64(%0)              \n\t"
01462         "movd                     (%0), %%mm0       \n\t"
01463         "movd                    2(%0), %%mm1       \n\t"
01464         "movd                    6(%0), %%mm2       \n\t"
01465         "movd                    8(%0), %%mm3       \n\t"
01466         "add                       $12, %0          \n\t"
01467         "punpcklbw               %%mm7, %%mm0       \n\t"
01468         "punpcklbw               %%mm7, %%mm1       \n\t"
01469         "punpcklbw               %%mm7, %%mm2       \n\t"
01470         "punpcklbw               %%mm7, %%mm3       \n\t"
01471         "pmaddwd                 %%mm5, %%mm0       \n\t"
01472         "pmaddwd                 %%mm6, %%mm1       \n\t"
01473         "pmaddwd                 %%mm5, %%mm2       \n\t"
01474         "pmaddwd                 %%mm6, %%mm3       \n\t"
01475         "paddd                   %%mm1, %%mm0       \n\t"
01476         "paddd                   %%mm3, %%mm2       \n\t"
01477         "paddd                   %%mm4, %%mm0       \n\t"
01478         "paddd                   %%mm4, %%mm2       \n\t"
01479         "psrad                     $9, %%mm0       \n\t"
01480         "psrad                     $9, %%mm2       \n\t"
01481         "packssdw                %%mm2, %%mm0       \n\t"
01482         "movq                %%mm0, (%1, %%"REG_a") \n\t"
01483         "add                        $8, %%"REG_a"   \n\t"
01484         " js                        1b              \n\t"
01485     : "+r" (src)
01486     : "r" (dst+width), "g" ((x86_reg)-2*width)
01487     : "%"REG_a
01488     );
01489 }
01490 
01491 static void RENAME(bgr24ToY)(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
01492                              int width, uint32_t *unused)
01493 {
01494     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
01495 }
01496 
01497 static void RENAME(rgb24ToY)(int16_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
01498                              int width, uint32_t *unused)
01499 {
01500     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
01501 }
01502 
01503 static av_always_inline void RENAME(bgr24ToUV_mmx)(int16_t *dstU, int16_t *dstV,
01504                                                    const uint8_t *src, int width,
01505                                                    enum PixelFormat srcFormat)
01506 {
01507     __asm__ volatile(
01508         "movq                    24(%4), %%mm6       \n\t"
01509         "mov                        %3, %%"REG_a"   \n\t"
01510         "pxor                    %%mm7, %%mm7       \n\t"
01511         "1:                                         \n\t"
01512         PREFETCH"               64(%0)              \n\t"
01513         "movd                     (%0), %%mm0       \n\t"
01514         "movd                    2(%0), %%mm1       \n\t"
01515         "punpcklbw               %%mm7, %%mm0       \n\t"
01516         "punpcklbw               %%mm7, %%mm1       \n\t"
01517         "movq                    %%mm0, %%mm2       \n\t"
01518         "movq                    %%mm1, %%mm3       \n\t"
01519         "pmaddwd                  (%4), %%mm0       \n\t"
01520         "pmaddwd                 8(%4), %%mm1       \n\t"
01521         "pmaddwd                16(%4), %%mm2       \n\t"
01522         "pmaddwd                 %%mm6, %%mm3       \n\t"
01523         "paddd                   %%mm1, %%mm0       \n\t"
01524         "paddd                   %%mm3, %%mm2       \n\t"
01525 
01526         "movd                    6(%0), %%mm1       \n\t"
01527         "movd                    8(%0), %%mm3       \n\t"
01528         "add                       $12, %0          \n\t"
01529         "punpcklbw               %%mm7, %%mm1       \n\t"
01530         "punpcklbw               %%mm7, %%mm3       \n\t"
01531         "movq                    %%mm1, %%mm4       \n\t"
01532         "movq                    %%mm3, %%mm5       \n\t"
01533         "pmaddwd                  (%4), %%mm1       \n\t"
01534         "pmaddwd                 8(%4), %%mm3       \n\t"
01535         "pmaddwd                16(%4), %%mm4       \n\t"
01536         "pmaddwd                 %%mm6, %%mm5       \n\t"
01537         "paddd                   %%mm3, %%mm1       \n\t"
01538         "paddd                   %%mm5, %%mm4       \n\t"
01539 
01540         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
01541         "paddd                   %%mm3, %%mm0       \n\t"
01542         "paddd                   %%mm3, %%mm2       \n\t"
01543         "paddd                   %%mm3, %%mm1       \n\t"
01544         "paddd                   %%mm3, %%mm4       \n\t"
01545         "psrad                     $9, %%mm0       \n\t"
01546         "psrad                     $9, %%mm2       \n\t"
01547         "psrad                     $9, %%mm1       \n\t"
01548         "psrad                     $9, %%mm4       \n\t"
01549         "packssdw                %%mm1, %%mm0       \n\t"
01550         "packssdw                %%mm4, %%mm2       \n\t"
01551         "movq                %%mm0, (%1, %%"REG_a") \n\t"
01552         "movq                %%mm2, (%2, %%"REG_a") \n\t"
01553         "add                        $8, %%"REG_a"   \n\t"
01554         " js                        1b              \n\t"
01555     : "+r" (src)
01556     : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-2*width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
01557     : "%"REG_a
01558     );
01559 }
01560 
01561 static void RENAME(bgr24ToUV)(int16_t *dstU, int16_t *dstV,
01562                               const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
01563                               int width, uint32_t *unused)
01564 {
01565     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
01566     assert(src1 == src2);
01567 }
01568 
01569 static void RENAME(rgb24ToUV)(int16_t *dstU, int16_t *dstV,
01570                               const uint8_t *unused1, const uint8_t *src1, const uint8_t *src2,
01571                               int width, uint32_t *unused)
01572 {
01573     assert(src1==src2);
01574     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
01575 }
01576 
01577 #if COMPILE_TEMPLATE_MMX2
01578 static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
01579                                  int dstWidth, const uint8_t *src,
01580                                  int srcW, int xInc)
01581 {
01582     int32_t *filterPos = c->hLumFilterPos;
01583     int16_t *filter    = c->hLumFilter;
01584     void    *mmx2FilterCode= c->lumMmx2FilterCode;
01585     int i;
01586 #if defined(PIC)
01587     uint64_t ebxsave;
01588 #endif
01589 #if ARCH_X86_64
01590     uint64_t retsave;
01591 #endif
01592 
01593     __asm__ volatile(
01594 #if defined(PIC)
01595         "mov               %%"REG_b", %5        \n\t"
01596 #if ARCH_X86_64
01597         "mov               -8(%%rsp), %%"REG_a" \n\t"
01598         "mov               %%"REG_a", %6        \n\t"
01599 #endif
01600 #else
01601 #if ARCH_X86_64
01602         "mov               -8(%%rsp), %%"REG_a" \n\t"
01603         "mov               %%"REG_a", %5        \n\t"
01604 #endif
01605 #endif
01606         "pxor                  %%mm7, %%mm7     \n\t"
01607         "mov                      %0, %%"REG_c" \n\t"
01608         "mov                      %1, %%"REG_D" \n\t"
01609         "mov                      %2, %%"REG_d" \n\t"
01610         "mov                      %3, %%"REG_b" \n\t"
01611         "xor               %%"REG_a", %%"REG_a" \n\t" // i
01612         PREFETCH"        (%%"REG_c")            \n\t"
01613         PREFETCH"      32(%%"REG_c")            \n\t"
01614         PREFETCH"      64(%%"REG_c")            \n\t"
01615 
01616 #if ARCH_X86_64
01617 #define CALL_MMX2_FILTER_CODE \
01618         "movl            (%%"REG_b"), %%esi     \n\t"\
01619         "call                    *%4            \n\t"\
01620         "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
01621         "add               %%"REG_S", %%"REG_c" \n\t"\
01622         "add               %%"REG_a", %%"REG_D" \n\t"\
01623         "xor               %%"REG_a", %%"REG_a" \n\t"\
01624 
01625 #else
01626 #define CALL_MMX2_FILTER_CODE \
01627         "movl (%%"REG_b"), %%esi        \n\t"\
01628         "call         *%4                       \n\t"\
01629         "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
01630         "add               %%"REG_a", %%"REG_D" \n\t"\
01631         "xor               %%"REG_a", %%"REG_a" \n\t"\
01632 
01633 #endif /* ARCH_X86_64 */
01634 
01635         CALL_MMX2_FILTER_CODE
01636         CALL_MMX2_FILTER_CODE
01637         CALL_MMX2_FILTER_CODE
01638         CALL_MMX2_FILTER_CODE
01639         CALL_MMX2_FILTER_CODE
01640         CALL_MMX2_FILTER_CODE
01641         CALL_MMX2_FILTER_CODE
01642         CALL_MMX2_FILTER_CODE
01643 
01644 #if defined(PIC)
01645         "mov                      %5, %%"REG_b" \n\t"
01646 #if ARCH_X86_64
01647         "mov                      %6, %%"REG_a" \n\t"
01648         "mov               %%"REG_a", -8(%%rsp) \n\t"
01649 #endif
01650 #else
01651 #if ARCH_X86_64
01652         "mov                      %5, %%"REG_a" \n\t"
01653         "mov               %%"REG_a", -8(%%rsp) \n\t"
01654 #endif
01655 #endif
01656         :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
01657            "m" (mmx2FilterCode)
01658 #if defined(PIC)
01659           ,"m" (ebxsave)
01660 #endif
01661 #if ARCH_X86_64
01662           ,"m"(retsave)
01663 #endif
01664         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
01665 #if !defined(PIC)
01666          ,"%"REG_b
01667 #endif
01668     );
01669 
01670     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
01671         dst[i] = src[srcW-1]*128;
01672 }
01673 
01674 static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
01675                                  int dstWidth, const uint8_t *src1,
01676                                  const uint8_t *src2, int srcW, int xInc)
01677 {
01678     int32_t *filterPos = c->hChrFilterPos;
01679     int16_t *filter    = c->hChrFilter;
01680     void    *mmx2FilterCode= c->chrMmx2FilterCode;
01681     int i;
01682 #if defined(PIC)
01683     DECLARE_ALIGNED(8, uint64_t, ebxsave);
01684 #endif
01685 #if ARCH_X86_64
01686     DECLARE_ALIGNED(8, uint64_t, retsave);
01687 #endif
01688 
01689     __asm__ volatile(
01690 #if defined(PIC)
01691         "mov          %%"REG_b", %7         \n\t"
01692 #if ARCH_X86_64
01693         "mov          -8(%%rsp), %%"REG_a"  \n\t"
01694         "mov          %%"REG_a", %8         \n\t"
01695 #endif
01696 #else
01697 #if ARCH_X86_64
01698         "mov          -8(%%rsp), %%"REG_a"  \n\t"
01699         "mov          %%"REG_a", %7         \n\t"
01700 #endif
01701 #endif
01702         "pxor             %%mm7, %%mm7      \n\t"
01703         "mov                 %0, %%"REG_c"  \n\t"
01704         "mov                 %1, %%"REG_D"  \n\t"
01705         "mov                 %2, %%"REG_d"  \n\t"
01706         "mov                 %3, %%"REG_b"  \n\t"
01707         "xor          %%"REG_a", %%"REG_a"  \n\t" // i
01708         PREFETCH"   (%%"REG_c")             \n\t"
01709         PREFETCH" 32(%%"REG_c")             \n\t"
01710         PREFETCH" 64(%%"REG_c")             \n\t"
01711 
01712         CALL_MMX2_FILTER_CODE
01713         CALL_MMX2_FILTER_CODE
01714         CALL_MMX2_FILTER_CODE
01715         CALL_MMX2_FILTER_CODE
01716         "xor          %%"REG_a", %%"REG_a"  \n\t" // i
01717         "mov                 %5, %%"REG_c"  \n\t" // src
01718         "mov                 %6, %%"REG_D"  \n\t" // buf2
01719         PREFETCH"   (%%"REG_c")             \n\t"
01720         PREFETCH" 32(%%"REG_c")             \n\t"
01721         PREFETCH" 64(%%"REG_c")             \n\t"
01722 
01723         CALL_MMX2_FILTER_CODE
01724         CALL_MMX2_FILTER_CODE
01725         CALL_MMX2_FILTER_CODE
01726         CALL_MMX2_FILTER_CODE
01727 
01728 #if defined(PIC)
01729         "mov %7, %%"REG_b"    \n\t"
01730 #if ARCH_X86_64
01731         "mov                 %8, %%"REG_a"  \n\t"
01732         "mov          %%"REG_a", -8(%%rsp)  \n\t"
01733 #endif
01734 #else
01735 #if ARCH_X86_64
01736         "mov                 %7, %%"REG_a"  \n\t"
01737         "mov          %%"REG_a", -8(%%rsp)  \n\t"
01738 #endif
01739 #endif
01740         :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
01741            "m" (mmx2FilterCode), "m" (src2), "m"(dst2)
01742 #if defined(PIC)
01743           ,"m" (ebxsave)
01744 #endif
01745 #if ARCH_X86_64
01746           ,"m"(retsave)
01747 #endif
01748         : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
01749 #if !defined(PIC)
01750          ,"%"REG_b
01751 #endif
01752     );
01753 
01754     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
01755         dst1[i] = src1[srcW-1]*128;
01756         dst2[i] = src2[srcW-1]*128;
01757     }
01758 }
01759 #endif /* COMPILE_TEMPLATE_MMX2 */
01760 
01761 static av_cold void RENAME(sws_init_swScale)(SwsContext *c)
01762 {
01763     enum PixelFormat srcFormat = c->srcFormat,
01764                      dstFormat = c->dstFormat;
01765     c->use_mmx_vfilter= 0;
01766     if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != PIX_FMT_NV12
01767         && dstFormat != PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
01768             if (c->flags & SWS_ACCURATE_RND) {
01769                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
01770                     switch (c->dstFormat) {
01771                     case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
01772                     case PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
01773                     case PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
01774                     case PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
01775                     case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
01776                     default: break;
01777                     }
01778                 }
01779             } else {
01780                 c->use_mmx_vfilter= 1;
01781                 c->yuv2planeX = RENAME(yuv2yuvX    );
01782                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
01783                     switch (c->dstFormat) {
01784                     case PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
01785                     case PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
01786                     case PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
01787                     case PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
01788                     case PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
01789                     default: break;
01790                     }
01791                 }
01792             }
01793         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
01794             switch (c->dstFormat) {
01795             case PIX_FMT_RGB32:
01796                 c->yuv2packed1 = RENAME(yuv2rgb32_1);
01797                 c->yuv2packed2 = RENAME(yuv2rgb32_2);
01798                 break;
01799             case PIX_FMT_BGR24:
01800                 c->yuv2packed1 = RENAME(yuv2bgr24_1);
01801                 c->yuv2packed2 = RENAME(yuv2bgr24_2);
01802                 break;
01803             case PIX_FMT_RGB555:
01804                 c->yuv2packed1 = RENAME(yuv2rgb555_1);
01805                 c->yuv2packed2 = RENAME(yuv2rgb555_2);
01806                 break;
01807             case PIX_FMT_RGB565:
01808                 c->yuv2packed1 = RENAME(yuv2rgb565_1);
01809                 c->yuv2packed2 = RENAME(yuv2rgb565_2);
01810                 break;
01811             case PIX_FMT_YUYV422:
01812                 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
01813                 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
01814                 break;
01815             default:
01816                 break;
01817             }
01818         }
01819     }
01820 
01821     if (c->srcBpc == 8 && c->dstBpc <= 10) {
01822     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
01823 #if COMPILE_TEMPLATE_MMX2
01824     if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
01825     {
01826         c->hyscale_fast = RENAME(hyscale_fast);
01827         c->hcscale_fast = RENAME(hcscale_fast);
01828     } else {
01829 #endif /* COMPILE_TEMPLATE_MMX2 */
01830         c->hyscale_fast = NULL;
01831         c->hcscale_fast = NULL;
01832 #if COMPILE_TEMPLATE_MMX2
01833     }
01834 #endif /* COMPILE_TEMPLATE_MMX2 */
01835     }
01836 
01837     if (!c->chrSrcHSubSample) {
01838         switch(srcFormat) {
01839         case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
01840         case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
01841         default: break;
01842         }
01843     }
01844 
01845     switch (srcFormat) {
01846     case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
01847     case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
01848     default: break;
01849     }
01850 }
Generated on Fri Feb 1 2013 14:34:57 for FFmpeg by doxygen 1.7.1