• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/ppc/h264_template_altivec.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #ifdef DEBUG
00022 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00023 #else
00024 #define ASSERT_ALIGNED(ptr) ;
00025 #endif
00026 
00027 /* this code assume that stride % 16 == 0 */
00028 
00029 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00030         vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00031         vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00032 \
00033         psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00034         psum = vec_mladd(vB, vsrc1ssH, psum);\
00035         psum = vec_mladd(vC, vsrc2ssH, psum);\
00036         psum = vec_mladd(vD, vsrc3ssH, psum);\
00037         psum = BIAS2(psum);\
00038         psum = vec_sr(psum, v6us);\
00039 \
00040         vdst = vec_ld(0, dst);\
00041         ppsum = (vec_u8)vec_pack(psum, psum);\
00042         vfdst = vec_perm(vdst, ppsum, fperm);\
00043 \
00044         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00045 \
00046         vec_st(fsum, 0, dst);\
00047 \
00048         vsrc0ssH = vsrc2ssH;\
00049         vsrc1ssH = vsrc3ssH;\
00050 \
00051         dst += stride;\
00052         src += stride;
00053 
00054 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00055 \
00056         vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00057         vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00058 \
00059         psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00060         psum = vec_mladd(vE, vsrc1ssH, psum);\
00061         psum = vec_sr(psum, v6us);\
00062 \
00063         vdst = vec_ld(0, dst);\
00064         ppsum = (vec_u8)vec_pack(psum, psum);\
00065         vfdst = vec_perm(vdst, ppsum, fperm);\
00066 \
00067         OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00068 \
00069         vec_st(fsum, 0, dst);\
00070 \
00071         dst += stride;\
00072         src += stride;
00073 
00074 #define noop(a) a
00075 #define add28(a) vec_add(v28ss, a)
00076 
00077 #ifdef PREFIX_h264_chroma_mc8_altivec
00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00079                                     int stride, int h, int x, int y) {
00080     DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00081                         {((8 - x) * (8 - y)),
00082                          ((    x) * (8 - y)),
00083                          ((8 - x) * (    y)),
00084                          ((    x) * (    y))};
00085     register int i;
00086     vec_u8 fperm;
00087     const vec_s32 vABCD = vec_ld(0, ABCD);
00088     const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00089     const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00090     const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00091     const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00092     LOAD_ZERO;
00093     const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00094     const vec_u16 v6us = vec_splat_u16(6);
00095     register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00096     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00097 
00098     vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00099     vec_u8 vsrc0uc, vsrc1uc;
00100     vec_s16 vsrc0ssH, vsrc1ssH;
00101     vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00102     vec_s16 vsrc2ssH, vsrc3ssH, psum;
00103     vec_u8 vdst, ppsum, vfdst, fsum;
00104 
00105     if (((unsigned long)dst) % 16 == 0) {
00106         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00107                          0x14, 0x15, 0x16, 0x17,
00108                          0x08, 0x09, 0x0A, 0x0B,
00109                          0x0C, 0x0D, 0x0E, 0x0F};
00110     } else {
00111         fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00112                          0x04, 0x05, 0x06, 0x07,
00113                          0x18, 0x19, 0x1A, 0x1B,
00114                          0x1C, 0x1D, 0x1E, 0x1F};
00115     }
00116 
00117     vsrcAuc = vec_ld(0, src);
00118 
00119     if (loadSecond)
00120         vsrcBuc = vec_ld(16, src);
00121     vsrcperm0 = vec_lvsl(0, src);
00122     vsrcperm1 = vec_lvsl(1, src);
00123 
00124     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00125     if (reallyBadAlign)
00126         vsrc1uc = vsrcBuc;
00127     else
00128         vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00129 
00130     vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00131     vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00132 
00133     if (ABCD[3]) {
00134         if (!loadSecond) {// -> !reallyBadAlign
00135             for (i = 0 ; i < h ; i++) {
00136                 vsrcCuc = vec_ld(stride + 0, src);
00137                 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00138                 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00139 
00140                 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00141             }
00142         } else {
00143             vec_u8 vsrcDuc;
00144             for (i = 0 ; i < h ; i++) {
00145                 vsrcCuc = vec_ld(stride + 0, src);
00146                 vsrcDuc = vec_ld(stride + 16, src);
00147                 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00148                 if (reallyBadAlign)
00149                     vsrc3uc = vsrcDuc;
00150                 else
00151                     vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00152 
00153                 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00154             }
00155         }
00156     } else {
00157         const vec_s16 vE = vec_add(vB, vC);
00158         if (ABCD[2]) { // x == 0 B == 0
00159             if (!loadSecond) {// -> !reallyBadAlign
00160                 for (i = 0 ; i < h ; i++) {
00161                     vsrcCuc = vec_ld(stride + 0, src);
00162                     vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00163                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00164 
00165                     vsrc0uc = vsrc1uc;
00166                 }
00167             } else {
00168                 vec_u8 vsrcDuc;
00169                 for (i = 0 ; i < h ; i++) {
00170                     vsrcCuc = vec_ld(stride + 0, src);
00171                     vsrcDuc = vec_ld(stride + 15, src);
00172                     vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00173                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00174 
00175                     vsrc0uc = vsrc1uc;
00176                 }
00177             }
00178         } else { // y == 0 C == 0
00179             if (!loadSecond) {// -> !reallyBadAlign
00180                 for (i = 0 ; i < h ; i++) {
00181                     vsrcCuc = vec_ld(0, src);
00182                     vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00183                     vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00184 
00185                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00186                 }
00187             } else {
00188                 vec_u8 vsrcDuc;
00189                 for (i = 0 ; i < h ; i++) {
00190                     vsrcCuc = vec_ld(0, src);
00191                     vsrcDuc = vec_ld(15, src);
00192                     vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00193                     if (reallyBadAlign)
00194                         vsrc1uc = vsrcDuc;
00195                     else
00196                         vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00197 
00198                     CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00199                 }
00200             }
00201         }
00202     }
00203 }
00204 #endif
00205 
00206 /* this code assume that stride % 16 == 0 */
00207 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
00208 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00209    DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00210                         {((8 - x) * (8 - y)),
00211                          ((    x) * (8 - y)),
00212                          ((8 - x) * (    y)),
00213                          ((    x) * (    y))};
00214     register int i;
00215     vec_u8 fperm;
00216     const vec_s32 vABCD = vec_ld(0, ABCD);
00217     const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00218     const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00219     const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00220     const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00221     LOAD_ZERO;
00222     const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00223     const vec_u16 v6us  = vec_splat_u16(6);
00224     register int loadSecond     = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00225     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00226 
00227     vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00228     vec_u8 vsrc0uc, vsrc1uc;
00229     vec_s16 vsrc0ssH, vsrc1ssH;
00230     vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00231     vec_s16 vsrc2ssH, vsrc3ssH, psum;
00232     vec_u8 vdst, ppsum, vfdst, fsum;
00233 
00234     if (((unsigned long)dst) % 16 == 0) {
00235         fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00236                          0x14, 0x15, 0x16, 0x17,
00237                          0x08, 0x09, 0x0A, 0x0B,
00238                          0x0C, 0x0D, 0x0E, 0x0F};
00239     } else {
00240         fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00241                          0x04, 0x05, 0x06, 0x07,
00242                          0x18, 0x19, 0x1A, 0x1B,
00243                          0x1C, 0x1D, 0x1E, 0x1F};
00244     }
00245 
00246     vsrcAuc = vec_ld(0, src);
00247 
00248     if (loadSecond)
00249         vsrcBuc = vec_ld(16, src);
00250     vsrcperm0 = vec_lvsl(0, src);
00251     vsrcperm1 = vec_lvsl(1, src);
00252 
00253     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00254     if (reallyBadAlign)
00255         vsrc1uc = vsrcBuc;
00256     else
00257         vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00258 
00259     vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00260     vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00261 
00262     if (!loadSecond) {// -> !reallyBadAlign
00263         for (i = 0 ; i < h ; i++) {
00264 
00265 
00266             vsrcCuc = vec_ld(stride + 0, src);
00267 
00268             vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00269             vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00270 
00271             CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00272         }
00273     } else {
00274         vec_u8 vsrcDuc;
00275         for (i = 0 ; i < h ; i++) {
00276             vsrcCuc = vec_ld(stride + 0, src);
00277             vsrcDuc = vec_ld(stride + 16, src);
00278 
00279             vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00280             if (reallyBadAlign)
00281                 vsrc3uc = vsrcDuc;
00282             else
00283                 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00284 
00285             CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00286         }
00287     }
00288 }
00289 #endif
00290 
00291 #undef noop
00292 #undef add28
00293 #undef CHROMA_MC8_ALTIVEC_CORE
00294 
00295 /* this code assume stride % 16 == 0 */
00296 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
00297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00298     register int i;
00299 
00300     LOAD_ZERO;
00301     const vec_u8 permM2 = vec_lvsl(-2, src);
00302     const vec_u8 permM1 = vec_lvsl(-1, src);
00303     const vec_u8 permP0 = vec_lvsl(+0, src);
00304     const vec_u8 permP1 = vec_lvsl(+1, src);
00305     const vec_u8 permP2 = vec_lvsl(+2, src);
00306     const vec_u8 permP3 = vec_lvsl(+3, src);
00307     const vec_s16 v5ss = vec_splat_s16(5);
00308     const vec_u16 v5us = vec_splat_u16(5);
00309     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00310     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00311 
00312     vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00313 
00314     register int align = ((((unsigned long)src) - 2) % 16);
00315 
00316     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00317               srcP2A, srcP2B, srcP3A, srcP3B,
00318               srcM1A, srcM1B, srcM2A, srcM2B,
00319               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00320               pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00321               psumA, psumB, sumA, sumB;
00322 
00323     vec_u8 sum, vdst, fsum;
00324 
00325     for (i = 0 ; i < 16 ; i ++) {
00326         vec_u8 srcR1 = vec_ld(-2, src);
00327         vec_u8 srcR2 = vec_ld(14, src);
00328 
00329         switch (align) {
00330         default: {
00331             srcM2 = vec_perm(srcR1, srcR2, permM2);
00332             srcM1 = vec_perm(srcR1, srcR2, permM1);
00333             srcP0 = vec_perm(srcR1, srcR2, permP0);
00334             srcP1 = vec_perm(srcR1, srcR2, permP1);
00335             srcP2 = vec_perm(srcR1, srcR2, permP2);
00336             srcP3 = vec_perm(srcR1, srcR2, permP3);
00337         } break;
00338         case 11: {
00339             srcM2 = vec_perm(srcR1, srcR2, permM2);
00340             srcM1 = vec_perm(srcR1, srcR2, permM1);
00341             srcP0 = vec_perm(srcR1, srcR2, permP0);
00342             srcP1 = vec_perm(srcR1, srcR2, permP1);
00343             srcP2 = vec_perm(srcR1, srcR2, permP2);
00344             srcP3 = srcR2;
00345         } break;
00346         case 12: {
00347             vec_u8 srcR3 = vec_ld(30, src);
00348             srcM2 = vec_perm(srcR1, srcR2, permM2);
00349             srcM1 = vec_perm(srcR1, srcR2, permM1);
00350             srcP0 = vec_perm(srcR1, srcR2, permP0);
00351             srcP1 = vec_perm(srcR1, srcR2, permP1);
00352             srcP2 = srcR2;
00353             srcP3 = vec_perm(srcR2, srcR3, permP3);
00354         } break;
00355         case 13: {
00356             vec_u8 srcR3 = vec_ld(30, src);
00357             srcM2 = vec_perm(srcR1, srcR2, permM2);
00358             srcM1 = vec_perm(srcR1, srcR2, permM1);
00359             srcP0 = vec_perm(srcR1, srcR2, permP0);
00360             srcP1 = srcR2;
00361             srcP2 = vec_perm(srcR2, srcR3, permP2);
00362             srcP3 = vec_perm(srcR2, srcR3, permP3);
00363         } break;
00364         case 14: {
00365             vec_u8 srcR3 = vec_ld(30, src);
00366             srcM2 = vec_perm(srcR1, srcR2, permM2);
00367             srcM1 = vec_perm(srcR1, srcR2, permM1);
00368             srcP0 = srcR2;
00369             srcP1 = vec_perm(srcR2, srcR3, permP1);
00370             srcP2 = vec_perm(srcR2, srcR3, permP2);
00371             srcP3 = vec_perm(srcR2, srcR3, permP3);
00372         } break;
00373         case 15: {
00374             vec_u8 srcR3 = vec_ld(30, src);
00375             srcM2 = vec_perm(srcR1, srcR2, permM2);
00376             srcM1 = srcR2;
00377             srcP0 = vec_perm(srcR2, srcR3, permP0);
00378             srcP1 = vec_perm(srcR2, srcR3, permP1);
00379             srcP2 = vec_perm(srcR2, srcR3, permP2);
00380             srcP3 = vec_perm(srcR2, srcR3, permP3);
00381         } break;
00382         }
00383 
00384         srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00385         srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00386         srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00387         srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00388 
00389         srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00390         srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00391         srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00392         srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00393 
00394         srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00395         srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00396         srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00397         srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00398 
00399         sum1A = vec_adds(srcP0A, srcP1A);
00400         sum1B = vec_adds(srcP0B, srcP1B);
00401         sum2A = vec_adds(srcM1A, srcP2A);
00402         sum2B = vec_adds(srcM1B, srcP2B);
00403         sum3A = vec_adds(srcM2A, srcP3A);
00404         sum3B = vec_adds(srcM2B, srcP3B);
00405 
00406         pp1A = vec_mladd(sum1A, v20ss, v16ss);
00407         pp1B = vec_mladd(sum1B, v20ss, v16ss);
00408 
00409         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00410         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00411 
00412         pp3A = vec_add(sum3A, pp1A);
00413         pp3B = vec_add(sum3B, pp1B);
00414 
00415         psumA = vec_sub(pp3A, pp2A);
00416         psumB = vec_sub(pp3B, pp2B);
00417 
00418         sumA = vec_sra(psumA, v5us);
00419         sumB = vec_sra(psumB, v5us);
00420 
00421         sum = vec_packsu(sumA, sumB);
00422 
00423         ASSERT_ALIGNED(dst);
00424         vdst = vec_ld(0, dst);
00425 
00426         OP_U8_ALTIVEC(fsum, sum, vdst);
00427 
00428         vec_st(fsum, 0, dst);
00429 
00430         src += srcStride;
00431         dst += dstStride;
00432     }
00433 }
00434 #endif
00435 
00436 /* this code assume stride % 16 == 0 */
00437 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
00438 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00439     register int i;
00440 
00441     LOAD_ZERO;
00442     const vec_u8 perm = vec_lvsl(0, src);
00443     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00444     const vec_u16 v5us = vec_splat_u16(5);
00445     const vec_s16 v5ss = vec_splat_s16(5);
00446     const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00447 
00448     uint8_t *srcbis = src - (srcStride * 2);
00449 
00450     const vec_u8 srcM2a = vec_ld(0, srcbis);
00451     const vec_u8 srcM2b = vec_ld(16, srcbis);
00452     const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00453     //srcbis += srcStride;
00454     const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00455     const vec_u8 srcM1b = vec_ld(16, srcbis);
00456     const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00457     //srcbis += srcStride;
00458     const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00459     const vec_u8 srcP0b = vec_ld(16, srcbis);
00460     const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00461     //srcbis += srcStride;
00462     const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00463     const vec_u8 srcP1b = vec_ld(16, srcbis);
00464     const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00465     //srcbis += srcStride;
00466     const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00467     const vec_u8 srcP2b = vec_ld(16, srcbis);
00468     const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00469     //srcbis += srcStride;
00470 
00471     vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00472     vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00473     vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00474     vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00475     vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00476     vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00477     vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00478     vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00479     vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00480     vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00481 
00482     vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00483               psumA, psumB, sumA, sumB,
00484               srcP3ssA, srcP3ssB,
00485               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00486 
00487     vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00488 
00489     for (i = 0 ; i < 16 ; i++) {
00490         srcP3a = vec_ld(0, srcbis += srcStride);
00491         srcP3b = vec_ld(16, srcbis);
00492         srcP3 = vec_perm(srcP3a, srcP3b, perm);
00493         srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00494         srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00495         //srcbis += srcStride;
00496 
00497         sum1A = vec_adds(srcP0ssA, srcP1ssA);
00498         sum1B = vec_adds(srcP0ssB, srcP1ssB);
00499         sum2A = vec_adds(srcM1ssA, srcP2ssA);
00500         sum2B = vec_adds(srcM1ssB, srcP2ssB);
00501         sum3A = vec_adds(srcM2ssA, srcP3ssA);
00502         sum3B = vec_adds(srcM2ssB, srcP3ssB);
00503 
00504         srcM2ssA = srcM1ssA;
00505         srcM2ssB = srcM1ssB;
00506         srcM1ssA = srcP0ssA;
00507         srcM1ssB = srcP0ssB;
00508         srcP0ssA = srcP1ssA;
00509         srcP0ssB = srcP1ssB;
00510         srcP1ssA = srcP2ssA;
00511         srcP1ssB = srcP2ssB;
00512         srcP2ssA = srcP3ssA;
00513         srcP2ssB = srcP3ssB;
00514 
00515         pp1A = vec_mladd(sum1A, v20ss, v16ss);
00516         pp1B = vec_mladd(sum1B, v20ss, v16ss);
00517 
00518         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00519         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00520 
00521         pp3A = vec_add(sum3A, pp1A);
00522         pp3B = vec_add(sum3B, pp1B);
00523 
00524         psumA = vec_sub(pp3A, pp2A);
00525         psumB = vec_sub(pp3B, pp2B);
00526 
00527         sumA = vec_sra(psumA, v5us);
00528         sumB = vec_sra(psumB, v5us);
00529 
00530         sum = vec_packsu(sumA, sumB);
00531 
00532         ASSERT_ALIGNED(dst);
00533         vdst = vec_ld(0, dst);
00534 
00535         OP_U8_ALTIVEC(fsum, sum, vdst);
00536 
00537         vec_st(fsum, 0, dst);
00538 
00539         dst += dstStride;
00540     }
00541 }
00542 #endif
00543 
00544 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
00545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
00546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00547     register int i;
00548     LOAD_ZERO;
00549     const vec_u8 permM2 = vec_lvsl(-2, src);
00550     const vec_u8 permM1 = vec_lvsl(-1, src);
00551     const vec_u8 permP0 = vec_lvsl(+0, src);
00552     const vec_u8 permP1 = vec_lvsl(+1, src);
00553     const vec_u8 permP2 = vec_lvsl(+2, src);
00554     const vec_u8 permP3 = vec_lvsl(+3, src);
00555     const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00556     const vec_u32 v10ui = vec_splat_u32(10);
00557     const vec_s16 v5ss = vec_splat_s16(5);
00558     const vec_s16 v1ss = vec_splat_s16(1);
00559     const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00560     const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00561 
00562     register int align = ((((unsigned long)src) - 2) % 16);
00563 
00564     vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00565               srcP2A, srcP2B, srcP3A, srcP3B,
00566               srcM1A, srcM1B, srcM2A, srcM2B,
00567               sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00568               pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00569 
00570     const vec_u8 mperm = (const vec_u8)
00571         {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00572          0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00573     int16_t *tmpbis = tmp;
00574 
00575     vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00576               tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00577               tmpP2ssA, tmpP2ssB;
00578 
00579     vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00580               pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00581               pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00582               ssumAe, ssumAo, ssumBe, ssumBo;
00583     vec_u8 fsum, sumv, sum, vdst;
00584     vec_s16 ssume, ssumo;
00585 
00586     src -= (2 * srcStride);
00587     for (i = 0 ; i < 21 ; i ++) {
00588         vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00589         vec_u8 srcR1 = vec_ld(-2, src);
00590         vec_u8 srcR2 = vec_ld(14, src);
00591 
00592         switch (align) {
00593         default: {
00594             srcM2 = vec_perm(srcR1, srcR2, permM2);
00595             srcM1 = vec_perm(srcR1, srcR2, permM1);
00596             srcP0 = vec_perm(srcR1, srcR2, permP0);
00597             srcP1 = vec_perm(srcR1, srcR2, permP1);
00598             srcP2 = vec_perm(srcR1, srcR2, permP2);
00599             srcP3 = vec_perm(srcR1, srcR2, permP3);
00600         } break;
00601         case 11: {
00602             srcM2 = vec_perm(srcR1, srcR2, permM2);
00603             srcM1 = vec_perm(srcR1, srcR2, permM1);
00604             srcP0 = vec_perm(srcR1, srcR2, permP0);
00605             srcP1 = vec_perm(srcR1, srcR2, permP1);
00606             srcP2 = vec_perm(srcR1, srcR2, permP2);
00607             srcP3 = srcR2;
00608         } break;
00609         case 12: {
00610             vec_u8 srcR3 = vec_ld(30, src);
00611             srcM2 = vec_perm(srcR1, srcR2, permM2);
00612             srcM1 = vec_perm(srcR1, srcR2, permM1);
00613             srcP0 = vec_perm(srcR1, srcR2, permP0);
00614             srcP1 = vec_perm(srcR1, srcR2, permP1);
00615             srcP2 = srcR2;
00616             srcP3 = vec_perm(srcR2, srcR3, permP3);
00617         } break;
00618         case 13: {
00619             vec_u8 srcR3 = vec_ld(30, src);
00620             srcM2 = vec_perm(srcR1, srcR2, permM2);
00621             srcM1 = vec_perm(srcR1, srcR2, permM1);
00622             srcP0 = vec_perm(srcR1, srcR2, permP0);
00623             srcP1 = srcR2;
00624             srcP2 = vec_perm(srcR2, srcR3, permP2);
00625             srcP3 = vec_perm(srcR2, srcR3, permP3);
00626         } break;
00627         case 14: {
00628             vec_u8 srcR3 = vec_ld(30, src);
00629             srcM2 = vec_perm(srcR1, srcR2, permM2);
00630             srcM1 = vec_perm(srcR1, srcR2, permM1);
00631             srcP0 = srcR2;
00632             srcP1 = vec_perm(srcR2, srcR3, permP1);
00633             srcP2 = vec_perm(srcR2, srcR3, permP2);
00634             srcP3 = vec_perm(srcR2, srcR3, permP3);
00635         } break;
00636         case 15: {
00637             vec_u8 srcR3 = vec_ld(30, src);
00638             srcM2 = vec_perm(srcR1, srcR2, permM2);
00639             srcM1 = srcR2;
00640             srcP0 = vec_perm(srcR2, srcR3, permP0);
00641             srcP1 = vec_perm(srcR2, srcR3, permP1);
00642             srcP2 = vec_perm(srcR2, srcR3, permP2);
00643             srcP3 = vec_perm(srcR2, srcR3, permP3);
00644         } break;
00645         }
00646 
00647         srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00648         srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00649         srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00650         srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00651 
00652         srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00653         srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00654         srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00655         srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00656 
00657         srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00658         srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00659         srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00660         srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00661 
00662         sum1A = vec_adds(srcP0A, srcP1A);
00663         sum1B = vec_adds(srcP0B, srcP1B);
00664         sum2A = vec_adds(srcM1A, srcP2A);
00665         sum2B = vec_adds(srcM1B, srcP2B);
00666         sum3A = vec_adds(srcM2A, srcP3A);
00667         sum3B = vec_adds(srcM2B, srcP3B);
00668 
00669         pp1A = vec_mladd(sum1A, v20ss, sum3A);
00670         pp1B = vec_mladd(sum1B, v20ss, sum3B);
00671 
00672         pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00673         pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00674 
00675         psumA = vec_sub(pp1A, pp2A);
00676         psumB = vec_sub(pp1B, pp2B);
00677 
00678         vec_st(psumA, 0, tmp);
00679         vec_st(psumB, 16, tmp);
00680 
00681         src += srcStride;
00682         tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
00683     }
00684 
00685     tmpM2ssA = vec_ld(0, tmpbis);
00686     tmpM2ssB = vec_ld(16, tmpbis);
00687     tmpbis += tmpStride;
00688     tmpM1ssA = vec_ld(0, tmpbis);
00689     tmpM1ssB = vec_ld(16, tmpbis);
00690     tmpbis += tmpStride;
00691     tmpP0ssA = vec_ld(0, tmpbis);
00692     tmpP0ssB = vec_ld(16, tmpbis);
00693     tmpbis += tmpStride;
00694     tmpP1ssA = vec_ld(0, tmpbis);
00695     tmpP1ssB = vec_ld(16, tmpbis);
00696     tmpbis += tmpStride;
00697     tmpP2ssA = vec_ld(0, tmpbis);
00698     tmpP2ssB = vec_ld(16, tmpbis);
00699     tmpbis += tmpStride;
00700 
00701     for (i = 0 ; i < 16 ; i++) {
00702         const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00703         const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00704 
00705         const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00706         const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00707         const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00708         const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00709         const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00710         const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00711 
00712         tmpbis += tmpStride;
00713 
00714         tmpM2ssA = tmpM1ssA;
00715         tmpM2ssB = tmpM1ssB;
00716         tmpM1ssA = tmpP0ssA;
00717         tmpM1ssB = tmpP0ssB;
00718         tmpP0ssA = tmpP1ssA;
00719         tmpP0ssB = tmpP1ssB;
00720         tmpP1ssA = tmpP2ssA;
00721         tmpP1ssB = tmpP2ssB;
00722         tmpP2ssA = tmpP3ssA;
00723         tmpP2ssB = tmpP3ssB;
00724 
00725         pp1Ae = vec_mule(sum1A, v20ss);
00726         pp1Ao = vec_mulo(sum1A, v20ss);
00727         pp1Be = vec_mule(sum1B, v20ss);
00728         pp1Bo = vec_mulo(sum1B, v20ss);
00729 
00730         pp2Ae = vec_mule(sum2A, v5ss);
00731         pp2Ao = vec_mulo(sum2A, v5ss);
00732         pp2Be = vec_mule(sum2B, v5ss);
00733         pp2Bo = vec_mulo(sum2B, v5ss);
00734 
00735         pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00736         pp3Ao = vec_mulo(sum3A, v1ss);
00737         pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00738         pp3Bo = vec_mulo(sum3B, v1ss);
00739 
00740         pp1cAe = vec_add(pp1Ae, v512si);
00741         pp1cAo = vec_add(pp1Ao, v512si);
00742         pp1cBe = vec_add(pp1Be, v512si);
00743         pp1cBo = vec_add(pp1Bo, v512si);
00744 
00745         pp32Ae = vec_sub(pp3Ae, pp2Ae);
00746         pp32Ao = vec_sub(pp3Ao, pp2Ao);
00747         pp32Be = vec_sub(pp3Be, pp2Be);
00748         pp32Bo = vec_sub(pp3Bo, pp2Bo);
00749 
00750         sumAe = vec_add(pp1cAe, pp32Ae);
00751         sumAo = vec_add(pp1cAo, pp32Ao);
00752         sumBe = vec_add(pp1cBe, pp32Be);
00753         sumBo = vec_add(pp1cBo, pp32Bo);
00754 
00755         ssumAe = vec_sra(sumAe, v10ui);
00756         ssumAo = vec_sra(sumAo, v10ui);
00757         ssumBe = vec_sra(sumBe, v10ui);
00758         ssumBo = vec_sra(sumBo, v10ui);
00759 
00760         ssume = vec_packs(ssumAe, ssumBe);
00761         ssumo = vec_packs(ssumAo, ssumBo);
00762 
00763         sumv = vec_packsu(ssume, ssumo);
00764         sum = vec_perm(sumv, sumv, mperm);
00765 
00766         ASSERT_ALIGNED(dst);
00767         vdst = vec_ld(0, dst);
00768 
00769         OP_U8_ALTIVEC(fsum, sum, vdst);
00770 
00771         vec_st(fsum, 0, dst);
00772 
00773         dst += dstStride;
00774     }
00775 }
00776 #endif
Generated on Fri Feb 1 2013 14:34:41 for FFmpeg by doxygen 1.7.1