00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifdef DEBUG_ALIGNMENT
00023 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00024 #else
00025 #define ASSERT_ALIGNED(ptr) ;
00026 #endif
00027
00028
00029
00030 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00031 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00032 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00033 \
00034 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00035 psum = vec_mladd(vB, vsrc1ssH, psum);\
00036 psum = vec_mladd(vC, vsrc2ssH, psum);\
00037 psum = vec_mladd(vD, vsrc3ssH, psum);\
00038 psum = BIAS2(psum);\
00039 psum = vec_sr(psum, v6us);\
00040 \
00041 vdst = vec_ld(0, dst);\
00042 ppsum = (vec_u8)vec_pack(psum, psum);\
00043 vfdst = vec_perm(vdst, ppsum, fperm);\
00044 \
00045 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00046 \
00047 vec_st(fsum, 0, dst);\
00048 \
00049 vsrc0ssH = vsrc2ssH;\
00050 vsrc1ssH = vsrc3ssH;\
00051 \
00052 dst += stride;\
00053 src += stride;
00054
00055 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00056 \
00057 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00058 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00059 \
00060 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00061 psum = vec_mladd(vE, vsrc1ssH, psum);\
00062 psum = vec_sr(psum, v6us);\
00063 \
00064 vdst = vec_ld(0, dst);\
00065 ppsum = (vec_u8)vec_pack(psum, psum);\
00066 vfdst = vec_perm(vdst, ppsum, fperm);\
00067 \
00068 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00069 \
00070 vec_st(fsum, 0, dst);\
00071 \
00072 dst += stride;\
00073 src += stride;
00074
00075 #define noop(a) a
00076 #define add28(a) vec_add(v28ss, a)
00077
00078 #ifdef PREFIX_h264_chroma_mc8_altivec
00079 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00080 int stride, int h, int x, int y) {
00081 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00082 {((8 - x) * (8 - y)),
00083 (( x) * (8 - y)),
00084 ((8 - x) * ( y)),
00085 (( x) * ( y))};
00086 register int i;
00087 vec_u8 fperm;
00088 const vec_s32 vABCD = vec_ld(0, ABCD);
00089 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00090 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00091 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00092 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00093 LOAD_ZERO;
00094 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00095 const vec_u16 v6us = vec_splat_u16(6);
00096 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00097 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00098
00099 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00100 vec_u8 vsrc0uc, vsrc1uc;
00101 vec_s16 vsrc0ssH, vsrc1ssH;
00102 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00103 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00104 vec_u8 vdst, ppsum, vfdst, fsum;
00105
00106 if (((unsigned long)dst) % 16 == 0) {
00107 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00108 0x14, 0x15, 0x16, 0x17,
00109 0x08, 0x09, 0x0A, 0x0B,
00110 0x0C, 0x0D, 0x0E, 0x0F};
00111 } else {
00112 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00113 0x04, 0x05, 0x06, 0x07,
00114 0x18, 0x19, 0x1A, 0x1B,
00115 0x1C, 0x1D, 0x1E, 0x1F};
00116 }
00117
00118 vsrcAuc = vec_ld(0, src);
00119
00120 if (loadSecond)
00121 vsrcBuc = vec_ld(16, src);
00122 vsrcperm0 = vec_lvsl(0, src);
00123 vsrcperm1 = vec_lvsl(1, src);
00124
00125 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00126 if (reallyBadAlign)
00127 vsrc1uc = vsrcBuc;
00128 else
00129 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00130
00131 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00132 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00133
00134 if (ABCD[3]) {
00135 if (!loadSecond) {
00136 for (i = 0 ; i < h ; i++) {
00137 vsrcCuc = vec_ld(stride + 0, src);
00138 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00139 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00140
00141 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00142 }
00143 } else {
00144 vec_u8 vsrcDuc;
00145 for (i = 0 ; i < h ; i++) {
00146 vsrcCuc = vec_ld(stride + 0, src);
00147 vsrcDuc = vec_ld(stride + 16, src);
00148 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00149 if (reallyBadAlign)
00150 vsrc3uc = vsrcDuc;
00151 else
00152 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00153
00154 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00155 }
00156 }
00157 } else {
00158 const vec_s16 vE = vec_add(vB, vC);
00159 if (ABCD[2]) {
00160 if (!loadSecond) {
00161 for (i = 0 ; i < h ; i++) {
00162 vsrcCuc = vec_ld(stride + 0, src);
00163 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00164 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00165
00166 vsrc0uc = vsrc1uc;
00167 }
00168 } else {
00169 vec_u8 vsrcDuc;
00170 for (i = 0 ; i < h ; i++) {
00171 vsrcCuc = vec_ld(stride + 0, src);
00172 vsrcDuc = vec_ld(stride + 15, src);
00173 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00174 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00175
00176 vsrc0uc = vsrc1uc;
00177 }
00178 }
00179 } else {
00180 if (!loadSecond) {
00181 for (i = 0 ; i < h ; i++) {
00182 vsrcCuc = vec_ld(0, src);
00183 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00184 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00185
00186 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00187 }
00188 } else {
00189 vec_u8 vsrcDuc;
00190 for (i = 0 ; i < h ; i++) {
00191 vsrcCuc = vec_ld(0, src);
00192 vsrcDuc = vec_ld(15, src);
00193 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00194 if (reallyBadAlign)
00195 vsrc1uc = vsrcDuc;
00196 else
00197 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00198
00199 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00200 }
00201 }
00202 }
00203 }
00204 }
00205 #endif
00206
00207
00208 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
00209 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00210 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00211 {((8 - x) * (8 - y)),
00212 (( x) * (8 - y)),
00213 ((8 - x) * ( y)),
00214 (( x) * ( y))};
00215 register int i;
00216 vec_u8 fperm;
00217 const vec_s32 vABCD = vec_ld(0, ABCD);
00218 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00219 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00220 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00221 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00222 LOAD_ZERO;
00223 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00224 const vec_u16 v6us = vec_splat_u16(6);
00225 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00226 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00227
00228 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00229 vec_u8 vsrc0uc, vsrc1uc;
00230 vec_s16 vsrc0ssH, vsrc1ssH;
00231 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00232 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00233 vec_u8 vdst, ppsum, vfdst, fsum;
00234
00235 if (((unsigned long)dst) % 16 == 0) {
00236 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00237 0x14, 0x15, 0x16, 0x17,
00238 0x08, 0x09, 0x0A, 0x0B,
00239 0x0C, 0x0D, 0x0E, 0x0F};
00240 } else {
00241 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00242 0x04, 0x05, 0x06, 0x07,
00243 0x18, 0x19, 0x1A, 0x1B,
00244 0x1C, 0x1D, 0x1E, 0x1F};
00245 }
00246
00247 vsrcAuc = vec_ld(0, src);
00248
00249 if (loadSecond)
00250 vsrcBuc = vec_ld(16, src);
00251 vsrcperm0 = vec_lvsl(0, src);
00252 vsrcperm1 = vec_lvsl(1, src);
00253
00254 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00255 if (reallyBadAlign)
00256 vsrc1uc = vsrcBuc;
00257 else
00258 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00259
00260 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00261 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00262
00263 if (!loadSecond) {
00264 for (i = 0 ; i < h ; i++) {
00265
00266
00267 vsrcCuc = vec_ld(stride + 0, src);
00268
00269 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00270 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00271
00272 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00273 }
00274 } else {
00275 vec_u8 vsrcDuc;
00276 for (i = 0 ; i < h ; i++) {
00277 vsrcCuc = vec_ld(stride + 0, src);
00278 vsrcDuc = vec_ld(stride + 16, src);
00279
00280 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00281 if (reallyBadAlign)
00282 vsrc3uc = vsrcDuc;
00283 else
00284 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00285
00286 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00287 }
00288 }
00289 }
00290 #endif
00291
00292 #undef noop
00293 #undef add28
00294 #undef CHROMA_MC8_ALTIVEC_CORE
00295
00296
00297 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
00298 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00299 register int i;
00300
00301 LOAD_ZERO;
00302 const vec_u8 permM2 = vec_lvsl(-2, src);
00303 const vec_u8 permM1 = vec_lvsl(-1, src);
00304 const vec_u8 permP0 = vec_lvsl(+0, src);
00305 const vec_u8 permP1 = vec_lvsl(+1, src);
00306 const vec_u8 permP2 = vec_lvsl(+2, src);
00307 const vec_u8 permP3 = vec_lvsl(+3, src);
00308 const vec_s16 v5ss = vec_splat_s16(5);
00309 const vec_u16 v5us = vec_splat_u16(5);
00310 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00311 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00312
00313 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00314
00315 register int align = ((((unsigned long)src) - 2) % 16);
00316
00317 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00318 srcP2A, srcP2B, srcP3A, srcP3B,
00319 srcM1A, srcM1B, srcM2A, srcM2B,
00320 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00321 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00322 psumA, psumB, sumA, sumB;
00323
00324 vec_u8 sum, vdst, fsum;
00325
00326 for (i = 0 ; i < 16 ; i ++) {
00327 vec_u8 srcR1 = vec_ld(-2, src);
00328 vec_u8 srcR2 = vec_ld(14, src);
00329
00330 switch (align) {
00331 default: {
00332 srcM2 = vec_perm(srcR1, srcR2, permM2);
00333 srcM1 = vec_perm(srcR1, srcR2, permM1);
00334 srcP0 = vec_perm(srcR1, srcR2, permP0);
00335 srcP1 = vec_perm(srcR1, srcR2, permP1);
00336 srcP2 = vec_perm(srcR1, srcR2, permP2);
00337 srcP3 = vec_perm(srcR1, srcR2, permP3);
00338 } break;
00339 case 11: {
00340 srcM2 = vec_perm(srcR1, srcR2, permM2);
00341 srcM1 = vec_perm(srcR1, srcR2, permM1);
00342 srcP0 = vec_perm(srcR1, srcR2, permP0);
00343 srcP1 = vec_perm(srcR1, srcR2, permP1);
00344 srcP2 = vec_perm(srcR1, srcR2, permP2);
00345 srcP3 = srcR2;
00346 } break;
00347 case 12: {
00348 vec_u8 srcR3 = vec_ld(30, src);
00349 srcM2 = vec_perm(srcR1, srcR2, permM2);
00350 srcM1 = vec_perm(srcR1, srcR2, permM1);
00351 srcP0 = vec_perm(srcR1, srcR2, permP0);
00352 srcP1 = vec_perm(srcR1, srcR2, permP1);
00353 srcP2 = srcR2;
00354 srcP3 = vec_perm(srcR2, srcR3, permP3);
00355 } break;
00356 case 13: {
00357 vec_u8 srcR3 = vec_ld(30, src);
00358 srcM2 = vec_perm(srcR1, srcR2, permM2);
00359 srcM1 = vec_perm(srcR1, srcR2, permM1);
00360 srcP0 = vec_perm(srcR1, srcR2, permP0);
00361 srcP1 = srcR2;
00362 srcP2 = vec_perm(srcR2, srcR3, permP2);
00363 srcP3 = vec_perm(srcR2, srcR3, permP3);
00364 } break;
00365 case 14: {
00366 vec_u8 srcR3 = vec_ld(30, src);
00367 srcM2 = vec_perm(srcR1, srcR2, permM2);
00368 srcM1 = vec_perm(srcR1, srcR2, permM1);
00369 srcP0 = srcR2;
00370 srcP1 = vec_perm(srcR2, srcR3, permP1);
00371 srcP2 = vec_perm(srcR2, srcR3, permP2);
00372 srcP3 = vec_perm(srcR2, srcR3, permP3);
00373 } break;
00374 case 15: {
00375 vec_u8 srcR3 = vec_ld(30, src);
00376 srcM2 = vec_perm(srcR1, srcR2, permM2);
00377 srcM1 = srcR2;
00378 srcP0 = vec_perm(srcR2, srcR3, permP0);
00379 srcP1 = vec_perm(srcR2, srcR3, permP1);
00380 srcP2 = vec_perm(srcR2, srcR3, permP2);
00381 srcP3 = vec_perm(srcR2, srcR3, permP3);
00382 } break;
00383 }
00384
00385 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00386 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00387 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00388 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00389
00390 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00391 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00392 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00393 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00394
00395 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00396 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00397 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00398 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00399
00400 sum1A = vec_adds(srcP0A, srcP1A);
00401 sum1B = vec_adds(srcP0B, srcP1B);
00402 sum2A = vec_adds(srcM1A, srcP2A);
00403 sum2B = vec_adds(srcM1B, srcP2B);
00404 sum3A = vec_adds(srcM2A, srcP3A);
00405 sum3B = vec_adds(srcM2B, srcP3B);
00406
00407 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00408 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00409
00410 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00411 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00412
00413 pp3A = vec_add(sum3A, pp1A);
00414 pp3B = vec_add(sum3B, pp1B);
00415
00416 psumA = vec_sub(pp3A, pp2A);
00417 psumB = vec_sub(pp3B, pp2B);
00418
00419 sumA = vec_sra(psumA, v5us);
00420 sumB = vec_sra(psumB, v5us);
00421
00422 sum = vec_packsu(sumA, sumB);
00423
00424 ASSERT_ALIGNED(dst);
00425 vdst = vec_ld(0, dst);
00426
00427 OP_U8_ALTIVEC(fsum, sum, vdst);
00428
00429 vec_st(fsum, 0, dst);
00430
00431 src += srcStride;
00432 dst += dstStride;
00433 }
00434 }
00435 #endif
00436
00437
00438 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
00439 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00440 register int i;
00441
00442 LOAD_ZERO;
00443 const vec_u8 perm = vec_lvsl(0, src);
00444 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00445 const vec_u16 v5us = vec_splat_u16(5);
00446 const vec_s16 v5ss = vec_splat_s16(5);
00447 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00448
00449 uint8_t *srcbis = src - (srcStride * 2);
00450
00451 const vec_u8 srcM2a = vec_ld(0, srcbis);
00452 const vec_u8 srcM2b = vec_ld(16, srcbis);
00453 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00454
00455 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00456 const vec_u8 srcM1b = vec_ld(16, srcbis);
00457 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00458
00459 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00460 const vec_u8 srcP0b = vec_ld(16, srcbis);
00461 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00462
00463 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00464 const vec_u8 srcP1b = vec_ld(16, srcbis);
00465 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00466
00467 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00468 const vec_u8 srcP2b = vec_ld(16, srcbis);
00469 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00470
00471
00472 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00473 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00474 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00475 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00476 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00477 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00478 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00479 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00480 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00481 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00482
00483 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00484 psumA, psumB, sumA, sumB,
00485 srcP3ssA, srcP3ssB,
00486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00487
00488 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00489
00490 for (i = 0 ; i < 16 ; i++) {
00491 srcP3a = vec_ld(0, srcbis += srcStride);
00492 srcP3b = vec_ld(16, srcbis);
00493 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00494 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00495 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00496
00497
00498 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00499 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00500 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00501 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00502 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00503 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00504
00505 srcM2ssA = srcM1ssA;
00506 srcM2ssB = srcM1ssB;
00507 srcM1ssA = srcP0ssA;
00508 srcM1ssB = srcP0ssB;
00509 srcP0ssA = srcP1ssA;
00510 srcP0ssB = srcP1ssB;
00511 srcP1ssA = srcP2ssA;
00512 srcP1ssB = srcP2ssB;
00513 srcP2ssA = srcP3ssA;
00514 srcP2ssB = srcP3ssB;
00515
00516 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00517 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00518
00519 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00520 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00521
00522 pp3A = vec_add(sum3A, pp1A);
00523 pp3B = vec_add(sum3B, pp1B);
00524
00525 psumA = vec_sub(pp3A, pp2A);
00526 psumB = vec_sub(pp3B, pp2B);
00527
00528 sumA = vec_sra(psumA, v5us);
00529 sumB = vec_sra(psumB, v5us);
00530
00531 sum = vec_packsu(sumA, sumB);
00532
00533 ASSERT_ALIGNED(dst);
00534 vdst = vec_ld(0, dst);
00535
00536 OP_U8_ALTIVEC(fsum, sum, vdst);
00537
00538 vec_st(fsum, 0, dst);
00539
00540 dst += dstStride;
00541 }
00542 }
00543 #endif
00544
00545
00546 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
00547 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00548 register int i;
00549 LOAD_ZERO;
00550 const vec_u8 permM2 = vec_lvsl(-2, src);
00551 const vec_u8 permM1 = vec_lvsl(-1, src);
00552 const vec_u8 permP0 = vec_lvsl(+0, src);
00553 const vec_u8 permP1 = vec_lvsl(+1, src);
00554 const vec_u8 permP2 = vec_lvsl(+2, src);
00555 const vec_u8 permP3 = vec_lvsl(+3, src);
00556 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00557 const vec_u32 v10ui = vec_splat_u32(10);
00558 const vec_s16 v5ss = vec_splat_s16(5);
00559 const vec_s16 v1ss = vec_splat_s16(1);
00560 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00561 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00562
00563 register int align = ((((unsigned long)src) - 2) % 16);
00564
00565 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00566 srcP2A, srcP2B, srcP3A, srcP3B,
00567 srcM1A, srcM1B, srcM2A, srcM2B,
00568 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00569 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00570
00571 const vec_u8 mperm = (const vec_u8)
00572 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00573 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00574 int16_t *tmpbis = tmp;
00575
00576 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00577 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00578 tmpP2ssA, tmpP2ssB;
00579
00580 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00581 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00582 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00583 ssumAe, ssumAo, ssumBe, ssumBo;
00584 vec_u8 fsum, sumv, sum, vdst;
00585 vec_s16 ssume, ssumo;
00586
00587 src -= (2 * srcStride);
00588 for (i = 0 ; i < 21 ; i ++) {
00589 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00590 vec_u8 srcR1 = vec_ld(-2, src);
00591 vec_u8 srcR2 = vec_ld(14, src);
00592
00593 switch (align) {
00594 default: {
00595 srcM2 = vec_perm(srcR1, srcR2, permM2);
00596 srcM1 = vec_perm(srcR1, srcR2, permM1);
00597 srcP0 = vec_perm(srcR1, srcR2, permP0);
00598 srcP1 = vec_perm(srcR1, srcR2, permP1);
00599 srcP2 = vec_perm(srcR1, srcR2, permP2);
00600 srcP3 = vec_perm(srcR1, srcR2, permP3);
00601 } break;
00602 case 11: {
00603 srcM2 = vec_perm(srcR1, srcR2, permM2);
00604 srcM1 = vec_perm(srcR1, srcR2, permM1);
00605 srcP0 = vec_perm(srcR1, srcR2, permP0);
00606 srcP1 = vec_perm(srcR1, srcR2, permP1);
00607 srcP2 = vec_perm(srcR1, srcR2, permP2);
00608 srcP3 = srcR2;
00609 } break;
00610 case 12: {
00611 vec_u8 srcR3 = vec_ld(30, src);
00612 srcM2 = vec_perm(srcR1, srcR2, permM2);
00613 srcM1 = vec_perm(srcR1, srcR2, permM1);
00614 srcP0 = vec_perm(srcR1, srcR2, permP0);
00615 srcP1 = vec_perm(srcR1, srcR2, permP1);
00616 srcP2 = srcR2;
00617 srcP3 = vec_perm(srcR2, srcR3, permP3);
00618 } break;
00619 case 13: {
00620 vec_u8 srcR3 = vec_ld(30, src);
00621 srcM2 = vec_perm(srcR1, srcR2, permM2);
00622 srcM1 = vec_perm(srcR1, srcR2, permM1);
00623 srcP0 = vec_perm(srcR1, srcR2, permP0);
00624 srcP1 = srcR2;
00625 srcP2 = vec_perm(srcR2, srcR3, permP2);
00626 srcP3 = vec_perm(srcR2, srcR3, permP3);
00627 } break;
00628 case 14: {
00629 vec_u8 srcR3 = vec_ld(30, src);
00630 srcM2 = vec_perm(srcR1, srcR2, permM2);
00631 srcM1 = vec_perm(srcR1, srcR2, permM1);
00632 srcP0 = srcR2;
00633 srcP1 = vec_perm(srcR2, srcR3, permP1);
00634 srcP2 = vec_perm(srcR2, srcR3, permP2);
00635 srcP3 = vec_perm(srcR2, srcR3, permP3);
00636 } break;
00637 case 15: {
00638 vec_u8 srcR3 = vec_ld(30, src);
00639 srcM2 = vec_perm(srcR1, srcR2, permM2);
00640 srcM1 = srcR2;
00641 srcP0 = vec_perm(srcR2, srcR3, permP0);
00642 srcP1 = vec_perm(srcR2, srcR3, permP1);
00643 srcP2 = vec_perm(srcR2, srcR3, permP2);
00644 srcP3 = vec_perm(srcR2, srcR3, permP3);
00645 } break;
00646 }
00647
00648 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00649 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00650 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00651 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00652
00653 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00654 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00655 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00656 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00657
00658 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00659 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00660 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00661 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00662
00663 sum1A = vec_adds(srcP0A, srcP1A);
00664 sum1B = vec_adds(srcP0B, srcP1B);
00665 sum2A = vec_adds(srcM1A, srcP2A);
00666 sum2B = vec_adds(srcM1B, srcP2B);
00667 sum3A = vec_adds(srcM2A, srcP3A);
00668 sum3B = vec_adds(srcM2B, srcP3B);
00669
00670 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00671 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00672
00673 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00674 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00675
00676 psumA = vec_sub(pp1A, pp2A);
00677 psumB = vec_sub(pp1B, pp2B);
00678
00679 vec_st(psumA, 0, tmp);
00680 vec_st(psumB, 16, tmp);
00681
00682 src += srcStride;
00683 tmp += tmpStride;
00684 }
00685
00686 tmpM2ssA = vec_ld(0, tmpbis);
00687 tmpM2ssB = vec_ld(16, tmpbis);
00688 tmpbis += tmpStride;
00689 tmpM1ssA = vec_ld(0, tmpbis);
00690 tmpM1ssB = vec_ld(16, tmpbis);
00691 tmpbis += tmpStride;
00692 tmpP0ssA = vec_ld(0, tmpbis);
00693 tmpP0ssB = vec_ld(16, tmpbis);
00694 tmpbis += tmpStride;
00695 tmpP1ssA = vec_ld(0, tmpbis);
00696 tmpP1ssB = vec_ld(16, tmpbis);
00697 tmpbis += tmpStride;
00698 tmpP2ssA = vec_ld(0, tmpbis);
00699 tmpP2ssB = vec_ld(16, tmpbis);
00700 tmpbis += tmpStride;
00701
00702 for (i = 0 ; i < 16 ; i++) {
00703 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00704 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00705
00706 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00707 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00708 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00709 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00710 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00711 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00712
00713 tmpbis += tmpStride;
00714
00715 tmpM2ssA = tmpM1ssA;
00716 tmpM2ssB = tmpM1ssB;
00717 tmpM1ssA = tmpP0ssA;
00718 tmpM1ssB = tmpP0ssB;
00719 tmpP0ssA = tmpP1ssA;
00720 tmpP0ssB = tmpP1ssB;
00721 tmpP1ssA = tmpP2ssA;
00722 tmpP1ssB = tmpP2ssB;
00723 tmpP2ssA = tmpP3ssA;
00724 tmpP2ssB = tmpP3ssB;
00725
00726 pp1Ae = vec_mule(sum1A, v20ss);
00727 pp1Ao = vec_mulo(sum1A, v20ss);
00728 pp1Be = vec_mule(sum1B, v20ss);
00729 pp1Bo = vec_mulo(sum1B, v20ss);
00730
00731 pp2Ae = vec_mule(sum2A, v5ss);
00732 pp2Ao = vec_mulo(sum2A, v5ss);
00733 pp2Be = vec_mule(sum2B, v5ss);
00734 pp2Bo = vec_mulo(sum2B, v5ss);
00735
00736 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00737 pp3Ao = vec_mulo(sum3A, v1ss);
00738 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00739 pp3Bo = vec_mulo(sum3B, v1ss);
00740
00741 pp1cAe = vec_add(pp1Ae, v512si);
00742 pp1cAo = vec_add(pp1Ao, v512si);
00743 pp1cBe = vec_add(pp1Be, v512si);
00744 pp1cBo = vec_add(pp1Bo, v512si);
00745
00746 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00747 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00748 pp32Be = vec_sub(pp3Be, pp2Be);
00749 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00750
00751 sumAe = vec_add(pp1cAe, pp32Ae);
00752 sumAo = vec_add(pp1cAo, pp32Ao);
00753 sumBe = vec_add(pp1cBe, pp32Be);
00754 sumBo = vec_add(pp1cBo, pp32Bo);
00755
00756 ssumAe = vec_sra(sumAe, v10ui);
00757 ssumAo = vec_sra(sumAo, v10ui);
00758 ssumBe = vec_sra(sumBe, v10ui);
00759 ssumBo = vec_sra(sumBo, v10ui);
00760
00761 ssume = vec_packs(ssumAe, ssumBe);
00762 ssumo = vec_packs(ssumAo, ssumBo);
00763
00764 sumv = vec_packsu(ssume, ssumo);
00765 sum = vec_perm(sumv, sumv, mperm);
00766
00767 ASSERT_ALIGNED(dst);
00768 vdst = vec_ld(0, dst);
00769
00770 OP_U8_ALTIVEC(fsum, sum, vdst);
00771
00772 vec_st(fsum, 0, dst);
00773
00774 dst += dstStride;
00775 }
00776 }
00777 #endif