00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045
00046 #undef printf
00047
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050
00051 void odivx_idct_c(short *block);
00052
00053
00054 void ff_bfin_idct(DCTELEM *block);
00055 void ff_bfin_fdct(DCTELEM *block);
00056
00057
00058 void fdct_altivec(DCTELEM *block);
00059
00060
00061
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067
00068 void ff_simple_idct_axp(DCTELEM *data);
00069
00070 struct algo {
00071 const char *name;
00072 enum { FDCT, IDCT } is_idct;
00073 void (* func) (DCTELEM *block);
00074 void (* ref) (DCTELEM *block);
00075 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
00076 int mm_support;
00077 };
00078
00079 #ifndef FAAN_POSTSCALE
00080 #define FAAN_SCALE SCALE_PERM
00081 #else
00082 #define FAAN_SCALE NO_PERM
00083 #endif
00084
00085 static int cpu_flags;
00086
00087 struct algo algos[] = {
00088 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
00089 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
00090 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
00091 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
00092 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
00093 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
00094 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
00095 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
00096
00097 #if HAVE_MMX
00098 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
00099 #if HAVE_MMX2
00100 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
00101 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
00102 #endif
00103
00104 #if CONFIG_GPL
00105 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
00106 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
00107 #endif
00108 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
00109 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
00110 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
00111 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
00112 #endif
00113
00114 #if HAVE_ALTIVEC
00115 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
00116 #endif
00117
00118 #if ARCH_BFIN
00119 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
00120 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
00121 #endif
00122
00123 #if ARCH_ARM
00124 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
00125 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
00126 #if HAVE_ARMV5TE
00127 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
00128 #endif
00129 #if HAVE_ARMV6
00130 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
00131 #endif
00132 #if HAVE_NEON
00133 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
00134 #endif
00135 #endif
00136
00137 #if ARCH_ALPHA
00138 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
00139 #endif
00140
00141 { 0 }
00142 };
00143
00144 #define AANSCALE_BITS 12
00145
00146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
00147
00148 static int64_t gettime(void)
00149 {
00150 struct timeval tv;
00151 gettimeofday(&tv,NULL);
00152 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00153 }
00154
00155 #define NB_ITS 20000
00156 #define NB_ITS_SPEED 50000
00157
00158 static short idct_mmx_perm[64];
00159
00160 static short idct_simple_mmx_perm[64]={
00161 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00162 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00163 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00164 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00165 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00166 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00167 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00168 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00169 };
00170
00171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00172
00173 static void idct_mmx_init(void)
00174 {
00175 int i;
00176
00177
00178 for (i = 0; i < 64; i++) {
00179 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00180
00181 }
00182 }
00183
00184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
00187
00188 static inline void mmx_emms(void)
00189 {
00190 #if HAVE_MMX
00191 if (cpu_flags & AV_CPU_FLAG_MMX)
00192 __asm__ volatile ("emms\n\t");
00193 #endif
00194 }
00195
00196 static void dct_error(const char *name, int is_idct,
00197 void (*fdct_func)(DCTELEM *block),
00198 void (*fdct_ref)(DCTELEM *block), int form, int test)
00199 {
00200 int it, i, scale;
00201 int err_inf, v;
00202 int64_t err2, ti, ti1, it1;
00203 int64_t sysErr[64], sysErrMax=0;
00204 int maxout=0;
00205 int blockSumErrMax=0, blockSumErr;
00206 AVLFG prng;
00207
00208 av_lfg_init(&prng, 1);
00209
00210 err_inf = 0;
00211 err2 = 0;
00212 for(i=0; i<64; i++) sysErr[i]=0;
00213 for(it=0;it<NB_ITS;it++) {
00214 for(i=0;i<64;i++)
00215 block1[i] = 0;
00216 switch(test){
00217 case 0:
00218 for(i=0;i<64;i++)
00219 block1[i] = (av_lfg_get(&prng) % 512) -256;
00220 if (is_idct){
00221 ff_ref_fdct(block1);
00222
00223 for(i=0;i<64;i++)
00224 block1[i]>>=3;
00225 }
00226 break;
00227 case 1:{
00228 int num = av_lfg_get(&prng) % 10 + 1;
00229 for(i=0;i<num;i++)
00230 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
00231 }break;
00232 case 2:
00233 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
00234 block1[63]= (block1[0]&1)^1;
00235 break;
00236 }
00237
00238 #if 0 // simulate mismatch control
00239 { int sum=0;
00240 for(i=0;i<64;i++)
00241 sum+=block1[i];
00242
00243 if((sum&1)==0) block1[63]^=1;
00244 }
00245 #endif
00246
00247 for(i=0; i<64; i++)
00248 block_org[i]= block1[i];
00249
00250 if (form == MMX_PERM) {
00251 for(i=0;i<64;i++)
00252 block[idct_mmx_perm[i]] = block1[i];
00253 } else if (form == MMX_SIMPLE_PERM) {
00254 for(i=0;i<64;i++)
00255 block[idct_simple_mmx_perm[i]] = block1[i];
00256
00257 } else if (form == SSE2_PERM) {
00258 for(i=0; i<64; i++)
00259 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
00260 } else if (form == PARTTRANS_PERM) {
00261 for(i=0; i<64; i++)
00262 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
00263 } else {
00264 for(i=0; i<64; i++)
00265 block[i]= block1[i];
00266 }
00267 #if 0 // simulate mismatch control for tested IDCT but not the ref
00268 { int sum=0;
00269 for(i=0;i<64;i++)
00270 sum+=block[i];
00271
00272 if((sum&1)==0) block[63]^=1;
00273 }
00274 #endif
00275
00276 fdct_func(block);
00277 mmx_emms();
00278
00279 if (form == SCALE_PERM) {
00280 for(i=0; i<64; i++) {
00281 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00282 block[i] = (block[i] * scale ) >> AANSCALE_BITS;
00283 }
00284 }
00285
00286 fdct_ref(block1);
00287
00288 blockSumErr=0;
00289 for(i=0;i<64;i++) {
00290 v = abs(block[i] - block1[i]);
00291 if (v > err_inf)
00292 err_inf = v;
00293 err2 += v * v;
00294 sysErr[i] += block[i] - block1[i];
00295 blockSumErr += v;
00296 if( abs(block[i])>maxout) maxout=abs(block[i]);
00297 }
00298 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
00299 #if 0 // print different matrix pairs
00300 if(blockSumErr){
00301 printf("\n");
00302 for(i=0; i<64; i++){
00303 if((i&7)==0) printf("\n");
00304 printf("%4d ", block_org[i]);
00305 }
00306 for(i=0; i<64; i++){
00307 if((i&7)==0) printf("\n");
00308 printf("%4d ", block[i] - block1[i]);
00309 }
00310 }
00311 #endif
00312 }
00313 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
00314
00315 #if 1 // dump systematic errors
00316 for(i=0; i<64; i++){
00317 if(i%8==0) printf("\n");
00318 printf("%7d ", (int)sysErr[i]);
00319 }
00320 printf("\n");
00321 #endif
00322
00323 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00324 is_idct ? "IDCT" : "DCT",
00325 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
00326 #if 1 //Speed test
00327
00328 for(i=0;i<64;i++)
00329 block1[i] = 0;
00330 switch(test){
00331 case 0:
00332 for(i=0;i<64;i++)
00333 block1[i] = av_lfg_get(&prng) % 512 -256;
00334 if (is_idct){
00335 ff_ref_fdct(block1);
00336
00337 for(i=0;i<64;i++)
00338 block1[i]>>=3;
00339 }
00340 break;
00341 case 1:{
00342 case 2:
00343 block1[0] = av_lfg_get(&prng) % 512 -256;
00344 block1[1] = av_lfg_get(&prng) % 512 -256;
00345 block1[2] = av_lfg_get(&prng) % 512 -256;
00346 block1[3] = av_lfg_get(&prng) % 512 -256;
00347 }break;
00348 }
00349
00350 if (form == MMX_PERM) {
00351 for(i=0;i<64;i++)
00352 block[idct_mmx_perm[i]] = block1[i];
00353 } else if(form == MMX_SIMPLE_PERM) {
00354 for(i=0;i<64;i++)
00355 block[idct_simple_mmx_perm[i]] = block1[i];
00356 } else {
00357 for(i=0; i<64; i++)
00358 block[i]= block1[i];
00359 }
00360
00361 ti = gettime();
00362 it1 = 0;
00363 do {
00364 for(it=0;it<NB_ITS_SPEED;it++) {
00365 for(i=0; i<64; i++)
00366 block[i]= block1[i];
00367
00368
00369 fdct_func(block);
00370 }
00371 it1 += NB_ITS_SPEED;
00372 ti1 = gettime() - ti;
00373 } while (ti1 < 1000000);
00374 mmx_emms();
00375
00376 printf("%s %s: %0.1f kdct/s\n",
00377 is_idct ? "IDCT" : "DCT",
00378 name, (double)it1 * 1000.0 / (double)ti1);
00379 #endif
00380 }
00381
00382 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00383 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00384
00385 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00386 {
00387 static int init;
00388 static double c8[8][8];
00389 static double c4[4][4];
00390 double block1[64], block2[64], block3[64];
00391 double s, sum, v;
00392 int i, j, k;
00393
00394 if (!init) {
00395 init = 1;
00396
00397 for(i=0;i<8;i++) {
00398 sum = 0;
00399 for(j=0;j<8;j++) {
00400 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
00401 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00402 sum += c8[i][j] * c8[i][j];
00403 }
00404 }
00405
00406 for(i=0;i<4;i++) {
00407 sum = 0;
00408 for(j=0;j<4;j++) {
00409 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
00410 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00411 sum += c4[i][j] * c4[i][j];
00412 }
00413 }
00414 }
00415
00416
00417 s = 0.5 * sqrt(2.0);
00418 for(i=0;i<4;i++) {
00419 for(j=0;j<8;j++) {
00420 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
00421 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
00422 }
00423 }
00424
00425
00426 for(i=0;i<8;i++) {
00427 for(j=0;j<8;j++) {
00428 sum = 0;
00429 for(k=0;k<8;k++)
00430 sum += c8[k][j] * block1[8*i+k];
00431 block2[8*i+j] = sum;
00432 }
00433 }
00434
00435
00436 for(i=0;i<8;i++) {
00437 for(j=0;j<4;j++) {
00438
00439 sum = 0;
00440 for(k=0;k<4;k++)
00441 sum += c4[k][j] * block2[8*(2*k)+i];
00442 block3[8*(2*j)+i] = sum;
00443
00444
00445 sum = 0;
00446 for(k=0;k<4;k++)
00447 sum += c4[k][j] * block2[8*(2*k+1)+i];
00448 block3[8*(2*j+1)+i] = sum;
00449 }
00450 }
00451
00452
00453 for(i=0;i<8;i++) {
00454 for(j=0;j<8;j++) {
00455 v = block3[8*i+j];
00456 if (v < 0)
00457 v = 0;
00458 else if (v > 255)
00459 v = 255;
00460 dest[i * linesize + j] = (int)rint(v);
00461 }
00462 }
00463 }
00464
00465 static void idct248_error(const char *name,
00466 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
00467 {
00468 int it, i, it1, ti, ti1, err_max, v;
00469
00470 AVLFG prng;
00471
00472 av_lfg_init(&prng, 1);
00473
00474
00475
00476 err_max = 0;
00477 for(it=0;it<NB_ITS;it++) {
00478
00479
00480 for(i=0;i<64;i++)
00481 block1[i] = av_lfg_get(&prng) % 256 - 128;
00482 block1[0] += 1024;
00483
00484 for(i=0; i<64; i++)
00485 block[i]= block1[i];
00486 idct248_ref(img_dest1, 8, block);
00487
00488 for(i=0; i<64; i++)
00489 block[i]= block1[i];
00490 idct248_put(img_dest, 8, block);
00491
00492 for(i=0;i<64;i++) {
00493 v = abs((int)img_dest[i] - (int)img_dest1[i]);
00494 if (v == 255)
00495 printf("%d %d\n", img_dest[i], img_dest1[i]);
00496 if (v > err_max)
00497 err_max = v;
00498 }
00499 #if 0
00500 printf("ref=\n");
00501 for(i=0;i<8;i++) {
00502 int j;
00503 for(j=0;j<8;j++) {
00504 printf(" %3d", img_dest1[i*8+j]);
00505 }
00506 printf("\n");
00507 }
00508
00509 printf("out=\n");
00510 for(i=0;i<8;i++) {
00511 int j;
00512 for(j=0;j<8;j++) {
00513 printf(" %3d", img_dest[i*8+j]);
00514 }
00515 printf("\n");
00516 }
00517 #endif
00518 }
00519 printf("%s %s: err_inf=%d\n",
00520 1 ? "IDCT248" : "DCT248",
00521 name, err_max);
00522
00523 ti = gettime();
00524 it1 = 0;
00525 do {
00526 for(it=0;it<NB_ITS_SPEED;it++) {
00527 for(i=0; i<64; i++)
00528 block[i]= block1[i];
00529
00530
00531 idct248_put(img_dest, 8, block);
00532 }
00533 it1 += NB_ITS_SPEED;
00534 ti1 = gettime() - ti;
00535 } while (ti1 < 1000000);
00536 mmx_emms();
00537
00538 printf("%s %s: %0.1f kdct/s\n",
00539 1 ? "IDCT248" : "DCT248",
00540 name, (double)it1 * 1000.0 / (double)ti1);
00541 }
00542
00543 static void help(void)
00544 {
00545 printf("dct-test [-i] [<test-number>]\n"
00546 "test-number 0 -> test with random matrixes\n"
00547 " 1 -> test with random sparse matrixes\n"
00548 " 2 -> do 3. test from mpeg4 std\n"
00549 "-i test IDCT implementations\n"
00550 "-4 test IDCT248 implementations\n");
00551 }
00552
00553 int main(int argc, char **argv)
00554 {
00555 int test_idct = 0, test_248_dct = 0;
00556 int c,i;
00557 int test=1;
00558 cpu_flags = av_get_cpu_flags();
00559
00560 ff_ref_dct_init();
00561 idct_mmx_init();
00562
00563 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
00564 for(i=0;i<MAX_NEG_CROP;i++) {
00565 cropTbl[i] = 0;
00566 cropTbl[i + MAX_NEG_CROP + 256] = 255;
00567 }
00568
00569 for(;;) {
00570 c = getopt(argc, argv, "ih4");
00571 if (c == -1)
00572 break;
00573 switch(c) {
00574 case 'i':
00575 test_idct = 1;
00576 break;
00577 case '4':
00578 test_248_dct = 1;
00579 break;
00580 default :
00581 case 'h':
00582 help();
00583 return 0;
00584 }
00585 }
00586
00587 if(optind <argc) test= atoi(argv[optind]);
00588
00589 printf("ffmpeg DCT/IDCT test\n");
00590
00591 if (test_248_dct) {
00592 idct248_error("SIMPLE-C", ff_simple_idct248_put);
00593 } else {
00594 for (i=0;algos[i].name;i++)
00595 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
00596 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
00597 }
00598 }
00599 return 0;
00600 }