• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/ppc/dsputil_ppc.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2002 Brian Foley
00003  * Copyright (c) 2002 Dieter Shirley
00004  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 #include "libavutil/cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "dsputil_altivec.h"
00026 
00027 /* ***** WARNING ***** WARNING ***** WARNING ***** */
00028 /*
00029 clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with a
00030 cache line size not equal to 32 bytes.
00031 Fortunately all processor used by Apple up to at least the 7450 (aka second
00032 generation G4) use 32 bytes cache line.
00033 This is due to the use of the 'dcbz' instruction. It simply clear to zero a
00034 single cache line, so you need to know the cache line size to use it !
00035 It's absurd, but it's fast...
00036 
00037 update 24/06/2003 : Apple released yesterday the G5, with a PPC970. cache line
00038 size: 128 bytes. Oups.
00039 The semantic of dcbz was changed, it always clear 32 bytes. so the function
00040 below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl,
00041 which is defined to clear a cache line (as dcbz before). So we still can
00042 distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required.
00043 
00044 see <http://developer.apple.com/technotes/tn/tn2087.html>
00045 and <http://developer.apple.com/technotes/tn/tn2086.html>
00046 */
00047 static void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
00048 {
00049     register int misal = ((unsigned long)blocks & 0x00000010);
00050     register int i = 0;
00051     if (misal) {
00052         ((unsigned long*)blocks)[0] = 0L;
00053         ((unsigned long*)blocks)[1] = 0L;
00054         ((unsigned long*)blocks)[2] = 0L;
00055         ((unsigned long*)blocks)[3] = 0L;
00056         i += 16;
00057     }
00058     for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) {
00059         __asm__ volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
00060     }
00061     if (misal) {
00062         ((unsigned long*)blocks)[188] = 0L;
00063         ((unsigned long*)blocks)[189] = 0L;
00064         ((unsigned long*)blocks)[190] = 0L;
00065         ((unsigned long*)blocks)[191] = 0L;
00066         i += 16;
00067     }
00068 }
00069 
00070 /* same as above, when dcbzl clear a whole 128B cache line
00071    i.e. the PPC970 aka G5 */
00072 #if HAVE_DCBZL
00073 static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
00074 {
00075     register int misal = ((unsigned long)blocks & 0x0000007f);
00076     register int i = 0;
00077     if (misal) {
00078         // we could probably also optimize this case,
00079         // but there's not much point as the machines
00080         // aren't available yet (2003-06-26)
00081         memset(blocks, 0, sizeof(DCTELEM)*6*64);
00082     }
00083     else
00084         for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
00085             __asm__ volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
00086         }
00087 }
00088 #else
00089 static void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
00090 {
00091     memset(blocks, 0, sizeof(DCTELEM)*6*64);
00092 }
00093 #endif
00094 
00095 #if HAVE_DCBZL
00096 /* check dcbz report how many bytes are set to 0 by dcbz */
00097 /* update 24/06/2003 : replace dcbz by dcbzl to get
00098    the intended effect (Apple "fixed" dcbz)
00099    unfortunately this cannot be used unless the assembler
00100    knows about dcbzl ... */
00101 static long check_dcbzl_effect(void)
00102 {
00103     register char *fakedata = av_malloc(1024);
00104     register char *fakedata_middle;
00105     register long zero = 0;
00106     register long i = 0;
00107     long count = 0;
00108 
00109     if (!fakedata) {
00110         return 0L;
00111     }
00112 
00113     fakedata_middle = (fakedata + 512);
00114 
00115     memset(fakedata, 0xFF, 1024);
00116 
00117     /* below the constraint "b" seems to mean "Address base register"
00118        in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
00119     __asm__ volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
00120 
00121     for (i = 0; i < 1024 ; i ++) {
00122         if (fakedata[i] == (char)0)
00123             count++;
00124     }
00125 
00126     av_free(fakedata);
00127 
00128     return count;
00129 }
00130 #else
00131 static long check_dcbzl_effect(void)
00132 {
00133   return 0;
00134 }
00135 #endif
00136 
00137 static void prefetch_ppc(void *mem, int stride, int h)
00138 {
00139     register const uint8_t *p = mem;
00140     do {
00141         __asm__ volatile ("dcbt 0,%0" : : "r" (p));
00142         p+= stride;
00143     } while(--h);
00144 }
00145 
00146 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
00147 {
00148     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
00149     int mm_flags = av_get_cpu_flags();
00150 
00151     if (avctx->dsp_mask) {
00152         if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
00153             mm_flags |= (avctx->dsp_mask & 0xffff);
00154         else
00155             mm_flags &= ~(avctx->dsp_mask & 0xffff);
00156     }
00157 
00158     // Common optimizations whether AltiVec is available or not
00159     c->prefetch = prefetch_ppc;
00160     if (!high_bit_depth) {
00161     switch (check_dcbzl_effect()) {
00162         case 32:
00163             c->clear_blocks = clear_blocks_dcbz32_ppc;
00164             break;
00165         case 128:
00166             c->clear_blocks = clear_blocks_dcbz128_ppc;
00167             break;
00168         default:
00169             break;
00170     }
00171     }
00172 
00173 #if HAVE_ALTIVEC
00174     if(CONFIG_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
00175 
00176     if (mm_flags & AV_CPU_FLAG_ALTIVEC) {
00177         dsputil_init_altivec(c, avctx);
00178         float_init_altivec(c, avctx);
00179         int_init_altivec(c, avctx);
00180         c->gmc1 = gmc1_altivec;
00181 
00182 #if CONFIG_ENCODERS
00183         if (avctx->bits_per_raw_sample <= 8 &&
00184             (avctx->dct_algo == FF_DCT_AUTO ||
00185              avctx->dct_algo == FF_DCT_ALTIVEC)) {
00186             c->fdct = fdct_altivec;
00187         }
00188 #endif //CONFIG_ENCODERS
00189 
00190         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
00191             if ((avctx->idct_algo == FF_IDCT_AUTO) ||
00192                 (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
00193                 c->idct_put = idct_put_altivec;
00194                 c->idct_add = idct_add_altivec;
00195                 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
00196             }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
00197                      avctx->idct_algo==FF_IDCT_VP3){
00198                 c->idct_put = ff_vp3_idct_put_altivec;
00199                 c->idct_add = ff_vp3_idct_add_altivec;
00200                 c->idct     = ff_vp3_idct_altivec;
00201                 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
00202             }
00203         }
00204 
00205     }
00206 #endif /* HAVE_ALTIVEC */
00207 }
Generated on Fri Feb 1 2013 14:34:41 for FFmpeg by doxygen 1.7.1