> From: "Mathew Hendry" <[EMAIL PROTECTED]>
> Date: Sun, 28 Nov 1999 05:17:06 -0000
>
> Now with Takehiro's table changes; in iteration_init:
> 

wow, that was fast!

I just committed the new routines to sourceforge.net.  They 
make a *big* difference, at least under gcc.  

I also took Takehiro's suggestion and made the new quantization only
used with "-h".  And since the old quantize_xrpow routine (renamed
quantize_xrpow_ISO) will still be used, I added Acy Stapp's full blown
MSVC asm routine.  He reported this was also a significant speedup,
but I dont have MSVC and so I haven't tested it.  

quantize_xrpow and quantize_xrpow_ISO are below:






#if defined(__GNUC__) && defined(__i386__)
#  define QUANTFAC(rx)  adj43asm[rx]
#  define XRPOW_FTOI(src, dest) \
     asm ("fistpl %0 " : "=m"(dest) : "t"(src) : "st")
#elif defined (_MSC_VER)
#  define QUANTFAC(rx)  adj43asm[rx]
#  define XRPOW_FTOI(src, dest) do { \
     FLOAT8 src_ = (src); \
     int dest_; \
     { \
       __asm fld src_ \
       __asm fistp dest_ \
     } \
     (dest) = dest_; \
   } while (0)
#else
#  define QUANTFAC(rx)  adj43[rx]
#  define XRPOW_FTOI(src,dest) ((dest) = (int)(src))
#endif

/*********************************************************************
 * nonlinear quantization of xr 
 * More accurate formula than the ISO formula.  Takes into account
 * the fact that we are quantizing xr -> ix, but we want ix^4/3 to be 
 * as close as possible to x^4/3.  (taking the nearest int would mean
 * ix is as close as possible to xr, which is different.)
 * From Segher Boessenkool <[EMAIL PROTECTED]>  11/1999
 * ASM optimization from Mathew Hendry <[EMAIL PROTECTED]> 11/1999
 * and Takehiro Tominaga <[EMAIL PROTECTED]> 
 *********************************************************************/
void quantize_xrpow( FLOAT8 xr[576], int ix[576], gr_info *cod_info )
{
  /* quantize on xr^(3/4) instead of xr */
  register int j;
  int rx;
  FLOAT8 x,quantizerStepSize;
  FLOAT8 istep_l,istep0,istep1,istep2;

  quantizerStepSize = cod_info->quantizerStepSize;
  
  istep_l = pow ( 2.0, quantizerStepSize * -0.1875 );
  
  if ((cod_info->block_type==SHORT_TYPE))
    {
      istep0 = istep_l * pow(2.0,1.5* (FLOAT8) cod_info->subblock_gain[0]);
      istep1 = istep_l * pow(2.0,1.5* (FLOAT8) cod_info->subblock_gain[1]);
      istep2 = istep_l * pow(2.0,1.5* (FLOAT8) cod_info->subblock_gain[2]);
      for (j=192;j>0;j--) 
        {
          x = istep0 * *xr++;
          /* *(ix++) = (int)( x  + adj43[(int)x]); */
          XRPOW_FTOI(x-.5, rx);
          XRPOW_FTOI(x + QUANTFAC(rx), *(ix++));

          x = istep1 * *xr++;
          /* *(ix++) = (int)( x  + adj43[(int)x]); */
          XRPOW_FTOI(x-.5, rx);
          XRPOW_FTOI(x + QUANTFAC(rx), *(ix++));

          x = istep2 * *xr++;
          /*          *(ix++) = (int)( x  + adj43[(int)x]); */
          XRPOW_FTOI(x-.5, rx);
          XRPOW_FTOI(x + QUANTFAC(rx), *(ix++));
        }
    }
  else
    {
      for (j=576;j>0;j--) {
        x = istep_l * *xr++;
        /*      *(ix++) = (int)( x  +  adj43[(int)x]); */
        XRPOW_FTOI(x-.5, rx);
        XRPOW_FTOI(x + QUANTFAC(rx), *(ix++));
      }
    }
}





#ifdef _MSC_VER
#define MSVC_XRPOW_ASM
#ifdef MSVC_XRPOW_ASM
# define MSVC_FTOL(src, dest) do { \
    FLOAT8 src_ = (src); \
    int dest_; \
    { \
      __asm fld src_ \
      __asm fistp dest_ \
    } \
    (dest) = dest_; \
  } while (0)
# endif
#endif
void quantize_xrpow_ISO( FLOAT8 xr[576], int ix[576], gr_info *cod_info )
{
  /* quantize on xr^(3/4) instead of xr */
  register int j;
  FLOAT8 quantizerStepSize;
  FLOAT8 istep_l,istep0,istep1,istep2;
#if defined(__GNUC__) && defined(__i386__) 
#elif defined(MSVC_XRPOW_ASM)
#else
  FLOAT8 compareval0;
#endif

  quantizerStepSize = cod_info->quantizerStepSize;
  
  istep_l = pow ( 2.0, quantizerStepSize * -0.1875 );
  
  if ((cod_info->block_type==SHORT_TYPE))
    {
      istep0 = istep_l * pow(2.0,1.5* (FLOAT8) cod_info->subblock_gain[0]);
      istep1 = istep_l * pow(2.0,1.5* (FLOAT8) cod_info->subblock_gain[1]);
      istep2 = istep_l * pow(2.0,1.5* (FLOAT8) cod_info->subblock_gain[2]);
      for (j=192;j>0;j--) 
        {
#if defined(__GNUC__) && defined(__i386__)
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep0*(*(xr++)) - 0.0946): "st");
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep1*(*(xr++)) - 0.0946): "st");
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep2*(*(xr++)) - 0.0946): "st");
#elif defined(MSVC_XRPOW_ASM)
          MSVC_FTOL((istep0*(*(xr++)) - 0.0946), *(ix++));
          MSVC_FTOL((istep1*(*(xr++)) - 0.0946), *(ix++));
          MSVC_FTOL((istep2*(*(xr++)) - 0.0946), *(ix++));
#else
          *(ix++) = (int)( istep0*(*(xr++))  + 0.4054);
          *(ix++) = (int)( istep1*(*(xr++))  + 0.4054);
          *(ix++) = (int)( istep2*(*(xr++))  + 0.4054);
#endif
        }
    }
  else
    {
#if defined(__GNUC__) && defined(__i386__) 
      for (j=576;j>0;j--) 
          asm ("fistpl %0 ": "=m"(*(ix++)): "t"(istep_l*(*(xr++)) - 0.0946): "st");
#elif defined(MSVC_XRPOW_ASM)
      temp0 = 0.0946;
      _asm {
          mov ecx, 576/4;
          fld qword ptr [temp0];
          fld qword ptr [istep_l];
          mov eax, dword ptr [xr];
          mov ebx, dword ptr [ix];
      } loop0: _asm {
          fld qword ptr [eax];
          fld qword ptr [eax+8];
          fld qword ptr [eax+16];
          fld qword ptr [eax+24];
          add eax, 32;
          fld st(4)
          fmul st(4), st(0);
          fmul st(3), st(0);
          fmul st(2), st(0);
          fmulp st(1), st(0);
          fsub st(0), st(5);
          fistp dword ptr [ebx+12];
          fsub st(0), st(4);
          fistp dword ptr [ebx+8];
          fsub st(0), st(3);
          fistp dword ptr [ebx+4];
          fsub st(0), st(2);
          fistp dword ptr [ebx];
          add ebx, 16;
          loop loop0;
          mov dword ptr [xr], eax;
          mov dword ptr [ix], ebx;
          fstp st(0);
          fstp st(0);
      }
      /*
      for (j=576;j>0;j--) {
        MSVC_FTOL((istep_l*(*(xr++)) - 0.0946), *(ix++));
      }
      */
#else
      compareval0 = (1.0 - 0.4054)/istep_l;
      /* depending on architecture, it may be worth calculating a few more 
compareval's.
         eg.  compareval1 = (2.0 - 0.4054/istep_l); 
              .. and then after the first compare do this ...
              if compareval1>*xr then ix = 1;
         On a pentium166, it's only worth doing the one compare (as done here), as the 
second
         compare becomes more expensive than just calculating the value. Architectures 
with 
         slow FP operations may want to add some more comparevals. try it and send 
your diffs 
         statistically speaking
         73% of all xr*istep_l values give ix=0
         16% will give 1
         4%  will give 2
      */
      for (j=576;j>0;j--) 
        {
          if (compareval0 > *xr) {
            *(ix++) = 0;
            xr++;
          } else
            *(ix++) = (int)( istep_l*(*(xr++))  + 0.4054);
        }
#endif
    }
}




--
MP3 ENCODER mailing list ( http://geek.rcc.se/mp3encoder/ )

Reply via email to