Andrew Hallendorff's SSE accelerated Equalization.

2014-01-16 17:55:35 +00:00 · 2014-01-16 17:55:35 +00:00 · d847ee7162
parent 3f59126949
commit d847ee7162
13 changed files with 3161 additions and 307 deletions
--- a/src/Experimental.h
+++ b/src/Experimental.h
@ -30,6 +30,10 @@
 #ifndef __EXPERIMENTAL__
 #define __EXPERIMENTAL__
 // ACH 08 Jan 2014
 // EQ accelerated code
 //#define EXPERIMENTAL_EQ_SSE_THREADED
 // LLL, 09 Nov 2013:
 // Allow all WASAPI devices, not just loopback
 #define EXPERIMENTAL_FULL_WASAPI
--- a/src/RealFFTf.cpp
+++ b/src/RealFFTf.cpp
@ -1,54 +1,59 @@
 /*
- *     Program: REALFFTF.C
+*     Program: REALFFTF.C
- *      Author: Philip Van Baren
+*      Author: Philip Van Baren
- *        Date: 2 September 1993
+*        Date: 2 September 1993
- *
+*
- * Description: These routines perform an FFT on real data to get a conjugate-symmetric
+* Description: These routines perform an FFT on real data to get a conjugate-symmetric
- *              output, and an inverse FFT on conjugate-symmetric input to get a real
+*              output, and an inverse FFT on conjugate-symmetric input to get a real
- *              output sequence.
+*              output sequence.
- *
+*
- *              This code is for floating point data.
+*              This code is for floating point data.
- *
+*
- *              Modified 8/19/1998 by Philip Van Baren
+*              Modified 8/19/1998 by Philip Van Baren
- *                 - made the InitializeFFT and EndFFT routines take a structure
+*                 - made the InitializeFFT and EndFFT routines take a structure
- *                   holding the length and pointers to the BitReversed and SinTable
+*                   holding the length and pointers to the BitReversed and SinTable
- *                   tables.
+*                   tables.
- *              Modified 5/23/2009 by Philip Van Baren
+*              Modified 5/23/2009 by Philip Van Baren
- *                 - Added GetFFT and ReleaseFFT routines to retain common SinTable
+*                 - Added GetFFT and ReleaseFFT routines to retain common SinTable
- *                   and BitReversed tables so they don't need to be reallocated
+*                   and BitReversed tables so they don't need to be reallocated
- *                   and recomputed on every call.
+*                   and recomputed on every call.
- *                 - Added Reorder* functions to undo the bit-reversal
+*                 - Added Reorder* functions to undo the bit-reversal
- *
+*
- *  Copyright (C) 2009  Philip VanBaren
+*  Copyright (C) 2009  Philip VanBaren
- *
+*
- *  This program is free software; you can redistribute it and/or modify
+*  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
+*  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
+*  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
+*  (at your option) any later version.
- *
+*
- *  This program is distributed in the hope that it will be useful,
+*  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
+*  GNU General Public License for more details.
- *
+*
- *  You should have received a copy of the GNU General Public License
+*  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
+*  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
+*/
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include "Experimental.h"
 #include "RealFFTf.h"
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
 #include "RealFFTf48x.h"
 #endif
 #ifndef M_PI
 #define	M_PI		3.14159265358979323846  /* pi */
 #endif
 /*
- *  Initialize the Sine table and Twiddle pointers (bit-reversed pointers)
+*  Initialize the Sine table and Twiddle pointers (bit-reversed pointers)
- *  for the FFT routine.
+*  for the FFT routine.
- */
+*/
 HFFT InitializeFFT(int fftlen)
 {
   int i;
@ -62,10 +67,10 @@ HFFT InitializeFFT(int fftlen)
      exit(8);
   }
   /*
-    *  FFT size is only half the number of data points
+   *  FFT size is only half the number of data points
-    *  The full FFT output can be reconstructed from this FFT's output.
+   *  The full FFT output can be reconstructed from this FFT's output.
-    *  (This optimization can be made since the data is real.)
+   *  (This optimization can be made since the data is real.)
-    */
+   */
   h->Points = fftlen/2;
   if((h->SinTable=(fft_type *)malloc(2*h->Points*sizeof(fft_type)))==NULL)
@ -73,6 +78,7 @@ HFFT InitializeFFT(int fftlen)
      fprintf(stderr,"Error allocating memory for Sine table.\n");
      exit(8);
   }
   if((h->BitReversed=(int *)malloc(h->Points*sizeof(int)))==NULL)
   {
      fprintf(stderr,"Error allocating memory for BitReversed.\n");
@ -86,19 +92,28 @@ HFFT InitializeFFT(int fftlen)
         temp=(temp >> 1) + (i&mask ? h->Points : 0);
      h->BitReversed[i]=temp;
-   }
+   } 
   for(i=0;i<h->Points;i++)
   {
      h->SinTable[h->BitReversed[i]  ]=(fft_type)-sin(2*M_PI*i/(2*h->Points));
      h->SinTable[h->BitReversed[i]+1]=(fft_type)-cos(2*M_PI*i/(2*h->Points));
   }
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   // new SSE FFT routines work on live data
   for(i=0;i<32;i++)
      if((1<<i)&fftlen)
         h->pow2Bits=i;
   InitializeFFT1x(fftlen);
 #endif
   return h;
 }
 /*
- *  Free up the memory allotted for Sin table and Twiddle Pointers
+*  Free up the memory allotted for Sin table and Twiddle Pointers
- */
+*/
 void EndFFT(HFFT h)
 {
   if(h->Points>0) {
@ -157,23 +172,23 @@ void CleanupFFT()
 }
 /*
- *  Forward FFT routine.  Must call InitializeFFT(fftlen) first!
+*  Forward FFT routine.  Must call InitializeFFT(fftlen) first!
- *
+*
- *  Note: Output is BIT-REVERSED! so you must use the BitReversed to
+*  Note: Output is BIT-REVERSED! so you must use the BitReversed to
- *        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
+*        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
- *                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )
+*                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )
- *        Input is in normal order.
+*        Input is in normal order.
- *
+*
- * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
+* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
- * - this can be done because both values will always be real only
+* - this can be done because both values will always be real only
- * - this allows us to not have to allocate an extra complex value for the Fs/2 bin
+* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
- *
+*
- *  Note: The scaling on this is done according to the standard FFT definition,
+*  Note: The scaling on this is done according to the standard FFT definition,
- *        so a unit amplitude DC signal will output an amplitude of (N)
+*        so a unit amplitude DC signal will output an amplitude of (N)
- *        (Older revisions would progressively scale the input, so the output
+*        (Older revisions would progressively scale the input, so the output
- *        values would be similar in amplitude to the input values, which is
+*        values would be similar in amplitude to the input values, which is
- *        good when using fixed point arithmetic)
+*        good when using fixed point arithmetic)
- */
+*/
 void RealFFTf(fft_type *buffer,HFFT h)
 {
   fft_type *A,*B;
@ -186,12 +201,12 @@ void RealFFTf(fft_type *buffer,HFFT h)
   int ButterfliesPerGroup=h->Points/2;
   /*
-    *  Butterfly:
+   *  Butterfly:
-    *     Ain-----Aout
+   *     Ain-----Aout
-    *         \ /
+   *         \ /
-    *         / \
+   *         / \
-    *     Bin-----Bout
+   *     Bin-----Bout
-    */
+   */
   endptr1=buffer+h->Points*2;
@ -258,24 +273,24 @@ void RealFFTf(fft_type *buffer,HFFT h)
 /* Description: This routine performs an inverse FFT to real data.
- *              This code is for floating point data.
+*              This code is for floating point data.
- *
+*
- *  Note: Output is BIT-REVERSED! so you must use the BitReversed to
+*  Note: Output is BIT-REVERSED! so you must use the BitReversed to
- *        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]
+*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]
- *                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
+*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
- *        Input is in normal order, interleaved (real,imaginary) complex data
+*        Input is in normal order, interleaved (real,imaginary) complex data
- *        You must call InitializeFFT(fftlen) first to initialize some buffers!
+*        You must call InitializeFFT(fftlen) first to initialize some buffers!
- *
+*
- * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
+* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
- * - this can be done because both values will always be real only
+* - this can be done because both values will always be real only
- * - this allows us to not have to allocate an extra complex value for the Fs/2 bin
+* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
- *
+*
- *  Note: The scaling on this is done according to the standard FFT definition,
+*  Note: The scaling on this is done according to the standard FFT definition,
- *        so a unit amplitude DC signal will output an amplitude of (N)
+*        so a unit amplitude DC signal will output an amplitude of (N)
- *        (Older revisions would progressively scale the input, so the output
+*        (Older revisions would progressively scale the input, so the output
- *        values would be similar in amplitude to the input values, which is
+*        values would be similar in amplitude to the input values, which is
- *        good when using fixed point arithmetic)
+*        good when using fixed point arithmetic)
- */
+*/
 void InverseRealFFTf(fft_type *buffer,HFFT h)
 {
   fft_type *A,*B;
@ -323,12 +338,12 @@ void InverseRealFFTf(fft_type *buffer,HFFT h)
   buffer[1]=v2;
   /*
-    *  Butterfly:
+   *  Butterfly:
-    *     Ain-----Aout
+   *     Ain-----Aout
-    *         \ /
+   *         \ /
-    *         / \
+   *         / \
-    *     Bin-----Bout
+   *     Bin-----Bout
-    */
+   */
   endptr1=buffer+h->Points*2;
--- a/src/RealFFTf.h
+++ b/src/RealFFTf.h
@ -6,6 +6,9 @@ typedef struct FFTParamType {
   int *BitReversed;
   fft_type *SinTable;
   int Points;
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   int pow2Bits;
 #endif
 } FFTParam;
 #define HFFT FFTParam *
--- a/src/RealFFTf48x.cpp
+++ b/src/RealFFTf48x.cpp
@ -0,0 +1,754 @@
 /**********************************************************************
   Audacity: A Digital Audio Editor
   RealFFT48x.cpp
   Philip Van Baren
   Andrew Hallendorff (SSE Mods)
 *******************************************************************//**
   \file RealFFT48x.cpp
   \brief Real FFT with SSE acceleration.
 *//****************************************************************/
 /*
 *     Program: REALFFTF.C
 *      Author: Philip Van Baren
 *        Date: 2 September 1993
 *
 * Description: These routines perform an FFT on real data to get a conjugate-symmetric
 *              output, and an inverse FFT on conjugate-symmetric input to get a real
 *              output sequence.
 *
 *              This code is for floating point data.
 *
 *              Modified 8/19/1998 by Philip Van Baren
 *                 - made the InitializeFFT and EndFFT routines take a structure
 *                   holding the length and pointers to the BitReversed and SinTable
 *                   tables.
 *              Modified 5/23/2009 by Philip Van Baren
 *                 - Added GetFFT and ReleaseFFT routines to retain common SinTable
 *                   and BitReversed tables so they don't need to be reallocated
 *                   and recomputed on every call.
 *                 - Added Reorder* functions to undo the bit-reversal
 *
 *  Copyright (C) 2009  Philip VanBaren
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 #include "Experimental.h"
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
 #ifndef USE_SSE2
 #define	USE_SSE2
 #endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include "RealFFTf.h"
 #ifdef __WXMSW__
 #pragma warning(disable:4305)
 #else
 #endif 
 #include "SseMathFuncs.h"
 #include <xmmintrin.h>
 #ifndef M_PI
 #define	M_PI		3.14159265358979323846  /* pi */
 #endif
 unsigned char smallReverseBitsTable[256];
 int tableMask=0;
 bool useBitReverseTable=false;
 bool useSinCosTable=false;
 void TableUsage(int iMask)
 {
   tableMask=iMask;
   useBitReverseTable=((iMask & 1)!=0);
   useSinCosTable=((iMask&2)!=0);
 }
 // note !!! number of bits must be between 9-16 
 int SmallReverseBits(int bits, int numberBits)
 {
   return (smallReverseBitsTable[*((unsigned char *)&bits)]<<(numberBits-8))+(smallReverseBitsTable[*(((unsigned char *)&bits)+1)]>>(16-numberBits));
 }
 /*
 *  Initialize the Sine table and Twiddle pointers (bit-reversed pointers)
 *  for the FFT routine.
 */
 HFFT InitializeFFT1x(int WXUNUSED( fftlen ) )
 {
   int i;
   //int temp;
   //int mask;
   //HFFT h;
   // this needs to move out but ehh... Andrew Hallendorff
   for(i=0;i<256;i++) {
      smallReverseBitsTable[i]=0;
      for(int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1) 
         if(i&maskLow)
            smallReverseBitsTable[i]|=maskHigh;
   }
   return NULL;
 }
 /*
 *  Free up the memory allotted for Sin table and Twiddle Pointers
 */
 void EndFFT1x(HFFT h)
 {
   if(h->Points>0) {
      free(h->BitReversed);
      free(h->SinTable);
   }
   h->Points=0;
   free(h);
 }
 #define MAX_HFFT 10
 static HFFT hFFTArray[MAX_HFFT] = { NULL };
 static int nFFTLockCount[MAX_HFFT] = { 0 };
 /* Get a handle to the FFT tables of the desired length */
 /* This version keeps common tables rather than allocating a new table every time */
 HFFT GetFFT1x(int fftlen)
 {
   int h,n = fftlen/2;
   for(h=0; (h<MAX_HFFT) && (hFFTArray[h] != NULL) && (n != hFFTArray[h]->Points); h++);
   if(h<MAX_HFFT) {
      if(hFFTArray[h] == NULL) {
         hFFTArray[h] = InitializeFFT(fftlen);
         nFFTLockCount[h] = 0;
      }
      nFFTLockCount[h]++;
      return hFFTArray[h];
   } else {
      // All buffers used, so fall back to allocating a new set of tables
      return InitializeFFT(fftlen);;
   }
 }
 /* Release a previously requested handle to the FFT tables */
 void ReleaseFFT1x(HFFT hFFT)
 {
   int h;
   for(h=0; (h<MAX_HFFT) && (hFFTArray[h] != hFFT); h++);
   if(h<MAX_HFFT) {
      nFFTLockCount[h]--;
   } else {
      EndFFT(hFFT);
   }
 }
 /* Deallocate any unused FFT tables */
 void CleanupFFT1x()
 {
   int h;
   for(h=0; (h<MAX_HFFT); h++) {
      if((nFFTLockCount[h] <= 0) && (hFFTArray[h] != NULL)) {
         EndFFT(hFFTArray[h]);
         hFFTArray[h] = NULL;
      }
   }
 }
 /*
 *  Forward FFT routine.  Must call InitializeFFT(fftlen) first!
 *
 *  Note: Output is BIT-REVERSED! so you must use the BitReversed to
 *        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
 *                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )
 *        Input is in normal order.
 *
 * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
 * - this can be done because both values will always be real only
 * - this allows us to not have to allocate an extra complex value for the Fs/2 bin
 *
 *  Note: The scaling on this is done according to the standard FFT definition,
 *        so a unit amplitude DC signal will output an amplitude of (N)
 *        (Older revisions would progressively scale the input, so the output
 *        values would be similar in amplitude to the input values, which is
 *        good when using fixed point arithmetic)
 */
 void RealFFTf1x(fft_type *buffer,HFFT h)
 {
   fft_type *A,*B;
   fft_type *sptr;
   fft_type *endptr1,*endptr2;
   int *br1,*br2;
   fft_type HRplus,HRminus,HIplus,HIminus;
   fft_type v1,v2,sin,cos;
   int ButterfliesPerGroup=h->Points/2;
   /*
   *  Butterfly:
   *     Ain-----Aout
   *         \ /
   *         / \
   *     Bin-----Bout
   */
   endptr1=buffer+h->Points*2;
   while(ButterfliesPerGroup>0)
   {
      A=buffer;
      B=buffer+ButterfliesPerGroup*2;
      sptr=h->SinTable;
      while(A<endptr1)
      {
         sin=*sptr;
         cos=*(sptr+1);
         endptr2=B;
         while(A<endptr2)
         {
            v1=*B*cos + *(B+1)*sin;
            v2=*B*sin - *(B+1)*cos;
            *B=(*A+v1);
            *(A++)=*(B++)-2*v1;
            *B=(*A-v2);
            *(A++)=*(B++)+2*v2;
         }
         A=B;
         B+=ButterfliesPerGroup*2;
         sptr+=2;
      }
      ButterfliesPerGroup >>= 1;
   }
   /* Massage output to get the output for a real input sequence. */
   br1=h->BitReversed+1;
   br2=h->BitReversed+h->Points-1;
   while(br1<br2)
   {
      sin=h->SinTable[*br1];
      cos=h->SinTable[*br1+1];
      A=buffer+*br1;
      B=buffer+*br2;
      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);
      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
      v1 = (sin*HRminus - cos*HIplus);
      v2 = (cos*HRminus + sin*HIplus);
      *A = (HRplus  + v1) * (fft_type)0.5;
      *B = *A - v1;
      *(A+1) = (HIminus + v2) * (fft_type)0.5;
      *(B+1) = *(A+1) - HIminus;
      br1++;
      br2--;
   }
   /* Handle the center bin (just need a conjugate) */
   A=buffer+*br1+1;
   *A=-*A;
   /* Handle DC bin separately - and ignore the Fs/2 bin
   buffer[0]+=buffer[1];
   buffer[1]=(fft_type)0;*/
   /* Handle DC and Fs/2 bins separately */
   /* Put the Fs/2 value into the imaginary part of the DC bin */
   v1=buffer[0]-buffer[1];
   buffer[0]+=buffer[1];
   buffer[1]=v1;
 }
 /* Description: This routine performs an inverse FFT to real data.
 *              This code is for floating point data.
 *
 *  Note: Output is BIT-REVERSED! so you must use the BitReversed to
 *        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]
 *                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
 *        Input is in normal order, interleaved (real,imaginary) complex data
 *        You must call InitializeFFT(fftlen) first to initialize some buffers!
 *
 * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
 * - this can be done because both values will always be real only
 * - this allows us to not have to allocate an extra complex value for the Fs/2 bin
 *
 *  Note: The scaling on this is done according to the standard FFT definition,
 *        so a unit amplitude DC signal will output an amplitude of (N)
 *        (Older revisions would progressively scale the input, so the output
 *        values would be similar in amplitude to the input values, which is
 *        good when using fixed point arithmetic)
 */
 void InverseRealFFTf1x(fft_type *buffer,HFFT h)
 {
   fft_type *A,*B;
   fft_type *sptr;
   fft_type *endptr1,*endptr2;
   int *br1;
   fft_type HRplus,HRminus,HIplus,HIminus;
   fft_type v1,v2,sin,cos;
   int ButterfliesPerGroup=h->Points/2;
   /* Massage input to get the input for a real output sequence. */
   A=buffer+2;
   B=buffer+h->Points*2-2;
   br1=h->BitReversed+1;
   while(A<B)
   {
      sin=h->SinTable[*br1];
      cos=h->SinTable[*br1+1];
      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);
      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
      v1 = (sin*HRminus + cos*HIplus);
      v2 = (cos*HRminus - sin*HIplus);
      *A = (HRplus  + v1) * (fft_type)0.5;
      *B = *A - v1;
      *(A+1) = (HIminus - v2) * (fft_type)0.5;
      *(B+1) = *(A+1) - HIminus;
      A+=2;
      B-=2;
      br1++;
   }
   /* Handle center bin (just need conjugate) */
   *(A+1)=-*(A+1);
   /* Handle DC bin separately - this ignores any Fs/2 component
   buffer[1]=buffer[0]=buffer[0]/2;*/
   /* Handle DC and Fs/2 bins specially */
   /* The DC bin is passed in as the real part of the DC complex value */
   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
   /* (v1+v2) = buffer[0] == the DC component */
   /* (v1-v2) = buffer[1] == the Fs/2 component */
   v1=0.5f*(buffer[0]+buffer[1]);
   v2=0.5f*(buffer[0]-buffer[1]);
   buffer[0]=v1;
   buffer[1]=v2;
   /*
   *  Butterfly:
   *     Ain-----Aout
   *         \ /
   *         / \
   *     Bin-----Bout
   */
   endptr1=buffer+h->Points*2;
   while(ButterfliesPerGroup>0)
   {
      A=buffer;
      B=buffer+ButterfliesPerGroup*2;
      sptr=h->SinTable;
      while(A<endptr1)
      {
         sin=*(sptr++);
         cos=*(sptr++);
         endptr2=B;
         while(A<endptr2)
         {
            v1=*B*cos - *(B+1)*sin;
            v2=*B*sin + *(B+1)*cos;
            *B=(*A+v1)*(fft_type)0.5;
            *(A++)=*(B++)-v1;
            *B=(*A+v2)*(fft_type)0.5;
            *(A++)=*(B++)-v2;
         }
         A=B;
         B+=ButterfliesPerGroup*2;
      }
      ButterfliesPerGroup >>= 1;
   }
 }
 void ReorderToFreq1x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
 {
   // Copy the data into the real and imaginary outputs
   for(int i=1;i<hFFT->Points;i++) {
      RealOut[i]=buffer[hFFT->BitReversed[i]  ];
      ImagOut[i]=buffer[hFFT->BitReversed[i]+1];
   }
   RealOut[0] = buffer[0]; // DC component
   ImagOut[0] = 0;
   RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
   ImagOut[hFFT->Points] = 0;
 }
 void ReorderToTime1x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut)
 {
   // Copy the data into the real outputs
   for(int i=0;i<hFFT->Points;i++) {
      TimeOut[i*2  ]=buffer[hFFT->BitReversed[i]  ];
      TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1];
   }
 }
 // 4x processing simd
 void RealFFTf4x(fft_type *buffer,HFFT h)
 {
   __m128 *localBuffer=(__m128 *)buffer;
   __m128 *A,*B;
   fft_type *sptr;
   __m128 *endptr1,*endptr2;
   int br1Index, br2Index;
   int br1Value, br2Value;
   __m128 HRplus,HRminus,HIplus,HIminus;
   __m128 v1,v2,sin,cos;
   fft_type iToRad=2*M_PI/(2*h->Points); 
   int ButterfliesPerGroup=h->Points/2;
   /*
   *  Butterfly:
   *     Ain-----Aout
   *         \ /
   *         / \
   *     Bin-----Bout
   */
   endptr1=&localBuffer[h->Points*2];
   while(ButterfliesPerGroup>0)
   {
      A=localBuffer;
      B=&localBuffer[ButterfliesPerGroup*2];
      sptr=h->SinTable;
      int iSinCosIndex=0;
      int iSinCosCalIndex=0;
      while(A<endptr1)
      {
         v4sfu sin4_2, cos4_2;
         if(useSinCosTable) {
            sin=_mm_set1_ps(*(sptr++));
            cos=_mm_set1_ps(*(sptr++));
         } else {
            if(!iSinCosCalIndex)
            {
               v4sfu vx;
               for(int i=0;i<4;i++)
                  vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
               sincos_ps(&vx, &sin4_2, &cos4_2);
               sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
               iSinCosCalIndex++;
            } else {
               sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
               if(iSinCosCalIndex==3)
                  iSinCosCalIndex=0;
               else
                  iSinCosCalIndex++;
            }
            iSinCosIndex++;
         }
         endptr2=B;
         while(A<endptr2)
         {
            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
            *B=_mm_add_ps( *A, v1);
            __m128 temp128 = _mm_set1_ps( 2.0); 
            *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
            *B=_mm_sub_ps(*A,v2);
            *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
         }
         A=B;
         B=&B[ButterfliesPerGroup*2];
      }
      ButterfliesPerGroup >>= 1;
   }
   /* Massage output to get the output for a real input sequence. */
   br1Index=1; // h->BitReversed+1;
   br2Index=h->Points-1;   //h->BitReversed+h->Points-1;
   int iSinCosCalIndex=0;
   while(br1Index<br2Index)
   {
      v4sfu sin4_2, cos4_2;
      if(useBitReverseTable) {
         br1Value=h->BitReversed[br1Index];
         br2Value=h->BitReversed[br2Index];
      } else {
         br1Value=SmallReverseBits(br1Index,h->pow2Bits);
         br2Value=SmallReverseBits(br2Index,h->pow2Bits);
      }
      if(useSinCosTable) {
         sin=_mm_set1_ps(h->SinTable[br1Value]);
         cos=_mm_set1_ps(h->SinTable[br1Value+1]);
      } else {
         if(!iSinCosCalIndex)
         {
            v4sfu vx;
            for(int i=0;i<4;i++)
               vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
            sincos_ps(&vx, &sin4_2, &cos4_2);
            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
            iSinCosCalIndex++;
         } else {
            sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
            if(iSinCosCalIndex==3)
               iSinCosCalIndex=0;
            else
               iSinCosCalIndex++;
         }
      }
      A=&localBuffer[br1Value];
      B=&localBuffer[br2Value];
      __m128 temp128 = _mm_set1_ps( 2.0);
      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
      temp128 = _mm_set1_ps( 0.5);
      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
      *B = _mm_sub_ps(*A, v1);
      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
      *(B+1) = _mm_sub_ps(*(A+1), HIminus);
      br1Index++;
      br2Index--;
   }
   /* Handle the center bin (just need a conjugate) */
   if(useBitReverseTable) 
      A=&localBuffer[h->BitReversed[br1Index]+1];
   else
      A=&localBuffer[SmallReverseBits(br1Index,h->pow2Bits)+1];
   // negate sse style
   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
   /* Handle DC and Fs/2 bins separately */
   /* Put the Fs/2 value into the imaginary part of the DC bin */
   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
   localBuffer[1]=v1;
 }
 /* Description: This routine performs an inverse FFT to real data.
 *              This code is for floating point data.
 *
 *  Note: Output is BIT-REVERSED! so you must use the BitReversed to
 *        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]
 *                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
 *        Input is in normal order, interleaved (real,imaginary) complex data
 *        You must call InitializeFFT(fftlen) first to initialize some buffers!
 *
 * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
 * - this can be done because both values will always be real only
 * - this allows us to not have to allocate an extra complex value for the Fs/2 bin
 *
 *  Note: The scaling on this is done according to the standard FFT definition,
 *        so a unit amplitude DC signal will output an amplitude of (N)
 *        (Older revisions would progressively scale the input, so the output
 *        values would be similar in amplitude to the input values, which is
 *        good when using fixed point arithmetic)
 */
 void InverseRealFFTf4x(fft_type *buffer,HFFT h)
 {
   __m128 *localBuffer=(__m128 *)buffer;
   __m128 *A,*B;
   fft_type *sptr;
   __m128 *endptr1,*endptr2;
   int br1Index, br1Value;
   __m128 HRplus,HRminus,HIplus,HIminus;
   __m128 v1,v2,sin,cos;
   fft_type iToRad=2*M_PI/(2*h->Points);
   int ButterfliesPerGroup=h->Points/2;
   /* Massage input to get the input for a real output sequence. */
   A=localBuffer+2;
   B=localBuffer+h->Points*2-2;
   br1Index=1; //h->BitReversed+1;
   int iSinCosCalIndex=0;
   while(A<B)
   {
      v4sfu sin4_2, cos4_2;
      if(useBitReverseTable) {
         br1Value=h->BitReversed[br1Index];
      } else {
         br1Value=SmallReverseBits(br1Index,h->pow2Bits);
      }
      if(useSinCosTable) {
         sin=_mm_set1_ps(h->SinTable[br1Value]);
         cos=_mm_set1_ps(h->SinTable[br1Value+1]);
      } else {
         if(!iSinCosCalIndex)
         {
            v4sfu vx;
            for(int i=0;i<4;i++)
               vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
            sincos_ps(&vx, &sin4_2, &cos4_2);
            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
            iSinCosCalIndex++;
         } else {
            sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
            cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
            if(iSinCosCalIndex==3)
               iSinCosCalIndex=0;
            else
               iSinCosCalIndex++;
         }
      }
      HRminus = _mm_sub_ps(*A,  *B);
      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));
      HIminus = _mm_sub_ps( *(A+1), *(B+1));
      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
      *B = _mm_sub_ps(*A, v1);
      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
      *(B+1) = _mm_sub_ps(*(A+1), HIminus);
      A=&A[2];
      B=&B[-2];
      br1Index++;
   }
   /* Handle center bin (just need conjugate) */
   // negate sse style
   *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
   /* Handle DC bin separately - this ignores any Fs/2 component
   buffer[1]=buffer[0]=buffer[0]/2;*/
   /* Handle DC and Fs/2 bins specially */
   /* The DC bin is passed in as the real part of the DC complex value */
   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
   /* (v1+v2) = buffer[0] == the DC component */
   /* (v1-v2) = buffer[1] == the Fs/2 component */
   v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
   v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
   localBuffer[0]=v1;
   localBuffer[1]=v2;
   /*
   *  Butterfly:
   *     Ain-----Aout
   *         \ /
   *         / \
   *     Bin-----Bout
   */
   endptr1=localBuffer+h->Points*2;
   while(ButterfliesPerGroup>0)
   {
      A=localBuffer;
      B=localBuffer+ButterfliesPerGroup*2;
      sptr=h->SinTable;
      int iSinCosIndex=0;
      int iSinCosCalIndex=0;
      while(A<endptr1)
      {
         v4sfu sin4_2, cos4_2;
         if(useSinCosTable) {
            sin=_mm_set1_ps(*(sptr++));
            cos=_mm_set1_ps(*(sptr++));
         } else {
            if(!iSinCosCalIndex)
            {
               v4sfu vx;
               for(int i=0;i<4;i++)
                  vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
               sincos_ps(&vx, &sin4_2, &cos4_2);
               sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
               iSinCosCalIndex++;
            } else {
               sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
               cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
               if(iSinCosCalIndex==3)
                  iSinCosCalIndex=0;
               else
                  iSinCosCalIndex++;
            }
            iSinCosIndex++;
         }
         endptr2=B;
         while(A<endptr2)
         {
            v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
            v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
            *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
            *(A++)=_mm_sub_ps(*(B++), v1);
            *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
            *(A++)=_mm_sub_ps(*(B++),v2);
         }
         A=B;
         B=&B[ButterfliesPerGroup*2];
      }
      ButterfliesPerGroup >>= 1;
   }
 }
 void ReorderToFreq4x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
 {
   __m128 *localBuffer=(__m128 *)buffer;
   __m128 *localRealOut=(__m128 *)RealOut;
   __m128 *localImagOut=(__m128 *)ImagOut;
   // Copy the data into the real and imaginary outputs
   for(int i=1;i<hFFT->Points;i++) {
      int brValue;
      if(useBitReverseTable) 
         brValue=hFFT->BitReversed[i];
      else
         brValue=SmallReverseBits(i,hFFT->pow2Bits);
      localRealOut[i]=localBuffer[brValue  ];
      localImagOut[i]=localBuffer[brValue+1];
   }
   localRealOut[0] = localBuffer[0]; // DC component
   localImagOut[0] = _mm_set1_ps(0.0);
   localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
   localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
 }
 void ReorderToTime4x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut)
 {
   __m128 *localBuffer=(__m128 *)buffer;
   __m128 *localTimeOut=(__m128 *)TimeOut;
   // Copy the data into the real outputs
   for(int i=0;i<hFFT->Points;i++) {
      int brValue;
      if(useBitReverseTable) 
         brValue=hFFT->BitReversed[i];
      else
         brValue=SmallReverseBits(i,hFFT->pow2Bits);
      localTimeOut[i*2  ]=localBuffer[brValue  ];
      localTimeOut[i*2+1]=localBuffer[brValue+1];
   }
 }
 #endif
--- a/src/RealFFTf48x.h
+++ b/src/RealFFTf48x.h
@ -0,0 +1,23 @@
 #ifndef __realfftf48x_h
 #define __realfftf48x_h
 #define fft_type float
 HFFT InitializeFFT1x(int);
 void EndFFT1x(HFFT);
 HFFT GetFFT1x(int);
 void ReleaseFFT1x(HFFT);
 void CleanupFFT1x();
 void RealFFTf1x(fft_type *,HFFT);
 void InverseRealFFTf1x(fft_type *,HFFT);
 void ReorderToTime1x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut);
 void ReorderToFreq1x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut);
 int SmallReverseBits(int bits, int numberBits);
 void RealFFTf4x(fft_type *,HFFT);
 void InverseRealFFTf4x(fft_type *,HFFT);
 void ReorderToTime4x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut);
 void ReorderToFreq4x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut);
 void TableUsage(int iMask);
 #endif
--- a/src/SseMathFuncs.cpp
+++ b/src/SseMathFuncs.cpp
@ -0,0 +1,698 @@
 /**********************************************************************
   Audacity: A Digital Audio Editor
   SseMathFuncs.cpp
   Stephen Moshier (wrote original C version, The Cephes Library)
   Julien Pommier (converted to use SSE)
   Andrew Hallendorff (adapted for Audacity)
 *******************************************************************//**
   \file SseMathfuncs.cpp
   \brief SSE maths functions (for FFTs)
 *//****************************************************************/
 #include "SseMathFuncs.h"
 /* JKC: The trig functions use Taylor's series, on the range 0 to Pi/4
 * computing both Sin and Cos, and using one or the other (in the
 * solo functions), or both in the more useful for us for FFTs sincos
 * function.
 * The constants minus_cephes_DP1 to minus_cephes_DP3 are used in the
 * angle reduction modulo function.  
 * 4 sincos are done at a time.  
 * If we wanted to do just sin or just cos, we could speed things up 
 * by queuing up the Sines and Cosines into batches of 4 separately.*/
 #ifndef USE_SSE2 //sry this is all sse2 now
 #define USE_SSE2
 #endif
 /* declare some SSE constants -- why can't I figure a better way to do that? */
 #define _PS_CONST(Name, Val)                                            \
   static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (float)Val, (float)Val, (float)Val, (float)Val }
 #define _PI32_CONST(Name, Val)                                            \
   static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 #define _PS_CONST_TYPE(Name, Type, Val)                                 \
   static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
 _PS_CONST(1  , 1.0f);
 _PS_CONST(0p5, 0.5f);
 /* the smallest non denormalized float number */
 _PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
 _PS_CONST_TYPE(mant_mask, int, 0x7f800000);
 _PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
 _PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
 _PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
 _PI32_CONST(1, 1);
 _PI32_CONST(inv1, ~1);
 _PI32_CONST(2, 2);
 _PI32_CONST(4, 4);
 _PI32_CONST(0x7f, 0x7f);
 _PS_CONST(cephes_SQRTHF, 0.707106781186547524);
 _PS_CONST(cephes_log_p0, 7.0376836292E-2);
 _PS_CONST(cephes_log_p1, - 1.1514610310E-1);
 _PS_CONST(cephes_log_p2, 1.1676998740E-1);
 _PS_CONST(cephes_log_p3, - 1.2420140846E-1);
 _PS_CONST(cephes_log_p4, + 1.4249322787E-1);
 _PS_CONST(cephes_log_p5, - 1.6668057665E-1);
 _PS_CONST(cephes_log_p6, + 2.0000714765E-1);
 _PS_CONST(cephes_log_p7, - 2.4999993993E-1);
 _PS_CONST(cephes_log_p8, + 3.3333331174E-1);
 _PS_CONST(cephes_log_q1, -2.12194440e-4);
 _PS_CONST(cephes_log_q2, 0.693359375);
 #ifndef USE_SSE2
 typedef union xmm_mm_union {
   __m128 xmm;
   __m64 mm[2];
 } xmm_mm_union;
 #define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
   xmm_mm_union u; u.xmm = xmm_;                   \
   mm0_ = u.mm[0];                                 \
   mm1_ = u.mm[1];                                 \
 }
 #define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
   xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
 }
 #endif // USE_SSE2
 /* natural logarithm computed for 4 simultaneous float 
 return NaN for x <= 0
 */
 __m128 log_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
 #ifdef USE_SSE2
   __m128i emm0;
 #else
   __m64 mm0, mm1;
 #endif
   __m128 one = *(__m128*)_ps_1;
   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
   x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */
 #ifndef USE_SSE2
   /* part 1: x = frexpf(x, &e); */
   COPY_XMM_TO_MM(x, mm0, mm1);
   mm0 = _mm_srli_pi32(mm0, 23);
   mm1 = _mm_srli_pi32(mm1, 23);
 #else
   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
 #endif
   /* keep only the fractional part */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
   x = _mm_or_ps(x, *(__m128*)_ps_0p5);
 #ifndef USE_SSE2
   /* now e=mm0:mm1 contain the really base-2 exponent */
   mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
   __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
   _mm_empty(); /* bye bye mmx */
 #else
   emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
   __m128 e = _mm_cvtepi32_ps(emm0);
 #endif
   e = _mm_add_ps(e, one);
   /* part2: 
   if( x < SQRTHF ) {
   e -= 1;
   x = x + x - 1.0;
   } else { x = x - 1.0; }
   */
   __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
   __m128 tmp = _mm_and_ps(x, mask);
   x = _mm_sub_ps(x, one);
   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
   x = _mm_add_ps(x, tmp);
   __m128 z = _mm_mul_ps(x,x);
   __m128 y = *(__m128*)_ps_cephes_log_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
   y = _mm_mul_ps(y, x);
   y = _mm_mul_ps(y, z);
   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
   y = _mm_add_ps(y, tmp);
   tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
   x = _mm_add_ps(x, y);
   x = _mm_add_ps(x, tmp);
   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
   return x;
 }
 _PS_CONST(exp_hi,	88.3762626647949f);
 _PS_CONST(exp_lo,	-88.3762626647949f);
 _PS_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS_CONST(cephes_exp_C1, 0.693359375);
 _PS_CONST(cephes_exp_C2, -2.12194440e-4);
 _PS_CONST(cephes_exp_p0, 1.9875691500E-4);
 _PS_CONST(cephes_exp_p1, 1.3981999507E-3);
 _PS_CONST(cephes_exp_p2, 8.3334519073E-3);
 _PS_CONST(cephes_exp_p3, 4.1665795894E-2);
 _PS_CONST(cephes_exp_p4, 1.6666665459E-1);
 _PS_CONST(cephes_exp_p5, 5.0000001201E-1);
 __m128 exp_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
   __m128 tmp = _mm_setzero_ps(), fx;
 #ifdef USE_SSE2
   __m128i emm0;
 #else
   __m64 mm0, mm1;
 #endif
   __m128 one = *(__m128*)_ps_1;
   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
   /* express exp(x) as exp(g + n*log(2)) */
   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
   fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);
   /* how to perform a floorf with SSE: just below */
 #ifndef USE_SSE2
   /* step 1 : cast to int */
   tmp = _mm_movehl_ps(tmp, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(tmp);
   /* step 2 : cast back to float */
   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
 #else
   emm0 = _mm_cvttps_epi32(fx);
   tmp  = _mm_cvtepi32_ps(emm0);
 #endif
   /* if greater, substract 1 */
   __m128 mask = _mm_cmpgt_ps(tmp, fx);    
   mask = _mm_and_ps(mask, one);
   fx = _mm_sub_ps(tmp, mask);
   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
   x = _mm_sub_ps(x, tmp);
   x = _mm_sub_ps(x, z);
   z = _mm_mul_ps(x,x);
   __m128 y = *(__m128*)_ps_cephes_exp_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, x);
   y = _mm_add_ps(y, one);
   /* build 2^n */
 #ifndef USE_SSE2
   z = _mm_movehl_ps(z, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(z);
   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
   mm0 = _mm_slli_pi32(mm0, 23); 
   mm1 = _mm_slli_pi32(mm1, 23);
   __m128 pow2n; 
   COPY_MM_TO_XMM(mm0, mm1, pow2n);
   _mm_empty();
 #else
   emm0 = _mm_cvttps_epi32(fx);
   emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f);
   emm0 = _mm_slli_epi32(emm0, 23);
   __m128 pow2n = _mm_castsi128_ps(emm0);
 #endif
   y = _mm_mul_ps(y, pow2n);
   return y;
 }
 _PS_CONST(minus_cephes_DP1, -0.78515625);
 _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
 _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
 _PS_CONST(sincof_p0, -1.9515295891E-4);
 _PS_CONST(sincof_p1,  8.3321608736E-3);
 _PS_CONST(sincof_p2, -1.6666654611E-1);
 _PS_CONST(coscof_p0,  2.443315711809948E-005);
 _PS_CONST(coscof_p1, -1.388731625493765E-003);
 _PS_CONST(coscof_p2,  4.166664568298827E-002);
 _PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
 it runs also on old athlons XPs and the pentium III of your grand
 mother.
 The code is the exact rewriting of the cephes sinf function.
 Precision is excellent as long as x < 8192 (I did not bother to
 take into account the special handling they have for greater values
 -- it does not return garbage for arguments over 8192, though, but
 the extra precision is missing).
 Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 surprising but correct result.
 Performance is also surprisingly good, 1.33 times faster than the
 macos vsinf SSE2 function, and 1.5 times faster than the
 __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
 too bad for an SSE1 function (with no special tuning) !
 However the latter libraries probably have a much better handling of NaN,
 Inf, denormalized and other special arguments..
 On my core 1 duo, the execution of this function takes approximately 95 cycles.
 From what I have observed on the experiments with Intel AMath lib, switching to an
 SSE2 version would improve the perf by only 10%.
 Since it is based on SSE intrinsics, it has to be compiled at -O2 to
 deliver full speed.
 */
 __m128 sin_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 #ifdef USE_SSE2
   __m128i emm0, emm2;
 #else
   __m64 mm0, mm1, mm2, mm3;
 #endif
   sign_bit = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask);
   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
 #ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);
   /* get the swap sign flag */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask 
   there is one polynom for 0 <= x <= Pi/4
   and another one for Pi/4<x<=Pi/2
   Both branches will be computed.
   */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
 #else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);
   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
   y = _mm_cvtpi32x2_ps(mm2, mm3);
   /* get the swap sign flag */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   /* get the polynom selection mask */
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 swap_sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
   _mm_empty(); /* good-bye mmx */
 #endif
   /* The magic pass: "Extended precision modular arithmetic" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);
   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);
   return y;
 }
 /* almost the same as sin_ps */
 __m128 cos_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
 #ifdef USE_SSE2
   __m128i emm0, emm2;
 #else
   __m64 mm0, mm1, mm2, mm3;
 #endif
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
 #ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);
   emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);
   /* get the swap sign flag */
   emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask */
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
 #else
   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);
   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
   y = _mm_cvtpi32x2_ps(mm2, mm3);
   mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);
   /* get the swap sign flag in mm0:mm1 and the 
   polynom selection mask in mm2:mm3 */
   mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   _mm_empty(); /* good-bye mmx */
 #endif
   /* The magic pass: "Extended precision modular arithmetic" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);
   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);
   return y;
 }
 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
 it is almost as fast, and gives you a free cosine with your sine */
 void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
   __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
 #ifdef USE_SSE2
   __m128i emm0, emm2, emm4;
 #else
   __m64 mm0, mm1, mm2, mm3, mm4, mm5;
 #endif
   sign_bit_sin = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
 #ifdef USE_SSE2
   /* store the integer part of y in emm2 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);
   emm4 = emm2;
   /* get the swap sign flag for the sine */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
   /* get the polynom selection mask for the sine*/
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
   __m128 poly_mask = _mm_castsi128_ps(emm2);
 #else
   /* store the integer part of y in mm2:mm3 */
   xmm3 = _mm_movehl_ps(xmm3, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm3);
   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
   y = _mm_cvtpi32x2_ps(mm2, mm3);
   mm4 = mm2;
   mm5 = mm3;
   /* get the swap sign flag for the sine */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   __m128 swap_sign_bit_sin;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
   /* get the polynom selection mask for the sine */
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 poly_mask;
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
 #endif
   /* The magic pass: "Extended precision modular arithmetic" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);
 #ifdef USE_SSE2
   emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
   emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
   emm4 = _mm_slli_epi32(emm4, 29);
   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
 #else
   /* get the sign flag for the cosine */
   mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
   mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
   mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
   mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
   mm4 = _mm_slli_pi32(mm4, 29);
   mm5 = _mm_slli_pi32(mm5, 29);
   __m128 sign_bit_cos;
   COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
   _mm_empty(); /* good-bye mmx */
 #endif
   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   __m128 z = _mm_mul_ps(x,x);
   y = *(__m128*)_ps_coscof_p0;
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);
   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   __m128 ysin2 = _mm_and_ps(xmm3, y2);
   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
   y2 = _mm_sub_ps(y2,ysin2);
   y = _mm_sub_ps(y, ysin1);
   xmm1 = _mm_add_ps(ysin1,ysin2);
   xmm2 = _mm_add_ps(y,y2);
   /* update the sign */
   *s = _mm_xor_ps(xmm1, sign_bit_sin);
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
--- a/src/SseMathFuncs.h
+++ b/src/SseMathFuncs.h
@ -0,0 +1,80 @@
 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
 Inspired by Intel Approximate Math library, and based on the
 corresponding algorithms of the cephes math library
 The default is to use the SSE1 version. If you define USE_SSE2 the
 the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
 not expect any significant performance improvement with SSE2.
 */
 /* Copyright (C) 2007  Julien Pommier
 This software is provided 'as-is', without any express or implied
 warranty.  In no event will the authors be held liable for any damages
 arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it
 freely, subject to the following restrictions:
 1. The origin of this software must not be misrepresented; you must not
 claim that you wrote the original software. If you use this software
 in a product, an acknowledgment in the product documentation would be
 appreciated but is not required.
 2. Altered source versions must be plainly marked as such, and must not be
 misrepresented as being the original software.
 3. This notice may not be removed or altered from any source distribution.
 (this is the zlib license)
 */
 #ifndef SSE_MATHFUN
 #define SSE_MATHFUN
 #include <inttypes.h>
 #include <xmmintrin.h>
 /* yes I know, the top of this file is quite ugly */
 #ifdef _MSC_VER /* visual c++ */
 # define ALIGN16_BEG __declspec(align(16))
 # define ALIGN16_END 
 #else /* gcc or icc */
 # define ALIGN16_BEG
 # define ALIGN16_END __attribute__((aligned(16)))
 #endif
 /* __m128 is ugly to write */
 //typedef __m128 _v4sfu;  // vector of 4 float (sse1)
 #ifndef USE_SSE2 //sry this is all sse2 now
 #define USE_SSE2
 #endif
 #ifdef USE_SSE2
 # include <emmintrin.h>
 #else
 typedef __m64 v2si;   // vector of 2 int (mmx)
 #endif
 // !!! Andrew Hallendorff Warning changed call structure to make compatible with gcc
 typedef ALIGN16_BEG union {
     float               m128_f32[4];
     int8_t              m128_i8[16];
     int16_t             m128_i16[8];
     int32_t             m128_i32[4];
     int64_t             m128_i64[2];
     uint8_t             m128_u8[16];
     uint16_t            m128_u16[8];
     uint32_t            m128_u32[4];
     uint64_t            m128_u64[2];
 } ALIGN16_END v4sfu;
 __m128 log_ps(v4sfu *xPtr);
 __m128 sin_ps(v4sfu *xPtr);
 void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr);
 #endif
--- a/src/effects/Equalization.cpp
+++ b/src/effects/Equalization.cpp
--- a/src/effects/Equalization.h
+++ b/src/effects/Equalization.h
@ -74,6 +74,10 @@ public:
 };
 WX_DECLARE_OBJARRAY( EQCurve, EQCurveArray );
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
 class EffectEqualization48x;
 #endif
 class EffectEqualization: public Effect {
 public:
@ -113,12 +117,15 @@ public:
   // low range of human hearing
   enum {loFreqI=20};
 private:
   bool ProcessOne(int count, WaveTrack * t,
                   sampleCount start, sampleCount len);
   void Filter(sampleCount len,
               float *buffer);
   void ReadPrefs();
   HFFT hFFT;
@ -135,6 +142,11 @@ private:
   bool mPrompting;
   bool mDrawGrid;
   bool mEditingBatchParams;
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   bool mBench;
   EffectEqualization48x *mEffectEqualization48x;
 friend class EffectEqualization48x;
 #endif
 public:
@ -222,6 +234,9 @@ public:
   void EnvelopeUpdated(Envelope *env, bool lin);
   static const double thirdOct[];
   wxRadioButton *mFaderOrDraw[2];
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   wxRadioButton *mMathProcessingType[5]; // default, sse, sse threaded, AVX, AVX threaded (note AVX is not implemented yet
 #endif
   wxChoice *mInterpChoice;
   wxCheckBox *mLinFreq;
   int M;
@ -276,6 +291,14 @@ private:
      ID_INVERT,
      drawRadioID,
      sliderRadioID,
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
      defaultMathRadioID, 
      sSERadioID, 
      sSEThreadedRadioID, 
      aVXRadioID, 
      aVXThreadedRadioID, 
      ID_BENCH,
 #endif
      ID_INTERP,
      ID_LIN_FREQ,
      GridOnOffID,
@ -294,6 +317,10 @@ private:
   void OnSliderDBMIN( wxCommandEvent &event );
   void OnDrawRadio(wxCommandEvent &event );
   void OnSliderRadio(wxCommandEvent &event );
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   void OnProcessingRadio(wxCommandEvent &event );
   void OnBench( wxCommandEvent & event);
 #endif
   void OnLinFreq(wxCommandEvent &event );
   void UpdateGraphic(void);
   void EnvLogToLin(void);
@ -339,6 +366,9 @@ private:
   wxBoxSizer *szrH;
   wxBoxSizer *szrI;
   wxBoxSizer *szrL;
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   wxBoxSizer *szrM;
 #endif
   wxFlexGridSizer *szr1;
   wxBoxSizer *szr2;
   wxBoxSizer *szr3;
--- a/src/effects/Equalization48x.cpp
+++ b/src/effects/Equalization48x.cpp
@ -0,0 +1,924 @@
 /**********************************************************************
   Audacity: A Digital Audio Editor
   EffectEqualization.cpp
   Andrew Hallendorff
 *******************************************************************//**
   \file Equalization48x.cpp
   \brief Fast SSE based implementation of equalization.
 *//****************************************************************/
 #include "../Audacity.h"
 #include "../Project.h"
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
 #include "Equalization.h"
 #include "../WaveTrack.h"
 #include "float_cast.h"
 #include <vector>
 #include <wx/dcmemory.h>
 #include <wx/event.h>
 #include <wx/string.h>
 #if wxUSE_TOOLTIPS
 #include <wx/tooltip.h>
 #endif
 #include <wx/utils.h>
 #include <math.h>
 #include <wx/arrimpl.cpp>
 #include "Equalization48x.h"
 #include "../RealFFTf.h"
 #include "../RealFFTf48x.h"
 #ifndef USE_SSE2
 #define	USE_SSE2
 #endif
 #include <stdlib.h>
 #include <malloc.h>
 #include <stdio.h>
 #include <math.h>
 #include <xmmintrin.h>
 #ifdef _WIN32
 //  Windows
 #include <intrin.h>
 #define cpuid    __cpuid
 #else
 //  GCC Inline Assembly
 void cpuid(int CPUInfo[4],int InfoType){
   __asm__ __volatile__ (
      "cpuid":
   "=a" (CPUInfo[0]),
      "=b" (CPUInfo[1]),
      "=c" (CPUInfo[2]),
      "=d" (CPUInfo[3]) :
   "a" (InfoType)
      );
 }
 #endif
 bool sMathCapsInitialized = false;
 MathCaps sMathCaps;
 // dirty switcher
 int sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
 void EffectEqualization48x::SetMathPath(int mathPath) { sMathPath=mathPath; };
 int EffectEqualization48x::GetMathPath() { return sMathPath; };
 void EffectEqualization48x::AddMathPathOption(int mathPath) { sMathPath|=mathPath; };
 void EffectEqualization48x::RemoveMathPathOption(int mathPath) { sMathPath&=~mathPath; };
 MathCaps *EffectEqualization48x::GetMathCaps() 
 { 
   if(!sMathCapsInitialized)
   {
      sMathCapsInitialized=true;
      sMathCaps.x64     = false;
      sMathCaps.MMX     = false;
      sMathCaps.SSE     = false;
      sMathCaps.SSE2    = false;
      sMathCaps.SSE3    = false;
      sMathCaps.SSSE3   = false;
      sMathCaps.SSE41   = false;
      sMathCaps.SSE42   = false;
      sMathCaps.SSE4a   = false;
      sMathCaps.AVX     = false;
      sMathCaps.XOP     = false;
      sMathCaps.FMA3    = false;
      sMathCaps.FMA4    = false;
      int info[4];
      cpuid(info, 0);
      int nIds = info[0];
      cpuid(info, 0x80000000);
      int nExIds = info[0];
      //  Detect Instruction Set
      if (nIds >= 1){
         cpuid(info,0x00000001);
         sMathCaps.MMX   = (info[3] & ((int)1 << 23)) != 0;
         sMathCaps.SSE   = (info[3] & ((int)1 << 25)) != 0;
         sMathCaps.SSE2  = (info[3] & ((int)1 << 26)) != 0;
         sMathCaps.SSE3  = (info[2] & ((int)1 <<  0)) != 0;
         sMathCaps.SSSE3 = (info[2] & ((int)1 <<  9)) != 0;
         sMathCaps.SSE41 = (info[2] & ((int)1 << 19)) != 0;
         sMathCaps.SSE42 = (info[2] & ((int)1 << 20)) != 0;
         sMathCaps.AVX   = (info[2] & ((int)1 << 28)) != 0;
         sMathCaps.FMA3  = (info[2] & ((int)1 << 12)) != 0;
      }
      if (nExIds >= 0x80000001){
         cpuid(info,0x80000001);
         sMathCaps.x64   = (info[3] & ((int)1 << 29)) != 0;
         sMathCaps.SSE4a = (info[2] & ((int)1 <<  6)) != 0;
         sMathCaps.FMA4  = (info[2] & ((int)1 << 16)) != 0;
         sMathCaps.XOP   = (info[2] & ((int)1 << 11)) != 0;
      }
      if(sMathCaps.SSE)
         sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; // we are starting on.
   }
   return &sMathCaps; 
 };
 void * malloc_simd(const size_t size)
 {
 #if defined WIN32           // WIN32
    return _aligned_malloc(size, 16);
 #elif defined __linux__     // Linux
    return memalign (16, size);
 #elif defined __MACH__      // Mac OS X
    return malloc(size);
 #else                       // other (use valloc for page-aligned memory)
    return valloc(size);
 #endif
 }
 void free_simd(void* mem)
 {
 #if defined WIN32           // WIN32
    _aligned_free(mem);
 #else  
    free(mem);
 #endif
 }
 EffectEqualization48x::EffectEqualization48x():
         mThreadCount(0),mFilterSize(0),mWindowSize(0),mBlockSize(0),mWorkerDataCount(0),mBlocksPerBuffer(20),
         mScratchBufferSize(0),mSubBufferSize(0),mBigBuffer(NULL),mBufferInfo(NULL),mEQWorkers(0),mThreaded(false),
         mBenching(false)
 {
 }
 EffectEqualization48x::~EffectEqualization48x()
 {
 }
 bool EffectEqualization48x::AllocateBuffersWorkers(bool threaded)
 {
   if(mBigBuffer)
      FreeBuffersWorkers(); 
   mFilterSize=(mEffectEqualization->mM-1)&(~15); // 4000 !!! Filter MUST BE QUAD WORD ALIGNED !!!!
   mWindowSize=mEffectEqualization->windowSize;
   mBlockSize=mWindowSize-mFilterSize; // 12,384
   mThreaded=threaded;
   if( mThreaded )
   {
      mThreadCount=wxThread::GetCPUCount();
      mWorkerDataCount=mThreadCount+2; // 2 extra slots (maybe double later)
      // we're skewing the data by one block to allow for 1/4 block intersections.
      // this will remove the disparity in data at the intersections of the runs
      // The nice magic allocation
      // megabyte - 3 windows - 4 overlaping buffers - filter 
      // 2^20 = 1,048,576 - 3 * 2^14 (16,384) - ((4 * 20) - 3) * 12,384 - 4000 
      // 1,048,576 - 49,152 - 953,568 - 4000 = 41,856 (leftover)
      mScratchBufferSize=mWindowSize*3*(sizeof(__m128)/sizeof(float)); // 3 window size blocks size of __m128 but we allocate in float
      mSubBufferSize=mBlockSize*((mBlocksPerBuffer<<2)-3); // we are going to do a full block overlap -(blockSize*3)
      mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)*mWorkerDataCount); // we run over by filtersize
      // fill the bufferInfo
      mBufferInfo = new BufferInfo[mWorkerDataCount];
      for(int i=0;i<mWorkerDataCount;i++) {
         mBufferInfo[i].mFftWindowSize=mWindowSize;
         mBufferInfo[i].mFftFilterSize=mFilterSize;
         mBufferInfo[i].mBufferLength=mBlockSize*mBlocksPerBuffer;
         mBufferInfo[i].mScratchBuffer=&mBigBuffer[(mSubBufferSize+mScratchBufferSize)*i+mSubBufferSize];
         for(int j=0;j<4;j++)
            mBufferInfo[i].mBufferDest[j]=mBufferInfo[i].mBufferSouce[j]=&mBigBuffer[j*(mBufferInfo[i].mBufferLength-mBlockSize)+(mSubBufferSize+mScratchBufferSize)*i];
      }
      // start the workers
      mDataMutex.IsOk();
      mEQWorkers=new EQWorker[mThreadCount];
      for(int i=0;i<mThreadCount;i++) {
         mEQWorkers[i].SetData( mBufferInfo, mWorkerDataCount, &mDataMutex, this);
         mEQWorkers[i].Create();
         mEQWorkers[i].Run();
      }
   } else {
      mScratchBufferSize=mWindowSize*3*(sizeof(__m128)/sizeof(float)); // 3 window size blocks size of __m128
      mSubBufferSize=mBlockSize*((mBlocksPerBuffer<<2)-3); // we are going to do a full block overlap -(blockSize*3)
      mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)); // we run over by filtersize
      mBufferInfo = new BufferInfo[1]; // yeah it looks odd but it keeps compatibility with threaded processing
      mBufferInfo[0].mFftWindowSize=mWindowSize;
      mBufferInfo[0].mFftFilterSize=mFilterSize;
      mBufferInfo[0].mBufferLength=mBlockSize*mBlocksPerBuffer;
      mBufferInfo[0].mScratchBuffer=&mBigBuffer[mSubBufferSize];
      for(int j=0;j<4;j++)
         mBufferInfo[0].mBufferDest[j]=mBufferInfo[0].mBufferSouce[j]=&mBigBuffer[j*(mBufferInfo[0].mBufferLength-mBlockSize)];
   }
   return true;
 }
 bool EffectEqualization48x::FreeBuffersWorkers()
 {
   if(mThreaded) {
      for(int i=0;i<mThreadCount;i++) { // tell all the workers to exit
         mEQWorkers[i].ExitLoop();
      }
      for(int i=0;i<mThreadCount;i++) {
         mEQWorkers[i].Wait();
      }
      delete[] mEQWorkers; // kill the workers ( go directly to jail)
      mEQWorkers= NULL;
      mThreadCount=0;
      mWorkerDataCount=0; 
   }
   delete [] mBufferInfo;
   mBufferInfo = NULL;
   free_simd(mBigBuffer);
   mBigBuffer=NULL;
   return true;
 }
 bool EffectEqualization48x::Process(EffectEqualization* effectEqualization)
 {
   mEffectEqualization=effectEqualization;
 //   return TrackCompare(); // used for debugging data
   mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
   bool bGoodResult = true;
   TableUsage(sMathPath);
   if(sMathPath)  // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
      mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
   AllocateBuffersWorkers((sMathPath & MATH_FUNCTION_THREADED) != 0);
   SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
   WaveTrack *track = (WaveTrack *) iter.First();
   int count = 0;
   while (track) {
      double trackStart = track->GetStartTime();
      double trackEnd = track->GetEndTime();
      double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
      double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
      if (t1 > t0) {
         sampleCount start = track->TimeToLongSamples(t0);
         sampleCount end = track->TimeToLongSamples(t1);
         sampleCount len = (sampleCount)(end - start);
         if(!sMathPath) {
            if (!mEffectEqualization->ProcessOne(count, track, start, len))
            {
               bGoodResult = false;
               break;
            }
         } else {
            if(sMathPath<8) {
               if (!ProcessOne4x(count, track, start, len))
               {
                  bGoodResult = false;
                  break;
               }
            } else {
               if (!ProcessOne4xThreaded(count, track, start, len))
               {
                  bGoodResult = false;
                  break;
               }
            }
         }
      }
      track = (WaveTrack *) iter.Next();
      count++;
   }
   FreeBuffersWorkers();
   mEffectEqualization->ReplaceProcessedTracks(bGoodResult); 
   return bGoodResult;
 }
 bool EffectEqualization48x::TrackCompare()
 {
   mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
   bool bGoodResult = true;
   TableUsage(sMathPath);
   if(sMathPath)  // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
      mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
   AllocateBuffersWorkers((sMathPath & MATH_FUNCTION_THREADED)!=0);
   // Reset map
   wxArrayPtrVoid SecondIMap;
   wxArrayPtrVoid SecondOMap;
   SecondIMap.Clear();
   SecondOMap.Clear();
   TrackList      *SecondOutputTracks = new TrackList();
   //iterate over tracks of type trackType (All types if Track::All)
   TrackListOfKindIterator aIt(mEffectEqualization->mOutputTracksType, mEffectEqualization->mTracks);
   for (Track *aTrack = aIt.First(); aTrack; aTrack = aIt.Next()) {
      // Include selected tracks, plus sync-lock selected tracks for Track::All.
      if (aTrack->GetSelected() ||
         (mEffectEqualization->mOutputTracksType == Track::All && aTrack->IsSyncLockSelected()))
      {
         Track *o = aTrack->Duplicate();
         SecondOutputTracks->Add(o);
         SecondIMap.Add(aTrack);
         SecondIMap.Add(o);
      }
   }
   for(int i=0;i<2;i++) {
      SelectedTrackListOfKindIterator iter(Track::Wave, i?mEffectEqualization->mOutputTracks:SecondOutputTracks);
      i?sMathPath=sMathPath:sMathPath=0;
      WaveTrack *track = (WaveTrack *) iter.First();
      int count = 0;
      while (track) {
         double trackStart = track->GetStartTime();
         double trackEnd = track->GetEndTime();
         double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
         double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
         if (t1 > t0) {
            sampleCount start = track->TimeToLongSamples(t0);
            sampleCount end = track->TimeToLongSamples(t1);
            sampleCount len = (sampleCount)(end - start);
            if(!sMathPath) {
               if (!mEffectEqualization->ProcessOne(count, track, start, len))
               {
                  bGoodResult = false;
                  break;
               }
            } else {
               if(sMathPath<8) {
                  if (!ProcessOne4x(count, track, start, len))
                  {
                     bGoodResult = false;
                     break;
                  }
               } else {
                  if (!ProcessOne4xThreaded(count, track, start, len))
                  {
                     bGoodResult = false;
                     break;
                  }
               }
            }
         }
         track = (WaveTrack *) iter.Next();
         count++;
      }
   }
   SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
   SelectedTrackListOfKindIterator iter2(Track::Wave, SecondOutputTracks);
   WaveTrack *track =  (WaveTrack *) iter.First();
   WaveTrack *track2 = (WaveTrack *) iter2.First();
   while (track) {
      double trackStart = track->GetStartTime();
      double trackEnd = track->GetEndTime();
      double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
      double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
      if (t1 > t0) {
         sampleCount start = track->TimeToLongSamples(t0);
         sampleCount end = track->TimeToLongSamples(t1);
         sampleCount len = (sampleCount)(end - start);
         DeltaTrack(track, track2, start, len);
      }
      track = (WaveTrack *) iter.Next();
      track2 = (WaveTrack *) iter2.Next();
   }
   delete SecondOutputTracks;
   FreeBuffersWorkers();
   mEffectEqualization->ReplaceProcessedTracks(bGoodResult); 
   return bGoodResult;
 }
 bool EffectEqualization48x::DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len)
 {
   sampleCount trackBlockSize = t->GetMaxBlockSize();
   float *buffer1 = new float[trackBlockSize];
   float *buffer2 = new float[trackBlockSize];
   AudacityProject *p = GetActiveProject();
   WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
   sampleCount originalLen = len;
   sampleCount currentSample = start;
   while(len) {
      sampleCount curretLength=(trackBlockSize>len)?len:trackBlockSize;
      t->Get((samplePtr)buffer1, floatSample, currentSample, curretLength);
      t2->Get((samplePtr)buffer2, floatSample, currentSample, curretLength);
      for(int i=0;i<curretLength;i++)
         buffer1[i]-=buffer2[i];
      output->Append((samplePtr)buffer1, floatSample, curretLength);
      currentSample+=curretLength;
      len-=curretLength;
   }
   delete[] buffer1;
   delete[] buffer2;
   output->Flush();
   len=originalLen;
   ProcessTail(t, output, start, len);
   delete output;
   return true;
 }
 bool EffectEqualization48x::Benchmark(EffectEqualization* effectEqualization)
 {
   mEffectEqualization=effectEqualization;
   mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
   bool bGoodResult = true;
   TableUsage(sMathPath);
   if(sMathPath)  // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
      mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
   AllocateBuffersWorkers((bool)MATH_FUNCTION_THREADED);
   SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
   long times[] = { 0,0,0 };
   wxStopWatch timer;
   mBenching=true;
   for(int i=0;i<3;i++) {
      int localMathPath;
      switch(i) {
         case 0: localMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
                 if(!sMathCaps.SSE)
                    localMathPath=-1;
            break;
         case 1: localMathPath=MATH_FUNCTION_SSE;
                 if(!sMathCaps.SSE)
                    localMathPath=-1;
            break;
         case 2: localMathPath=0;
            break;
         default: localMathPath=-1;
      }
      if(localMathPath>=0) {
         timer.Start();
         WaveTrack *track = (WaveTrack *) iter.First();
         int count = 0;
         while (track) {
            double trackStart = track->GetStartTime();
            double trackEnd = track->GetEndTime();
            double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
            double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
            if (t1 > t0) {
               sampleCount start = track->TimeToLongSamples(t0);
               sampleCount end = track->TimeToLongSamples(t1);
               sampleCount len = (sampleCount)(end - start);
               if(!localMathPath) {
                  if (!mEffectEqualization->ProcessOne(count, track, start, len))
                  {
                     bGoodResult = false;
                     break;
                  }
               } else {
                  if(localMathPath<8) {
                     if (!ProcessOne4x(count, track, start, len))
                     {
                        bGoodResult = false;
                        break;
                     }
                  } else {
                     if (!ProcessOne4xThreaded(count, track, start, len))
                     {
                        bGoodResult = false;
                        break;
                     }
                  }
               }
            }
            track = (WaveTrack *) iter.Next();
            count++;
         }
         times[i]=timer.Time();
      }
   }
   FreeBuffersWorkers();
   mBenching=false;
   bGoodResult=false;
   mEffectEqualization->ReplaceProcessedTracks(bGoodResult); 
   wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]);
   wxTimeSpan tsSSE(0, 0, 0, times[1]);
   wxTimeSpan tsDefault(0, 0, 0, times[2]);
   wxMessageBox(wxString::Format(_("Benchmark times:\nDefault: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")).c_str(),tsSSE.Format(wxT("%M:%S.%l")).c_str(),tsSSEThreaded.Format(wxT("%M:%S.%l")).c_str()));
 /*   wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]);
   wxTimeSpan tsSSE(0, 0, 0, times[1]);
   wxTimeSpan tsDefault(0, 0, 0, times[2]);
   wxString outputString;
   outputString.Format(_("Benchmark times:\nDefault: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")),tsSSE.Format(wxT("%M:%S.%l")),tsSSEThreaded.Format(wxT("%M:%S.%l"))); 
   wxMessageBox(outputString); */ 
   return bGoodResult;
 }
 bool EffectEqualization48x::ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength)
 {
   sampleCount blockCount=bufferLength/mBlockSize;
   sampleCount lastBlockSize=bufferLength%mBlockSize;
   if(lastBlockSize)
      blockCount++;
   float *workBuffer=&sourceBuffer[bufferLength];  // all scratch buffers are at the end
   for(int runx=0;runx<blockCount;runx++) 
   {
      float *currentBuffer=&workBuffer[mWindowSize*(runx&1)]; 
      for(int i=0;i<mBlockSize;i++)
         currentBuffer[i]=sourceBuffer[i];
      sourceBuffer+=mBlockSize;
      float *currentFilter=&currentBuffer[mBlockSize];
      for(int i=0;i<mFilterSize;i++)
         currentFilter[i]=0;
      mEffectEqualization->Filter(mWindowSize, currentBuffer);
      float *writeEnd=currentBuffer+mBlockSize;
      if(runx==blockCount) 
         writeEnd=currentBuffer+(lastBlockSize+mFilterSize);
      if(runx) {
         float *lastOverrun=&workBuffer[mWindowSize*((runx+1)&1)+mBlockSize]; 
         for(int j=0;j<mFilterSize;j++)
            *destBuffer++= *currentBuffer++ + *lastOverrun++;
      } else 
         currentBuffer+=mFilterSize>>1; // this will skip the first filterSize on the first run
      while(currentBuffer<writeEnd)
         *destBuffer++ = *currentBuffer++;
   }
   return true;
 }
 bool EffectEqualization48x::ProcessBuffer4x(BufferInfo *bufferInfo)
 {
   // length must be a factor of window size for 4x processing. 
   if(bufferInfo->mBufferLength%mBlockSize)
      return false;
   sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize;
   __m128 *readBlocks[4]; // some temps so we dont destroy the vars in the struct
   __m128 *writeBlocks[4];
   for(int i=0;i<4;i++) {
      readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i];
      writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i];
   }
   __m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer;
   __m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*2];
   for(int run4x=0;run4x<blockCount;run4x++) 
   {
      // swizzle the data to the swizzle buffer
      __m128 *currentSwizzledBlock=&swizzledBuffer128[mWindowSize*(run4x&1)]; 
      for(int i=0,j=0;j<mBlockSize;i++,j+=4) {
         __m128 tmp0   = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(1,0,1,0)); 
         __m128 tmp1   = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(3,2,3,2)); 
         __m128 tmp2   = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(1,0,1,0)); 
         __m128 tmp3   = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(3,2,3,2)); 
         currentSwizzledBlock[j]   = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0)); 
         currentSwizzledBlock[j+1] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1)); 
         currentSwizzledBlock[j+2] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0)); 
         currentSwizzledBlock[j+3] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1)); 
      }
      __m128 *thisOverrun128=&currentSwizzledBlock[mBlockSize]; 
      for(int i=0;i<mFilterSize;i++)
         thisOverrun128[i]=_mm_set1_ps(0.0);
      Filter4x(mWindowSize, (float *)currentSwizzledBlock, (float *)scratchBuffer);
      int writeStart=0, writeToStart=0; // note readStart is where the read data is written
      int writeEnd=mBlockSize;
      if(run4x) {
         // maybe later swizzle add and write in one
         __m128 *lastOverrun128=&swizzledBuffer128[mWindowSize*((run4x+1)&1)+mBlockSize]; 
         // add and swizzle data + filter
         for(int i=0,j=0;j<mFilterSize;i++,j+=4) {
            __m128 tmps0 = _mm_add_ps(currentSwizzledBlock[j], lastOverrun128[j]);
            __m128 tmps1 = _mm_add_ps(currentSwizzledBlock[j+1], lastOverrun128[j+1]);
            __m128 tmps2 = _mm_add_ps(currentSwizzledBlock[j+2], lastOverrun128[j+2]);
            __m128 tmps3 = _mm_add_ps(currentSwizzledBlock[j+3], lastOverrun128[j+3]);
            __m128 tmp0   = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1)); 
            __m128 tmp1   = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3)); 
            __m128 tmp2   = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1)); 
            __m128 tmp3   = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3)); 
            writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3)); 
            writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2)); 
            writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3)); 
            writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2)); 
         } 
         writeStart=mFilterSize;
         writeToStart=mFilterSize>>2;
         // swizzle it back. 
         for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
            __m128 tmp0   = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1)); 
            __m128 tmp1   = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(2,3,2,3)); 
            __m128 tmp2   = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1)); 
            __m128 tmp3   = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(2,3,2,3)); 
            writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3)); 
            writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2)); 
            writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3)); 
            writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2)); 
         }
      } else {
         // swizzle it back. We overlap one block so we only write the first block on the first run
         writeStart=0;
         writeToStart=0;
         for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
            __m128 tmp0   = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1)); 
            __m128 tmp2   = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1)); 
            writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3)); 
         }
      }
      for(int i=0;i<4;i++) { // shift each block
         readBlocks[i]+=mBlockSize>>2; // these are 128b pointers, each window is 1/4 blockSize for those
         writeBlocks[i]+=mBlockSize>>2; 
      }
   }
   return true;
 }
 bool EffectEqualization48x::ProcessOne4x(int count, WaveTrack * t,
                                         sampleCount start, sampleCount len)
 {
   sampleCount blockCount=len/mBlockSize;
   if(blockCount<16) // it's not worth 4x processing do a regular process
      return mEffectEqualization->ProcessOne(count, t, start, len);
   sampleCount trackBlockSize = t->GetMaxBlockSize();
   AudacityProject *p = GetActiveProject();
   WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
   mEffectEqualization->TrackProgress(count, 0.0);
   int bigRuns=len/(mSubBufferSize-mBlockSize);
   int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
   int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
   int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
   sampleCount currentSample=start;
   for(int bigRun=0;bigRun<bigRuns;bigRun++)
   {
      // fill the buffer
      for(int i=0;i<trackBlocksPerBig;i++) {
         t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
         currentSample+=trackBlockSize;
      }
      if(trackLeftovers) {
         t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
         currentSample+=trackLeftovers;
      }
      currentSample-=mBlockSize+(mFilterSize>>1);
      ProcessBuffer4x(mBufferInfo);
      if (mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns))
      {
         break;
      }
      output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
   }
   if(singleProcessLength) {
      t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
      ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
      output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
   }
   output->Flush();
   ProcessTail(t, output, start, len);
   delete output;
   return true;
 }
 void *EQWorker::Entry()
 {
   while(!mExitLoop) {
      mMutex->Lock();
      bool bufferAquired=false;
      for(int i=0;i<mBufferInfoCount;i++)
         if(mBufferInfoList[i].mBufferStatus==BufferReady) { // we found an unlocked ready buffer
            bufferAquired=true;
            mBufferInfoList[i].mBufferStatus=BufferBusy; // we own it now
            mMutex->Unlock();
            mEffectEqualization48x->ProcessBuffer4x(&mBufferInfoList[i]);
            mBufferInfoList[i].mBufferStatus=BufferDone; // we're done
            break;
         } 
         if(!bufferAquired)
            mMutex->Unlock();
   }
   return NULL;
 }
 bool EffectEqualization48x::ProcessOne4xThreaded(int count, WaveTrack * t,
                                                 sampleCount start, sampleCount len)
 {
   sampleCount blockCount=len/mBlockSize;
   if(blockCount<16) // it's not worth 4x processing do a regular process
      return ProcessOne4x(count, t, start, len);
   if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data
      return ProcessOne4x(count, t, start, len);
   AudacityProject *p = GetActiveProject();
   WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
   sampleCount trackBlockSize = t->GetMaxBlockSize();
   mEffectEqualization->TrackProgress(count, 0.0);
   int bigRuns=len/(mSubBufferSize-mBlockSize);
   int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
   int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
   int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
   sampleCount currentSample=start;
   int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0;
   // fill the first workerDataCount buffers we checked above and there is at least this data
   for(int i=0;i<mWorkerDataCount;i++)
   {
      // fill the buffer
      for(int j=0;j<trackBlocksPerBig;j++) {
         t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
         currentSample+=trackBlockSize;
      }
      if(trackLeftovers) {
         t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
         currentSample+=trackLeftovers;
      }
      currentSample-=mBlockSize+(mFilterSize>>1);
      mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin
   }
   int currentIndex=0;
   while(bigBlocksWritten<bigRuns) {
      mDataMutex.Lock(); // Get in line for data
      // process as many blocks as we can
      while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWritten<bigRuns)) { // data is ours
         if (mEffectEqualization->TrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns))
         {
            break;
         }
         output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)));
         bigBlocksWritten++;
         if(bigBlocksRead<bigRuns) {
            // fill the buffer
            for(int j=0;j<trackBlocksPerBig;j++) {
               t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
               currentSample+=trackBlockSize;
            }
            if(trackLeftovers) {
               t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
               currentSample+=trackLeftovers;
            }
            currentSample-=mBlockSize+(mFilterSize>>1);
            mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin
            bigBlocksRead++;
         } else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary
         currentIndex=(currentIndex+1)%mWorkerDataCount;
      } 
      mDataMutex.Unlock(); // Get back in line for data
   }
   if(singleProcessLength) {
      t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
      ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
      output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
   }
   output->Flush();
   ProcessTail(t, output, start, len);
   delete output;
   return true;
 }
 bool EffectEqualization48x::ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len)
 {
   //	  double offsetT0 = t->LongSamplesToTime((sampleCount)offset);
   double lenT = t->LongSamplesToTime(len);
   // 'start' is the sample offset in 't', the passed in track
   // 'startT' is the equivalent time value
   // 'output' starts at zero
   double startT = t->LongSamplesToTime(start);
   //output has one waveclip for the total length, even though 
   //t might have whitespace seperating multiple clips
   //we want to maintain the original clip structure, so
   //only paste the intersections of the new clip.
   //Find the bits of clips that need replacing
   std::vector<std::pair<double, double> > clipStartEndTimes;
   std::vector<std::pair<double, double> > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected
   for (WaveClipList::compatibility_iterator it=t->GetClipIterator(); it; it=it->GetNext())
   {
      WaveClip *clip;
      double clipStartT;
      double clipEndT;
      clip = it->GetData();
      clipStartT = clip->GetStartTime();
      clipEndT = clip->GetEndTime();
      if( clipEndT <= startT )
         continue;   // clip is not within selection
      if( clipStartT >= startT + lenT )
         continue;   // clip is not within selection
      //save the actual clip start/end so that we can rejoin them after we paste.
      clipRealStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));            
      if( clipStartT < startT )  // does selection cover the whole clip?
         clipStartT = startT; // don't copy all the new clip
      if( clipEndT > startT + lenT )  // does selection cover the whole clip?
         clipEndT = startT + lenT; // don't copy all the new clip
      //save them
      clipStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
   }
   //now go thru and replace the old clips with new
   for(unsigned int i=0;i<clipStartEndTimes.size();i++)
   {
      Track *toClipOutput;
      //remove the old audio and get the new
      t->Clear(clipStartEndTimes[i].first,clipStartEndTimes[i].second);
      //         output->Copy(clipStartEndTimes[i].first-startT+offsetT0,clipStartEndTimes[i].second-startT+offsetT0, &toClipOutput);   
      output->Copy(clipStartEndTimes[i].first-startT,clipStartEndTimes[i].second-startT, &toClipOutput);   
      if(toClipOutput)
      {
         //put the processed audio in
         bool bResult = t->Paste(clipStartEndTimes[i].first, toClipOutput);
         wxASSERT(bResult); // TO DO: Actually handle this.
         //if the clip was only partially selected, the Paste will have created a split line.  Join is needed to take care of this
         //This is not true when the selection is fully contained within one clip (second half of conditional)
         if( (clipRealStartEndTimes[i].first  != clipStartEndTimes[i].first || 
            clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) &&
            !(clipRealStartEndTimes[i].first <= startT &&  
            clipRealStartEndTimes[i].second >= startT+lenT) )
            t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second);
         delete toClipOutput;
      }
   }
   return true;
 }
 void EffectEqualization48x::Filter4x(sampleCount len,
                                     float *buffer, float *scratchBuffer)
 {
   int i;
   __m128 real128, imag128;
   // Apply FFT
   RealFFTf4x(buffer, mEffectEqualization->hFFT);
   // Apply filter
   // DC component is purely real
   __m128 *localFFTBuffer=(__m128 *)scratchBuffer;
   __m128 *localBuffer=(__m128 *)buffer;
   __m128 filterFuncR, filterFuncI;
   filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[0]);
   localFFTBuffer[0]=_mm_mul_ps(localBuffer[0], filterFuncR); 
   int halfLength=(len/2);
   bool useBitReverseTable=sMathPath&1;
   for(i=1; i<halfLength; i++)
   {
      if(useBitReverseTable) {
         real128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]  ];
         imag128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1];
      } else {
         int bitReversed=SmallReverseBits(i,mEffectEqualization->hFFT->pow2Bits);
         real128=localBuffer[bitReversed];
         imag128=localBuffer[bitReversed+1];
      }
      filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[i]);
      filterFuncI=_mm_set1_ps(mEffectEqualization->mFilterFuncI[i]);
      localFFTBuffer[2*i  ] = _mm_sub_ps( _mm_mul_ps(real128, filterFuncR), _mm_mul_ps(imag128, filterFuncI));
      localFFTBuffer[2*i+1] = _mm_add_ps( _mm_mul_ps(real128, filterFuncI), _mm_mul_ps(imag128, filterFuncR));
   }
   // Fs/2 component is purely real
   filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]);
   localFFTBuffer[1] = _mm_mul_ps(localBuffer[1], filterFuncR);
   // Inverse FFT and normalization
   InverseRealFFTf4x(scratchBuffer, mEffectEqualization->hFFT);
   ReorderToTime4x(mEffectEqualization->hFFT, scratchBuffer, buffer);
 }
 #endif
--- a/src/effects/Equalization48x.h
+++ b/src/effects/Equalization48x.h
@ -0,0 +1,146 @@
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
 /**********************************************************************
 Audacity: A Digital Audio Editor
 Equalization48x.h
 Intrinsics (SSE/AVX) and Threaded Equalization
 ***********************************************************************/
 #ifndef __AUDACITY_EFFECT_EQUALIZATION48X__
 #define __AUDACITY_EFFECT_EQUALIZATION48X__
 // bitwise function selection
 // options are 
 #define MATH_FUNCTION_ORIGINAL 0 // 0 original path
 #define MATH_FUNCTION_BITREVERSE_TABLE 1 // 1 SSE BitReverse Table
 #define MATH_FUNCTION_SIN_COS_TABLE 2 // 2 SSE SinCos Table
 // 3 SSE with SinCos and BitReverse buffer
 #define MATH_FUNCTION_SSE 4 // 4 SSE no SinCos and no BitReverse buffer
 #define MATH_FUNCTION_THREADED 8 // 8 SSE threaded no SinCos and no BitReverse buffer
 // 9 SSE threaded BitReverse Table
 // 10 SSE threaded SinCos Table
 // 11 SSE threaded with SinCos and BitReverse buffer
 //#define MATH_FUNCTION_AVX 16
 // added by Andrew Hallendorff intrinsics processing
 enum EQBufferStatus
 {
   BufferEmpty=0,
   BufferReady,
   BufferBusy,
   BufferDone
 };
 class BufferInfo {
 public:
   BufferInfo() { mBufferLength=0; mBufferStatus=BufferEmpty; };
   float* mBufferSouce[4];
   float* mBufferDest[4];
   int mBufferLength;
   sampleCount mFftWindowSize;
   sampleCount mFftFilterSize;
   float* mScratchBuffer;
   EQBufferStatus mBufferStatus;
 };
 typedef struct {
   int x64;
   int MMX;
   int SSE;
   int SSE2;
   int SSE3;
   int SSSE3;
   int SSE41;
   int SSE42;
   int SSE4a;
   int AVX;
   int XOP;
   int FMA3;
   int FMA4;
 } MathCaps;
 class EffectEqualization;
 class EffectEqualization48x;
 static int EQWorkerCounter=0;
 class EQWorker : public wxThread {
 public:
   EQWorker():wxThread(wxTHREAD_JOINABLE) {   
      mBufferInfoList=NULL;
      mBufferInfoCount=0;
      mMutex=NULL;
      mEffectEqualization48x=NULL;
      mExitLoop=false;
      mThreadID=EQWorkerCounter++;
   }
   void SetData( BufferInfo* bufferInfoList, int bufferInfoCount, wxMutex *mutex, EffectEqualization48x *effectEqualization48x) {
      mBufferInfoList=bufferInfoList;
      mBufferInfoCount=bufferInfoCount;
      mMutex=mutex;
      mEffectEqualization48x=effectEqualization48x;
   }
   void ExitLoop() { // this will cause the thread to drop from the loops
      mExitLoop=true;
   }
   virtual void* Entry();
   BufferInfo* mBufferInfoList;
   int mBufferInfoCount, mThreadID;
   wxMutex *mMutex;
   EffectEqualization48x *mEffectEqualization48x;
   bool mExitLoop;
 };
 class EffectEqualization48x {
 public:
   EffectEqualization48x();
   virtual ~EffectEqualization48x();
   static MathCaps *GetMathCaps();
   static void SetMathPath(int mathPath);
   static int GetMathPath();
   static void AddMathPathOption(int mathPath);
   static void RemoveMathPathOption(int mathPath);
   bool Process(EffectEqualization* effectEqualization);
   bool Benchmark(EffectEqualization* effectEqualization);
 private:
   bool TrackCompare();
   bool DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len);
   bool AllocateBuffersWorkers(bool threaded);
   bool FreeBuffersWorkers();
   bool ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength);
   bool ProcessBuffer4x(BufferInfo *bufferInfo);
   bool ProcessOne4x(int count, WaveTrack * t, sampleCount start, sampleCount len);
   bool ProcessOne4xThreaded(int count, WaveTrack * t, sampleCount start, sampleCount len);
   bool ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len);
   void Filter4x(sampleCount len, float *buffer, float *scratchBuffer);
   EffectEqualization* mEffectEqualization;
   int mThreadCount;
   sampleCount mFilterSize;
   sampleCount mBlockSize;
   sampleCount mWindowSize;
   int mWorkerDataCount;
   int mBlocksPerBuffer;
   int mScratchBufferSize;
   int mSubBufferSize;
   float *mBigBuffer;
   BufferInfo* mBufferInfo;
   wxMutex mDataMutex;
   EQWorker* mEQWorkers;
   bool mThreaded;
   bool mBenching;
   friend EQWorker;
 };
 #endif
 #endif
--- a/src/prefs/EffectsPrefs.cpp
+++ b/src/prefs/EffectsPrefs.cpp
@ -123,6 +123,17 @@ void EffectsPrefs::PopulateOrExchange(ShuttleGui & S)
   }
   S.EndStatic();
 #endif
 #ifdef EXPERIMENTAL_EQ_SSE_THREADED
   S.StartStatic(_("Instruction Set"));
   {
      S.TieCheckBox(_("&Use SSE/SSE2/.../AVX"), 
                    wxT("/SSE/GUI"),
                    true);
   }
   S.EndStatic();
 #endif
 }
 bool EffectsPrefs::Apply()
--- a/win/Projects/Audacity/Audacity.vcproj
+++ b/win/Projects/Audacity/Audacity.vcproj
@ -634,6 +634,14 @@
 				RelativePath="..\..\..\src\RealFFTf.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\RealFFTf48x.cpp"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\RealFFTf48x.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\Resample.cpp"
 				>
@ -748,6 +756,14 @@
 				RelativePath="..\..\..\src\SplashDialog.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\SseMathFuncs.cpp"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\SseMathFuncs.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\Tags.cpp"
 				>
@ -996,6 +1012,14 @@
 				RelativePath="..\..\..\src\effects\Equalization.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\effects\Equalization48x.cpp"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\effects\Equalization48x.h"
 				>
 			</File>
 			<File
 				RelativePath="..\..\..\src\effects\Fade.cpp"
 				>