diff --git a/src/Experimental.h b/src/Experimental.h index 56a2d503b..083cb92b6 100644 --- a/src/Experimental.h +++ b/src/Experimental.h @@ -30,6 +30,10 @@ #ifndef __EXPERIMENTAL__ #define __EXPERIMENTAL__ +// ACH 08 Jan 2014 +// EQ accelerated code +//#define EXPERIMENTAL_EQ_SSE_THREADED + // LLL, 09 Nov 2013: // Allow all WASAPI devices, not just loopback #define EXPERIMENTAL_FULL_WASAPI diff --git a/src/RealFFTf.cpp b/src/RealFFTf.cpp index 4519a5acb..cb8438bea 100644 --- a/src/RealFFTf.cpp +++ b/src/RealFFTf.cpp @@ -1,54 +1,59 @@ /* - * Program: REALFFTF.C - * Author: Philip Van Baren - * Date: 2 September 1993 - * - * Description: These routines perform an FFT on real data to get a conjugate-symmetric - * output, and an inverse FFT on conjugate-symmetric input to get a real - * output sequence. - * - * This code is for floating point data. - * - * Modified 8/19/1998 by Philip Van Baren - * - made the InitializeFFT and EndFFT routines take a structure - * holding the length and pointers to the BitReversed and SinTable - * tables. - * Modified 5/23/2009 by Philip Van Baren - * - Added GetFFT and ReleaseFFT routines to retain common SinTable - * and BitReversed tables so they don't need to be reallocated - * and recomputed on every call. - * - Added Reorder* functions to undo the bit-reversal - * - * Copyright (C) 2009 Philip VanBaren - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ +* Program: REALFFTF.C +* Author: Philip Van Baren +* Date: 2 September 1993 +* +* Description: These routines perform an FFT on real data to get a conjugate-symmetric +* output, and an inverse FFT on conjugate-symmetric input to get a real +* output sequence. +* +* This code is for floating point data. +* +* Modified 8/19/1998 by Philip Van Baren +* - made the InitializeFFT and EndFFT routines take a structure +* holding the length and pointers to the BitReversed and SinTable +* tables. +* Modified 5/23/2009 by Philip Van Baren +* - Added GetFFT and ReleaseFFT routines to retain common SinTable +* and BitReversed tables so they don't need to be reallocated +* and recomputed on every call. +* - Added Reorder* functions to undo the bit-reversal +* +* Copyright (C) 2009 Philip VanBaren +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ #include #include #include +#include "Experimental.h" + #include "RealFFTf.h" +#ifdef EXPERIMENTAL_EQ_SSE_THREADED +#include "RealFFTf48x.h" +#endif #ifndef M_PI #define M_PI 3.14159265358979323846 /* pi */ #endif /* - * Initialize the Sine table and Twiddle pointers (bit-reversed pointers) - * for the FFT routine. - */ +* Initialize the Sine table and Twiddle pointers (bit-reversed pointers) +* for the FFT routine. +*/ HFFT InitializeFFT(int fftlen) { int i; @@ -62,10 +67,10 @@ HFFT InitializeFFT(int fftlen) exit(8); } /* - * FFT size is only half the number of data points - * The full FFT output can be reconstructed from this FFT's output. - * (This optimization can be made since the data is real.) - */ + * FFT size is only half the number of data points + * The full FFT output can be reconstructed from this FFT's output. + * (This optimization can be made since the data is real.) + */ h->Points = fftlen/2; if((h->SinTable=(fft_type *)malloc(2*h->Points*sizeof(fft_type)))==NULL) @@ -73,6 +78,7 @@ HFFT InitializeFFT(int fftlen) fprintf(stderr,"Error allocating memory for Sine table.\n"); exit(8); } + if((h->BitReversed=(int *)malloc(h->Points*sizeof(int)))==NULL) { fprintf(stderr,"Error allocating memory for BitReversed.\n"); @@ -86,19 +92,28 @@ HFFT InitializeFFT(int fftlen) temp=(temp >> 1) + (i&mask ? h->Points : 0); h->BitReversed[i]=temp; - } + } for(i=0;iPoints;i++) { h->SinTable[h->BitReversed[i] ]=(fft_type)-sin(2*M_PI*i/(2*h->Points)); h->SinTable[h->BitReversed[i]+1]=(fft_type)-cos(2*M_PI*i/(2*h->Points)); } + +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + // new SSE FFT routines work on live data + for(i=0;i<32;i++) + if((1<pow2Bits=i; + InitializeFFT1x(fftlen); +#endif + return h; } /* - * Free up the memory allotted for Sin table and Twiddle Pointers - */ +* Free up the memory allotted for Sin table and Twiddle Pointers +*/ void EndFFT(HFFT h) { if(h->Points>0) { @@ -157,23 +172,23 @@ void CleanupFFT() } /* - * Forward FFT routine. Must call InitializeFFT(fftlen) first! - * - * Note: Output is BIT-REVERSED! so you must use the BitReversed to - * get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] - * Imag_i = buffer[ h->BitReversed[i]+1 ] ) - * Input is in normal order. - * - * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin - * - this can be done because both values will always be real only - * - this allows us to not have to allocate an extra complex value for the Fs/2 bin - * - * Note: The scaling on this is done according to the standard FFT definition, - * so a unit amplitude DC signal will output an amplitude of (N) - * (Older revisions would progressively scale the input, so the output - * values would be similar in amplitude to the input values, which is - * good when using fixed point arithmetic) - */ +* Forward FFT routine. Must call InitializeFFT(fftlen) first! +* +* Note: Output is BIT-REVERSED! so you must use the BitReversed to +* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] +* Imag_i = buffer[ h->BitReversed[i]+1 ] ) +* Input is in normal order. +* +* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin +* - this can be done because both values will always be real only +* - this allows us to not have to allocate an extra complex value for the Fs/2 bin +* +* Note: The scaling on this is done according to the standard FFT definition, +* so a unit amplitude DC signal will output an amplitude of (N) +* (Older revisions would progressively scale the input, so the output +* values would be similar in amplitude to the input values, which is +* good when using fixed point arithmetic) +*/ void RealFFTf(fft_type *buffer,HFFT h) { fft_type *A,*B; @@ -186,12 +201,12 @@ void RealFFTf(fft_type *buffer,HFFT h) int ButterfliesPerGroup=h->Points/2; /* - * Butterfly: - * Ain-----Aout - * \ / - * / \ - * Bin-----Bout - */ + * Butterfly: + * Ain-----Aout + * \ / + * / \ + * Bin-----Bout + */ endptr1=buffer+h->Points*2; @@ -258,24 +273,24 @@ void RealFFTf(fft_type *buffer,HFFT h) /* Description: This routine performs an inverse FFT to real data. - * This code is for floating point data. - * - * Note: Output is BIT-REVERSED! so you must use the BitReversed to - * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] - * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) - * Input is in normal order, interleaved (real,imaginary) complex data - * You must call InitializeFFT(fftlen) first to initialize some buffers! - * - * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin - * - this can be done because both values will always be real only - * - this allows us to not have to allocate an extra complex value for the Fs/2 bin - * - * Note: The scaling on this is done according to the standard FFT definition, - * so a unit amplitude DC signal will output an amplitude of (N) - * (Older revisions would progressively scale the input, so the output - * values would be similar in amplitude to the input values, which is - * good when using fixed point arithmetic) - */ +* This code is for floating point data. +* +* Note: Output is BIT-REVERSED! so you must use the BitReversed to +* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] +* wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) +* Input is in normal order, interleaved (real,imaginary) complex data +* You must call InitializeFFT(fftlen) first to initialize some buffers! +* +* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin +* - this can be done because both values will always be real only +* - this allows us to not have to allocate an extra complex value for the Fs/2 bin +* +* Note: The scaling on this is done according to the standard FFT definition, +* so a unit amplitude DC signal will output an amplitude of (N) +* (Older revisions would progressively scale the input, so the output +* values would be similar in amplitude to the input values, which is +* good when using fixed point arithmetic) +*/ void InverseRealFFTf(fft_type *buffer,HFFT h) { fft_type *A,*B; @@ -323,12 +338,12 @@ void InverseRealFFTf(fft_type *buffer,HFFT h) buffer[1]=v2; /* - * Butterfly: - * Ain-----Aout - * \ / - * / \ - * Bin-----Bout - */ + * Butterfly: + * Ain-----Aout + * \ / + * / \ + * Bin-----Bout + */ endptr1=buffer+h->Points*2; diff --git a/src/RealFFTf.h b/src/RealFFTf.h index f46119ca9..b3bacab7a 100644 --- a/src/RealFFTf.h +++ b/src/RealFFTf.h @@ -6,6 +6,9 @@ typedef struct FFTParamType { int *BitReversed; fft_type *SinTable; int Points; +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + int pow2Bits; +#endif } FFTParam; #define HFFT FFTParam * diff --git a/src/RealFFTf48x.cpp b/src/RealFFTf48x.cpp new file mode 100644 index 000000000..51e9649fe --- /dev/null +++ b/src/RealFFTf48x.cpp @@ -0,0 +1,754 @@ +/********************************************************************** + + Audacity: A Digital Audio Editor + + RealFFT48x.cpp + + Philip Van Baren + Andrew Hallendorff (SSE Mods) + +*******************************************************************//** + + \file RealFFT48x.cpp + \brief Real FFT with SSE acceleration. + +*//****************************************************************/ + +/* +* Program: REALFFTF.C +* Author: Philip Van Baren +* Date: 2 September 1993 +* +* Description: These routines perform an FFT on real data to get a conjugate-symmetric +* output, and an inverse FFT on conjugate-symmetric input to get a real +* output sequence. +* +* This code is for floating point data. +* +* Modified 8/19/1998 by Philip Van Baren +* - made the InitializeFFT and EndFFT routines take a structure +* holding the length and pointers to the BitReversed and SinTable +* tables. +* Modified 5/23/2009 by Philip Van Baren +* - Added GetFFT and ReleaseFFT routines to retain common SinTable +* and BitReversed tables so they don't need to be reallocated +* and recomputed on every call. +* - Added Reorder* functions to undo the bit-reversal +* +* Copyright (C) 2009 Philip VanBaren +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ +#include "Experimental.h" +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + + +#ifndef USE_SSE2 +#define USE_SSE2 +#endif + +#include +#include +#include +#include "RealFFTf.h" +#ifdef __WXMSW__ +#pragma warning(disable:4305) +#else + +#endif +#include "SseMathFuncs.h" +#include + + +#ifndef M_PI +#define M_PI 3.14159265358979323846 /* pi */ +#endif + +unsigned char smallReverseBitsTable[256]; + +int tableMask=0; +bool useBitReverseTable=false; +bool useSinCosTable=false; + +void TableUsage(int iMask) +{ + tableMask=iMask; + useBitReverseTable=((iMask & 1)!=0); + useSinCosTable=((iMask&2)!=0); +} + +// note !!! number of bits must be between 9-16 +int SmallReverseBits(int bits, int numberBits) +{ + return (smallReverseBitsTable[*((unsigned char *)&bits)]<<(numberBits-8))+(smallReverseBitsTable[*(((unsigned char *)&bits)+1)]>>(16-numberBits)); +} + + +/* +* Initialize the Sine table and Twiddle pointers (bit-reversed pointers) +* for the FFT routine. +*/ +HFFT InitializeFFT1x(int WXUNUSED( fftlen ) ) +{ + int i; + //int temp; + //int mask; + //HFFT h; + + + // this needs to move out but ehh... Andrew Hallendorff + for(i=0;i<256;i++) { + smallReverseBitsTable[i]=0; + for(int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1) + if(i&maskLow) + smallReverseBitsTable[i]|=maskHigh; + } + + return NULL; +} + +/* +* Free up the memory allotted for Sin table and Twiddle Pointers +*/ +void EndFFT1x(HFFT h) +{ + if(h->Points>0) { + free(h->BitReversed); + free(h->SinTable); + } + h->Points=0; + free(h); +} + +#define MAX_HFFT 10 +static HFFT hFFTArray[MAX_HFFT] = { NULL }; +static int nFFTLockCount[MAX_HFFT] = { 0 }; + +/* Get a handle to the FFT tables of the desired length */ +/* This version keeps common tables rather than allocating a new table every time */ +HFFT GetFFT1x(int fftlen) +{ + int h,n = fftlen/2; + for(h=0; (hPoints); h++); + if(hBitReversed[i] ] +* Imag_i = buffer[ h->BitReversed[i]+1 ] ) +* Input is in normal order. +* +* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin +* - this can be done because both values will always be real only +* - this allows us to not have to allocate an extra complex value for the Fs/2 bin +* +* Note: The scaling on this is done according to the standard FFT definition, +* so a unit amplitude DC signal will output an amplitude of (N) +* (Older revisions would progressively scale the input, so the output +* values would be similar in amplitude to the input values, which is +* good when using fixed point arithmetic) +*/ +void RealFFTf1x(fft_type *buffer,HFFT h) +{ + fft_type *A,*B; + fft_type *sptr; + fft_type *endptr1,*endptr2; + int *br1,*br2; + fft_type HRplus,HRminus,HIplus,HIminus; + fft_type v1,v2,sin,cos; + + int ButterfliesPerGroup=h->Points/2; + + /* + * Butterfly: + * Ain-----Aout + * \ / + * / \ + * Bin-----Bout + */ + + endptr1=buffer+h->Points*2; + + while(ButterfliesPerGroup>0) + { + A=buffer; + B=buffer+ButterfliesPerGroup*2; + sptr=h->SinTable; + + while(A>= 1; + } + /* Massage output to get the output for a real input sequence. */ + br1=h->BitReversed+1; + br2=h->BitReversed+h->Points-1; + + while(br1SinTable[*br1]; + cos=h->SinTable[*br1+1]; + A=buffer+*br1; + B=buffer+*br2; + HRplus = (HRminus = *A - *B ) + (*B * 2); + HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); + v1 = (sin*HRminus - cos*HIplus); + v2 = (cos*HRminus + sin*HIplus); + *A = (HRplus + v1) * (fft_type)0.5; + *B = *A - v1; + *(A+1) = (HIminus + v2) * (fft_type)0.5; + *(B+1) = *(A+1) - HIminus; + + br1++; + br2--; + } + /* Handle the center bin (just need a conjugate) */ + A=buffer+*br1+1; + *A=-*A; + /* Handle DC bin separately - and ignore the Fs/2 bin + buffer[0]+=buffer[1]; + buffer[1]=(fft_type)0;*/ + /* Handle DC and Fs/2 bins separately */ + /* Put the Fs/2 value into the imaginary part of the DC bin */ + v1=buffer[0]-buffer[1]; + buffer[0]+=buffer[1]; + buffer[1]=v1; +} + + +/* Description: This routine performs an inverse FFT to real data. +* This code is for floating point data. +* +* Note: Output is BIT-REVERSED! so you must use the BitReversed to +* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] +* wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) +* Input is in normal order, interleaved (real,imaginary) complex data +* You must call InitializeFFT(fftlen) first to initialize some buffers! +* +* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin +* - this can be done because both values will always be real only +* - this allows us to not have to allocate an extra complex value for the Fs/2 bin +* +* Note: The scaling on this is done according to the standard FFT definition, +* so a unit amplitude DC signal will output an amplitude of (N) +* (Older revisions would progressively scale the input, so the output +* values would be similar in amplitude to the input values, which is +* good when using fixed point arithmetic) +*/ +void InverseRealFFTf1x(fft_type *buffer,HFFT h) +{ + fft_type *A,*B; + fft_type *sptr; + fft_type *endptr1,*endptr2; + int *br1; + fft_type HRplus,HRminus,HIplus,HIminus; + fft_type v1,v2,sin,cos; + + int ButterfliesPerGroup=h->Points/2; + + /* Massage input to get the input for a real output sequence. */ + A=buffer+2; + B=buffer+h->Points*2-2; + br1=h->BitReversed+1; + while(ASinTable[*br1]; + cos=h->SinTable[*br1+1]; + HRplus = (HRminus = *A - *B ) + (*B * 2); + HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); + v1 = (sin*HRminus + cos*HIplus); + v2 = (cos*HRminus - sin*HIplus); + *A = (HRplus + v1) * (fft_type)0.5; + *B = *A - v1; + *(A+1) = (HIminus - v2) * (fft_type)0.5; + *(B+1) = *(A+1) - HIminus; + + A+=2; + B-=2; + br1++; + } + /* Handle center bin (just need conjugate) */ + *(A+1)=-*(A+1); + /* Handle DC bin separately - this ignores any Fs/2 component + buffer[1]=buffer[0]=buffer[0]/2;*/ + /* Handle DC and Fs/2 bins specially */ + /* The DC bin is passed in as the real part of the DC complex value */ + /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ + /* (v1+v2) = buffer[0] == the DC component */ + /* (v1-v2) = buffer[1] == the Fs/2 component */ + v1=0.5f*(buffer[0]+buffer[1]); + v2=0.5f*(buffer[0]-buffer[1]); + buffer[0]=v1; + buffer[1]=v2; + + /* + * Butterfly: + * Ain-----Aout + * \ / + * / \ + * Bin-----Bout + */ + + endptr1=buffer+h->Points*2; + + while(ButterfliesPerGroup>0) + { + A=buffer; + B=buffer+ButterfliesPerGroup*2; + sptr=h->SinTable; + + while(A>= 1; + } +} + +void ReorderToFreq1x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) +{ + // Copy the data into the real and imaginary outputs + for(int i=1;iPoints;i++) { + RealOut[i]=buffer[hFFT->BitReversed[i] ]; + ImagOut[i]=buffer[hFFT->BitReversed[i]+1]; + } + RealOut[0] = buffer[0]; // DC component + ImagOut[0] = 0; + RealOut[hFFT->Points] = buffer[1]; // Fs/2 component + ImagOut[hFFT->Points] = 0; +} + +void ReorderToTime1x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut) +{ + // Copy the data into the real outputs + for(int i=0;iPoints;i++) { + TimeOut[i*2 ]=buffer[hFFT->BitReversed[i] ]; + TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1]; + } +} + + +// 4x processing simd + +void RealFFTf4x(fft_type *buffer,HFFT h) +{ + + __m128 *localBuffer=(__m128 *)buffer; + + __m128 *A,*B; + fft_type *sptr; + __m128 *endptr1,*endptr2; + int br1Index, br2Index; + int br1Value, br2Value; + __m128 HRplus,HRminus,HIplus,HIminus; + __m128 v1,v2,sin,cos; + fft_type iToRad=2*M_PI/(2*h->Points); + + int ButterfliesPerGroup=h->Points/2; + + /* + * Butterfly: + * Ain-----Aout + * \ / + * / \ + * Bin-----Bout + */ + + endptr1=&localBuffer[h->Points*2]; + + while(ButterfliesPerGroup>0) + { + A=localBuffer; + B=&localBuffer[ButterfliesPerGroup*2]; + sptr=h->SinTable; + int iSinCosIndex=0; + int iSinCosCalIndex=0; + while(Apow2Bits-1))*iToRad; + sincos_ps(&vx, &sin4_2, &cos4_2); + sin=_mm_set1_ps(-sin4_2.m128_f32[0]); + cos=_mm_set1_ps(-cos4_2.m128_f32[0]); + iSinCosCalIndex++; + } else { + sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); + cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); + if(iSinCosCalIndex==3) + iSinCosCalIndex=0; + else + iSinCosCalIndex++; + } + iSinCosIndex++; + } + endptr2=B; + while(A>= 1; + } + /* Massage output to get the output for a real input sequence. */ + + br1Index=1; // h->BitReversed+1; + br2Index=h->Points-1; //h->BitReversed+h->Points-1; + + int iSinCosCalIndex=0; + while(br1IndexBitReversed[br1Index]; + br2Value=h->BitReversed[br2Index]; + } else { + br1Value=SmallReverseBits(br1Index,h->pow2Bits); + br2Value=SmallReverseBits(br2Index,h->pow2Bits); + } + if(useSinCosTable) { + sin=_mm_set1_ps(h->SinTable[br1Value]); + cos=_mm_set1_ps(h->SinTable[br1Value+1]); + } else { + if(!iSinCosCalIndex) + { + v4sfu vx; + for(int i=0;i<4;i++) + vx.m128_f32[i]=((float)(br1Index+i))*iToRad; + sincos_ps(&vx, &sin4_2, &cos4_2); + sin=_mm_set1_ps(-sin4_2.m128_f32[0]); + cos=_mm_set1_ps(-cos4_2.m128_f32[0]); + iSinCosCalIndex++; + } else { + sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); + cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); + if(iSinCosCalIndex==3) + iSinCosCalIndex=0; + else + iSinCosCalIndex++; + } + } + + A=&localBuffer[br1Value]; + B=&localBuffer[br2Value]; + __m128 temp128 = _mm_set1_ps( 2.0); + HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128)); + HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128)); + v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); + v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); + temp128 = _mm_set1_ps( 0.5); + *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128); + *B = _mm_sub_ps(*A, v1); + *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128); + *(B+1) = _mm_sub_ps(*(A+1), HIminus); + + br1Index++; + br2Index--; + } + /* Handle the center bin (just need a conjugate) */ + if(useBitReverseTable) + A=&localBuffer[h->BitReversed[br1Index]+1]; + else + A=&localBuffer[SmallReverseBits(br1Index,h->pow2Bits)+1]; + // negate sse style + *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); + /* Handle DC and Fs/2 bins separately */ + /* Put the Fs/2 value into the imaginary part of the DC bin */ + v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); + localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); + localBuffer[1]=v1; +} + + +/* Description: This routine performs an inverse FFT to real data. +* This code is for floating point data. +* +* Note: Output is BIT-REVERSED! so you must use the BitReversed to +* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] +* wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) +* Input is in normal order, interleaved (real,imaginary) complex data +* You must call InitializeFFT(fftlen) first to initialize some buffers! +* +* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin +* - this can be done because both values will always be real only +* - this allows us to not have to allocate an extra complex value for the Fs/2 bin +* +* Note: The scaling on this is done according to the standard FFT definition, +* so a unit amplitude DC signal will output an amplitude of (N) +* (Older revisions would progressively scale the input, so the output +* values would be similar in amplitude to the input values, which is +* good when using fixed point arithmetic) +*/ +void InverseRealFFTf4x(fft_type *buffer,HFFT h) +{ + + __m128 *localBuffer=(__m128 *)buffer; + + __m128 *A,*B; + fft_type *sptr; + __m128 *endptr1,*endptr2; + int br1Index, br1Value; + __m128 HRplus,HRminus,HIplus,HIminus; + __m128 v1,v2,sin,cos; + fft_type iToRad=2*M_PI/(2*h->Points); + + + int ButterfliesPerGroup=h->Points/2; + + /* Massage input to get the input for a real output sequence. */ + A=localBuffer+2; + B=localBuffer+h->Points*2-2; + br1Index=1; //h->BitReversed+1; + int iSinCosCalIndex=0; + while(ABitReversed[br1Index]; + } else { + br1Value=SmallReverseBits(br1Index,h->pow2Bits); + } + if(useSinCosTable) { + sin=_mm_set1_ps(h->SinTable[br1Value]); + cos=_mm_set1_ps(h->SinTable[br1Value+1]); + } else { + if(!iSinCosCalIndex) + { + v4sfu vx; + for(int i=0;i<4;i++) + vx.m128_f32[i]=((float)(br1Index+i))*iToRad; + sincos_ps(&vx, &sin4_2, &cos4_2); + sin=_mm_set1_ps(-sin4_2.m128_f32[0]); + cos=_mm_set1_ps(-cos4_2.m128_f32[0]); + iSinCosCalIndex++; + } else { + sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); + cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); + if(iSinCosCalIndex==3) + iSinCosCalIndex=0; + else + iSinCosCalIndex++; + } + } + HRminus = _mm_sub_ps(*A, *B); + HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); + HIminus = _mm_sub_ps( *(A+1), *(B+1)); + HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); + v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); + v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); + *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); + *B = _mm_sub_ps(*A, v1); + *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); + *(B+1) = _mm_sub_ps(*(A+1), HIminus); + + A=&A[2]; + B=&B[-2]; + br1Index++; + } + /* Handle center bin (just need conjugate) */ + // negate sse style + *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); + + /* Handle DC bin separately - this ignores any Fs/2 component + buffer[1]=buffer[0]=buffer[0]/2;*/ + /* Handle DC and Fs/2 bins specially */ + /* The DC bin is passed in as the real part of the DC complex value */ + /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ + /* (v1+v2) = buffer[0] == the DC component */ + /* (v1-v2) = buffer[1] == the Fs/2 component */ + v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); + v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); + localBuffer[0]=v1; + localBuffer[1]=v2; + + /* + * Butterfly: + * Ain-----Aout + * \ / + * / \ + * Bin-----Bout + */ + + endptr1=localBuffer+h->Points*2; + + while(ButterfliesPerGroup>0) + { + A=localBuffer; + B=localBuffer+ButterfliesPerGroup*2; + sptr=h->SinTable; + int iSinCosIndex=0; + int iSinCosCalIndex=0; + while(Apow2Bits-1))*iToRad; + sincos_ps(&vx, &sin4_2, &cos4_2); + sin=_mm_set1_ps(-sin4_2.m128_f32[0]); + cos=_mm_set1_ps(-cos4_2.m128_f32[0]); + iSinCosCalIndex++; + } else { + sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]); + cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]); + if(iSinCosCalIndex==3) + iSinCosCalIndex=0; + else + iSinCosCalIndex++; + } + iSinCosIndex++; + } + endptr2=B; + while(A>= 1; + } +} + +void ReorderToFreq4x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) +{ + __m128 *localBuffer=(__m128 *)buffer; + __m128 *localRealOut=(__m128 *)RealOut; + __m128 *localImagOut=(__m128 *)ImagOut; + + // Copy the data into the real and imaginary outputs + for(int i=1;iPoints;i++) { + int brValue; + if(useBitReverseTable) + brValue=hFFT->BitReversed[i]; + else + brValue=SmallReverseBits(i,hFFT->pow2Bits); + localRealOut[i]=localBuffer[brValue ]; + localImagOut[i]=localBuffer[brValue+1]; + } + localRealOut[0] = localBuffer[0]; // DC component + localImagOut[0] = _mm_set1_ps(0.0); + localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component + localImagOut[hFFT->Points] = _mm_set1_ps(0.0); +} + +void ReorderToTime4x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut) +{ + __m128 *localBuffer=(__m128 *)buffer; + __m128 *localTimeOut=(__m128 *)TimeOut; + // Copy the data into the real outputs + for(int i=0;iPoints;i++) { + int brValue; + if(useBitReverseTable) + brValue=hFFT->BitReversed[i]; + else + brValue=SmallReverseBits(i,hFFT->pow2Bits); + localTimeOut[i*2 ]=localBuffer[brValue ]; + localTimeOut[i*2+1]=localBuffer[brValue+1]; + } +} + +#endif \ No newline at end of file diff --git a/src/RealFFTf48x.h b/src/RealFFTf48x.h new file mode 100644 index 000000000..2bdad1d38 --- /dev/null +++ b/src/RealFFTf48x.h @@ -0,0 +1,23 @@ +#ifndef __realfftf48x_h +#define __realfftf48x_h + +#define fft_type float + +HFFT InitializeFFT1x(int); +void EndFFT1x(HFFT); +HFFT GetFFT1x(int); +void ReleaseFFT1x(HFFT); +void CleanupFFT1x(); +void RealFFTf1x(fft_type *,HFFT); +void InverseRealFFTf1x(fft_type *,HFFT); +void ReorderToTime1x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut); +void ReorderToFreq1x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut); +int SmallReverseBits(int bits, int numberBits); +void RealFFTf4x(fft_type *,HFFT); +void InverseRealFFTf4x(fft_type *,HFFT); +void ReorderToTime4x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut); +void ReorderToFreq4x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut); +void TableUsage(int iMask); + +#endif + diff --git a/src/SseMathFuncs.cpp b/src/SseMathFuncs.cpp new file mode 100644 index 000000000..0530106f6 --- /dev/null +++ b/src/SseMathFuncs.cpp @@ -0,0 +1,698 @@ +/********************************************************************** + + Audacity: A Digital Audio Editor + + SseMathFuncs.cpp + + Stephen Moshier (wrote original C version, The Cephes Library) + Julien Pommier (converted to use SSE) + Andrew Hallendorff (adapted for Audacity) + +*******************************************************************//** + + \file SseMathfuncs.cpp + \brief SSE maths functions (for FFTs) + +*//****************************************************************/ + +#include "SseMathFuncs.h" + + + +/* JKC: The trig functions use Taylor's series, on the range 0 to Pi/4 + * computing both Sin and Cos, and using one or the other (in the + * solo functions), or both in the more useful for us for FFTs sincos + * function. + * The constants minus_cephes_DP1 to minus_cephes_DP3 are used in the + * angle reduction modulo function. + * 4 sincos are done at a time. + * If we wanted to do just sin or just cos, we could speed things up + * by queuing up the Sines and Cosines into batches of 4 separately.*/ + + + +#ifndef USE_SSE2 //sry this is all sse2 now +#define USE_SSE2 +#endif + +/* declare some SSE constants -- why can't I figure a better way to do that? */ +#define _PS_CONST(Name, Val) \ + static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (float)Val, (float)Val, (float)Val, (float)Val } +#define _PI32_CONST(Name, Val) \ + static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val } +#define _PS_CONST_TYPE(Name, Type, Val) \ + static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val } + +_PS_CONST(1 , 1.0f); +_PS_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST(1, 1); +_PI32_CONST(inv1, ~1); +_PI32_CONST(2, 2); +_PI32_CONST(4, 4); +_PI32_CONST(0x7f, 0x7f); + +_PS_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS_CONST(cephes_log_p0, 7.0376836292E-2); +_PS_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS_CONST(cephes_log_p2, 1.1676998740E-1); +_PS_CONST(cephes_log_p3, - 1.2420140846E-1); +_PS_CONST(cephes_log_p4, + 1.4249322787E-1); +_PS_CONST(cephes_log_p5, - 1.6668057665E-1); +_PS_CONST(cephes_log_p6, + 2.0000714765E-1); +_PS_CONST(cephes_log_p7, - 2.4999993993E-1); +_PS_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS_CONST(cephes_log_q1, -2.12194440e-4); +_PS_CONST(cephes_log_q2, 0.693359375); + +#ifndef USE_SSE2 +typedef union xmm_mm_union { + __m128 xmm; + __m64 mm[2]; +} xmm_mm_union; + +#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \ + xmm_mm_union u; u.xmm = xmm_; \ + mm0_ = u.mm[0]; \ + mm1_ = u.mm[1]; \ +} + +#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \ + xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \ +} + +#endif // USE_SSE2 + +/* natural logarithm computed for 4 simultaneous float +return NaN for x <= 0 +*/ +__m128 log_ps(v4sfu *xPtr) { + __m128 x=*((__m128 *)xPtr); +#ifdef USE_SSE2 + __m128i emm0; +#else + __m64 mm0, mm1; +#endif + __m128 one = *(__m128*)_ps_1; + + __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); + + x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos); /* cut off denormalized stuff */ + +#ifndef USE_SSE2 + /* part 1: x = frexpf(x, &e); */ + COPY_XMM_TO_MM(x, mm0, mm1); + mm0 = _mm_srli_pi32(mm0, 23); + mm1 = _mm_srli_pi32(mm1, 23); +#else + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); +#endif + /* keep only the fractional part */ + x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask); + x = _mm_or_ps(x, *(__m128*)_ps_0p5); + +#ifndef USE_SSE2 + /* now e=mm0:mm1 contain the really base-2 exponent */ + mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f); + mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f); + __m128 e = _mm_cvtpi32x2_ps(mm0, mm1); + _mm_empty(); /* bye bye mmx */ +#else + emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f); + __m128 e = _mm_cvtepi32_ps(emm0); +#endif + + e = _mm_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF); + __m128 tmp = _mm_and_ps(x, mask); + x = _mm_sub_ps(x, one); + e = _mm_sub_ps(e, _mm_and_ps(one, mask)); + x = _mm_add_ps(x, tmp); + + + __m128 z = _mm_mul_ps(x,x); + + __m128 y = *(__m128*)_ps_cephes_log_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8); + y = _mm_mul_ps(y, x); + + y = _mm_mul_ps(y, z); + + + tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1); + y = _mm_add_ps(y, tmp); + + + tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); + y = _mm_sub_ps(y, tmp); + + tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2); + x = _mm_add_ps(x, y); + x = _mm_add_ps(x, tmp); + x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS_CONST(exp_hi, 88.3762626647949f); +_PS_CONST(exp_lo, -88.3762626647949f); + +_PS_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS_CONST(cephes_exp_C1, 0.693359375); +_PS_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS_CONST(cephes_exp_p5, 5.0000001201E-1); + +__m128 exp_ps(v4sfu *xPtr) { + __m128 x=*((__m128 *)xPtr); + __m128 tmp = _mm_setzero_ps(), fx; +#ifdef USE_SSE2 + __m128i emm0; +#else + __m64 mm0, mm1; +#endif + __m128 one = *(__m128*)_ps_1; + + x = _mm_min_ps(x, *(__m128*)_ps_exp_hi); + x = _mm_max_ps(x, *(__m128*)_ps_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF); + fx = _mm_add_ps(fx, *(__m128*)_ps_0p5); + + /* how to perform a floorf with SSE: just below */ +#ifndef USE_SSE2 + /* step 1 : cast to int */ + tmp = _mm_movehl_ps(tmp, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(tmp); + /* step 2 : cast back to float */ + tmp = _mm_cvtpi32x2_ps(mm0, mm1); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); +#endif + /* if greater, substract 1 */ + __m128 mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, one); + fx = _mm_sub_ps(tmp, mask); + + tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1); + __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2); + x = _mm_sub_ps(x, tmp); + x = _mm_sub_ps(x, z); + + z = _mm_mul_ps(x,x); + + __m128 y = *(__m128*)_ps_cephes_exp_p0; + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4); + y = _mm_mul_ps(y, x); + y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, x); + y = _mm_add_ps(y, one); + + /* build 2^n */ +#ifndef USE_SSE2 + z = _mm_movehl_ps(z, fx); + mm0 = _mm_cvttps_pi32(fx); + mm1 = _mm_cvttps_pi32(z); + mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f); + mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f); + mm0 = _mm_slli_pi32(mm0, 23); + mm1 = _mm_slli_pi32(mm1, 23); + + __m128 pow2n; + COPY_MM_TO_XMM(mm0, mm1, pow2n); + _mm_empty(); +#else + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + __m128 pow2n = _mm_castsi128_ps(emm0); +#endif + y = _mm_mul_ps(y, pow2n); + return y; +} + +_PS_CONST(minus_cephes_DP1, -0.78515625); +_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS_CONST(sincof_p0, -1.9515295891E-4); +_PS_CONST(sincof_p1, 8.3321608736E-3); +_PS_CONST(sincof_p2, -1.6666654611E-1); +_PS_CONST(coscof_p0, 2.443315711809948E-005); +_PS_CONST(coscof_p1, -1.388731625493765E-003); +_PS_CONST(coscof_p2, 4.166664568298827E-002); +_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so +it runs also on old athlons XPs and the pentium III of your grand +mother. + +The code is the exact rewriting of the cephes sinf function. +Precision is excellent as long as x < 8192 (I did not bother to +take into account the special handling they have for greater values +-- it does not return garbage for arguments over 8192, though, but +the extra precision is missing). + +Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the +surprising but correct result. + +Performance is also surprisingly good, 1.33 times faster than the +macos vsinf SSE2 function, and 1.5 times faster than the +__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not +too bad for an SSE1 function (with no special tuning) ! +However the latter libraries probably have a much better handling of NaN, +Inf, denormalized and other special arguments.. + +On my core 1 duo, the execution of this function takes approximately 95 cycles. + +From what I have observed on the experiments with Intel AMath lib, switching to an +SSE2 version would improve the perf by only 10%. + +Since it is based on SSE intrinsics, it has to be compiled at -O2 to +deliver full speed. +*/ +__m128 sin_ps(v4sfu *xPtr) { // any x + __m128 x=*((__m128 *)xPtr); + __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; + +#ifdef USE_SSE2 + __m128i emm0, emm2; +#else + __m64 mm0, mm1, mm2, mm3; +#endif + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); + +#ifdef USE_SSE2 + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + /* get the swap sign flag */ + emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +#include + +/* yes I know, the top of this file is quite ugly */ + +#ifdef _MSC_VER /* visual c++ */ +# define ALIGN16_BEG __declspec(align(16)) +# define ALIGN16_END +#else /* gcc or icc */ +# define ALIGN16_BEG +# define ALIGN16_END __attribute__((aligned(16))) +#endif + +/* __m128 is ugly to write */ +//typedef __m128 _v4sfu; // vector of 4 float (sse1) + +#ifndef USE_SSE2 //sry this is all sse2 now +#define USE_SSE2 +#endif + +#ifdef USE_SSE2 +# include +#else +typedef __m64 v2si; // vector of 2 int (mmx) +#endif + +// !!! Andrew Hallendorff Warning changed call structure to make compatible with gcc + +typedef ALIGN16_BEG union { + float m128_f32[4]; + int8_t m128_i8[16]; + int16_t m128_i16[8]; + int32_t m128_i32[4]; + int64_t m128_i64[2]; + uint8_t m128_u8[16]; + uint16_t m128_u16[8]; + uint32_t m128_u32[4]; + uint64_t m128_u64[2]; + } ALIGN16_END v4sfu; + +__m128 log_ps(v4sfu *xPtr); +__m128 sin_ps(v4sfu *xPtr); +void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr); + + + +#endif \ No newline at end of file diff --git a/src/effects/Equalization.cpp b/src/effects/Equalization.cpp index 1c0d0ff97..8a55f8180 100644 --- a/src/effects/Equalization.cpp +++ b/src/effects/Equalization.cpp @@ -1,62 +1,64 @@ /********************************************************************** - Audacity: A Digital Audio Editor + Audacity: A Digital Audio Editor - EffectEqualization.cpp + EffectEqualization.cpp - Mitch Golden - Vaughan Johnson (Preview) - Martyn Shaw (FIR filters, response curve, graphic EQ) + Mitch Golden + Vaughan Johnson (Preview) + Martyn Shaw (FIR filters, response curve, graphic EQ) *******************************************************************//** -\file Equalization.cpp -\brief Implements EffectEqualiztaion, EqualizationDialog, -EqualizationPanel, EQCurve and EQPoint. + \file Equalization.cpp + \brief Implements EffectEqualiztaion, EqualizationDialog, + EqualizationPanel, EQCurve and EQPoint. *//****************************************************************//** -\class EffectEqualization -\brief An Effect. + \class EffectEqualization + \brief An Effect. - Performs filtering, using an FFT to do a FIR filter. - It lets the user draw an arbitrary envelope (using the same - envelope editing code that is used to edit the track's - amplitude envelope). + Performs filtering, using an FFT to do a FIR filter. + It lets the user draw an arbitrary envelope (using the same + envelope editing code that is used to edit the track's + amplitude envelope). - Also allows the curve to be specified with a series of 'graphic EQ' - sliders. + Also allows the curve to be specified with a series of 'graphic EQ' + sliders. - The filter is applied using overlap/add of Hanning windows. + The filter is applied using overlap/add of Hanning windows. - Clone of the FFT Filter effect, no longer part of Audacity. + Clone of the FFT Filter effect, no longer part of Audacity. *//****************************************************************//** -\class EqualizationDialog -\brief Dialog used with EffectEqualization + \class EqualizationDialog + \brief Dialog used with EffectEqualization *//****************************************************************//** -\class EqualizationPanel -\brief EqualizationPanel is used with EqualizationDialog and controls -a graph for EffectEqualization. We should look at amalgamating the -various graphing code, such as provided by FreqWindow and FilterPanel. + \class EqualizationPanel + \brief EqualizationPanel is used with EqualizationDialog and controls + a graph for EffectEqualization. We should look at amalgamating the + various graphing code, such as provided by FreqWindow and FilterPanel. *//****************************************************************//** -\class EQCurve -\brief EQCurve is used with EffectEqualization. + \class EQCurve + \brief EQCurve is used with EffectEqualization. *//****************************************************************//** -\class EQPoint -\brief EQPoint is used with EQCurve and hence EffectEqualization. + \class EQPoint + \brief EQPoint is used with EQCurve and hence EffectEqualization. *//*******************************************************************/ + #include "../Audacity.h" +#include "../Experimental.h" #include "Equalization.h" #include "../AColor.h" #include "../ShuttleGui.h" @@ -97,6 +99,10 @@ various graphing code, such as provided by FreqWindow and FilterPanel. #include #include +#ifdef EXPERIMENTAL_EQ_SSE_THREADED +#include "Equalization48x.h" +#endif + #if wxUSE_TOOLTIPS #include #endif @@ -123,33 +129,33 @@ const double EqualizationDialog::thirdOct[] = // Leaving here for now...just disabling #ifdef __WXGTK__disabled /* - * wxSlider exhibits strange behaviour on wxGTK when wxSL_INVERSE and/or - * negative scale values are used. This affects at least SUSE 9.3 with - * self-compiled wxWidgets 2.6.2 and Kubuntu 6.06. This class is used - * to transparently work around those problems. Use wxSliderBugfix wherever - * you would have used a slider with wxSL_INVERSE and/or negative scale - * values. - * - * The equalization class uses wxSliderBugfix everywhere even when not - * strictly needed. This is important because we override some wxSlider - * functions which are not virtual, and so we need to cast to wxSliderBugfix - * instead of to wxSlider to allow the right function to be called statically. - */ +* wxSlider exhibits strange behaviour on wxGTK when wxSL_INVERSE and/or +* negative scale values are used. This affects at least SUSE 9.3 with +* self-compiled wxWidgets 2.6.2 and Kubuntu 6.06. This class is used +* to transparently work around those problems. Use wxSliderBugfix wherever +* you would have used a slider with wxSL_INVERSE and/or negative scale +* values. +* +* The equalization class uses wxSliderBugfix everywhere even when not +* strictly needed. This is important because we override some wxSlider +* functions which are not virtual, and so we need to cast to wxSliderBugfix +* instead of to wxSlider to allow the right function to be called statically. +*/ class wxSliderBugfix : public wxSlider { public: wxSliderBugfix(wxWindow* parent, wxWindowID id, int value, - int minValue, int maxValue, - const wxPoint& point = wxDefaultPosition, - const wxSize& size = wxDefaultSize, - long style = wxSL_HORIZONTAL, - const wxValidator& validator = wxDefaultValidator, - const wxString& name = wxT("slider")) : - wxSlider(parent, id, 0, 0, maxValue - minValue, point, - size, style & ~wxSL_INVERSE, validator, name) { - isInverse = (style & wxSL_INVERSE) != 0; - originalMinValue = minValue; - SetValue(value); + int minValue, int maxValue, + const wxPoint& point = wxDefaultPosition, + const wxSize& size = wxDefaultSize, + long style = wxSL_HORIZONTAL, + const wxValidator& validator = wxDefaultValidator, + const wxString& name = wxT("slider")) : + wxSlider(parent, id, 0, 0, maxValue - minValue, point, + size, style & ~wxSL_INVERSE, validator, name) { + isInverse = (style & wxSL_INVERSE) != 0; + originalMinValue = minValue; + SetValue(value); } int GetMin() const { @@ -182,8 +188,8 @@ private: #else /* - * On platforms other than wxGTK, the workaround is not needed - */ +* On platforms other than wxGTK, the workaround is not needed +*/ #define wxSliderBugfix wxSlider #endif // ifdef __WXGTK__ @@ -195,7 +201,7 @@ void EffectEqualization::ReadPrefs() { gPrefs->Read(wxT("/Effects/Equalization/FilterLength"), - &mM, 4001); + &mM, 4001); if ((mM < 21) || (mM > 8191)) { // corrupted Prefs? mM = 4001; //default gPrefs->Write(wxT("/Effects/Equalization/FilterLength"), mM); @@ -216,6 +222,18 @@ void EffectEqualization::ReadPrefs() gPrefs->Read(wxT("/Effects/Equalization/Interp"), &mInterp, 0); gPrefs->Read(wxT("/Effects/Equalization/DrawGrid"), &mDrawGrid, true); +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + bool useSSE; + gPrefs->Read(wxT("SSE/GUI"), &useSSE); + if(useSSE && !mEffectEqualization48x) + mEffectEqualization48x=new EffectEqualization48x; + else + if(!useSSE && mEffectEqualization48x) { + delete mEffectEqualization48x; + mEffectEqualization48x=NULL; + } + mBench=false; +#endif gPrefs->Flush(); } @@ -225,6 +243,11 @@ EffectEqualization::EffectEqualization() mFFTBuffer = new float[windowSize]; mFilterFuncR = new float[windowSize]; mFilterFuncI = new float[windowSize]; + +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + mEffectEqualization48x=NULL; +#endif + ReadPrefs(); mPrompting = false; @@ -250,6 +273,10 @@ EffectEqualization::~EffectEqualization() delete[] mFilterFuncI; mFilterFuncR = NULL; mFilterFuncI = NULL; +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + if(mEffectEqualization48x) + delete mEffectEqualization48x; +#endif } bool EffectEqualization::Init() @@ -296,7 +323,7 @@ bool EffectEqualization::PromptUser() EqualizationDialog dlog(this, ((double)loFreqI), hiFreq, mFilterFuncR, mFilterFuncI, - windowSize, mCurveName, mEditingBatchParams, mParent, -1, _("Equalization")); + windowSize, mCurveName, mEditingBatchParams, mParent, -1, _("Equalization")); dlog.M = mM; dlog.curveName = mCurveName; @@ -351,7 +378,7 @@ bool EffectEqualization::DontPromptUser() hiFreq = ((float)(GetActiveProject()->GetRate())/2.); EqualizationDialog dlog(this, ((double)loFreqI), hiFreq, mFilterFuncR, mFilterFuncI, - windowSize, mCurveName, false, NULL, -1, _("Equalization")); + windowSize, mCurveName, false, NULL, -1, _("Equalization")); dlog.M = mM; dlog.curveName = mCurveName; @@ -378,6 +405,14 @@ bool EffectEqualization::TransferParameters( Shuttle & shuttle ) bool EffectEqualization::Process() { +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + if(mEffectEqualization48x) + if(mBench) { + mBench=false; + return mEffectEqualization48x->Benchmark(this); + } else + return mEffectEqualization48x->Process(this); +#endif this->CopyInputTracks(); // Set up mOutputTracks. bool bGoodResult = true; @@ -412,7 +447,7 @@ bool EffectEqualization::Process() bool EffectEqualization::ProcessOne(int count, WaveTrack * t, - sampleCount start, sampleCount len) + sampleCount start, sampleCount len) { // create a new WaveTrack to hold all of the output, including 'tails' each end AudacityProject *p = GetActiveProject(); @@ -511,12 +546,12 @@ bool EffectEqualization::ProcessOne(int count, WaveTrack * t, // 'startT' is the equivalent time value // 'output' starts at zero double startT = t->LongSamplesToTime(start); - + //output has one waveclip for the total length, even though //t might have whitespace seperating multiple clips //we want to maintain the original clip structure, so //only paste the intersections of the new clip. - + //Find the bits of clips that need replacing std::vector > clipStartEndTimes; std::vector > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected @@ -525,7 +560,7 @@ bool EffectEqualization::ProcessOne(int count, WaveTrack * t, WaveClip *clip; double clipStartT; double clipEndT; - + clip = it->GetData(); clipStartT = clip->GetStartTime(); clipEndT = clip->GetEndTime(); @@ -533,15 +568,15 @@ bool EffectEqualization::ProcessOne(int count, WaveTrack * t, continue; // clip is not within selection if( clipStartT >= startT + lenT ) continue; // clip is not within selection - + //save the actual clip start/end so that we can rejoin them after we paste. clipRealStartEndTimes.push_back(std::pair(clipStartT,clipEndT)); - + if( clipStartT < startT ) // does selection cover the whole clip? clipStartT = startT; // don't copy all the new clip if( clipEndT > startT + lenT ) // does selection cover the whole clip? clipEndT = startT + lenT; // don't copy all the new clip - + //save them clipStartEndTimes.push_back(std::pair(clipStartT,clipEndT)); } @@ -560,9 +595,9 @@ bool EffectEqualization::ProcessOne(int count, WaveTrack * t, //if the clip was only partially selected, the Paste will have created a split line. Join is needed to take care of this //This is not true when the selection is fully contained within one clip (second half of conditional) if( (clipRealStartEndTimes[i].first != clipStartEndTimes[i].first || - clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) && - !(clipRealStartEndTimes[i].first <= startT && - clipRealStartEndTimes[i].second >= startT+lenT) ) + clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) && + !(clipRealStartEndTimes[i].first <= startT && + clipRealStartEndTimes[i].second >= startT+lenT) ) t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second); delete toClipOutput; } @@ -578,7 +613,7 @@ bool EffectEqualization::ProcessOne(int count, WaveTrack * t, } void EffectEqualization::Filter(sampleCount len, - float *buffer) + float *buffer) { int i; float re,im; @@ -610,18 +645,18 @@ void EffectEqualization::Filter(sampleCount len, //---------------------------------------------------------------------------- BEGIN_EVENT_TABLE(EqualizationPanel, wxPanel) - EVT_PAINT(EqualizationPanel::OnPaint) - EVT_MOUSE_EVENTS(EqualizationPanel::OnMouseEvent) - EVT_MOUSE_CAPTURE_LOST(EqualizationPanel::OnCaptureLost) - EVT_SIZE(EqualizationPanel::OnSize) +EVT_PAINT(EqualizationPanel::OnPaint) +EVT_MOUSE_EVENTS(EqualizationPanel::OnMouseEvent) +EVT_MOUSE_CAPTURE_LOST(EqualizationPanel::OnCaptureLost) +EVT_SIZE(EqualizationPanel::OnSize) END_EVENT_TABLE() EqualizationPanel::EqualizationPanel( double loFreq, double hiFreq, - Envelope *env, - EqualizationDialog *parent, - float *filterFuncR, float *filterFuncI, long windowSize, - wxWindowID id, const wxPoint& pos, const wxSize& size): - wxPanel(parent, id, pos, size) + Envelope *env, + EqualizationDialog *parent, + float *filterFuncR, float *filterFuncI, long windowSize, + wxWindowID id, const wxPoint& pos, const wxSize& size): +wxPanel(parent, id, pos, size) { mOutr = NULL; mOuti = NULL; @@ -732,8 +767,8 @@ void EqualizationPanel::OnPaint(wxPaintEvent & WXUNUSED(event)) memDC.SetPen(wxPen(theTheme.Colour( clrGraphLines ), 1, wxSOLID)); int center = (int) (mEnvRect.height * dBMax/(dBMax-dBMin) + .5); AColor::Line(memDC, - mEnvRect.GetLeft(), mEnvRect.y + center, - mEnvRect.GetRight(), mEnvRect.y + center); + mEnvRect.GetLeft(), mEnvRect.y + center, + mEnvRect.GetRight(), mEnvRect.y + center); // Draw the grid, if asked for. Do it now so it's underneath the main plots. if( mParent->drawGrid ) @@ -767,7 +802,7 @@ void EqualizationPanel::OnPaint(wxPaintEvent & WXUNUSED(event)) if ( (i != 0) & (!off1) ) { AColor::Line(memDC, xlast, ylast, - x, mEnvRect.y + y); + x, mEnvRect.y + y); } off1 = off; xlast = x; @@ -839,7 +874,7 @@ void EqualizationPanel::OnPaint(wxPaintEvent & WXUNUSED(event)) mEnvelope->DrawPoints(memDC, mEnvRect, 0.0, mEnvRect.width-1, false, dBMin, dBMax); dc.Blit(0, 0, mWidth, mHeight, - &memDC, 0, 0, wxCOPY, FALSE); + &memDC, 0, 0, wxCOPY, FALSE); } void EqualizationPanel::OnMouseEvent(wxMouseEvent & event) @@ -850,7 +885,7 @@ void EqualizationPanel::OnMouseEvent(wxMouseEvent & event) } if (mEnvelope->MouseEvent(event, mEnvRect, 0.0, mEnvRect.width, false, - dBMin, dBMax)) + dBMin, dBMax)) { mParent->EnvelopeUpdated(); RecalcRequired = true; @@ -881,48 +916,58 @@ void EqualizationPanel::OnCaptureLost(wxMouseCaptureLostEvent & WXUNUSED(event)) // WDR: event table for EqualizationDialog BEGIN_EVENT_TABLE(EqualizationDialog,wxDialog) - EVT_SIZE( EqualizationDialog::OnSize ) - EVT_PAINT( EqualizationDialog::OnPaint ) - EVT_ERASE_BACKGROUND( EqualizationDialog::OnErase ) +EVT_SIZE( EqualizationDialog::OnSize ) +EVT_PAINT( EqualizationDialog::OnPaint ) +EVT_ERASE_BACKGROUND( EqualizationDialog::OnErase ) - EVT_SLIDER( ID_LENGTH, EqualizationDialog::OnSliderM ) - EVT_SLIDER( ID_DBMAX, EqualizationDialog::OnSliderDBMAX ) - EVT_SLIDER( ID_DBMIN, EqualizationDialog::OnSliderDBMIN ) - EVT_COMMAND_RANGE( ID_SLIDER, - ID_SLIDER + NUMBER_OF_BANDS - 1, - wxEVT_COMMAND_SLIDER_UPDATED, - EqualizationDialog::OnSlider) - EVT_CHOICE( ID_INTERP, EqualizationDialog::OnInterp ) +EVT_SLIDER( ID_LENGTH, EqualizationDialog::OnSliderM ) +EVT_SLIDER( ID_DBMAX, EqualizationDialog::OnSliderDBMAX ) +EVT_SLIDER( ID_DBMIN, EqualizationDialog::OnSliderDBMIN ) +EVT_COMMAND_RANGE( ID_SLIDER, + ID_SLIDER + NUMBER_OF_BANDS - 1, + wxEVT_COMMAND_SLIDER_UPDATED, + EqualizationDialog::OnSlider) +EVT_CHOICE( ID_INTERP, EqualizationDialog::OnInterp ) - EVT_CHOICE( ID_CURVE, EqualizationDialog::OnCurve ) - EVT_BUTTON( ID_MANAGE, EqualizationDialog::OnManage ) - EVT_BUTTON( ID_CLEAR, EqualizationDialog::OnClear ) - EVT_BUTTON( ID_INVERT, EqualizationDialog::OnInvert ) +EVT_CHOICE( ID_CURVE, EqualizationDialog::OnCurve ) +EVT_BUTTON( ID_MANAGE, EqualizationDialog::OnManage ) +EVT_BUTTON( ID_CLEAR, EqualizationDialog::OnClear ) +EVT_BUTTON( ID_INVERT, EqualizationDialog::OnInvert ) - EVT_BUTTON( ID_EFFECT_PREVIEW, EqualizationDialog::OnPreview ) - EVT_BUTTON( wxID_OK, EqualizationDialog::OnOk ) - EVT_BUTTON( wxID_CANCEL, EqualizationDialog::OnCancel ) - EVT_RADIOBUTTON(drawRadioID, EqualizationDialog::OnDrawRadio) - EVT_RADIOBUTTON(sliderRadioID, EqualizationDialog::OnSliderRadio) - EVT_CHECKBOX(ID_LIN_FREQ, EqualizationDialog::OnLinFreq) - EVT_CHECKBOX(GridOnOffID, EqualizationDialog::OnGridOnOff) +EVT_BUTTON( ID_EFFECT_PREVIEW, EqualizationDialog::OnPreview ) +EVT_BUTTON( wxID_OK, EqualizationDialog::OnOk ) +EVT_BUTTON( wxID_CANCEL, EqualizationDialog::OnCancel ) +EVT_RADIOBUTTON(drawRadioID, EqualizationDialog::OnDrawRadio) +EVT_RADIOBUTTON(sliderRadioID, EqualizationDialog::OnSliderRadio) + +#ifdef EXPERIMENTAL_EQ_SSE_THREADED +EVT_RADIOBUTTON(defaultMathRadioID, EqualizationDialog::OnProcessingRadio) +EVT_RADIOBUTTON(sSERadioID, EqualizationDialog::OnProcessingRadio) +EVT_RADIOBUTTON(sSEThreadedRadioID, EqualizationDialog::OnProcessingRadio) +EVT_RADIOBUTTON(aVXRadioID, EqualizationDialog::OnProcessingRadio) +EVT_RADIOBUTTON(aVXThreadedRadioID, EqualizationDialog::OnProcessingRadio) +EVT_BUTTON( ID_BENCH, EqualizationDialog::OnBench ) +#endif + +EVT_CHECKBOX(ID_LIN_FREQ, EqualizationDialog::OnLinFreq) +EVT_CHECKBOX(GridOnOffID, EqualizationDialog::OnGridOnOff) END_EVENT_TABLE() EqualizationDialog::EqualizationDialog(EffectEqualization * effect, - double loFreq, double hiFreq, - float *filterFuncR, - float *filterFuncI, - long windowSize, - wxString curveName, - bool disallowCustom, - wxWindow *parent, wxWindowID id, - const wxString &title, - const wxPoint &position, - const wxSize& size, - long style): - wxDialog( parent, id, title, position, size, style | wxRESIZE_BORDER | wxMAXIMIZE_BOX ), - mDisallowCustom(disallowCustom), - mCustomBackup(wxT("unnamed")) + double loFreq, double hiFreq, + float *filterFuncR, + float *filterFuncI, + long windowSize, + wxString curveName, + bool disallowCustom, + wxWindow *parent, wxWindowID id, + const wxString &title, + const wxPoint &position, + const wxSize& size, + long style): + wxDialog( parent, id, title, position, size, style | wxRESIZE_BORDER | wxMAXIMIZE_BOX ), + mDisallowCustom(disallowCustom), + mCustomBackup(wxT("unnamed")) { m_pEffect = effect; @@ -1050,8 +1095,8 @@ void EqualizationDialog::LoadCurves(wxString fileName, bool append) msg.Printf(_("Error Loading EQ Curves from file:\n%s\nError message says:\n%s"), fn.GetFullPath().c_str(), reader.GetErrorStr().c_str()); // Inform user of load failure wxMessageBox( msg, - _("Error Loading EQ Curves"), - wxOK | wxCENTRE); + _("Error Loading EQ Curves"), + wxOK | wxCENTRE); mCurves.Add( _("unnamed") ); // we always need a default curve to use return; } @@ -1144,10 +1189,10 @@ void EqualizationDialog::MakeEqualizationDialog() szr2 = new wxBoxSizer( wxVERTICAL ); dBMaxSlider = new wxSliderBugfix(this, ID_DBMAX, 30, 0, 60, - wxDefaultPosition, wxDefaultSize, wxSL_VERTICAL|wxSL_INVERSE); + wxDefaultPosition, wxDefaultSize, wxSL_VERTICAL|wxSL_INVERSE); szr2->Add( dBMaxSlider, 1, wxALIGN_LEFT|wxALIGN_CENTER_VERTICAL|wxALL, 4 ); dBMinSlider = new wxSliderBugfix(this, ID_DBMIN, -30, -120, -10, - wxDefaultPosition, wxDefaultSize, wxSL_VERTICAL|wxSL_INVERSE); + wxDefaultPosition, wxDefaultSize, wxSL_VERTICAL|wxSL_INVERSE); szr2->Add( dBMinSlider, 1, wxALIGN_LEFT|wxALIGN_CENTER_VERTICAL|wxALL, 4 ); szr1->Add( szr2, 0, wxEXPAND|wxALIGN_CENTRE|wxALL, 4 ); @@ -1177,10 +1222,10 @@ void EqualizationDialog::MakeEqualizationDialog() szr1->Add( szr4, 0, wxEXPAND|wxALIGN_LEFT|wxALL ); mPanel = new EqualizationPanel( mLoFreq, mHiFreq, - mLogEnvelope, - this, - mFilterFuncR, mFilterFuncI, mWindowSize, - ID_FILTERPANEL); + mLogEnvelope, + this, + mFilterFuncR, mFilterFuncI, mWindowSize, + ID_FILTERPANEL); szr1->Add( mPanel, 1, wxEXPAND|wxALIGN_CENTRE); szr3 = new wxBoxSizer( wxVERTICAL ); szr1->Add( szr3, 0, wxALIGN_CENTRE|wxRIGHT, 0); //spacer for last EQ @@ -1218,8 +1263,8 @@ void EqualizationDialog::MakeEqualizationDialog() for (int i = 0; (i < NUMBER_OF_BANDS) && (thirdOct[i] <= mHiFreq); ++i) { m_sliders[i] = new wxSliderBugfix(this, ID_SLIDER + i, 0, -20, +20, - wxDefaultPosition, wxSize(20, 124), wxSL_VERTICAL| - wxSL_INVERSE); + wxDefaultPosition, wxSize(20, 124), wxSL_VERTICAL| + wxSL_INVERSE); szrG->Add( m_sliders[i], 0, wxEXPAND|wxALIGN_CENTER ); szrG->Add(0, 0, 0); // horizontal spacer - used to put EQ sliders in correct position m_sliders[i]->Connect(wxEVT_ERASE_BACKGROUND,wxEraseEventHandler(EqualizationDialog::OnErase)); @@ -1248,20 +1293,20 @@ void EqualizationDialog::MakeEqualizationDialog() szrH = new wxBoxSizer( wxHORIZONTAL ); mFaderOrDraw[0] = new wxRadioButton( - this, drawRadioID, _("&Draw Curves"), - wxDefaultPosition, wxDefaultSize, wxRB_GROUP ); + this, drawRadioID, _("&Draw Curves"), + wxDefaultPosition, wxDefaultSize, wxRB_GROUP ); mFaderOrDraw[0]->SetName(_("Draw Curves")); szrH->Add( mFaderOrDraw[0], 0, wxRIGHT, 10 ); mFaderOrDraw[1] = new wxRadioButton( - this, sliderRadioID, _("&Graphic EQ"), - wxDefaultPosition, wxDefaultSize, 0 ); + this, sliderRadioID, _("&Graphic EQ"), + wxDefaultPosition, wxDefaultSize, 0 ); mFaderOrDraw[1]->SetName(_("Graphic EQ")); szrH->Add( mFaderOrDraw[1], 0, wxRIGHT, 4 ); mInterpChoice = new wxChoice(this, ID_INTERP, - wxDefaultPosition, wxDefaultSize, - NUM_INTERP_CHOICES, interpChoiceStrings); + wxDefaultPosition, wxDefaultSize, + NUM_INTERP_CHOICES, interpChoiceStrings); mInterpChoice->SetSelection(0); szrI = new wxBoxSizer( wxHORIZONTAL ); @@ -1284,7 +1329,7 @@ void EqualizationDialog::MakeEqualizationDialog() // length of filter (M) slider MSlider = new wxSliderBugfix(this, ID_LENGTH, (M -1)/2, 10, 4095, - wxDefaultPosition, wxSize(200, -1), wxSL_HORIZONTAL); + wxDefaultPosition, wxSize(200, -1), wxSL_HORIZONTAL); MSlider->SetName(_("Length of Filter")); szrH->Add( MSlider, 0, wxEXPAND ); @@ -1321,18 +1366,85 @@ void EqualizationDialog::MakeEqualizationDialog() btn = new wxButton( this, ID_INVERT, _("&Invert")); szrC->Add( btn, 0, wxALIGN_CENTRE | wxALL, 4 ); mGridOnOff = new wxCheckBox(this, GridOnOffID, _("G&rids"), - wxDefaultPosition, wxDefaultSize, + wxDefaultPosition, wxDefaultSize, #if defined(__WXGTK__) -// Fixes bug #662 - wxALIGN_LEFT); + // Fixes bug #662 + wxALIGN_LEFT); #else - wxALIGN_RIGHT); + wxALIGN_RIGHT); #endif mGridOnOff->SetName(_("Grids")); szrC->Add( mGridOnOff, 0, wxALIGN_CENTRE | wxALL, 4 ); szrV->Add( szrC, 0, wxALIGN_CENTER | wxALL, 0 ); +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + // ------------------------------------------------------------------- + // Processing routine selection + // ------------------------------------------------------------------- + if(m_pEffect->mEffectEqualization48x) { + szrM = new wxBoxSizer( wxHORIZONTAL ); + txt = new wxStaticText( this, wxID_ANY, _("&Processing: ") ); + szrM->Add( txt, 0, wxALIGN_LEFT | wxALIGN_CENTER_VERTICAL | wxLEFT, 4 ); + + mMathProcessingType[0] = new wxRadioButton( + this, defaultMathRadioID, _("D&efault"), + wxDefaultPosition, wxDefaultSize, wxRB_GROUP ); + mMathProcessingType[0]->SetName(_("Default")); + szrM->Add( mMathProcessingType[0], 0, wxRIGHT, 10 ); + + mMathProcessingType[1] = new wxRadioButton( + this, sSERadioID, _("&SSE"), + wxDefaultPosition, wxDefaultSize, 0 ); + mMathProcessingType[1]->SetName(_("SSE")); + szrM->Add( mMathProcessingType[1], 0, wxRIGHT, 4 ); + + mMathProcessingType[2] = new wxRadioButton( + this, sSEThreadedRadioID, _("SSE &Threaded"), + wxDefaultPosition, wxDefaultSize, 0 ); + mMathProcessingType[2]->SetName(_("SSE")); + szrM->Add( mMathProcessingType[2], 0, wxRIGHT, 4 ); + + mMathProcessingType[3] = new wxRadioButton( + this, aVXRadioID, _("A&VX"), + wxDefaultPosition, wxDefaultSize, 0 ); + mMathProcessingType[3]->SetName(_("AVX")); + szrM->Add( mMathProcessingType[3], 0, wxRIGHT, 4 ); + + mMathProcessingType[4] = new wxRadioButton( + this, aVXThreadedRadioID, _("AV&X Threaded"), + wxDefaultPosition, wxDefaultSize, 0 ); + mMathProcessingType[4]->SetName(_("AVX Threaded")); + szrM->Add( mMathProcessingType[4], 0, wxRIGHT, 4 ); + + if(!EffectEqualization48x::GetMathCaps()->SSE) { + mMathProcessingType[1]->Disable(); + mMathProcessingType[2]->Disable(); + } + if(true) { //!EffectEqualization48x::GetMathCaps()->AVX) { not implemented + mMathProcessingType[3]->Disable(); + mMathProcessingType[4]->Disable(); + } + // update the control state + mMathProcessingType[0]->SetValue(true); + int mathPath=EffectEqualization48x::GetMathPath(); + if(mathPath&MATH_FUNCTION_SSE) { + mMathProcessingType[1]->SetValue(true); + if(mathPath&MATH_FUNCTION_THREADED) + mMathProcessingType[2]->SetValue(true); + } + if(false) { //mathPath&MATH_FUNCTION_AVX) { not implemented + mMathProcessingType[3]->SetValue(true); + if(mathPath&MATH_FUNCTION_THREADED) + mMathProcessingType[4]->SetValue(true); + } + btn = new wxButton( this, ID_BENCH, _("&Bench")); + szrM->Add( btn, 0, wxRIGHT, 4 ); + + szrV->Add( szrM, 0, wxALIGN_CENTER | wxALL, 0 ); + } +#endif + // ------------------------------------------------------------------- // Preview, OK, & Cancel buttons // ------------------------------------------------------------------- @@ -1369,13 +1481,13 @@ void EqualizationDialog::CreateChoice() // Create an array of names for( i = 0; i < numCurves; i++ ) { - curveNames[ i ] = mCurves[ i ].Name; + curveNames[ i ] = mCurves[ i ].Name; } // Create the control choice = new wxChoice( this, ID_CURVE, - wxDefaultPosition, wxDefaultSize, - numCurves, curveNames ); + wxDefaultPosition, wxDefaultSize, + numCurves, curveNames ); // Have an existing control? if( mCurve ) @@ -1404,10 +1516,10 @@ bool EqualizationDialog::Validate() while (mDisallowCustom && curveName.IsSameAs(wxT("unnamed"))) { wxMessageBox(_("To use this EQ curve in a batch chain, please choose a new name for it.\nChoose the 'Save/Manage Curves...' button and rename the 'unnamed' curve, then use that one."), - _("EQ Curve needs a different name"), - wxOK | wxCENTRE, - this); - return false; + _("EQ Curve needs a different name"), + wxOK | wxCENTRE, + this); + return false; } return true; } @@ -1555,16 +1667,16 @@ bool EqualizationDialog::CalcFilter() mFilterFuncR[i] = val0; } else if(when > 1.0) - { - mFilterFuncR[i] = val1; - } - else - { - if( lin ) - mFilterFuncR[i] = mLinEnvelope->GetValue(when); - else - mFilterFuncR[i] = mLogEnvelope->GetValue(when); - } + { + mFilterFuncR[i] = val1; + } + else + { + if( lin ) + mFilterFuncR[i] = mLinEnvelope->GetValue(when); + else + mFilterFuncR[i] = mLogEnvelope->GetValue(when); + } freq += delta; } mFilterFuncR[mWindowSize/2] = val1; @@ -1588,8 +1700,8 @@ bool EqualizationDialog::CalcFilter() for(i=0;i<=(M-1)/2;i++) { //Windowing - could give a choice, fixed for now - MJS -// double mult=0.54-0.46*cos(2*M_PI*(i+(M-1)/2.0)/(M-1)); //Hamming -//Blackman + // double mult=0.54-0.46*cos(2*M_PI*(i+(M-1)/2.0)/(M-1)); //Hamming + //Blackman double mult=0.42-0.5*cos(2*M_PI*(i+(M-1)/2.0)/(M-1))+.08*cos(4*M_PI*(i+(M-1)/2.0)/(M-1)); outr[i]*=mult; if(i!=0){ @@ -1894,7 +2006,7 @@ bool EqualizationDialog::HandleXMLTag(const wxChar *tag, const wxChar **attrs) if( !wxStrcmp( attr, wxT("f") ) ) { if (!XMLValueChecker::IsGoodString(strValue) || - !Internat::CompatibleToDouble(strValue, &dblValue)) + !Internat::CompatibleToDouble(strValue, &dblValue)) return false; f = dblValue; } @@ -1902,7 +2014,7 @@ bool EqualizationDialog::HandleXMLTag(const wxChar *tag, const wxChar **attrs) else if( !wxStrcmp( attr, wxT("d") ) ) { if (!XMLValueChecker::IsGoodString(strValue) || - !Internat::CompatibleToDouble(strValue, &dblValue)) + !Internat::CompatibleToDouble(strValue, &dblValue)) return false; d = dblValue; } @@ -2087,7 +2199,7 @@ void EqualizationDialog::GraphicEQ(Envelope *env) int n = mInterpChoice->GetSelection(); switch( n ) { - case 0: // B-spline + case 0: // B-spline { int minF = 0; for(int i=0; iGetCurrentSelection() ); - #else +#else setCurve( mCurve->GetSelection() ); - #endif +#endif if( !drawMode ) UpdateGraphic(); } @@ -2643,14 +2755,14 @@ void EqualizationDialog::OnInvert(wxCommandEvent & WXUNUSED(event)) // Inverts a int newPosn = (int)m_EQVals[i]; m_sliders[i]->SetValue( newPosn ); m_sliders_old[i] = newPosn; - #if wxUSE_TOOLTIPS +#if wxUSE_TOOLTIPS wxString tip; if( thirdOct[i] < 1000.) tip.Printf( wxT("%dHz\n%.1fdB"), (int)thirdOct[i], m_EQVals[i] ); else tip.Printf( wxT("%gkHz\n%.1fdB"), thirdOct[i]/1000., m_EQVals[i] ); m_sliders[i]->SetToolTip(tip); - #endif +#endif } GraphicEQ(mLogEnvelope); } @@ -2766,10 +2878,10 @@ void EqualizationDialog::RevertCustom() void EqualizationDialog::Finish(bool ok) { if(mLogEnvelope) - delete mLogEnvelope; + delete mLogEnvelope; mLogEnvelope = NULL; if(mLinEnvelope) - delete mLinEnvelope; + delete mLinEnvelope; mLinEnvelope = NULL; mPanel = NULL; @@ -2804,7 +2916,7 @@ void EqualizationDialog::OnOk(wxCommandEvent & event) for(i=0,j=0;jvalue[i+1]-.05) && - (value[i+1]value[i+2]-.05) ) + (value[i+1]value[i+2]-.05) ) { // within < 0.05 dB? mLogEnvelope->Delete(j+1); numPoints--; @@ -2833,21 +2945,21 @@ void EqualizationDialog::OnOk(wxCommandEvent & event) /// Constructor BEGIN_EVENT_TABLE(EditCurvesDialog, wxDialog) - EVT_BUTTON(UpButtonID, EditCurvesDialog::OnUp) - EVT_BUTTON(DownButtonID, EditCurvesDialog::OnDown) - EVT_BUTTON(RenameButtonID, EditCurvesDialog::OnRename) - EVT_BUTTON(DeleteButtonID, EditCurvesDialog::OnDelete) - EVT_BUTTON(ImportButtonID, EditCurvesDialog::OnImport) - EVT_BUTTON(ExportButtonID, EditCurvesDialog::OnExport) - EVT_BUTTON(LibraryButtonID, EditCurvesDialog::OnLibrary) - EVT_BUTTON(DefaultsButtonID, EditCurvesDialog::OnDefaults) - EVT_BUTTON(wxID_OK, EditCurvesDialog::OnOK) +EVT_BUTTON(UpButtonID, EditCurvesDialog::OnUp) +EVT_BUTTON(DownButtonID, EditCurvesDialog::OnDown) +EVT_BUTTON(RenameButtonID, EditCurvesDialog::OnRename) +EVT_BUTTON(DeleteButtonID, EditCurvesDialog::OnDelete) +EVT_BUTTON(ImportButtonID, EditCurvesDialog::OnImport) +EVT_BUTTON(ExportButtonID, EditCurvesDialog::OnExport) +EVT_BUTTON(LibraryButtonID, EditCurvesDialog::OnLibrary) +EVT_BUTTON(DefaultsButtonID, EditCurvesDialog::OnDefaults) +EVT_BUTTON(wxID_OK, EditCurvesDialog::OnOK) END_EVENT_TABLE() EditCurvesDialog::EditCurvesDialog(EqualizationDialog * parent, int position): - wxDialog(parent, wxID_ANY, _("Manage Curves List"), - wxDefaultPosition, wxDefaultSize, - wxDEFAULT_DIALOG_STYLE | wxRESIZE_BORDER) +wxDialog(parent, wxID_ANY, _("Manage Curves List"), + wxDefaultPosition, wxDefaultSize, + wxDEFAULT_DIALOG_STYLE | wxRESIZE_BORDER) { SetLabel(_("Manage Curves")); // Provide visual label SetName(_("Manage Curves List")); // Provide audible label @@ -2905,7 +3017,7 @@ void EditCurvesDialog::PopulateOrExchange(ShuttleGui & S) S.EndHorizontalLay(); S.AddStandardButtons(); S.StartStatic(_("Help")); - S.AddConstTextBox(wxT(""), _("Rename 'unnamed' to save a new entry.\n'OK' saves all changes, 'Cancel' doesn't.")); + S.AddConstTextBox(wxT(""), _("Rename 'unnamed' to save a new entry.\n'OK' saves all changes, 'Cancel' doesn't.")); S.EndStatic(); PopulateList(mPosition); Fit(); @@ -2998,7 +3110,7 @@ long EditCurvesDialog::GetPreviousItem(long item) // wx doesn't have this { long lastItem = -1; long itemTemp = mList->GetNextItem(-1, wxLIST_NEXT_ALL, - wxLIST_STATE_SELECTED); + wxLIST_STATE_SELECTED); while( (itemTemp != -1) && (itemTemp < item) ) { lastItem = itemTemp; @@ -3034,8 +3146,8 @@ void EditCurvesDialog::OnRename(wxCommandEvent & WXUNUSED(event)) bad = false; // build the dialog wxTextEntryDialog dlg( this, - _("Rename '") + mEditCurves[ item ].Name + _("' to..."), - _("Rename...") ); + _("Rename '") + mEditCurves[ item ].Name + _("' to..."), + _("Rename...") ); dlg.SetTextValidator( wxFILTER_EXCLUDE_CHAR_LIST ); dlg.SetName( _("Rename '") + mEditCurves[ item ].Name ); wxTextValidator *tv = dlg.GetTextValidator(); @@ -3062,7 +3174,7 @@ void EditCurvesDialog::OnRename(wxCommandEvent & WXUNUSED(event)) break; } int answer = wxMessageBox( _("Overwrite existing curve '") + name +_("'?"), - _("Curve exists"), wxYES_NO ); + _("Curve exists"), wxYES_NO ); if (answer == wxYES) { bad = false; @@ -3134,7 +3246,7 @@ void EditCurvesDialog::OnDelete(wxCommandEvent & WXUNUSED(event)) if(item == mList->GetItemCount()-1) //unnamed { wxMessageBox(_("You cannot delete the 'unnamed' curve."), - _("Can't delete 'unnamed'"), wxOK | wxCENTRE, this); + _("Can't delete 'unnamed'"), wxOK | wxCENTRE, this); } else { @@ -3183,7 +3295,7 @@ void EditCurvesDialog::OnDelete(wxCommandEvent & WXUNUSED(event)) if(item == mList->GetItemCount()-1) //unnamed { wxMessageBox(_("You cannot delete the 'unnamed' curve, it is special."), - _("Can't delete 'unnamed'"), wxOK | wxCENTRE, this); + _("Can't delete 'unnamed'"), wxOK | wxCENTRE, this); } else { @@ -3274,6 +3386,35 @@ void EditCurvesDialog::OnDefaults( wxCommandEvent & WXUNUSED(event)) PopulateList(0); // update the EditCurvesDialog dialog } +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + +void EqualizationDialog::OnProcessingRadio(wxCommandEvent & event) +{ + int testEvent=event.GetId(); + switch(testEvent) + { + case defaultMathRadioID: EffectEqualization48x::SetMathPath(MATH_FUNCTION_ORIGINAL); + break; + case sSERadioID: EffectEqualization48x::SetMathPath(MATH_FUNCTION_SSE); + break; + case sSEThreadedRadioID: EffectEqualization48x::SetMathPath(MATH_FUNCTION_THREADED | MATH_FUNCTION_SSE); + break; + case aVXRadioID: testEvent=2; + break; + case aVXThreadedRadioID: testEvent=2; + break; + } + +}; + +void EqualizationDialog::OnBench( wxCommandEvent & event) +{ + m_pEffect->mBench=true; + OnOk(event); +} + +#endif + void EditCurvesDialog::OnOK(wxCommandEvent & WXUNUSED(event)) { // Make a backup of the current curves @@ -3293,8 +3434,8 @@ void EditCurvesDialog::OnOK(wxCommandEvent & WXUNUSED(event)) // Select something sensible long item = mList->GetNextItem(-1, - wxLIST_NEXT_ALL, - wxLIST_STATE_SELECTED); + wxLIST_NEXT_ALL, + wxLIST_STATE_SELECTED); if (item == -1) item = mList->GetItemCount()-1; // nothing selected, default to 'unnamed' mParent->setCurve(item); @@ -3304,7 +3445,7 @@ void EditCurvesDialog::OnOK(wxCommandEvent & WXUNUSED(event)) #if wxUSE_ACCESSIBILITY SliderAx::SliderAx( wxWindow * window, wxString fmt ): - wxWindowAccessible( window ) +wxWindowAccessible( window ) { mParent = window; mFmt = fmt; @@ -3415,17 +3556,17 @@ wxAccStatus SliderAx::GetRole(int childId, wxAccRole* role) { switch( childId ) { - case 0: - *role = wxROLE_SYSTEM_SLIDER; + case 0: + *role = wxROLE_SYSTEM_SLIDER; break; - case 1: - case 3: - *role = wxROLE_SYSTEM_PUSHBUTTON; + case 1: + case 3: + *role = wxROLE_SYSTEM_PUSHBUTTON; break; - case 2: - *role = wxROLE_SYSTEM_INDICATOR; + case 2: + *role = wxROLE_SYSTEM_INDICATOR; break; } @@ -3452,22 +3593,22 @@ wxAccStatus SliderAx::GetState(int childId, long* state) switch( childId ) { - case 0: - *state = wxACC_STATE_SYSTEM_FOCUSABLE; + case 0: + *state = wxACC_STATE_SYSTEM_FOCUSABLE; break; - case 1: - if( s->GetValue() == s->GetMin() ) - { - *state = wxACC_STATE_SYSTEM_INVISIBLE; - } + case 1: + if( s->GetValue() == s->GetMin() ) + { + *state = wxACC_STATE_SYSTEM_INVISIBLE; + } break; - case 3: - if( s->GetValue() == s->GetMax() ) - { - *state = wxACC_STATE_SYSTEM_INVISIBLE; - } + case 3: + if( s->GetValue() == s->GetMax() ) + { + *state = wxACC_STATE_SYSTEM_INVISIBLE; + } break; } @@ -3495,3 +3636,4 @@ wxAccStatus SliderAx::GetValue(int childId, wxString* strValue) } #endif + diff --git a/src/effects/Equalization.h b/src/effects/Equalization.h index 499d961dd..8d8e7e4d8 100644 --- a/src/effects/Equalization.h +++ b/src/effects/Equalization.h @@ -74,6 +74,10 @@ public: }; WX_DECLARE_OBJARRAY( EQCurve, EQCurveArray ); +#ifdef EXPERIMENTAL_EQ_SSE_THREADED +class EffectEqualization48x; +#endif + class EffectEqualization: public Effect { public: @@ -113,12 +117,15 @@ public: // low range of human hearing enum {loFreqI=20}; + + private: bool ProcessOne(int count, WaveTrack * t, sampleCount start, sampleCount len); void Filter(sampleCount len, float *buffer); + void ReadPrefs(); HFFT hFFT; @@ -135,6 +142,11 @@ private: bool mPrompting; bool mDrawGrid; bool mEditingBatchParams; +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + bool mBench; + EffectEqualization48x *mEffectEqualization48x; +friend class EffectEqualization48x; +#endif public: @@ -222,6 +234,9 @@ public: void EnvelopeUpdated(Envelope *env, bool lin); static const double thirdOct[]; wxRadioButton *mFaderOrDraw[2]; +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + wxRadioButton *mMathProcessingType[5]; // default, sse, sse threaded, AVX, AVX threaded (note AVX is not implemented yet +#endif wxChoice *mInterpChoice; wxCheckBox *mLinFreq; int M; @@ -276,6 +291,14 @@ private: ID_INVERT, drawRadioID, sliderRadioID, +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + defaultMathRadioID, + sSERadioID, + sSEThreadedRadioID, + aVXRadioID, + aVXThreadedRadioID, + ID_BENCH, +#endif ID_INTERP, ID_LIN_FREQ, GridOnOffID, @@ -294,6 +317,10 @@ private: void OnSliderDBMIN( wxCommandEvent &event ); void OnDrawRadio(wxCommandEvent &event ); void OnSliderRadio(wxCommandEvent &event ); +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + void OnProcessingRadio(wxCommandEvent &event ); + void OnBench( wxCommandEvent & event); +#endif void OnLinFreq(wxCommandEvent &event ); void UpdateGraphic(void); void EnvLogToLin(void); @@ -339,6 +366,9 @@ private: wxBoxSizer *szrH; wxBoxSizer *szrI; wxBoxSizer *szrL; +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + wxBoxSizer *szrM; +#endif wxFlexGridSizer *szr1; wxBoxSizer *szr2; wxBoxSizer *szr3; diff --git a/src/effects/Equalization48x.cpp b/src/effects/Equalization48x.cpp new file mode 100644 index 000000000..40c6ee3fc --- /dev/null +++ b/src/effects/Equalization48x.cpp @@ -0,0 +1,924 @@ +/********************************************************************** + + Audacity: A Digital Audio Editor + + EffectEqualization.cpp + + Andrew Hallendorff + +*******************************************************************//** + + \file Equalization48x.cpp + \brief Fast SSE based implementation of equalization. + +*//****************************************************************/ + +#include "../Audacity.h" +#include "../Project.h" +#ifdef EXPERIMENTAL_EQ_SSE_THREADED +#include "Equalization.h" +#include "../WaveTrack.h" +#include "float_cast.h" +#include + +#include +#include +#include + +#if wxUSE_TOOLTIPS +#include +#endif +#include + +#include + +#include + +#include "Equalization48x.h" +#include "../RealFFTf.h" +#include "../RealFFTf48x.h" + +#ifndef USE_SSE2 +#define USE_SSE2 +#endif + +#include +#include +#include +#include +#include + +#ifdef _WIN32 + +// Windows +#include +#define cpuid __cpuid + +#else + +// GCC Inline Assembly +void cpuid(int CPUInfo[4],int InfoType){ + __asm__ __volatile__ ( + "cpuid": + "=a" (CPUInfo[0]), + "=b" (CPUInfo[1]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "a" (InfoType) + ); +} + +#endif + + +bool sMathCapsInitialized = false; +MathCaps sMathCaps; + +// dirty switcher +int sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; +void EffectEqualization48x::SetMathPath(int mathPath) { sMathPath=mathPath; }; +int EffectEqualization48x::GetMathPath() { return sMathPath; }; +void EffectEqualization48x::AddMathPathOption(int mathPath) { sMathPath|=mathPath; }; +void EffectEqualization48x::RemoveMathPathOption(int mathPath) { sMathPath&=~mathPath; }; + +MathCaps *EffectEqualization48x::GetMathCaps() +{ + if(!sMathCapsInitialized) + { + sMathCapsInitialized=true; + sMathCaps.x64 = false; + sMathCaps.MMX = false; + sMathCaps.SSE = false; + sMathCaps.SSE2 = false; + sMathCaps.SSE3 = false; + sMathCaps.SSSE3 = false; + sMathCaps.SSE41 = false; + sMathCaps.SSE42 = false; + sMathCaps.SSE4a = false; + sMathCaps.AVX = false; + sMathCaps.XOP = false; + sMathCaps.FMA3 = false; + sMathCaps.FMA4 = false; + + int info[4]; + cpuid(info, 0); + int nIds = info[0]; + + cpuid(info, 0x80000000); + int nExIds = info[0]; + + // Detect Instruction Set + if (nIds >= 1){ + cpuid(info,0x00000001); + sMathCaps.MMX = (info[3] & ((int)1 << 23)) != 0; + sMathCaps.SSE = (info[3] & ((int)1 << 25)) != 0; + sMathCaps.SSE2 = (info[3] & ((int)1 << 26)) != 0; + sMathCaps.SSE3 = (info[2] & ((int)1 << 0)) != 0; + + sMathCaps.SSSE3 = (info[2] & ((int)1 << 9)) != 0; + sMathCaps.SSE41 = (info[2] & ((int)1 << 19)) != 0; + sMathCaps.SSE42 = (info[2] & ((int)1 << 20)) != 0; + + sMathCaps.AVX = (info[2] & ((int)1 << 28)) != 0; + sMathCaps.FMA3 = (info[2] & ((int)1 << 12)) != 0; + } + + if (nExIds >= 0x80000001){ + cpuid(info,0x80000001); + sMathCaps.x64 = (info[3] & ((int)1 << 29)) != 0; + sMathCaps.SSE4a = (info[2] & ((int)1 << 6)) != 0; + sMathCaps.FMA4 = (info[2] & ((int)1 << 16)) != 0; + sMathCaps.XOP = (info[2] & ((int)1 << 11)) != 0; + } + if(sMathCaps.SSE) + sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; // we are starting on. + } + return &sMathCaps; +}; + +void * malloc_simd(const size_t size) +{ +#if defined WIN32 // WIN32 + return _aligned_malloc(size, 16); +#elif defined __linux__ // Linux + return memalign (16, size); +#elif defined __MACH__ // Mac OS X + return malloc(size); +#else // other (use valloc for page-aligned memory) + return valloc(size); +#endif +} + +void free_simd(void* mem) +{ +#if defined WIN32 // WIN32 + _aligned_free(mem); +#else + free(mem); +#endif +} + +EffectEqualization48x::EffectEqualization48x(): + mThreadCount(0),mFilterSize(0),mWindowSize(0),mBlockSize(0),mWorkerDataCount(0),mBlocksPerBuffer(20), + mScratchBufferSize(0),mSubBufferSize(0),mBigBuffer(NULL),mBufferInfo(NULL),mEQWorkers(0),mThreaded(false), + mBenching(false) +{ +} + +EffectEqualization48x::~EffectEqualization48x() +{ +} + + +bool EffectEqualization48x::AllocateBuffersWorkers(bool threaded) +{ + if(mBigBuffer) + FreeBuffersWorkers(); + mFilterSize=(mEffectEqualization->mM-1)&(~15); // 4000 !!! Filter MUST BE QUAD WORD ALIGNED !!!! + mWindowSize=mEffectEqualization->windowSize; + mBlockSize=mWindowSize-mFilterSize; // 12,384 + mThreaded=threaded; + if( mThreaded ) + { + mThreadCount=wxThread::GetCPUCount(); + mWorkerDataCount=mThreadCount+2; // 2 extra slots (maybe double later) + + // we're skewing the data by one block to allow for 1/4 block intersections. + // this will remove the disparity in data at the intersections of the runs + + // The nice magic allocation + // megabyte - 3 windows - 4 overlaping buffers - filter + // 2^20 = 1,048,576 - 3 * 2^14 (16,384) - ((4 * 20) - 3) * 12,384 - 4000 + // 1,048,576 - 49,152 - 953,568 - 4000 = 41,856 (leftover) + + mScratchBufferSize=mWindowSize*3*(sizeof(__m128)/sizeof(float)); // 3 window size blocks size of __m128 but we allocate in float + mSubBufferSize=mBlockSize*((mBlocksPerBuffer<<2)-3); // we are going to do a full block overlap -(blockSize*3) + mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)*mWorkerDataCount); // we run over by filtersize + // fill the bufferInfo + mBufferInfo = new BufferInfo[mWorkerDataCount]; + for(int i=0;iCopyInputTracks(); // Set up mOutputTracks. + bool bGoodResult = true; + + TableUsage(sMathPath); + if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!! + mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1; + AllocateBuffersWorkers((sMathPath & MATH_FUNCTION_THREADED) != 0); + SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks); + WaveTrack *track = (WaveTrack *) iter.First(); + int count = 0; + while (track) { + double trackStart = track->GetStartTime(); + double trackEnd = track->GetEndTime(); + double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; + double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; + + if (t1 > t0) { + sampleCount start = track->TimeToLongSamples(t0); + sampleCount end = track->TimeToLongSamples(t1); + sampleCount len = (sampleCount)(end - start); + + if(!sMathPath) { + if (!mEffectEqualization->ProcessOne(count, track, start, len)) + { + bGoodResult = false; + break; + } + } else { + if(sMathPath<8) { + if (!ProcessOne4x(count, track, start, len)) + { + bGoodResult = false; + break; + } + } else { + if (!ProcessOne4xThreaded(count, track, start, len)) + { + bGoodResult = false; + break; + } + } + } + + + } + + track = (WaveTrack *) iter.Next(); + count++; + } + FreeBuffersWorkers(); + + mEffectEqualization->ReplaceProcessedTracks(bGoodResult); + return bGoodResult; +} + +bool EffectEqualization48x::TrackCompare() +{ + mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks. + bool bGoodResult = true; + + TableUsage(sMathPath); + if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!! + mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1; + AllocateBuffersWorkers((sMathPath & MATH_FUNCTION_THREADED)!=0); + // Reset map + wxArrayPtrVoid SecondIMap; + wxArrayPtrVoid SecondOMap; + SecondIMap.Clear(); + SecondOMap.Clear(); + + TrackList *SecondOutputTracks = new TrackList(); + + //iterate over tracks of type trackType (All types if Track::All) + TrackListOfKindIterator aIt(mEffectEqualization->mOutputTracksType, mEffectEqualization->mTracks); + + for (Track *aTrack = aIt.First(); aTrack; aTrack = aIt.Next()) { + + // Include selected tracks, plus sync-lock selected tracks for Track::All. + if (aTrack->GetSelected() || + (mEffectEqualization->mOutputTracksType == Track::All && aTrack->IsSyncLockSelected())) + { + Track *o = aTrack->Duplicate(); + SecondOutputTracks->Add(o); + SecondIMap.Add(aTrack); + SecondIMap.Add(o); + } + } + + for(int i=0;i<2;i++) { + SelectedTrackListOfKindIterator iter(Track::Wave, i?mEffectEqualization->mOutputTracks:SecondOutputTracks); + i?sMathPath=sMathPath:sMathPath=0; + WaveTrack *track = (WaveTrack *) iter.First(); + int count = 0; + while (track) { + double trackStart = track->GetStartTime(); + double trackEnd = track->GetEndTime(); + double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; + double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; + + if (t1 > t0) { + sampleCount start = track->TimeToLongSamples(t0); + sampleCount end = track->TimeToLongSamples(t1); + sampleCount len = (sampleCount)(end - start); + + if(!sMathPath) { + if (!mEffectEqualization->ProcessOne(count, track, start, len)) + { + bGoodResult = false; + break; + } + } else { + if(sMathPath<8) { + if (!ProcessOne4x(count, track, start, len)) + { + bGoodResult = false; + break; + } + } else { + if (!ProcessOne4xThreaded(count, track, start, len)) + { + bGoodResult = false; + break; + } + } + } + } + track = (WaveTrack *) iter.Next(); + count++; + } + } + SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks); + SelectedTrackListOfKindIterator iter2(Track::Wave, SecondOutputTracks); + WaveTrack *track = (WaveTrack *) iter.First(); + WaveTrack *track2 = (WaveTrack *) iter2.First(); + while (track) { + double trackStart = track->GetStartTime(); + double trackEnd = track->GetEndTime(); + double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; + double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; + + if (t1 > t0) { + sampleCount start = track->TimeToLongSamples(t0); + sampleCount end = track->TimeToLongSamples(t1); + sampleCount len = (sampleCount)(end - start); + DeltaTrack(track, track2, start, len); + } + track = (WaveTrack *) iter.Next(); + track2 = (WaveTrack *) iter2.Next(); + } + delete SecondOutputTracks; + FreeBuffersWorkers(); + mEffectEqualization->ReplaceProcessedTracks(bGoodResult); + return bGoodResult; +} + + +bool EffectEqualization48x::DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len) +{ + + sampleCount trackBlockSize = t->GetMaxBlockSize(); + + float *buffer1 = new float[trackBlockSize]; + float *buffer2 = new float[trackBlockSize]; + + AudacityProject *p = GetActiveProject(); + WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); + sampleCount originalLen = len; + sampleCount currentSample = start; + + while(len) { + sampleCount curretLength=(trackBlockSize>len)?len:trackBlockSize; + t->Get((samplePtr)buffer1, floatSample, currentSample, curretLength); + t2->Get((samplePtr)buffer2, floatSample, currentSample, curretLength); + for(int i=0;iAppend((samplePtr)buffer1, floatSample, curretLength); + currentSample+=curretLength; + len-=curretLength; + } + delete[] buffer1; + delete[] buffer2; + output->Flush(); + len=originalLen; + ProcessTail(t, output, start, len); + delete output; + return true; +} + +bool EffectEqualization48x::Benchmark(EffectEqualization* effectEqualization) +{ + mEffectEqualization=effectEqualization; + mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks. + bool bGoodResult = true; + + TableUsage(sMathPath); + if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!! + mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1; + AllocateBuffersWorkers((bool)MATH_FUNCTION_THREADED); + SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks); + long times[] = { 0,0,0 }; + wxStopWatch timer; + mBenching=true; + for(int i=0;i<3;i++) { + int localMathPath; + switch(i) { + case 0: localMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; + if(!sMathCaps.SSE) + localMathPath=-1; + break; + case 1: localMathPath=MATH_FUNCTION_SSE; + if(!sMathCaps.SSE) + localMathPath=-1; + break; + case 2: localMathPath=0; + break; + default: localMathPath=-1; + } + if(localMathPath>=0) { + timer.Start(); + WaveTrack *track = (WaveTrack *) iter.First(); + int count = 0; + while (track) { + double trackStart = track->GetStartTime(); + double trackEnd = track->GetEndTime(); + double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0; + double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1; + + if (t1 > t0) { + sampleCount start = track->TimeToLongSamples(t0); + sampleCount end = track->TimeToLongSamples(t1); + sampleCount len = (sampleCount)(end - start); + + if(!localMathPath) { + if (!mEffectEqualization->ProcessOne(count, track, start, len)) + { + bGoodResult = false; + break; + } + } else { + if(localMathPath<8) { + if (!ProcessOne4x(count, track, start, len)) + { + bGoodResult = false; + break; + } + } else { + if (!ProcessOne4xThreaded(count, track, start, len)) + { + bGoodResult = false; + break; + } + } + } + } + track = (WaveTrack *) iter.Next(); + count++; + } + times[i]=timer.Time(); + } + } + FreeBuffersWorkers(); + mBenching=false; + bGoodResult=false; + mEffectEqualization->ReplaceProcessedTracks(bGoodResult); + + wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]); + wxTimeSpan tsSSE(0, 0, 0, times[1]); + wxTimeSpan tsDefault(0, 0, 0, times[2]); + wxMessageBox(wxString::Format(_("Benchmark times:\nDefault: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")).c_str(),tsSSE.Format(wxT("%M:%S.%l")).c_str(),tsSSEThreaded.Format(wxT("%M:%S.%l")).c_str())); +/* wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]); + wxTimeSpan tsSSE(0, 0, 0, times[1]); + wxTimeSpan tsDefault(0, 0, 0, times[2]); + wxString outputString; + outputString.Format(_("Benchmark times:\nDefault: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")),tsSSE.Format(wxT("%M:%S.%l")),tsSSEThreaded.Format(wxT("%M:%S.%l"))); + wxMessageBox(outputString); */ + + + return bGoodResult; +} + + +bool EffectEqualization48x::ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength) + +{ + sampleCount blockCount=bufferLength/mBlockSize; + sampleCount lastBlockSize=bufferLength%mBlockSize; + if(lastBlockSize) + blockCount++; + + float *workBuffer=&sourceBuffer[bufferLength]; // all scratch buffers are at the end + + for(int runx=0;runxFilter(mWindowSize, currentBuffer); + float *writeEnd=currentBuffer+mBlockSize; + if(runx==blockCount) + writeEnd=currentBuffer+(lastBlockSize+mFilterSize); + if(runx) { + float *lastOverrun=&workBuffer[mWindowSize*((runx+1)&1)+mBlockSize]; + for(int j=0;j>1; // this will skip the first filterSize on the first run + while(currentBuffermBufferLength%mBlockSize) + return false; + + sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize; + + __m128 *readBlocks[4]; // some temps so we dont destroy the vars in the struct + __m128 *writeBlocks[4]; + for(int i=0;i<4;i++) { + readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i]; + writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i]; + } + + __m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer; + __m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*2]; + + for(int run4x=0;run4x>2; + // swizzle it back. + for(int i=writeToStart,j=writeStart;j>2; // these are 128b pointers, each window is 1/4 blockSize for those + writeBlocks[i]+=mBlockSize>>2; + } + } + return true; +} + +bool EffectEqualization48x::ProcessOne4x(int count, WaveTrack * t, + sampleCount start, sampleCount len) +{ + sampleCount blockCount=len/mBlockSize; + + if(blockCount<16) // it's not worth 4x processing do a regular process + return mEffectEqualization->ProcessOne(count, t, start, len); + + sampleCount trackBlockSize = t->GetMaxBlockSize(); + + AudacityProject *p = GetActiveProject(); + WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); + + mEffectEqualization->TrackProgress(count, 0.0); + int bigRuns=len/(mSubBufferSize-mBlockSize); + int trackBlocksPerBig=mSubBufferSize/trackBlockSize; + int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize; + int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize)); + sampleCount currentSample=start; + + for(int bigRun=0;bigRunGet((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize); + currentSample+=trackBlockSize; + } + if(trackLeftovers) { + t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); + currentSample+=trackLeftovers; + } + currentSample-=mBlockSize+(mFilterSize>>1); + + ProcessBuffer4x(mBufferInfo); + if (mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns)) + { + break; + } + output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1))); + } + if(singleProcessLength) { + t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); + ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); + output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); + } + + output->Flush(); + ProcessTail(t, output, start, len); + delete output; + return true; +} + +void *EQWorker::Entry() +{ + while(!mExitLoop) { + mMutex->Lock(); + bool bufferAquired=false; + for(int i=0;iUnlock(); + mEffectEqualization48x->ProcessBuffer4x(&mBufferInfoList[i]); + mBufferInfoList[i].mBufferStatus=BufferDone; // we're done + break; + } + if(!bufferAquired) + mMutex->Unlock(); + } + return NULL; +} + +bool EffectEqualization48x::ProcessOne4xThreaded(int count, WaveTrack * t, + sampleCount start, sampleCount len) +{ + sampleCount blockCount=len/mBlockSize; + + if(blockCount<16) // it's not worth 4x processing do a regular process + return ProcessOne4x(count, t, start, len); + if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data + return ProcessOne4x(count, t, start, len); + + AudacityProject *p = GetActiveProject(); + WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate()); + + sampleCount trackBlockSize = t->GetMaxBlockSize(); + mEffectEqualization->TrackProgress(count, 0.0); + int bigRuns=len/(mSubBufferSize-mBlockSize); + int trackBlocksPerBig=mSubBufferSize/trackBlockSize; + int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize; + int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize)); + sampleCount currentSample=start; + + int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0; + + // fill the first workerDataCount buffers we checked above and there is at least this data + for(int i=0;iGet((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize); + currentSample+=trackBlockSize; + } + if(trackLeftovers) { + t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); + currentSample+=trackLeftovers; + } + currentSample-=mBlockSize+(mFilterSize>>1); + mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin + } + int currentIndex=0; + while(bigBlocksWrittenTrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns)) + { + break; + } + output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1))); + bigBlocksWritten++; + if(bigBlocksReadGet((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize); + currentSample+=trackBlockSize; + } + if(trackLeftovers) { + t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers); + currentSample+=trackLeftovers; + } + currentSample-=mBlockSize+(mFilterSize>>1); + mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin + bigBlocksRead++; + } else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary + currentIndex=(currentIndex+1)%mWorkerDataCount; + } + mDataMutex.Unlock(); // Get back in line for data + } + if(singleProcessLength) { + t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); + ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1)); + output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1)); + } + output->Flush(); + ProcessTail(t, output, start, len); + delete output; + return true; +} + +bool EffectEqualization48x::ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len) +{ + // double offsetT0 = t->LongSamplesToTime((sampleCount)offset); + double lenT = t->LongSamplesToTime(len); + // 'start' is the sample offset in 't', the passed in track + // 'startT' is the equivalent time value + // 'output' starts at zero + double startT = t->LongSamplesToTime(start); + + //output has one waveclip for the total length, even though + //t might have whitespace seperating multiple clips + //we want to maintain the original clip structure, so + //only paste the intersections of the new clip. + + //Find the bits of clips that need replacing + std::vector > clipStartEndTimes; + std::vector > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected + for (WaveClipList::compatibility_iterator it=t->GetClipIterator(); it; it=it->GetNext()) + { + WaveClip *clip; + double clipStartT; + double clipEndT; + + clip = it->GetData(); + clipStartT = clip->GetStartTime(); + clipEndT = clip->GetEndTime(); + if( clipEndT <= startT ) + continue; // clip is not within selection + if( clipStartT >= startT + lenT ) + continue; // clip is not within selection + + //save the actual clip start/end so that we can rejoin them after we paste. + clipRealStartEndTimes.push_back(std::pair(clipStartT,clipEndT)); + + if( clipStartT < startT ) // does selection cover the whole clip? + clipStartT = startT; // don't copy all the new clip + if( clipEndT > startT + lenT ) // does selection cover the whole clip? + clipEndT = startT + lenT; // don't copy all the new clip + + //save them + clipStartEndTimes.push_back(std::pair(clipStartT,clipEndT)); + } + //now go thru and replace the old clips with new + for(unsigned int i=0;iClear(clipStartEndTimes[i].first,clipStartEndTimes[i].second); + // output->Copy(clipStartEndTimes[i].first-startT+offsetT0,clipStartEndTimes[i].second-startT+offsetT0, &toClipOutput); + output->Copy(clipStartEndTimes[i].first-startT,clipStartEndTimes[i].second-startT, &toClipOutput); + if(toClipOutput) + { + //put the processed audio in + bool bResult = t->Paste(clipStartEndTimes[i].first, toClipOutput); + wxASSERT(bResult); // TO DO: Actually handle this. + //if the clip was only partially selected, the Paste will have created a split line. Join is needed to take care of this + //This is not true when the selection is fully contained within one clip (second half of conditional) + if( (clipRealStartEndTimes[i].first != clipStartEndTimes[i].first || + clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) && + !(clipRealStartEndTimes[i].first <= startT && + clipRealStartEndTimes[i].second >= startT+lenT) ) + t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second); + delete toClipOutput; + } + } + return true; +} + + + + +void EffectEqualization48x::Filter4x(sampleCount len, + float *buffer, float *scratchBuffer) +{ + int i; + __m128 real128, imag128; + // Apply FFT + RealFFTf4x(buffer, mEffectEqualization->hFFT); + + // Apply filter + // DC component is purely real + __m128 *localFFTBuffer=(__m128 *)scratchBuffer; + __m128 *localBuffer=(__m128 *)buffer; + + __m128 filterFuncR, filterFuncI; + filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[0]); + localFFTBuffer[0]=_mm_mul_ps(localBuffer[0], filterFuncR); + int halfLength=(len/2); + + bool useBitReverseTable=sMathPath&1; + + for(i=1; ihFFT->BitReversed[i] ]; + imag128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1]; + } else { + int bitReversed=SmallReverseBits(i,mEffectEqualization->hFFT->pow2Bits); + real128=localBuffer[bitReversed]; + imag128=localBuffer[bitReversed+1]; + } + filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[i]); + filterFuncI=_mm_set1_ps(mEffectEqualization->mFilterFuncI[i]); + localFFTBuffer[2*i ] = _mm_sub_ps( _mm_mul_ps(real128, filterFuncR), _mm_mul_ps(imag128, filterFuncI)); + localFFTBuffer[2*i+1] = _mm_add_ps( _mm_mul_ps(real128, filterFuncI), _mm_mul_ps(imag128, filterFuncR)); + } + // Fs/2 component is purely real + filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]); + localFFTBuffer[1] = _mm_mul_ps(localBuffer[1], filterFuncR); + + // Inverse FFT and normalization + InverseRealFFTf4x(scratchBuffer, mEffectEqualization->hFFT); + ReorderToTime4x(mEffectEqualization->hFFT, scratchBuffer, buffer); +} + +#endif diff --git a/src/effects/Equalization48x.h b/src/effects/Equalization48x.h new file mode 100644 index 000000000..518d39c2d --- /dev/null +++ b/src/effects/Equalization48x.h @@ -0,0 +1,146 @@ +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + +/********************************************************************** + +Audacity: A Digital Audio Editor + +Equalization48x.h + +Intrinsics (SSE/AVX) and Threaded Equalization + +***********************************************************************/ + +#ifndef __AUDACITY_EFFECT_EQUALIZATION48X__ +#define __AUDACITY_EFFECT_EQUALIZATION48X__ + +// bitwise function selection +// options are +#define MATH_FUNCTION_ORIGINAL 0 // 0 original path +#define MATH_FUNCTION_BITREVERSE_TABLE 1 // 1 SSE BitReverse Table +#define MATH_FUNCTION_SIN_COS_TABLE 2 // 2 SSE SinCos Table +// 3 SSE with SinCos and BitReverse buffer +#define MATH_FUNCTION_SSE 4 // 4 SSE no SinCos and no BitReverse buffer +#define MATH_FUNCTION_THREADED 8 // 8 SSE threaded no SinCos and no BitReverse buffer +// 9 SSE threaded BitReverse Table +// 10 SSE threaded SinCos Table +// 11 SSE threaded with SinCos and BitReverse buffer +//#define MATH_FUNCTION_AVX 16 + +// added by Andrew Hallendorff intrinsics processing +enum EQBufferStatus +{ + BufferEmpty=0, + BufferReady, + BufferBusy, + BufferDone +}; + + +class BufferInfo { +public: + BufferInfo() { mBufferLength=0; mBufferStatus=BufferEmpty; }; + float* mBufferSouce[4]; + float* mBufferDest[4]; + int mBufferLength; + sampleCount mFftWindowSize; + sampleCount mFftFilterSize; + float* mScratchBuffer; + EQBufferStatus mBufferStatus; +}; + +typedef struct { + int x64; + int MMX; + int SSE; + int SSE2; + int SSE3; + int SSSE3; + int SSE41; + int SSE42; + int SSE4a; + int AVX; + int XOP; + int FMA3; + int FMA4; +} MathCaps; + + +class EffectEqualization; +class EffectEqualization48x; +static int EQWorkerCounter=0; + +class EQWorker : public wxThread { +public: + EQWorker():wxThread(wxTHREAD_JOINABLE) { + mBufferInfoList=NULL; + mBufferInfoCount=0; + mMutex=NULL; + mEffectEqualization48x=NULL; + mExitLoop=false; + mThreadID=EQWorkerCounter++; + } + void SetData( BufferInfo* bufferInfoList, int bufferInfoCount, wxMutex *mutex, EffectEqualization48x *effectEqualization48x) { + mBufferInfoList=bufferInfoList; + mBufferInfoCount=bufferInfoCount; + mMutex=mutex; + mEffectEqualization48x=effectEqualization48x; + } + void ExitLoop() { // this will cause the thread to drop from the loops + mExitLoop=true; + } + virtual void* Entry(); + BufferInfo* mBufferInfoList; + int mBufferInfoCount, mThreadID; + wxMutex *mMutex; + EffectEqualization48x *mEffectEqualization48x; + bool mExitLoop; +}; + +class EffectEqualization48x { + +public: + + EffectEqualization48x(); + virtual ~EffectEqualization48x(); + + static MathCaps *GetMathCaps(); + static void SetMathPath(int mathPath); + static int GetMathPath(); + static void AddMathPathOption(int mathPath); + static void RemoveMathPathOption(int mathPath); + + bool Process(EffectEqualization* effectEqualization); + bool Benchmark(EffectEqualization* effectEqualization); +private: + bool TrackCompare(); + bool DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len); + bool AllocateBuffersWorkers(bool threaded); + bool FreeBuffersWorkers(); + bool ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength); + bool ProcessBuffer4x(BufferInfo *bufferInfo); + bool ProcessOne4x(int count, WaveTrack * t, sampleCount start, sampleCount len); + bool ProcessOne4xThreaded(int count, WaveTrack * t, sampleCount start, sampleCount len); + bool ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len); + void Filter4x(sampleCount len, float *buffer, float *scratchBuffer); + + EffectEqualization* mEffectEqualization; + int mThreadCount; + sampleCount mFilterSize; + sampleCount mBlockSize; + sampleCount mWindowSize; + int mWorkerDataCount; + int mBlocksPerBuffer; + int mScratchBufferSize; + int mSubBufferSize; + float *mBigBuffer; + BufferInfo* mBufferInfo; + wxMutex mDataMutex; + EQWorker* mEQWorkers; + bool mThreaded; + bool mBenching; + friend EQWorker; +}; + +#endif + +#endif \ No newline at end of file diff --git a/src/prefs/EffectsPrefs.cpp b/src/prefs/EffectsPrefs.cpp index 020a01b6e..02a5be482 100644 --- a/src/prefs/EffectsPrefs.cpp +++ b/src/prefs/EffectsPrefs.cpp @@ -123,6 +123,17 @@ void EffectsPrefs::PopulateOrExchange(ShuttleGui & S) } S.EndStatic(); #endif + +#ifdef EXPERIMENTAL_EQ_SSE_THREADED + S.StartStatic(_("Instruction Set")); + { + S.TieCheckBox(_("&Use SSE/SSE2/.../AVX"), + wxT("/SSE/GUI"), + true); + } + S.EndStatic(); +#endif + } bool EffectsPrefs::Apply() diff --git a/win/Projects/Audacity/Audacity.vcproj b/win/Projects/Audacity/Audacity.vcproj index 1889da939..efa97a9ab 100644 --- a/win/Projects/Audacity/Audacity.vcproj +++ b/win/Projects/Audacity/Audacity.vcproj @@ -634,6 +634,14 @@ RelativePath="..\..\..\src\RealFFTf.h" > + + + + @@ -748,6 +756,14 @@ RelativePath="..\..\..\src\SplashDialog.h" > + + + + @@ -996,6 +1012,14 @@ RelativePath="..\..\..\src\effects\Equalization.h" > + + + +