Andrew Hallendorff's SSE accelerated Equalization.

This commit is contained in:
james.k.crook@gmail.com 2014-01-16 17:55:35 +00:00
parent 3f59126949
commit d847ee7162
13 changed files with 3161 additions and 307 deletions

View File

@ -30,6 +30,10 @@
#ifndef __EXPERIMENTAL__
#define __EXPERIMENTAL__
// ACH 08 Jan 2014
// EQ accelerated code
//#define EXPERIMENTAL_EQ_SSE_THREADED
// LLL, 09 Nov 2013:
// Allow all WASAPI devices, not just loopback
#define EXPERIMENTAL_FULL_WASAPI

View File

@ -1,54 +1,59 @@
/*
* Program: REALFFTF.C
* Author: Philip Van Baren
* Date: 2 September 1993
*
* Description: These routines perform an FFT on real data to get a conjugate-symmetric
* output, and an inverse FFT on conjugate-symmetric input to get a real
* output sequence.
*
* This code is for floating point data.
*
* Modified 8/19/1998 by Philip Van Baren
* - made the InitializeFFT and EndFFT routines take a structure
* holding the length and pointers to the BitReversed and SinTable
* tables.
* Modified 5/23/2009 by Philip Van Baren
* - Added GetFFT and ReleaseFFT routines to retain common SinTable
* and BitReversed tables so they don't need to be reallocated
* and recomputed on every call.
* - Added Reorder* functions to undo the bit-reversal
*
* Copyright (C) 2009 Philip VanBaren
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
* Program: REALFFTF.C
* Author: Philip Van Baren
* Date: 2 September 1993
*
* Description: These routines perform an FFT on real data to get a conjugate-symmetric
* output, and an inverse FFT on conjugate-symmetric input to get a real
* output sequence.
*
* This code is for floating point data.
*
* Modified 8/19/1998 by Philip Van Baren
* - made the InitializeFFT and EndFFT routines take a structure
* holding the length and pointers to the BitReversed and SinTable
* tables.
* Modified 5/23/2009 by Philip Van Baren
* - Added GetFFT and ReleaseFFT routines to retain common SinTable
* and BitReversed tables so they don't need to be reallocated
* and recomputed on every call.
* - Added Reorder* functions to undo the bit-reversal
*
* Copyright (C) 2009 Philip VanBaren
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "Experimental.h"
#include "RealFFTf.h"
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
#include "RealFFTf48x.h"
#endif
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
/*
* Initialize the Sine table and Twiddle pointers (bit-reversed pointers)
* for the FFT routine.
*/
* Initialize the Sine table and Twiddle pointers (bit-reversed pointers)
* for the FFT routine.
*/
HFFT InitializeFFT(int fftlen)
{
int i;
@ -62,10 +67,10 @@ HFFT InitializeFFT(int fftlen)
exit(8);
}
/*
* FFT size is only half the number of data points
* The full FFT output can be reconstructed from this FFT's output.
* (This optimization can be made since the data is real.)
*/
* FFT size is only half the number of data points
* The full FFT output can be reconstructed from this FFT's output.
* (This optimization can be made since the data is real.)
*/
h->Points = fftlen/2;
if((h->SinTable=(fft_type *)malloc(2*h->Points*sizeof(fft_type)))==NULL)
@ -73,6 +78,7 @@ HFFT InitializeFFT(int fftlen)
fprintf(stderr,"Error allocating memory for Sine table.\n");
exit(8);
}
if((h->BitReversed=(int *)malloc(h->Points*sizeof(int)))==NULL)
{
fprintf(stderr,"Error allocating memory for BitReversed.\n");
@ -86,19 +92,28 @@ HFFT InitializeFFT(int fftlen)
temp=(temp >> 1) + (i&mask ? h->Points : 0);
h->BitReversed[i]=temp;
}
}
for(i=0;i<h->Points;i++)
{
h->SinTable[h->BitReversed[i] ]=(fft_type)-sin(2*M_PI*i/(2*h->Points));
h->SinTable[h->BitReversed[i]+1]=(fft_type)-cos(2*M_PI*i/(2*h->Points));
}
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
// new SSE FFT routines work on live data
for(i=0;i<32;i++)
if((1<<i)&fftlen)
h->pow2Bits=i;
InitializeFFT1x(fftlen);
#endif
return h;
}
/*
* Free up the memory allotted for Sin table and Twiddle Pointers
*/
* Free up the memory allotted for Sin table and Twiddle Pointers
*/
void EndFFT(HFFT h)
{
if(h->Points>0) {
@ -157,23 +172,23 @@ void CleanupFFT()
}
/*
* Forward FFT routine. Must call InitializeFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
* Forward FFT routine. Must call InitializeFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf(fft_type *buffer,HFFT h)
{
fft_type *A,*B;
@ -186,12 +201,12 @@ void RealFFTf(fft_type *buffer,HFFT h)
int ButterfliesPerGroup=h->Points/2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1=buffer+h->Points*2;
@ -258,24 +273,24 @@ void RealFFTf(fft_type *buffer,HFFT h)
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call InitializeFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call InitializeFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf(fft_type *buffer,HFFT h)
{
fft_type *A,*B;
@ -323,12 +338,12 @@ void InverseRealFFTf(fft_type *buffer,HFFT h)
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1=buffer+h->Points*2;

View File

@ -6,6 +6,9 @@ typedef struct FFTParamType {
int *BitReversed;
fft_type *SinTable;
int Points;
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
int pow2Bits;
#endif
} FFTParam;
#define HFFT FFTParam *

754
src/RealFFTf48x.cpp Normal file
View File

@ -0,0 +1,754 @@
/**********************************************************************
Audacity: A Digital Audio Editor
RealFFT48x.cpp
Philip Van Baren
Andrew Hallendorff (SSE Mods)
*******************************************************************//**
\file RealFFT48x.cpp
\brief Real FFT with SSE acceleration.
*//****************************************************************/
/*
* Program: REALFFTF.C
* Author: Philip Van Baren
* Date: 2 September 1993
*
* Description: These routines perform an FFT on real data to get a conjugate-symmetric
* output, and an inverse FFT on conjugate-symmetric input to get a real
* output sequence.
*
* This code is for floating point data.
*
* Modified 8/19/1998 by Philip Van Baren
* - made the InitializeFFT and EndFFT routines take a structure
* holding the length and pointers to the BitReversed and SinTable
* tables.
* Modified 5/23/2009 by Philip Van Baren
* - Added GetFFT and ReleaseFFT routines to retain common SinTable
* and BitReversed tables so they don't need to be reallocated
* and recomputed on every call.
* - Added Reorder* functions to undo the bit-reversal
*
* Copyright (C) 2009 Philip VanBaren
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "Experimental.h"
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
#ifndef USE_SSE2
#define USE_SSE2
#endif
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "RealFFTf.h"
#ifdef __WXMSW__
#pragma warning(disable:4305)
#else
#endif
#include "SseMathFuncs.h"
#include <xmmintrin.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
unsigned char smallReverseBitsTable[256];
int tableMask=0;
bool useBitReverseTable=false;
bool useSinCosTable=false;
void TableUsage(int iMask)
{
tableMask=iMask;
useBitReverseTable=((iMask & 1)!=0);
useSinCosTable=((iMask&2)!=0);
}
// note !!! number of bits must be between 9-16
int SmallReverseBits(int bits, int numberBits)
{
return (smallReverseBitsTable[*((unsigned char *)&bits)]<<(numberBits-8))+(smallReverseBitsTable[*(((unsigned char *)&bits)+1)]>>(16-numberBits));
}
/*
* Initialize the Sine table and Twiddle pointers (bit-reversed pointers)
* for the FFT routine.
*/
HFFT InitializeFFT1x(int WXUNUSED( fftlen ) )
{
int i;
//int temp;
//int mask;
//HFFT h;
// this needs to move out but ehh... Andrew Hallendorff
for(i=0;i<256;i++) {
smallReverseBitsTable[i]=0;
for(int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1)
if(i&maskLow)
smallReverseBitsTable[i]|=maskHigh;
}
return NULL;
}
/*
* Free up the memory allotted for Sin table and Twiddle Pointers
*/
void EndFFT1x(HFFT h)
{
if(h->Points>0) {
free(h->BitReversed);
free(h->SinTable);
}
h->Points=0;
free(h);
}
#define MAX_HFFT 10
static HFFT hFFTArray[MAX_HFFT] = { NULL };
static int nFFTLockCount[MAX_HFFT] = { 0 };
/* Get a handle to the FFT tables of the desired length */
/* This version keeps common tables rather than allocating a new table every time */
HFFT GetFFT1x(int fftlen)
{
int h,n = fftlen/2;
for(h=0; (h<MAX_HFFT) && (hFFTArray[h] != NULL) && (n != hFFTArray[h]->Points); h++);
if(h<MAX_HFFT) {
if(hFFTArray[h] == NULL) {
hFFTArray[h] = InitializeFFT(fftlen);
nFFTLockCount[h] = 0;
}
nFFTLockCount[h]++;
return hFFTArray[h];
} else {
// All buffers used, so fall back to allocating a new set of tables
return InitializeFFT(fftlen);;
}
}
/* Release a previously requested handle to the FFT tables */
void ReleaseFFT1x(HFFT hFFT)
{
int h;
for(h=0; (h<MAX_HFFT) && (hFFTArray[h] != hFFT); h++);
if(h<MAX_HFFT) {
nFFTLockCount[h]--;
} else {
EndFFT(hFFT);
}
}
/* Deallocate any unused FFT tables */
void CleanupFFT1x()
{
int h;
for(h=0; (h<MAX_HFFT); h++) {
if((nFFTLockCount[h] <= 0) && (hFFTArray[h] != NULL)) {
EndFFT(hFFTArray[h]);
hFFTArray[h] = NULL;
}
}
}
/*
* Forward FFT routine. Must call InitializeFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf1x(fft_type *buffer,HFFT h)
{
fft_type *A,*B;
fft_type *sptr;
fft_type *endptr1,*endptr2;
int *br1,*br2;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
int ButterfliesPerGroup=h->Points/2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1=buffer+h->Points*2;
while(ButterfliesPerGroup>0)
{
A=buffer;
B=buffer+ButterfliesPerGroup*2;
sptr=h->SinTable;
while(A<endptr1)
{
sin=*sptr;
cos=*(sptr+1);
endptr2=B;
while(A<endptr2)
{
v1=*B*cos + *(B+1)*sin;
v2=*B*sin - *(B+1)*cos;
*B=(*A+v1);
*(A++)=*(B++)-2*v1;
*B=(*A-v2);
*(A++)=*(B++)+2*v2;
}
A=B;
B+=ButterfliesPerGroup*2;
sptr+=2;
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1=h->BitReversed+1;
br2=h->BitReversed+h->Points-1;
while(br1<br2)
{
sin=h->SinTable[*br1];
cos=h->SinTable[*br1+1];
A=buffer+*br1;
B=buffer+*br2;
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus - cos*HIplus);
v2 = (cos*HRminus + sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus + v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
br1++;
br2--;
}
/* Handle the center bin (just need a conjugate) */
A=buffer+*br1+1;
*A=-*A;
/* Handle DC bin separately - and ignore the Fs/2 bin
buffer[0]+=buffer[1];
buffer[1]=(fft_type)0;*/
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=buffer[0]-buffer[1];
buffer[0]+=buffer[1];
buffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call InitializeFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf1x(fft_type *buffer,HFFT h)
{
fft_type *A,*B;
fft_type *sptr;
fft_type *endptr1,*endptr2;
int *br1;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
int ButterfliesPerGroup=h->Points/2;
/* Massage input to get the input for a real output sequence. */
A=buffer+2;
B=buffer+h->Points*2-2;
br1=h->BitReversed+1;
while(A<B)
{
sin=h->SinTable[*br1];
cos=h->SinTable[*br1+1];
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus + cos*HIplus);
v2 = (cos*HRminus - sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus - v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
A+=2;
B-=2;
br1++;
}
/* Handle center bin (just need conjugate) */
*(A+1)=-*(A+1);
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=0.5f*(buffer[0]+buffer[1]);
v2=0.5f*(buffer[0]-buffer[1]);
buffer[0]=v1;
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1=buffer+h->Points*2;
while(ButterfliesPerGroup>0)
{
A=buffer;
B=buffer+ButterfliesPerGroup*2;
sptr=h->SinTable;
while(A<endptr1)
{
sin=*(sptr++);
cos=*(sptr++);
endptr2=B;
while(A<endptr2)
{
v1=*B*cos - *(B+1)*sin;
v2=*B*sin + *(B+1)*cos;
*B=(*A+v1)*(fft_type)0.5;
*(A++)=*(B++)-v1;
*B=(*A+v2)*(fft_type)0.5;
*(A++)=*(B++)-v2;
}
A=B;
B+=ButterfliesPerGroup*2;
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq1x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
// Copy the data into the real and imaginary outputs
for(int i=1;i<hFFT->Points;i++) {
RealOut[i]=buffer[hFFT->BitReversed[i] ];
ImagOut[i]=buffer[hFFT->BitReversed[i]+1];
}
RealOut[0] = buffer[0]; // DC component
ImagOut[0] = 0;
RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
ImagOut[hFFT->Points] = 0;
}
void ReorderToTime1x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut)
{
// Copy the data into the real outputs
for(int i=0;i<hFFT->Points;i++) {
TimeOut[i*2 ]=buffer[hFFT->BitReversed[i] ];
TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1];
}
}
// 4x processing simd
void RealFFTf4x(fft_type *buffer,HFFT h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
fft_type *sptr;
__m128 *endptr1,*endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
fft_type iToRad=2*M_PI/(2*h->Points);
int ButterfliesPerGroup=h->Points/2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1=&localBuffer[h->Points*2];
while(ButterfliesPerGroup>0)
{
A=localBuffer;
B=&localBuffer[ButterfliesPerGroup*2];
sptr=h->SinTable;
int iSinCosIndex=0;
int iSinCosCalIndex=0;
while(A<endptr1)
{
v4sfu sin4_2, cos4_2;
if(useSinCosTable) {
sin=_mm_set1_ps(*(sptr++));
cos=_mm_set1_ps(*(sptr++));
} else {
if(!iSinCosCalIndex)
{
v4sfu vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
sincos_ps(&vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
iSinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
if(iSinCosCalIndex==3)
iSinCosCalIndex=0;
else
iSinCosCalIndex++;
}
iSinCosIndex++;
}
endptr2=B;
while(A<endptr2)
{
v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_add_ps( *A, v1);
__m128 temp128 = _mm_set1_ps( 2.0);
*(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
*B=_mm_sub_ps(*A,v2);
*(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
}
A=B;
B=&B[ButterfliesPerGroup*2];
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index=1; // h->BitReversed+1;
br2Index=h->Points-1; //h->BitReversed+h->Points-1;
int iSinCosCalIndex=0;
while(br1Index<br2Index)
{
v4sfu sin4_2, cos4_2;
if(useBitReverseTable) {
br1Value=h->BitReversed[br1Index];
br2Value=h->BitReversed[br2Index];
} else {
br1Value=SmallReverseBits(br1Index,h->pow2Bits);
br2Value=SmallReverseBits(br2Index,h->pow2Bits);
}
if(useSinCosTable) {
sin=_mm_set1_ps(h->SinTable[br1Value]);
cos=_mm_set1_ps(h->SinTable[br1Value+1]);
} else {
if(!iSinCosCalIndex)
{
v4sfu vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(&vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
iSinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
if(iSinCosCalIndex==3)
iSinCosCalIndex=0;
else
iSinCosCalIndex++;
}
}
A=&localBuffer[br1Value];
B=&localBuffer[br2Value];
__m128 temp128 = _mm_set1_ps( 2.0);
HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
temp128 = _mm_set1_ps( 0.5);
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
if(useBitReverseTable)
A=&localBuffer[h->BitReversed[br1Index]+1];
else
A=&localBuffer[SmallReverseBits(br1Index,h->pow2Bits)+1];
// negate sse style
*A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
localBuffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call InitializeFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf4x(fft_type *buffer,HFFT h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
fft_type *sptr;
__m128 *endptr1,*endptr2;
int br1Index, br1Value;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
fft_type iToRad=2*M_PI/(2*h->Points);
int ButterfliesPerGroup=h->Points/2;
/* Massage input to get the input for a real output sequence. */
A=localBuffer+2;
B=localBuffer+h->Points*2-2;
br1Index=1; //h->BitReversed+1;
int iSinCosCalIndex=0;
while(A<B)
{
v4sfu sin4_2, cos4_2;
if(useBitReverseTable) {
br1Value=h->BitReversed[br1Index];
} else {
br1Value=SmallReverseBits(br1Index,h->pow2Bits);
}
if(useSinCosTable) {
sin=_mm_set1_ps(h->SinTable[br1Value]);
cos=_mm_set1_ps(h->SinTable[br1Value+1]);
} else {
if(!iSinCosCalIndex)
{
v4sfu vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(&vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
iSinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
if(iSinCosCalIndex==3)
iSinCosCalIndex=0;
else
iSinCosCalIndex++;
}
}
HRminus = _mm_sub_ps(*A, *B);
HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
HIminus = _mm_sub_ps( *(A+1), *(B+1));
HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
// negate sse style
*(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
localBuffer[0]=v1;
localBuffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1=localBuffer+h->Points*2;
while(ButterfliesPerGroup>0)
{
A=localBuffer;
B=localBuffer+ButterfliesPerGroup*2;
sptr=h->SinTable;
int iSinCosIndex=0;
int iSinCosCalIndex=0;
while(A<endptr1)
{
v4sfu sin4_2, cos4_2;
if(useSinCosTable) {
sin=_mm_set1_ps(*(sptr++));
cos=_mm_set1_ps(*(sptr++));
} else {
if(!iSinCosCalIndex)
{
v4sfu vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((fft_type )SmallReverseBits(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
sincos_ps(&vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
iSinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[iSinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[iSinCosCalIndex]);
if(iSinCosCalIndex==3)
iSinCosCalIndex=0;
else
iSinCosCalIndex++;
}
iSinCosIndex++;
}
endptr2=B;
while(A<endptr2)
{
v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++), v1);
*B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++),v2);
}
A=B;
B=&B[ButterfliesPerGroup*2];
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq4x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *localRealOut=(__m128 *)RealOut;
__m128 *localImagOut=(__m128 *)ImagOut;
// Copy the data into the real and imaginary outputs
for(int i=1;i<hFFT->Points;i++) {
int brValue;
if(useBitReverseTable)
brValue=hFFT->BitReversed[i];
else
brValue=SmallReverseBits(i,hFFT->pow2Bits);
localRealOut[i]=localBuffer[brValue ];
localImagOut[i]=localBuffer[brValue+1];
}
localRealOut[0] = localBuffer[0]; // DC component
localImagOut[0] = _mm_set1_ps(0.0);
localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
}
void ReorderToTime4x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *localTimeOut=(__m128 *)TimeOut;
// Copy the data into the real outputs
for(int i=0;i<hFFT->Points;i++) {
int brValue;
if(useBitReverseTable)
brValue=hFFT->BitReversed[i];
else
brValue=SmallReverseBits(i,hFFT->pow2Bits);
localTimeOut[i*2 ]=localBuffer[brValue ];
localTimeOut[i*2+1]=localBuffer[brValue+1];
}
}
#endif

23
src/RealFFTf48x.h Normal file
View File

@ -0,0 +1,23 @@
#ifndef __realfftf48x_h
#define __realfftf48x_h
#define fft_type float
HFFT InitializeFFT1x(int);
void EndFFT1x(HFFT);
HFFT GetFFT1x(int);
void ReleaseFFT1x(HFFT);
void CleanupFFT1x();
void RealFFTf1x(fft_type *,HFFT);
void InverseRealFFTf1x(fft_type *,HFFT);
void ReorderToTime1x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut);
void ReorderToFreq1x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut);
int SmallReverseBits(int bits, int numberBits);
void RealFFTf4x(fft_type *,HFFT);
void InverseRealFFTf4x(fft_type *,HFFT);
void ReorderToTime4x(HFFT hFFT, fft_type *buffer, fft_type *TimeOut);
void ReorderToFreq4x(HFFT hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut);
void TableUsage(int iMask);
#endif

698
src/SseMathFuncs.cpp Normal file
View File

@ -0,0 +1,698 @@
/**********************************************************************
Audacity: A Digital Audio Editor
SseMathFuncs.cpp
Stephen Moshier (wrote original C version, The Cephes Library)
Julien Pommier (converted to use SSE)
Andrew Hallendorff (adapted for Audacity)
*******************************************************************//**
\file SseMathfuncs.cpp
\brief SSE maths functions (for FFTs)
*//****************************************************************/
#include "SseMathFuncs.h"
/* JKC: The trig functions use Taylor's series, on the range 0 to Pi/4
* computing both Sin and Cos, and using one or the other (in the
* solo functions), or both in the more useful for us for FFTs sincos
* function.
* The constants minus_cephes_DP1 to minus_cephes_DP3 are used in the
* angle reduction modulo function.
* 4 sincos are done at a time.
* If we wanted to do just sin or just cos, we could speed things up
* by queuing up the Sines and Cosines into batches of 4 separately.*/
#ifndef USE_SSE2 //sry this is all sse2 now
#define USE_SSE2
#endif
/* declare some SSE constants -- why can't I figure a better way to do that? */
#define _PS_CONST(Name, Val) \
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { (float)Val, (float)Val, (float)Val, (float)Val }
#define _PI32_CONST(Name, Val) \
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PS_CONST_TYPE(Name, Type, Val) \
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
_PS_CONST(1 , 1.0f);
_PS_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
_PI32_CONST(1, 1);
_PI32_CONST(inv1, ~1);
_PI32_CONST(2, 2);
_PI32_CONST(4, 4);
_PI32_CONST(0x7f, 0x7f);
_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS_CONST(cephes_log_p0, 7.0376836292E-2);
_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
_PS_CONST(cephes_log_p2, 1.1676998740E-1);
_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
_PS_CONST(cephes_log_q1, -2.12194440e-4);
_PS_CONST(cephes_log_q2, 0.693359375);
#ifndef USE_SSE2
typedef union xmm_mm_union {
__m128 xmm;
__m64 mm[2];
} xmm_mm_union;
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
xmm_mm_union u; u.xmm = xmm_; \
mm0_ = u.mm[0]; \
mm1_ = u.mm[1]; \
}
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
}
#endif // USE_SSE2
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
__m128 log_ps(v4sfu *xPtr) {
__m128 x=*((__m128 *)xPtr);
#ifdef USE_SSE2
__m128i emm0;
#else
__m64 mm0, mm1;
#endif
__m128 one = *(__m128*)_ps_1;
__m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos); /* cut off denormalized stuff */
#ifndef USE_SSE2
/* part 1: x = frexpf(x, &e); */
COPY_XMM_TO_MM(x, mm0, mm1);
mm0 = _mm_srli_pi32(mm0, 23);
mm1 = _mm_srli_pi32(mm1, 23);
#else
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
/* keep only the fractional part */
x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
x = _mm_or_ps(x, *(__m128*)_ps_0p5);
#ifndef USE_SSE2
/* now e=mm0:mm1 contain the really base-2 exponent */
mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
__m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
_mm_empty(); /* bye bye mmx */
#else
emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
__m128 e = _mm_cvtepi32_ps(emm0);
#endif
e = _mm_add_ps(e, one);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
__m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
__m128 tmp = _mm_and_ps(x, mask);
x = _mm_sub_ps(x, one);
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
x = _mm_add_ps(x, tmp);
__m128 z = _mm_mul_ps(x,x);
__m128 y = *(__m128*)_ps_cephes_log_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
y = _mm_mul_ps(y, x);
y = _mm_mul_ps(y, z);
tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
y = _mm_add_ps(y, tmp);
tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
_PS_CONST(exp_hi, 88.3762626647949f);
_PS_CONST(exp_lo, -88.3762626647949f);
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS_CONST(cephes_exp_C1, 0.693359375);
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
__m128 exp_ps(v4sfu *xPtr) {
__m128 x=*((__m128 *)xPtr);
__m128 tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
__m128i emm0;
#else
__m64 mm0, mm1;
#endif
__m128 one = *(__m128*)_ps_1;
x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);
/* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
/* step 1 : cast to int */
tmp = _mm_movehl_ps(tmp, fx);
mm0 = _mm_cvttps_pi32(fx);
mm1 = _mm_cvttps_pi32(tmp);
/* step 2 : cast back to float */
tmp = _mm_cvtpi32x2_ps(mm0, mm1);
#else
emm0 = _mm_cvttps_epi32(fx);
tmp = _mm_cvtepi32_ps(emm0);
#endif
/* if greater, substract 1 */
__m128 mask = _mm_cmpgt_ps(tmp, fx);
mask = _mm_and_ps(mask, one);
fx = _mm_sub_ps(tmp, mask);
tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
__m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
x = _mm_sub_ps(x, tmp);
x = _mm_sub_ps(x, z);
z = _mm_mul_ps(x,x);
__m128 y = *(__m128*)_ps_cephes_exp_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, x);
y = _mm_add_ps(y, one);
/* build 2^n */
#ifndef USE_SSE2
z = _mm_movehl_ps(z, fx);
mm0 = _mm_cvttps_pi32(fx);
mm1 = _mm_cvttps_pi32(z);
mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
mm0 = _mm_slli_pi32(mm0, 23);
mm1 = _mm_slli_pi32(mm1, 23);
__m128 pow2n;
COPY_MM_TO_XMM(mm0, mm1, pow2n);
_mm_empty();
#else
emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f);
emm0 = _mm_slli_epi32(emm0, 23);
__m128 pow2n = _mm_castsi128_ps(emm0);
#endif
y = _mm_mul_ps(y, pow2n);
return y;
}
_PS_CONST(minus_cephes_DP1, -0.78515625);
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS_CONST(sincof_p0, -1.9515295891E-4);
_PS_CONST(sincof_p1, 8.3321608736E-3);
_PS_CONST(sincof_p2, -1.6666654611E-1);
_PS_CONST(coscof_p0, 2.443315711809948E-005);
_PS_CONST(coscof_p1, -1.388731625493765E-003);
_PS_CONST(coscof_p2, 4.166664568298827E-002);
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
it runs also on old athlons XPs and the pentium III of your grand
mother.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
Performance is also surprisingly good, 1.33 times faster than the
macos vsinf SSE2 function, and 1.5 times faster than the
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
too bad for an SSE1 function (with no special tuning) !
However the latter libraries probably have a much better handling of NaN,
Inf, denormalized and other special arguments..
On my core 1 duo, the execution of this function takes approximately 95 cycles.
From what I have observed on the experiments with Intel AMath lib, switching to an
SSE2 version would improve the perf by only 10%.
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
deliver full speed.
*/
__m128 sin_ps(v4sfu *xPtr) { // any x
__m128 x=*((__m128 *)xPtr);
__m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
#ifdef USE_SSE2
__m128i emm0, emm2;
#else
__m64 mm0, mm1, mm2, mm3;
#endif
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
/* get the swap sign flag */
emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 swap_sign_bit = _mm_castsi128_ps(emm0);
__m128 poly_mask = _mm_castsi128_ps(emm2);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
#else
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
/* get the swap sign flag */
mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
/* get the polynom selection mask */
mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
__m128 swap_sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
_mm_empty(); /* good-bye mmx */
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(__m128*)_ps_coscof_p0;
__m128 z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* almost the same as sin_ps */
__m128 cos_ps(v4sfu *xPtr) { // any x
__m128 x=*((__m128 *)xPtr);
__m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
__m128i emm0, emm2;
#else
__m64 mm0, mm1, mm2, mm3;
#endif
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2);
/* get the swap sign flag */
emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask */
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 sign_bit = _mm_castsi128_ps(emm0);
__m128 poly_mask = _mm_castsi128_ps(emm2);
#else
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2);
mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2);
/* get the swap sign flag in mm0:mm1 and the
polynom selection mask in mm2:mm3 */
mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4);
mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
__m128 sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
_mm_empty(); /* good-bye mmx */
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(__m128*)_ps_coscof_p0;
__m128 z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) {
__m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
__m128i emm0, emm2, emm4;
#else
__m64 mm0, mm1, mm2, mm3, mm4, mm5;
#endif
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2);
#else
/* store the integer part of y in mm2:mm3 */
xmm3 = _mm_movehl_ps(xmm3, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm3);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm4 = mm2;
mm5 = mm3;
/* get the swap sign flag for the sine */
mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
__m128 swap_sign_bit_sin;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
/* get the polynom selection mask for the sine */
mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
__m128 poly_mask;
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
#ifdef USE_SSE2
emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
#else
/* get the sign flag for the cosine */
mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2);
mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2);
mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4);
mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4);
mm4 = _mm_slli_pi32(mm4, 29);
mm5 = _mm_slli_pi32(mm5, 29);
__m128 sign_bit_cos;
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
_mm_empty(); /* good-bye mmx */
#endif
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x);
y = *(__m128*)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
*s = _mm_xor_ps(xmm1, sign_bit_sin);
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}

80
src/SseMathFuncs.h Normal file
View File

@ -0,0 +1,80 @@
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
The default is to use the SSE1 version. If you define USE_SSE2 the
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
not expect any significant performance improvement with SSE2.
*/
/* Copyright (C) 2007 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#ifndef SSE_MATHFUN
#define SSE_MATHFUN
#include <inttypes.h>
#include <xmmintrin.h>
/* yes I know, the top of this file is quite ugly */
#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif
/* __m128 is ugly to write */
//typedef __m128 _v4sfu; // vector of 4 float (sse1)
#ifndef USE_SSE2 //sry this is all sse2 now
#define USE_SSE2
#endif
#ifdef USE_SSE2
# include <emmintrin.h>
#else
typedef __m64 v2si; // vector of 2 int (mmx)
#endif
// !!! Andrew Hallendorff Warning changed call structure to make compatible with gcc
typedef ALIGN16_BEG union {
float m128_f32[4];
int8_t m128_i8[16];
int16_t m128_i16[8];
int32_t m128_i32[4];
int64_t m128_i64[2];
uint8_t m128_u8[16];
uint16_t m128_u16[8];
uint32_t m128_u32[4];
uint64_t m128_u64[2];
} ALIGN16_END v4sfu;
__m128 log_ps(v4sfu *xPtr);
__m128 sin_ps(v4sfu *xPtr);
void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -74,6 +74,10 @@ public:
};
WX_DECLARE_OBJARRAY( EQCurve, EQCurveArray );
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
class EffectEqualization48x;
#endif
class EffectEqualization: public Effect {
public:
@ -113,12 +117,15 @@ public:
// low range of human hearing
enum {loFreqI=20};
private:
bool ProcessOne(int count, WaveTrack * t,
sampleCount start, sampleCount len);
void Filter(sampleCount len,
float *buffer);
void ReadPrefs();
HFFT hFFT;
@ -135,6 +142,11 @@ private:
bool mPrompting;
bool mDrawGrid;
bool mEditingBatchParams;
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
bool mBench;
EffectEqualization48x *mEffectEqualization48x;
friend class EffectEqualization48x;
#endif
public:
@ -222,6 +234,9 @@ public:
void EnvelopeUpdated(Envelope *env, bool lin);
static const double thirdOct[];
wxRadioButton *mFaderOrDraw[2];
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
wxRadioButton *mMathProcessingType[5]; // default, sse, sse threaded, AVX, AVX threaded (note AVX is not implemented yet
#endif
wxChoice *mInterpChoice;
wxCheckBox *mLinFreq;
int M;
@ -276,6 +291,14 @@ private:
ID_INVERT,
drawRadioID,
sliderRadioID,
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
defaultMathRadioID,
sSERadioID,
sSEThreadedRadioID,
aVXRadioID,
aVXThreadedRadioID,
ID_BENCH,
#endif
ID_INTERP,
ID_LIN_FREQ,
GridOnOffID,
@ -294,6 +317,10 @@ private:
void OnSliderDBMIN( wxCommandEvent &event );
void OnDrawRadio(wxCommandEvent &event );
void OnSliderRadio(wxCommandEvent &event );
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
void OnProcessingRadio(wxCommandEvent &event );
void OnBench( wxCommandEvent & event);
#endif
void OnLinFreq(wxCommandEvent &event );
void UpdateGraphic(void);
void EnvLogToLin(void);
@ -339,6 +366,9 @@ private:
wxBoxSizer *szrH;
wxBoxSizer *szrI;
wxBoxSizer *szrL;
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
wxBoxSizer *szrM;
#endif
wxFlexGridSizer *szr1;
wxBoxSizer *szr2;
wxBoxSizer *szr3;

View File

@ -0,0 +1,924 @@
/**********************************************************************
Audacity: A Digital Audio Editor
EffectEqualization.cpp
Andrew Hallendorff
*******************************************************************//**
\file Equalization48x.cpp
\brief Fast SSE based implementation of equalization.
*//****************************************************************/
#include "../Audacity.h"
#include "../Project.h"
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
#include "Equalization.h"
#include "../WaveTrack.h"
#include "float_cast.h"
#include <vector>
#include <wx/dcmemory.h>
#include <wx/event.h>
#include <wx/string.h>
#if wxUSE_TOOLTIPS
#include <wx/tooltip.h>
#endif
#include <wx/utils.h>
#include <math.h>
#include <wx/arrimpl.cpp>
#include "Equalization48x.h"
#include "../RealFFTf.h"
#include "../RealFFTf48x.h"
#ifndef USE_SSE2
#define USE_SSE2
#endif
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <math.h>
#include <xmmintrin.h>
#ifdef _WIN32
// Windows
#include <intrin.h>
#define cpuid __cpuid
#else
// GCC Inline Assembly
void cpuid(int CPUInfo[4],int InfoType){
__asm__ __volatile__ (
"cpuid":
"=a" (CPUInfo[0]),
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
"a" (InfoType)
);
}
#endif
bool sMathCapsInitialized = false;
MathCaps sMathCaps;
// dirty switcher
int sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
void EffectEqualization48x::SetMathPath(int mathPath) { sMathPath=mathPath; };
int EffectEqualization48x::GetMathPath() { return sMathPath; };
void EffectEqualization48x::AddMathPathOption(int mathPath) { sMathPath|=mathPath; };
void EffectEqualization48x::RemoveMathPathOption(int mathPath) { sMathPath&=~mathPath; };
MathCaps *EffectEqualization48x::GetMathCaps()
{
if(!sMathCapsInitialized)
{
sMathCapsInitialized=true;
sMathCaps.x64 = false;
sMathCaps.MMX = false;
sMathCaps.SSE = false;
sMathCaps.SSE2 = false;
sMathCaps.SSE3 = false;
sMathCaps.SSSE3 = false;
sMathCaps.SSE41 = false;
sMathCaps.SSE42 = false;
sMathCaps.SSE4a = false;
sMathCaps.AVX = false;
sMathCaps.XOP = false;
sMathCaps.FMA3 = false;
sMathCaps.FMA4 = false;
int info[4];
cpuid(info, 0);
int nIds = info[0];
cpuid(info, 0x80000000);
int nExIds = info[0];
// Detect Instruction Set
if (nIds >= 1){
cpuid(info,0x00000001);
sMathCaps.MMX = (info[3] & ((int)1 << 23)) != 0;
sMathCaps.SSE = (info[3] & ((int)1 << 25)) != 0;
sMathCaps.SSE2 = (info[3] & ((int)1 << 26)) != 0;
sMathCaps.SSE3 = (info[2] & ((int)1 << 0)) != 0;
sMathCaps.SSSE3 = (info[2] & ((int)1 << 9)) != 0;
sMathCaps.SSE41 = (info[2] & ((int)1 << 19)) != 0;
sMathCaps.SSE42 = (info[2] & ((int)1 << 20)) != 0;
sMathCaps.AVX = (info[2] & ((int)1 << 28)) != 0;
sMathCaps.FMA3 = (info[2] & ((int)1 << 12)) != 0;
}
if (nExIds >= 0x80000001){
cpuid(info,0x80000001);
sMathCaps.x64 = (info[3] & ((int)1 << 29)) != 0;
sMathCaps.SSE4a = (info[2] & ((int)1 << 6)) != 0;
sMathCaps.FMA4 = (info[2] & ((int)1 << 16)) != 0;
sMathCaps.XOP = (info[2] & ((int)1 << 11)) != 0;
}
if(sMathCaps.SSE)
sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; // we are starting on.
}
return &sMathCaps;
};
void * malloc_simd(const size_t size)
{
#if defined WIN32 // WIN32
return _aligned_malloc(size, 16);
#elif defined __linux__ // Linux
return memalign (16, size);
#elif defined __MACH__ // Mac OS X
return malloc(size);
#else // other (use valloc for page-aligned memory)
return valloc(size);
#endif
}
void free_simd(void* mem)
{
#if defined WIN32 // WIN32
_aligned_free(mem);
#else
free(mem);
#endif
}
EffectEqualization48x::EffectEqualization48x():
mThreadCount(0),mFilterSize(0),mWindowSize(0),mBlockSize(0),mWorkerDataCount(0),mBlocksPerBuffer(20),
mScratchBufferSize(0),mSubBufferSize(0),mBigBuffer(NULL),mBufferInfo(NULL),mEQWorkers(0),mThreaded(false),
mBenching(false)
{
}
EffectEqualization48x::~EffectEqualization48x()
{
}
bool EffectEqualization48x::AllocateBuffersWorkers(bool threaded)
{
if(mBigBuffer)
FreeBuffersWorkers();
mFilterSize=(mEffectEqualization->mM-1)&(~15); // 4000 !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mWindowSize=mEffectEqualization->windowSize;
mBlockSize=mWindowSize-mFilterSize; // 12,384
mThreaded=threaded;
if( mThreaded )
{
mThreadCount=wxThread::GetCPUCount();
mWorkerDataCount=mThreadCount+2; // 2 extra slots (maybe double later)
// we're skewing the data by one block to allow for 1/4 block intersections.
// this will remove the disparity in data at the intersections of the runs
// The nice magic allocation
// megabyte - 3 windows - 4 overlaping buffers - filter
// 2^20 = 1,048,576 - 3 * 2^14 (16,384) - ((4 * 20) - 3) * 12,384 - 4000
// 1,048,576 - 49,152 - 953,568 - 4000 = 41,856 (leftover)
mScratchBufferSize=mWindowSize*3*(sizeof(__m128)/sizeof(float)); // 3 window size blocks size of __m128 but we allocate in float
mSubBufferSize=mBlockSize*((mBlocksPerBuffer<<2)-3); // we are going to do a full block overlap -(blockSize*3)
mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)*mWorkerDataCount); // we run over by filtersize
// fill the bufferInfo
mBufferInfo = new BufferInfo[mWorkerDataCount];
for(int i=0;i<mWorkerDataCount;i++) {
mBufferInfo[i].mFftWindowSize=mWindowSize;
mBufferInfo[i].mFftFilterSize=mFilterSize;
mBufferInfo[i].mBufferLength=mBlockSize*mBlocksPerBuffer;
mBufferInfo[i].mScratchBuffer=&mBigBuffer[(mSubBufferSize+mScratchBufferSize)*i+mSubBufferSize];
for(int j=0;j<4;j++)
mBufferInfo[i].mBufferDest[j]=mBufferInfo[i].mBufferSouce[j]=&mBigBuffer[j*(mBufferInfo[i].mBufferLength-mBlockSize)+(mSubBufferSize+mScratchBufferSize)*i];
}
// start the workers
mDataMutex.IsOk();
mEQWorkers=new EQWorker[mThreadCount];
for(int i=0;i<mThreadCount;i++) {
mEQWorkers[i].SetData( mBufferInfo, mWorkerDataCount, &mDataMutex, this);
mEQWorkers[i].Create();
mEQWorkers[i].Run();
}
} else {
mScratchBufferSize=mWindowSize*3*(sizeof(__m128)/sizeof(float)); // 3 window size blocks size of __m128
mSubBufferSize=mBlockSize*((mBlocksPerBuffer<<2)-3); // we are going to do a full block overlap -(blockSize*3)
mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)); // we run over by filtersize
mBufferInfo = new BufferInfo[1]; // yeah it looks odd but it keeps compatibility with threaded processing
mBufferInfo[0].mFftWindowSize=mWindowSize;
mBufferInfo[0].mFftFilterSize=mFilterSize;
mBufferInfo[0].mBufferLength=mBlockSize*mBlocksPerBuffer;
mBufferInfo[0].mScratchBuffer=&mBigBuffer[mSubBufferSize];
for(int j=0;j<4;j++)
mBufferInfo[0].mBufferDest[j]=mBufferInfo[0].mBufferSouce[j]=&mBigBuffer[j*(mBufferInfo[0].mBufferLength-mBlockSize)];
}
return true;
}
bool EffectEqualization48x::FreeBuffersWorkers()
{
if(mThreaded) {
for(int i=0;i<mThreadCount;i++) { // tell all the workers to exit
mEQWorkers[i].ExitLoop();
}
for(int i=0;i<mThreadCount;i++) {
mEQWorkers[i].Wait();
}
delete[] mEQWorkers; // kill the workers ( go directly to jail)
mEQWorkers= NULL;
mThreadCount=0;
mWorkerDataCount=0;
}
delete [] mBufferInfo;
mBufferInfo = NULL;
free_simd(mBigBuffer);
mBigBuffer=NULL;
return true;
}
bool EffectEqualization48x::Process(EffectEqualization* effectEqualization)
{
mEffectEqualization=effectEqualization;
// return TrackCompare(); // used for debugging data
mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
bool bGoodResult = true;
TableUsage(sMathPath);
if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
AllocateBuffersWorkers((sMathPath & MATH_FUNCTION_THREADED) != 0);
SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
WaveTrack *track = (WaveTrack *) iter.First();
int count = 0;
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
if(!sMathPath) {
if (!mEffectEqualization->ProcessOne(count, track, start, len))
{
bGoodResult = false;
break;
}
} else {
if(sMathPath<8) {
if (!ProcessOne4x(count, track, start, len))
{
bGoodResult = false;
break;
}
} else {
if (!ProcessOne4xThreaded(count, track, start, len))
{
bGoodResult = false;
break;
}
}
}
}
track = (WaveTrack *) iter.Next();
count++;
}
FreeBuffersWorkers();
mEffectEqualization->ReplaceProcessedTracks(bGoodResult);
return bGoodResult;
}
bool EffectEqualization48x::TrackCompare()
{
mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
bool bGoodResult = true;
TableUsage(sMathPath);
if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
AllocateBuffersWorkers((sMathPath & MATH_FUNCTION_THREADED)!=0);
// Reset map
wxArrayPtrVoid SecondIMap;
wxArrayPtrVoid SecondOMap;
SecondIMap.Clear();
SecondOMap.Clear();
TrackList *SecondOutputTracks = new TrackList();
//iterate over tracks of type trackType (All types if Track::All)
TrackListOfKindIterator aIt(mEffectEqualization->mOutputTracksType, mEffectEqualization->mTracks);
for (Track *aTrack = aIt.First(); aTrack; aTrack = aIt.Next()) {
// Include selected tracks, plus sync-lock selected tracks for Track::All.
if (aTrack->GetSelected() ||
(mEffectEqualization->mOutputTracksType == Track::All && aTrack->IsSyncLockSelected()))
{
Track *o = aTrack->Duplicate();
SecondOutputTracks->Add(o);
SecondIMap.Add(aTrack);
SecondIMap.Add(o);
}
}
for(int i=0;i<2;i++) {
SelectedTrackListOfKindIterator iter(Track::Wave, i?mEffectEqualization->mOutputTracks:SecondOutputTracks);
i?sMathPath=sMathPath:sMathPath=0;
WaveTrack *track = (WaveTrack *) iter.First();
int count = 0;
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
if(!sMathPath) {
if (!mEffectEqualization->ProcessOne(count, track, start, len))
{
bGoodResult = false;
break;
}
} else {
if(sMathPath<8) {
if (!ProcessOne4x(count, track, start, len))
{
bGoodResult = false;
break;
}
} else {
if (!ProcessOne4xThreaded(count, track, start, len))
{
bGoodResult = false;
break;
}
}
}
}
track = (WaveTrack *) iter.Next();
count++;
}
}
SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
SelectedTrackListOfKindIterator iter2(Track::Wave, SecondOutputTracks);
WaveTrack *track = (WaveTrack *) iter.First();
WaveTrack *track2 = (WaveTrack *) iter2.First();
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
DeltaTrack(track, track2, start, len);
}
track = (WaveTrack *) iter.Next();
track2 = (WaveTrack *) iter2.Next();
}
delete SecondOutputTracks;
FreeBuffersWorkers();
mEffectEqualization->ReplaceProcessedTracks(bGoodResult);
return bGoodResult;
}
bool EffectEqualization48x::DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len)
{
sampleCount trackBlockSize = t->GetMaxBlockSize();
float *buffer1 = new float[trackBlockSize];
float *buffer2 = new float[trackBlockSize];
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
sampleCount originalLen = len;
sampleCount currentSample = start;
while(len) {
sampleCount curretLength=(trackBlockSize>len)?len:trackBlockSize;
t->Get((samplePtr)buffer1, floatSample, currentSample, curretLength);
t2->Get((samplePtr)buffer2, floatSample, currentSample, curretLength);
for(int i=0;i<curretLength;i++)
buffer1[i]-=buffer2[i];
output->Append((samplePtr)buffer1, floatSample, curretLength);
currentSample+=curretLength;
len-=curretLength;
}
delete[] buffer1;
delete[] buffer2;
output->Flush();
len=originalLen;
ProcessTail(t, output, start, len);
delete output;
return true;
}
bool EffectEqualization48x::Benchmark(EffectEqualization* effectEqualization)
{
mEffectEqualization=effectEqualization;
mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
bool bGoodResult = true;
TableUsage(sMathPath);
if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
AllocateBuffersWorkers((bool)MATH_FUNCTION_THREADED);
SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
long times[] = { 0,0,0 };
wxStopWatch timer;
mBenching=true;
for(int i=0;i<3;i++) {
int localMathPath;
switch(i) {
case 0: localMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
if(!sMathCaps.SSE)
localMathPath=-1;
break;
case 1: localMathPath=MATH_FUNCTION_SSE;
if(!sMathCaps.SSE)
localMathPath=-1;
break;
case 2: localMathPath=0;
break;
default: localMathPath=-1;
}
if(localMathPath>=0) {
timer.Start();
WaveTrack *track = (WaveTrack *) iter.First();
int count = 0;
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
if(!localMathPath) {
if (!mEffectEqualization->ProcessOne(count, track, start, len))
{
bGoodResult = false;
break;
}
} else {
if(localMathPath<8) {
if (!ProcessOne4x(count, track, start, len))
{
bGoodResult = false;
break;
}
} else {
if (!ProcessOne4xThreaded(count, track, start, len))
{
bGoodResult = false;
break;
}
}
}
}
track = (WaveTrack *) iter.Next();
count++;
}
times[i]=timer.Time();
}
}
FreeBuffersWorkers();
mBenching=false;
bGoodResult=false;
mEffectEqualization->ReplaceProcessedTracks(bGoodResult);
wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]);
wxTimeSpan tsSSE(0, 0, 0, times[1]);
wxTimeSpan tsDefault(0, 0, 0, times[2]);
wxMessageBox(wxString::Format(_("Benchmark times:\nDefault: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")).c_str(),tsSSE.Format(wxT("%M:%S.%l")).c_str(),tsSSEThreaded.Format(wxT("%M:%S.%l")).c_str()));
/* wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]);
wxTimeSpan tsSSE(0, 0, 0, times[1]);
wxTimeSpan tsDefault(0, 0, 0, times[2]);
wxString outputString;
outputString.Format(_("Benchmark times:\nDefault: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")),tsSSE.Format(wxT("%M:%S.%l")),tsSSEThreaded.Format(wxT("%M:%S.%l")));
wxMessageBox(outputString); */
return bGoodResult;
}
bool EffectEqualization48x::ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength)
{
sampleCount blockCount=bufferLength/mBlockSize;
sampleCount lastBlockSize=bufferLength%mBlockSize;
if(lastBlockSize)
blockCount++;
float *workBuffer=&sourceBuffer[bufferLength]; // all scratch buffers are at the end
for(int runx=0;runx<blockCount;runx++)
{
float *currentBuffer=&workBuffer[mWindowSize*(runx&1)];
for(int i=0;i<mBlockSize;i++)
currentBuffer[i]=sourceBuffer[i];
sourceBuffer+=mBlockSize;
float *currentFilter=&currentBuffer[mBlockSize];
for(int i=0;i<mFilterSize;i++)
currentFilter[i]=0;
mEffectEqualization->Filter(mWindowSize, currentBuffer);
float *writeEnd=currentBuffer+mBlockSize;
if(runx==blockCount)
writeEnd=currentBuffer+(lastBlockSize+mFilterSize);
if(runx) {
float *lastOverrun=&workBuffer[mWindowSize*((runx+1)&1)+mBlockSize];
for(int j=0;j<mFilterSize;j++)
*destBuffer++= *currentBuffer++ + *lastOverrun++;
} else
currentBuffer+=mFilterSize>>1; // this will skip the first filterSize on the first run
while(currentBuffer<writeEnd)
*destBuffer++ = *currentBuffer++;
}
return true;
}
bool EffectEqualization48x::ProcessBuffer4x(BufferInfo *bufferInfo)
{
// length must be a factor of window size for 4x processing.
if(bufferInfo->mBufferLength%mBlockSize)
return false;
sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize;
__m128 *readBlocks[4]; // some temps so we dont destroy the vars in the struct
__m128 *writeBlocks[4];
for(int i=0;i<4;i++) {
readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i];
writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i];
}
__m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer;
__m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*2];
for(int run4x=0;run4x<blockCount;run4x++)
{
// swizzle the data to the swizzle buffer
__m128 *currentSwizzledBlock=&swizzledBuffer128[mWindowSize*(run4x&1)];
for(int i=0,j=0;j<mBlockSize;i++,j+=4) {
__m128 tmp0 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(1,0,1,0));
__m128 tmp1 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(3,2,3,2));
__m128 tmp2 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(1,0,1,0));
__m128 tmp3 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(3,2,3,2));
currentSwizzledBlock[j] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+1] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
currentSwizzledBlock[j+2] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+3] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
}
__m128 *thisOverrun128=&currentSwizzledBlock[mBlockSize];
for(int i=0;i<mFilterSize;i++)
thisOverrun128[i]=_mm_set1_ps(0.0);
Filter4x(mWindowSize, (float *)currentSwizzledBlock, (float *)scratchBuffer);
int writeStart=0, writeToStart=0; // note readStart is where the read data is written
int writeEnd=mBlockSize;
if(run4x) {
// maybe later swizzle add and write in one
__m128 *lastOverrun128=&swizzledBuffer128[mWindowSize*((run4x+1)&1)+mBlockSize];
// add and swizzle data + filter
for(int i=0,j=0;j<mFilterSize;i++,j+=4) {
__m128 tmps0 = _mm_add_ps(currentSwizzledBlock[j], lastOverrun128[j]);
__m128 tmps1 = _mm_add_ps(currentSwizzledBlock[j+1], lastOverrun128[j+1]);
__m128 tmps2 = _mm_add_ps(currentSwizzledBlock[j+2], lastOverrun128[j+2]);
__m128 tmps3 = _mm_add_ps(currentSwizzledBlock[j+3], lastOverrun128[j+3]);
__m128 tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
__m128 tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
__m128 tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
__m128 tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
}
writeStart=mFilterSize;
writeToStart=mFilterSize>>2;
// swizzle it back.
for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
__m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
__m128 tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(2,3,2,3));
__m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1));
__m128 tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(2,3,2,3));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
}
} else {
// swizzle it back. We overlap one block so we only write the first block on the first run
writeStart=0;
writeToStart=0;
for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
__m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
__m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
}
}
for(int i=0;i<4;i++) { // shift each block
readBlocks[i]+=mBlockSize>>2; // these are 128b pointers, each window is 1/4 blockSize for those
writeBlocks[i]+=mBlockSize>>2;
}
}
return true;
}
bool EffectEqualization48x::ProcessOne4x(int count, WaveTrack * t,
sampleCount start, sampleCount len)
{
sampleCount blockCount=len/mBlockSize;
if(blockCount<16) // it's not worth 4x processing do a regular process
return mEffectEqualization->ProcessOne(count, t, start, len);
sampleCount trackBlockSize = t->GetMaxBlockSize();
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
mEffectEqualization->TrackProgress(count, 0.0);
int bigRuns=len/(mSubBufferSize-mBlockSize);
int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
sampleCount currentSample=start;
for(int bigRun=0;bigRun<bigRuns;bigRun++)
{
// fill the buffer
for(int i=0;i<trackBlocksPerBig;i++) {
t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
ProcessBuffer4x(mBufferInfo);
if (mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns))
{
break;
}
output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
}
if(singleProcessLength) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
}
output->Flush();
ProcessTail(t, output, start, len);
delete output;
return true;
}
void *EQWorker::Entry()
{
while(!mExitLoop) {
mMutex->Lock();
bool bufferAquired=false;
for(int i=0;i<mBufferInfoCount;i++)
if(mBufferInfoList[i].mBufferStatus==BufferReady) { // we found an unlocked ready buffer
bufferAquired=true;
mBufferInfoList[i].mBufferStatus=BufferBusy; // we own it now
mMutex->Unlock();
mEffectEqualization48x->ProcessBuffer4x(&mBufferInfoList[i]);
mBufferInfoList[i].mBufferStatus=BufferDone; // we're done
break;
}
if(!bufferAquired)
mMutex->Unlock();
}
return NULL;
}
bool EffectEqualization48x::ProcessOne4xThreaded(int count, WaveTrack * t,
sampleCount start, sampleCount len)
{
sampleCount blockCount=len/mBlockSize;
if(blockCount<16) // it's not worth 4x processing do a regular process
return ProcessOne4x(count, t, start, len);
if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data
return ProcessOne4x(count, t, start, len);
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
sampleCount trackBlockSize = t->GetMaxBlockSize();
mEffectEqualization->TrackProgress(count, 0.0);
int bigRuns=len/(mSubBufferSize-mBlockSize);
int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
sampleCount currentSample=start;
int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0;
// fill the first workerDataCount buffers we checked above and there is at least this data
for(int i=0;i<mWorkerDataCount;i++)
{
// fill the buffer
for(int j=0;j<trackBlocksPerBig;j++) {
t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin
}
int currentIndex=0;
while(bigBlocksWritten<bigRuns) {
mDataMutex.Lock(); // Get in line for data
// process as many blocks as we can
while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWritten<bigRuns)) { // data is ours
if (mEffectEqualization->TrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns))
{
break;
}
output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)));
bigBlocksWritten++;
if(bigBlocksRead<bigRuns) {
// fill the buffer
for(int j=0;j<trackBlocksPerBig;j++) {
t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin
bigBlocksRead++;
} else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary
currentIndex=(currentIndex+1)%mWorkerDataCount;
}
mDataMutex.Unlock(); // Get back in line for data
}
if(singleProcessLength) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
}
output->Flush();
ProcessTail(t, output, start, len);
delete output;
return true;
}
bool EffectEqualization48x::ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len)
{
// double offsetT0 = t->LongSamplesToTime((sampleCount)offset);
double lenT = t->LongSamplesToTime(len);
// 'start' is the sample offset in 't', the passed in track
// 'startT' is the equivalent time value
// 'output' starts at zero
double startT = t->LongSamplesToTime(start);
//output has one waveclip for the total length, even though
//t might have whitespace seperating multiple clips
//we want to maintain the original clip structure, so
//only paste the intersections of the new clip.
//Find the bits of clips that need replacing
std::vector<std::pair<double, double> > clipStartEndTimes;
std::vector<std::pair<double, double> > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected
for (WaveClipList::compatibility_iterator it=t->GetClipIterator(); it; it=it->GetNext())
{
WaveClip *clip;
double clipStartT;
double clipEndT;
clip = it->GetData();
clipStartT = clip->GetStartTime();
clipEndT = clip->GetEndTime();
if( clipEndT <= startT )
continue; // clip is not within selection
if( clipStartT >= startT + lenT )
continue; // clip is not within selection
//save the actual clip start/end so that we can rejoin them after we paste.
clipRealStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
if( clipStartT < startT ) // does selection cover the whole clip?
clipStartT = startT; // don't copy all the new clip
if( clipEndT > startT + lenT ) // does selection cover the whole clip?
clipEndT = startT + lenT; // don't copy all the new clip
//save them
clipStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
}
//now go thru and replace the old clips with new
for(unsigned int i=0;i<clipStartEndTimes.size();i++)
{
Track *toClipOutput;
//remove the old audio and get the new
t->Clear(clipStartEndTimes[i].first,clipStartEndTimes[i].second);
// output->Copy(clipStartEndTimes[i].first-startT+offsetT0,clipStartEndTimes[i].second-startT+offsetT0, &toClipOutput);
output->Copy(clipStartEndTimes[i].first-startT,clipStartEndTimes[i].second-startT, &toClipOutput);
if(toClipOutput)
{
//put the processed audio in
bool bResult = t->Paste(clipStartEndTimes[i].first, toClipOutput);
wxASSERT(bResult); // TO DO: Actually handle this.
//if the clip was only partially selected, the Paste will have created a split line. Join is needed to take care of this
//This is not true when the selection is fully contained within one clip (second half of conditional)
if( (clipRealStartEndTimes[i].first != clipStartEndTimes[i].first ||
clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) &&
!(clipRealStartEndTimes[i].first <= startT &&
clipRealStartEndTimes[i].second >= startT+lenT) )
t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second);
delete toClipOutput;
}
}
return true;
}
void EffectEqualization48x::Filter4x(sampleCount len,
float *buffer, float *scratchBuffer)
{
int i;
__m128 real128, imag128;
// Apply FFT
RealFFTf4x(buffer, mEffectEqualization->hFFT);
// Apply filter
// DC component is purely real
__m128 *localFFTBuffer=(__m128 *)scratchBuffer;
__m128 *localBuffer=(__m128 *)buffer;
__m128 filterFuncR, filterFuncI;
filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[0]);
localFFTBuffer[0]=_mm_mul_ps(localBuffer[0], filterFuncR);
int halfLength=(len/2);
bool useBitReverseTable=sMathPath&1;
for(i=1; i<halfLength; i++)
{
if(useBitReverseTable) {
real128=localBuffer[mEffectEqualization->hFFT->BitReversed[i] ];
imag128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1];
} else {
int bitReversed=SmallReverseBits(i,mEffectEqualization->hFFT->pow2Bits);
real128=localBuffer[bitReversed];
imag128=localBuffer[bitReversed+1];
}
filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[i]);
filterFuncI=_mm_set1_ps(mEffectEqualization->mFilterFuncI[i]);
localFFTBuffer[2*i ] = _mm_sub_ps( _mm_mul_ps(real128, filterFuncR), _mm_mul_ps(imag128, filterFuncI));
localFFTBuffer[2*i+1] = _mm_add_ps( _mm_mul_ps(real128, filterFuncI), _mm_mul_ps(imag128, filterFuncR));
}
// Fs/2 component is purely real
filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]);
localFFTBuffer[1] = _mm_mul_ps(localBuffer[1], filterFuncR);
// Inverse FFT and normalization
InverseRealFFTf4x(scratchBuffer, mEffectEqualization->hFFT);
ReorderToTime4x(mEffectEqualization->hFFT, scratchBuffer, buffer);
}
#endif

View File

@ -0,0 +1,146 @@
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
/**********************************************************************
Audacity: A Digital Audio Editor
Equalization48x.h
Intrinsics (SSE/AVX) and Threaded Equalization
***********************************************************************/
#ifndef __AUDACITY_EFFECT_EQUALIZATION48X__
#define __AUDACITY_EFFECT_EQUALIZATION48X__
// bitwise function selection
// options are
#define MATH_FUNCTION_ORIGINAL 0 // 0 original path
#define MATH_FUNCTION_BITREVERSE_TABLE 1 // 1 SSE BitReverse Table
#define MATH_FUNCTION_SIN_COS_TABLE 2 // 2 SSE SinCos Table
// 3 SSE with SinCos and BitReverse buffer
#define MATH_FUNCTION_SSE 4 // 4 SSE no SinCos and no BitReverse buffer
#define MATH_FUNCTION_THREADED 8 // 8 SSE threaded no SinCos and no BitReverse buffer
// 9 SSE threaded BitReverse Table
// 10 SSE threaded SinCos Table
// 11 SSE threaded with SinCos and BitReverse buffer
//#define MATH_FUNCTION_AVX 16
// added by Andrew Hallendorff intrinsics processing
enum EQBufferStatus
{
BufferEmpty=0,
BufferReady,
BufferBusy,
BufferDone
};
class BufferInfo {
public:
BufferInfo() { mBufferLength=0; mBufferStatus=BufferEmpty; };
float* mBufferSouce[4];
float* mBufferDest[4];
int mBufferLength;
sampleCount mFftWindowSize;
sampleCount mFftFilterSize;
float* mScratchBuffer;
EQBufferStatus mBufferStatus;
};
typedef struct {
int x64;
int MMX;
int SSE;
int SSE2;
int SSE3;
int SSSE3;
int SSE41;
int SSE42;
int SSE4a;
int AVX;
int XOP;
int FMA3;
int FMA4;
} MathCaps;
class EffectEqualization;
class EffectEqualization48x;
static int EQWorkerCounter=0;
class EQWorker : public wxThread {
public:
EQWorker():wxThread(wxTHREAD_JOINABLE) {
mBufferInfoList=NULL;
mBufferInfoCount=0;
mMutex=NULL;
mEffectEqualization48x=NULL;
mExitLoop=false;
mThreadID=EQWorkerCounter++;
}
void SetData( BufferInfo* bufferInfoList, int bufferInfoCount, wxMutex *mutex, EffectEqualization48x *effectEqualization48x) {
mBufferInfoList=bufferInfoList;
mBufferInfoCount=bufferInfoCount;
mMutex=mutex;
mEffectEqualization48x=effectEqualization48x;
}
void ExitLoop() { // this will cause the thread to drop from the loops
mExitLoop=true;
}
virtual void* Entry();
BufferInfo* mBufferInfoList;
int mBufferInfoCount, mThreadID;
wxMutex *mMutex;
EffectEqualization48x *mEffectEqualization48x;
bool mExitLoop;
};
class EffectEqualization48x {
public:
EffectEqualization48x();
virtual ~EffectEqualization48x();
static MathCaps *GetMathCaps();
static void SetMathPath(int mathPath);
static int GetMathPath();
static void AddMathPathOption(int mathPath);
static void RemoveMathPathOption(int mathPath);
bool Process(EffectEqualization* effectEqualization);
bool Benchmark(EffectEqualization* effectEqualization);
private:
bool TrackCompare();
bool DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len);
bool AllocateBuffersWorkers(bool threaded);
bool FreeBuffersWorkers();
bool ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength);
bool ProcessBuffer4x(BufferInfo *bufferInfo);
bool ProcessOne4x(int count, WaveTrack * t, sampleCount start, sampleCount len);
bool ProcessOne4xThreaded(int count, WaveTrack * t, sampleCount start, sampleCount len);
bool ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len);
void Filter4x(sampleCount len, float *buffer, float *scratchBuffer);
EffectEqualization* mEffectEqualization;
int mThreadCount;
sampleCount mFilterSize;
sampleCount mBlockSize;
sampleCount mWindowSize;
int mWorkerDataCount;
int mBlocksPerBuffer;
int mScratchBufferSize;
int mSubBufferSize;
float *mBigBuffer;
BufferInfo* mBufferInfo;
wxMutex mDataMutex;
EQWorker* mEQWorkers;
bool mThreaded;
bool mBenching;
friend EQWorker;
};
#endif
#endif

View File

@ -123,6 +123,17 @@ void EffectsPrefs::PopulateOrExchange(ShuttleGui & S)
}
S.EndStatic();
#endif
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
S.StartStatic(_("Instruction Set"));
{
S.TieCheckBox(_("&Use SSE/SSE2/.../AVX"),
wxT("/SSE/GUI"),
true);
}
S.EndStatic();
#endif
}
bool EffectsPrefs::Apply()

View File

@ -634,6 +634,14 @@
RelativePath="..\..\..\src\RealFFTf.h"
>
</File>
<File
RelativePath="..\..\..\src\RealFFTf48x.cpp"
>
</File>
<File
RelativePath="..\..\..\src\RealFFTf48x.h"
>
</File>
<File
RelativePath="..\..\..\src\Resample.cpp"
>
@ -748,6 +756,14 @@
RelativePath="..\..\..\src\SplashDialog.h"
>
</File>
<File
RelativePath="..\..\..\src\SseMathFuncs.cpp"
>
</File>
<File
RelativePath="..\..\..\src\SseMathFuncs.h"
>
</File>
<File
RelativePath="..\..\..\src\Tags.cpp"
>
@ -996,6 +1012,14 @@
RelativePath="..\..\..\src\effects\Equalization.h"
>
</File>
<File
RelativePath="..\..\..\src\effects\Equalization48x.cpp"
>
</File>
<File
RelativePath="..\..\..\src\effects\Equalization48x.h"
>
</File>
<File
RelativePath="..\..\..\src\effects\Fade.cpp"
>