audacia/src/RealFFTf48x.cpp
Paul Licameli 5e7d41ec07 Each .cpp/.mm file includes corresponding header before any other...
... except Audacity.h

This forces us to make each header contain all forward declarations or nested
headers that it requires, rather than depend on context.
2019-03-17 22:54:52 -04:00

3003 lines
106 KiB
C++

/**********************************************************************
Audacity: A Digital Audio Editor
RealFFT48x.cpp
Philip Van Baren
Andrew Hallendorff (SSE Mods)
*******************************************************************//**
\file RealFFT48x.cpp
\brief Real FFT with SSE acceleration.
*//****************************************************************/
/*
* Program: REALFFTF.C
* Author: Philip Van Baren
* Date: 2 September 1993
*
* Description: These routines perform an FFT on real data to get a conjugate-symmetric
* output, and an inverse FFT on conjugate-symmetric input to get a real
* output sequence.
*
* This code is for floating point data.
*
* Modified 8/19/1998 by Philip Van Baren
* - made the InitializeFFT and EndFFT routines take a structure
* holding the length and pointers to the BitReversed and SinTable
* tables.
* Modified 5/23/2009 by Philip Van Baren
* - Added GetFFT and ReleaseFFT routines to retain common SinTable
* and BitReversed tables so they don't need to be reallocated
* and recomputed on every call.
* - Added Reorder* functions to undo the bit-reversal
* Modified 15 April 2016 Paul Licameli
* - C++11 smart pointers
*
* Copyright (C) 2009 Philip VanBaren
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include "Audacity.h"
#include "RealFFTf48x.h"
#include "Experimental.h"
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
// at the moment these two are mutually exclusive
//#define USE_SSEMATHFUNC
//#define TEST_COSSINBIT_TABLES
#ifndef USE_SSE2
#define USE_SSE2
#endif
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include "RealFFTf.h"
#ifdef __WXMSW__
#pragma warning(disable:4305)
#else
#endif
#include "SseMathFuncs.h"
#include <intrin.h>
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
int tableMask=0;
bool useBitReverseTable=false;
bool useSinCosTable=false;
void TableUsage(int iMask)
{
tableMask=iMask;
useBitReverseTable=(iMask&1)!=0;
useSinCosTable=(iMask&2)!=0;
}
SinCosTable::SinCosTable() :
mSinCosTablePow(13)
{
size_t tableSize=1<<mSinCosTablePow;
mSinCosTable.reinit(tableSize);
for(size_t i=0;i<tableSize;i++) {
mSinCosTable[i].mSin=(float)-sin(((float)i)*M_PI/tableSize);
mSinCosTable[i].mCos=(float)-cos(((float)i)*M_PI/tableSize);
}
};
static SinCosTable sSinCosTable;
static unsigned char sSmallRBTable[256];
class BitReverser {
public:
BitReverser()
{
for(int i=0;i<256;i++) {
sSmallRBTable[i]=0;
for(int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1)
if(i&maskLow)
sSmallRBTable[i]|=maskHigh;
}
};
};
static BitReverser sBitReverser;
/* array of functions there prob is a better way to do this */
int SmallVRB0(int bits) { return bits; }; int SmallVRB1(int bits) { return sSmallRBTable[bits]>>7; };
int SmallVRB2(int bits) { return sSmallRBTable[bits]>>6; }; int SmallVRB3(int bits) { return sSmallRBTable[bits]>>5; };
int SmallVRB4(int bits) { return sSmallRBTable[bits]>>4; }; int SmallVRB5(int bits) { return sSmallRBTable[bits]>>3; };
int SmallVRB6(int bits) { return sSmallRBTable[bits]>>2; }; int SmallVRB7(int bits) { return sSmallRBTable[bits]>>1; };
int SmallVRB8(int bits) { return sSmallRBTable[bits]; };
int SmallVRB9(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<1)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>7); };
int SmallVRB10(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<2)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>6); };
int SmallVRB11(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<3)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>5); };
int SmallVRB12(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<4)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>4); };
int SmallVRB13(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<5)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>3); };
int SmallVRB14(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<6)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>2); };
int SmallVRB15(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<7)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>1); };
int SmallVRB16(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<8)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]); };
int (*SmallVRB[])(int bits) = { SmallVRB0, SmallVRB1, SmallVRB2, SmallVRB3, SmallVRB4,
SmallVRB5, SmallVRB6, SmallVRB7, SmallVRB8, SmallVRB9, SmallVRB10,
SmallVRB11, SmallVRB12, SmallVRB13, SmallVRB14,SmallVRB15, SmallVRB16 };
int SmallRB(int bits, int numberBits)
{
// return ( (sSmallRBTable[*((unsigned char *)&bits)]<<16) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&bits)+2)] ))>>(24-numberBits);
return ( (sSmallRBTable[*((unsigned char *)&bits)]<<8) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]) )>>(16-numberBits);
};
/* wrapper funcitons. If passed -1 function choice will be made locally */
void RealFFTf1x(fft_type *buffer, FFTParam *h, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
RealFFTf1xSinCosTableVBR16( buffer, h);
break;
case FFT_SinCosTableBR16:
RealFFTf1xSinCosTableBR16( buffer, h);
break;
case FFT_FastMathBR16:
RealFFTf1xFastMathBR16( buffer, h);
break;
case FFT_FastMathBR24:
RealFFTf1xFastMathBR24( buffer, h);
break;
case FFT_SinCosBRTable:
default:
RealFFTf1xSinCosBRTable( buffer, h);
};
}
void InverseRealFFTf1x(fft_type *buffer, FFTParam *h, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
InverseRealFFTf1xSinCosTableVBR16( buffer, h);
break;
case FFT_SinCosTableBR16:
InverseRealFFTf1xSinCosTableBR16( buffer, h);
break;
case FFT_FastMathBR16:
InverseRealFFTf1xFastMathBR16( buffer, h);
break;
case FFT_FastMathBR24:
InverseRealFFTf1xFastMathBR24( buffer, h);
break;
case FFT_SinCosBRTable:
default:
InverseRealFFTf1xSinCosBRTable( buffer, h);
};
}
void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
ReorderToTime1xSinCosTableVBR16( hFFT, buffer, TimeOut);
break;
case FFT_SinCosTableBR16:
ReorderToTime1xSinCosTableBR16( hFFT, buffer, TimeOut);
break;
case FFT_FastMathBR16:
ReorderToTime1xFastMathBR16( hFFT, buffer, TimeOut);
break;
case FFT_FastMathBR24:
ReorderToTime1xFastMathBR24( hFFT, buffer, TimeOut);
break;
case FFT_SinCosBRTable:
default:
ReorderToTime1xSinCosBRTable( hFFT, buffer, TimeOut);
};
}
void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
ReorderToFreq1xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_SinCosTableBR16:
ReorderToFreq1xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_FastMathBR16:
ReorderToFreq1xFastMathBR16(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_FastMathBR24:
ReorderToFreq1xFastMathBR24(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_SinCosBRTable:
default:
ReorderToFreq1xSinCosBRTable(hFFT, buffer, RealOut, ImagOut);
};
}
void RealFFTf4x( fft_type *buffer, FFTParam *h, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
RealFFTf4xSinCosTableVBR16( buffer, h);
break;
case FFT_SinCosTableBR16:
RealFFTf4xSinCosTableBR16( buffer, h);
break;
case FFT_FastMathBR16:
RealFFTf4xFastMathBR16( buffer, h);
break;
case FFT_FastMathBR24:
RealFFTf4xFastMathBR24( buffer, h);
break;
case FFT_SinCosBRTable:
default:
RealFFTf4xSinCosBRTable( buffer, h);
};
}
void InverseRealFFTf4x( fft_type *buffer, FFTParam *h, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
InverseRealFFTf4xSinCosTableVBR16( buffer, h);
break;
case FFT_SinCosTableBR16:
InverseRealFFTf4xSinCosTableBR16( buffer, h);
break;
case FFT_FastMathBR16:
InverseRealFFTf4xFastMathBR16( buffer, h);
break;
case FFT_FastMathBR24:
InverseRealFFTf4xFastMathBR24( buffer, h);
break;
case FFT_SinCosBRTable:
default:
InverseRealFFTf4xSinCosBRTable( buffer, h);
};
}
void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
ReorderToTime4xSinCosTableVBR16( hFFT, buffer, TimeOut);
break;
case FFT_SinCosTableBR16:
ReorderToTime4xSinCosTableBR16( hFFT, buffer, TimeOut);
break;
case FFT_FastMathBR16:
ReorderToTime4xFastMathBR16( hFFT, buffer, TimeOut);
break;
case FFT_FastMathBR24:
ReorderToTime4xFastMathBR24( hFFT, buffer, TimeOut);
break;
case FFT_SinCosBRTable:
default:
ReorderToTime4xSinCosBRTable( hFFT, buffer, TimeOut);
};
}
void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType)
{
switch(functionType) {
case FFT_SinCosTableVBR16:
ReorderToFreq4xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_SinCosTableBR16:
ReorderToFreq4xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_FastMathBR16:
ReorderToFreq4xFastMathBR16(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_FastMathBR24:
ReorderToFreq4xFastMathBR24(hFFT, buffer, RealOut, ImagOut);
break;
case FFT_SinCosBRTable:
default:
ReorderToFreq4xSinCosBRTable(hFFT, buffer, RealOut, ImagOut);
};
}
#define REAL_SINCOSBRTABLE
#ifdef REAL_SINCOSBRTABLE
/*
* Forward FFT routine. Must call GetFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf1xSinCosBRTable(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *sptr;
fft_type *endptr1,*endptr2;
int *br1,*br2;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
sptr = h->SinTable.get();
while(A < endptr1)
{
sin = *sptr;
cos = *(sptr + 1);
endptr2 = B;
while(A < endptr2)
{
v1 = *B * cos + *(B+1) * sin;
v2 = *B * sin - *(B+1) * cos;
*B = (*A + v1);
*(A++) = *(B++) - 2 * v1;
*B = (*A - v2);
*(A++) = *(B++) + 2 * v2;
}
A = B;
B += ButterfliesPerGroup * 2;
sptr += 2;
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1 = h->BitReversed.get() + 1;
br2 = h->BitReversed.get() + h->Points - 1;
while(br1 < br2)
{
sin=h->SinTable[*br1];
cos=h->SinTable[*br1+1];
A=buffer+*br1;
B=buffer+*br2;
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus - cos*HIplus);
v2 = (cos*HRminus + sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus + v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
br1++;
br2--;
}
/* Handle the center bin (just need a conjugate) */
A=buffer+*br1+1;
*A=-*A;
/* Handle DC bin separately - and ignore the Fs/2 bin
buffer[0]+=buffer[1];
buffer[1]=(fft_type)0;*/
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=buffer[0]-buffer[1];
buffer[0]+=buffer[1];
buffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf1xSinCosBRTable(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *sptr;
fft_type *endptr1,*endptr2;
int *br1;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
/* Massage input to get the input for a real output sequence. */
A = buffer + 2;
B = buffer + h->Points * 2 - 2;
br1 = h->BitReversed.get() + 1;
while(A < B)
{
sin=h->SinTable[*br1];
cos=h->SinTable[*br1+1];
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus + cos*HIplus);
v2 = (cos*HRminus - sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus - v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
A+=2;
B-=2;
br1++;
}
/* Handle center bin (just need conjugate) */
*(A+1)=-*(A+1);
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=0.5f*(buffer[0]+buffer[1]);
v2=0.5f*(buffer[0]-buffer[1]);
buffer[0]=v1;
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
sptr = h->SinTable.get();
while(A < endptr1)
{
sin = *(sptr++);
cos = *(sptr++);
endptr2 = B;
while(A < endptr2)
{
v1 = *B * cos - *(B + 1) * sin;
v2 = *B * sin + *(B + 1) * cos;
*B = (*A + v1) * (fft_type)0.5;
*(A++) = *(B++) - v1;
*B = (*A + v2) * (fft_type)0.5;
*(A++) = *(B++) - v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
RealOut[i]=buffer[hFFT->BitReversed[i] ];
ImagOut[i]=buffer[hFFT->BitReversed[i]+1];
}
RealOut[0] = buffer[0]; // DC component
ImagOut[0] = 0;
RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
ImagOut[hFFT->Points] = 0;
}
void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
TimeOut[i*2 ]=buffer[hFFT->BitReversed[i] ];
TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1];
}
}
// 4x processing simd
void RealFFTf4xSinCosBRTable(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
fft_type *sptr;
__m128 *endptr1,*endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = &localBuffer[h->Points * 2];
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = &localBuffer[ButterfliesPerGroup * 2];
sptr = h->SinTable.get();
while(A < endptr1)
{
sin = _mm_set1_ps(*(sptr++));
cos = _mm_set1_ps(*(sptr++));
endptr2 = B;
while(A < endptr2)
{
v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B = _mm_add_ps( *A, v1);
__m128 temp128 = _mm_set1_ps( 2.0);
*(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
*B = _mm_sub_ps(*A,v2);
*(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed + 1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
while(br1Index<br2Index)
{
br1Value=h->BitReversed[br1Index];
br2Value=h->BitReversed[br2Index];
sin=_mm_set1_ps(h->SinTable[br1Value]);
cos=_mm_set1_ps(h->SinTable[br1Value+1]);
A=&localBuffer[br1Value];
B=&localBuffer[br2Value];
__m128 temp128 = _mm_set1_ps( 2.0);
HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
temp128 = _mm_set1_ps( 0.5);
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
A=&localBuffer[h->BitReversed[br1Index]+1];
// negate sse style
*A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
localBuffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf4xSinCosBRTable(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
fft_type *sptr;
__m128 *endptr1,*endptr2;
int br1Index, br1Value;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
/* Massage input to get the input for a real output sequence. */
A = localBuffer + 2;
B = localBuffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
while(A < B)
{
br1Value = h->BitReversed[br1Index];
sin = _mm_set1_ps(h->SinTable[br1Value]);
cos = _mm_set1_ps(h->SinTable[br1Value + 1]);
HRminus = _mm_sub_ps(*A, *B);
HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
HIminus = _mm_sub_ps( *(A+1), *(B+1));
HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
// negate sse style
*(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
localBuffer[0]=v1;
localBuffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = localBuffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = localBuffer + ButterfliesPerGroup * 2;
sptr = h->SinTable.get();
while(A < endptr1)
{
sin = _mm_set1_ps(*(sptr++));
cos = _mm_set1_ps(*(sptr++));
endptr2 = B;
while(A < endptr2)
{
v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B + 1), sin));
v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B + 1), cos));
*B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
*(A++) = _mm_sub_ps(*(B++), v1);
*B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
*(A++) = _mm_sub_ps(*(B++), v2);
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *localRealOut=(__m128 *)RealOut;
__m128 *localImagOut=(__m128 *)ImagOut;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
brValue=hFFT->BitReversed[i];
localRealOut[i]=localBuffer[brValue ];
localImagOut[i]=localBuffer[brValue+1];
}
localRealOut[0] = localBuffer[0]; // DC component
localImagOut[0] = _mm_set1_ps(0.0);
localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
}
void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *localTimeOut=(__m128 *)TimeOut;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue = hFFT->BitReversed[i];
localTimeOut[i*2 ] = localBuffer[brValue ];
localTimeOut[i*2+1] = localBuffer[brValue+1];
}
}
#endif
#define REAL_SINCOSTABLE_VBR16
#ifdef REAL_SINCOSTABLE_VBR16
/*
* Forward FFT routine. Must call GetFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf1xSinCosTableVBR16(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1,*endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int iSinCosIndex = 0;
while(A < endptr1)
{
int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;
cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;
iSinCosIndex++;
endptr2 = B;
while(A < endptr2)
{
v1 = *B*cos + *(B+1)*sin;
v2 = *B*sin - *(B+1)*cos;
*B = (*A+v1);
*(A++) = *(B++) - 2 * v1;
*B = (*A - v2);
*(A++) = *(B++) + 2 * v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1;
br2Index = h->Points - 1;
while(br1Index < br2Index)
{
br1Value=(*SmallVRB[h->pow2Bits])(br1Index);
br2Value=(*SmallVRB[h->pow2Bits])(br2Index);
int sinCosIndex=br1Index<<sinCosShift;
sin=sSinCosTable.mSinCosTable[sinCosIndex].mSin;
cos=sSinCosTable.mSinCosTable[sinCosIndex].mCos;
A=&buffer[br1Value];
B=&buffer[br2Value];
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus - cos*HIplus);
v2 = (cos*HRminus + sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus + v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
/* Handle DC bin separately - and ignore the Fs/2 bin
buffer[0]+=buffer[1];
buffer[1]=(fft_type)0;*/
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=buffer[0]-buffer[1];
buffer[0]+=buffer[1];
buffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf1xSinCosTableVBR16(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1,*endptr2;
int br1Index, br1Value;
int *br1;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
/* Massage input to get the input for a real output sequence. */
A = buffer + 2;
B = buffer + h->Points * 2 - 2;
br1 = h->BitReversed.get() + 1;
br1Index = 1; //h->BitReversed + 1;
while(A < B)
{
br1Value = (*SmallVRB[h->pow2Bits])(br1Index);
int sinCosIndex = br1Index << sinCosShift;
sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;
cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus + cos*HIplus);
v2 = (cos*HRminus - sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus - v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
*(A+1)=-*(A+1);
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=0.5f*(buffer[0]+buffer[1]);
v2=0.5f*(buffer[0]-buffer[1]);
buffer[0]=v1;
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int iSinCosIndex = 0;
while(A < endptr1)
{
int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;
cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;
iSinCosIndex++;
endptr2 = B;
while(A < endptr2)
{
v1 = *B * cos - *(B + 1) * sin;
v2 = *B * sin + *(B + 1) * cos;
*B = (*A + v1) * (fft_type)0.5;
*(A++) = *(B++) - v1;
*B = (*A + v2) * (fft_type)0.5;
*(A++) = *(B++) - v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
// Copy the data into the real outputs
for(size_t i = 0;i < hFFT->Points; i++) {
int brValue;
brValue=(*SmallVRB[hFFT->pow2Bits])(i);
TimeOut[i*2 ] = buffer[brValue ];
TimeOut[i*2+1] = buffer[brValue+1];
}
}
void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
brValue = (*SmallVRB[hFFT->pow2Bits])(i);
RealOut[i] = buffer[brValue ];
ImagOut[i] = buffer[brValue+1];
}
RealOut[0] = buffer[0]; // DC component
ImagOut[0] = 0;
RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
ImagOut[hFFT->Points] = 0;
}
// 4x processing simd
void RealFFTf4xSinCosTableVBR16(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
__m128 *endptr1,*endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = &localBuffer[h->Points * 2];
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = &localBuffer[ButterfliesPerGroup * 2];
int iSinCosIndex = 0;
while(A < endptr1)
{
int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
iSinCosIndex++;
endptr2 = B;
while(A < endptr2)
{
v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B = _mm_add_ps( *A, v1);
__m128 temp128 = _mm_set1_ps( 2.0);
*(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
*B = _mm_sub_ps(*A,v2);
*(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed + 1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
while(br1Index < br2Index)
{
br1Value=(*SmallVRB[h->pow2Bits])(br1Index);
br2Value=(*SmallVRB[h->pow2Bits])(br2Index);
int sinCosIndex=br1Index<<sinCosShift;
sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
A=&localBuffer[br1Value];
B=&localBuffer[br2Value];
__m128 temp128 = _mm_set1_ps( 2.0);
HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
temp128 = _mm_set1_ps( 0.5);
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
// negate sse style
*A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
localBuffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf4xSinCosTableVBR16(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A, *B;
__m128 *endptr1, *endptr2;
int br1Index, br1Value;
__m128 HRplus, HRminus, HIplus, HIminus;
__m128 v1, v2, sin, cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
/* Massage input to get the input for a real output sequence. */
A = localBuffer + 2;
B = localBuffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
while(A < B)
{
br1Value = (*SmallVRB[h->pow2Bits])(br1Index);
int sinCosIndex = br1Index << sinCosShift;
sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
HRminus = _mm_sub_ps(*A, *B);
HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
HIminus = _mm_sub_ps( *(A+1), *(B+1));
HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
A = &A[2];
B = &B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
// negate sse style
*(A+1) = _mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
v2 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
localBuffer[0] = v1;
localBuffer[1] = v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = localBuffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = localBuffer + ButterfliesPerGroup * 2;
int iSinCosIndex = 0;
while(A < endptr1)
{
int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
iSinCosIndex++;
endptr2 = B;
while(A < endptr2)
{
v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
*(A++) = _mm_sub_ps(*(B++), v1);
*B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
*(A++) = _mm_sub_ps(*(B++),v2);
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *localTimeOut = (__m128 *)TimeOut;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue = (*SmallVRB[hFFT->pow2Bits])(i);
localTimeOut[i*2 ] = localBuffer[brValue ];
localTimeOut[i*2+1] = localBuffer[brValue+1];
}
}
void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *localRealOut = (__m128 *)RealOut;
__m128 *localImagOut = (__m128 *)ImagOut;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
brValue = (*SmallVRB[hFFT->pow2Bits])(i);
localRealOut[i] = localBuffer[brValue ];
localImagOut[i] = localBuffer[brValue+1];
}
localRealOut[0] = localBuffer[0]; // DC component
localImagOut[0] = _mm_set1_ps(0.0);
localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
}
#endif
#define REAL_SINCOSTABLE_BR16
#ifdef REAL_SINCOSTABLE_BR16
/*
* Forward FFT routine. Must call GetFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf1xSinCosTableBR16(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1, *endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
fft_type HRplus, HRminus, HIplus, HIminus;
fft_type v1, v2, sin, cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int bitReverseShiftM1 = 17 - h->pow2Bits;
int bitReverseShift = bitReverseShiftM1 - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int iSinCosIndex = 0;
while(A < endptr1)
{
// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
int sinCosLookup = ( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;
cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;
iSinCosIndex++;
endptr2 = B;
while(A < endptr2)
{
v1=*B*cos + *(B+1)*sin;
v2=*B*sin - *(B+1)*cos;
*B=(*A+v1);
*(A++)=*(B++)-2*v1;
*B=(*A-v2);
*(A++)=*(B++)+2*v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1;
br2Index = h->Points - 1;
while(br1Index < br2Index)
{
br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
int sinCosIndex = br1Index << sinCosShift;
sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;
cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;
A = &buffer[br1Value];
B = &buffer[br2Value];
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus - cos*HIplus);
v2 = (cos*HRminus + sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus + v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
// A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
/* Handle DC bin separately - and ignore the Fs/2 bin
buffer[0]+=buffer[1];
buffer[1]=(fft_type)0;*/
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=buffer[0]-buffer[1];
buffer[0]+=buffer[1];
buffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf1xSinCosTableBR16(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1,*endptr2;
int br1Index;
fft_type HRplus, HRminus, HIplus, HIminus;
fft_type v1, v2, sin, cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);
int bitReverseShiftM1 = 17 - h->pow2Bits;
/* Massage input to get the input for a real output sequence. */
A = buffer + 2;
B = buffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
while(A < B)
{
int sinCosIndex = br1Index << sinCosShift;
sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;
cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus + cos*HIplus);
v2 = (cos*HRminus - sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus - v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
*(A+1)=-*(A+1);
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=0.5f*(buffer[0]+buffer[1]);
v2=0.5f*(buffer[0]-buffer[1]);
buffer[0]=v1;
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int iSinCosIndex = 0;
while(A < endptr1)
{
// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
sin=sSinCosTable.mSinCosTable[sinCosLookup].mSin;
cos=sSinCosTable.mSinCosTable[sinCosLookup].mCos;
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=*B*cos - *(B+1)*sin;
v2=*B*sin + *(B+1)*cos;
*B=(*A+v1)*(fft_type)0.5;
*(A++)=*(B++)-v1;
*B=(*A+v2)*(fft_type)0.5;
*(A++)=*(B++)-v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
int bitReverseShift=16-hFFT->pow2Bits;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
brValue = ( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
RealOut[i] = buffer[brValue ];
ImagOut[i] = buffer[brValue+1];
}
RealOut[0] = buffer[0]; // DC component
ImagOut[0] = 0;
RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
ImagOut[hFFT->Points] = 0;
}
void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
int bitReverseShift=16-hFFT->pow2Bits;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
TimeOut[i*2 ] = buffer[brValue ];
TimeOut[i*2+1] = buffer[brValue+1];
}
}
// 4x processing simd
void RealFFTf4xSinCosTableBR16(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A, *B;
__m128 *endptr1, *endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
__m128 HRplus, HRminus, HIplus, HIminus;
__m128 v1, v2, sin, cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);
int bitReverseShiftM1 = 17 - h->pow2Bits;
int bitReverseShift = bitReverseShiftM1 - 1;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = &localBuffer[h->Points * 2];
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = &localBuffer[ButterfliesPerGroup * 2];
int iSinCosIndex = 0;
while(A < endptr1)
{
// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_add_ps( *A, v1);
__m128 temp128 = _mm_set1_ps( 2.0);
*(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
*B=_mm_sub_ps(*A,v2);
*(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed + 1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
while(br1Index < br2Index)
{
br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
int sinCosIndex=br1Index<<sinCosShift;
sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
A=&localBuffer[br1Value];
B=&localBuffer[br2Value];
__m128 temp128 = _mm_set1_ps( 2.0);
HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
temp128 = _mm_set1_ps( 0.5);
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
// A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
// negate sse style
*A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
localBuffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf4xSinCosTableBR16(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A, *B;
__m128 *endptr1, *endptr2;
int br1Index;
__m128 HRplus, HRminus, HIplus, HIminus;
__m128 v1, v2, sin, cos;
auto ButterfliesPerGroup = h->Points / 2;
int pow2BitsMinus1 = h->pow2Bits - 1;
int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);
int bitReverseShiftM1 = 17 - h->pow2Bits;
/* Massage input to get the input for a real output sequence. */
A = localBuffer + 2;
B = localBuffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
while(A < B)
{
int sinCosIndex = br1Index << sinCosShift;
sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
HRminus = _mm_sub_ps(*A, *B);
HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
HIminus = _mm_sub_ps( *(A+1), *(B+1));
HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
// negate sse style
*(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
localBuffer[0] = v1;
localBuffer[1] = v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = localBuffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = localBuffer + ButterfliesPerGroup * 2;
int iSinCosIndex = 0;
while(A < endptr1)
{
// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++), v1);
*B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++),v2);
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *localTimeOut = (__m128 *)TimeOut;
int bitReverseShift = 16 - hFFT->pow2Bits;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
localTimeOut[i*2 ] = localBuffer[brValue ];
localTimeOut[i*2+1] = localBuffer[brValue+1];
}
}
void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *localRealOut = (__m128 *)RealOut;
__m128 *localImagOut = (__m128 *)ImagOut;
int bitReverseShift = 16 - hFFT->pow2Bits;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
localRealOut[i] = localBuffer[brValue ];
localImagOut[i] = localBuffer[brValue+1];
}
localRealOut[0] = localBuffer[0]; // DC component
localImagOut[0] = _mm_set1_ps(0.0);
localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
}
#endif
#define FAST_MATH_BR24
#ifdef FAST_MATH_BR24
/*
* Forward FFT routine. Must call GetFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf1xFastMathBR24(fft_type *buffer, FFTParam *h)
{
fft_type *A, *B;
fft_type *endptr1, *endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
fft_type HRplus, HRminus, HIplus, HIminus;
fft_type v1, v2, sin, cos;
fft_type iToRad = 2 * M_PI/(2 * h->Points);
int bitReverseShift = 24 - h->pow2Bits;
int bitReverseShiftM1 = bitReverseShift + 1;
auto ButterfliesPerGroup = h->Points / 2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
const v4sf zeroes = {0.0,0.0,0.0,0.0};
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
//v4sf vx=zeroes; // <-- If we want to suppress the C4701 warning later.
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
//"Warning C4701 potentially uninitialized local variable 'vx' " is OK.
//vx is initilased component by component, and MSVC doesn't realize.
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=*B*cos + *(B+1)*sin;
v2=*B*sin - *(B+1)*cos;
*B=(*A+v1);
*(A++)=*(B++)-2*v1;
*B=(*A-v2);
*(A++)=*(B++)+2*v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed + 1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
int sinCosCalIndex = 0;
while(br1Index < br2Index)
{
v4sf sin4_2, cos4_2;
br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
A=&buffer[br1Value];
B=&buffer[br2Value];
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus - cos*HIplus);
v2 = (cos*HRminus + sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus + v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
// A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];
/* Handle DC bin separately - and ignore the Fs/2 bin
buffer[0]+=buffer[1];
buffer[1]=(fft_type)0;*/
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=buffer[0]-buffer[1];
buffer[0]+=buffer[1];
buffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf1xFastMathBR24(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1,*endptr2;
int br1Index;
fft_type HRplus, HRminus, HIplus, HIminus;
fft_type v1, v2, sin, cos;
fft_type iToRad = 2 * M_PI / (2 * h->Points);
int bitReverseShiftM1 = 25 - h->pow2Bits;
auto ButterfliesPerGroup = h->Points / 2;
/* Massage input to get the input for a real output sequence. */
A = buffer + 2;
B = buffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
int sinCosCalIndex = 0;
while(A < B)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus + cos*HIplus);
v2 = (cos*HRminus - sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus - v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
*(A+1)=-*(A+1);
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=0.5f*(buffer[0]+buffer[1]);
v2=0.5f*(buffer[0]-buffer[1]);
buffer[0]=v1;
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=*B*cos - *(B+1)*sin;
v2=*B*sin + *(B+1)*cos;
*B=(*A+v1)*(fft_type)0.5;
*(A++)=*(B++)-v1;
*B=(*A+v2)*(fft_type)0.5;
*(A++)=*(B++)-v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
int bitReverseShift = 24 - hFFT->pow2Bits;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
RealOut[i] = buffer[brValue ];
ImagOut[i] = buffer[brValue+1];
}
RealOut[0] = buffer[0]; // DC component
ImagOut[0] = 0;
RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
ImagOut[hFFT->Points] = 0;
}
void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
int bitReverseShift = 24 - hFFT->pow2Bits;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
TimeOut[i*2 ] = buffer[brValue ];
TimeOut[i*2+1] = buffer[brValue+1];
}
}
void RealFFTf4xFastMathBR24(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
__m128 *endptr1,*endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
fft_type iToRad = 2 * M_PI/(2 * h->Points);
auto ButterfliesPerGroup = h->Points / 2;
int bitReverseShift = 24 - h->pow2Bits;
int bitReverseShiftM1 = bitReverseShift + 1;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = &localBuffer[h->Points * 2];
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = &localBuffer[ButterfliesPerGroup * 2];
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_add_ps( *A, v1);
__m128 temp128 = _mm_set1_ps( 2.0);
*(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
*B=_mm_sub_ps(*A,v2);
*(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed+1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
int sinCosCalIndex = 0;
while(br1Index < br2Index)
{
v4sf sin4_2, cos4_2;
br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
A=&localBuffer[br1Value];
B=&localBuffer[br2Value];
__m128 temp128 = _mm_set1_ps( 2.0);
HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
temp128 = _mm_set1_ps( 0.5);
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
// A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];
// negate sse style
*A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
localBuffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf4xFastMathBR24(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A,*B;
__m128 *endptr1,*endptr2;
int br1Index;
__m128 HRplus,HRminus,HIplus,HIminus;
__m128 v1,v2,sin,cos;
fft_type iToRad = 2 * M_PI/(2 * h->Points);
int bitReverseShiftM1 = 25 - h->pow2Bits;
auto ButterfliesPerGroup = h->Points / 2;
/* Massage input to get the input for a real output sequence. */
A = localBuffer + 2;
B = localBuffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
int sinCosCalIndex = 0;
while(A < B)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
HRminus = _mm_sub_ps(*A, *B);
HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
HIminus = _mm_sub_ps( *(A+1), *(B+1));
HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
// negate sse style
*(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
localBuffer[0]=v1;
localBuffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = localBuffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = localBuffer + ButterfliesPerGroup * 2;
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++), v1);
*B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++),v2);
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *localRealOut = (__m128 *)RealOut;
__m128 *localImagOut = (__m128 *)ImagOut;
int bitReverseShift = 24-hFFT->pow2Bits;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
localRealOut[i]=localBuffer[brValue ];
localImagOut[i]=localBuffer[brValue+1];
}
localRealOut[0] = localBuffer[0]; // DC component
localImagOut[0] = _mm_set1_ps(0.0);
localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
}
void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *localTimeOut = (__m128 *)TimeOut;
int bitReverseShift = 24-hFFT->pow2Bits;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
localTimeOut[i*2 ] = localBuffer[brValue ];
localTimeOut[i*2+1] = localBuffer[brValue+1];
}
}
#endif
#define FAST_MATH_BR16
#ifdef FAST_MATH_BR16
/*
* Forward FFT routine. Must call GetFFT(fftlen) first!
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
* Imag_i = buffer[ h->BitReversed[i]+1 ] )
* Input is in normal order.
*
* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void RealFFTf1xFastMathBR16(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1,*endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
fft_type iToRad = 2 * M_PI / (2 * h->Points);
int bitReverseShiftM1 = 17 - h->pow2Bits;
int bitReverseShift = bitReverseShiftM1 - 1;
auto ButterfliesPerGroup = h->Points / 2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=*B*cos + *(B+1)*sin;
v2=*B*sin - *(B+1)*cos;
*B=(*A+v1);
*(A++)=*(B++)-2*v1;
*B=(*A-v2);
*(A++)=*(B++)+2*v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed+1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
int sinCosCalIndex = 0;
while(br1Index < br2Index)
{
v4sf sin4_2, cos4_2;
br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
if(!sinCosCalIndex)
{
v4sf vx;
for(int i = 0; i < 4; i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin = -sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
A=&buffer[br1Value];
B=&buffer[br2Value];
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus - cos*HIplus);
v2 = (cos*HRminus + sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus + v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
// A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
/* Handle DC bin separately - and ignore the Fs/2 bin
buffer[0]+=buffer[1];
buffer[1]=(fft_type)0;*/
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=buffer[0]-buffer[1];
buffer[0]+=buffer[1];
buffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf1xFastMathBR16(fft_type *buffer, FFTParam *h)
{
fft_type *A,*B;
fft_type *endptr1,*endptr2;
int br1Index;
fft_type HRplus,HRminus,HIplus,HIminus;
fft_type v1,v2,sin,cos;
fft_type iToRad=2 * M_PI / (2 * h->Points);
int bitReverseShiftM1=17-h->pow2Bits;
auto ButterfliesPerGroup = h->Points / 2;
/* Massage input to get the input for a real output sequence. */
A = buffer + 2;
B = buffer + h->Points * 2 - 2;
br1Index = 1; //h->BitReversed + 1;
int sinCosCalIndex = 0;
while(A < B)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
HRplus = (HRminus = *A - *B ) + (*B * 2);
HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
v1 = (sin*HRminus + cos*HIplus);
v2 = (cos*HRminus - sin*HIplus);
*A = (HRplus + v1) * (fft_type)0.5;
*B = *A - v1;
*(A+1) = (HIminus - v2) * (fft_type)0.5;
*(B+1) = *(A+1) - HIminus;
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
*(A+1)=-*(A+1);
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=0.5f*(buffer[0]+buffer[1]);
v2=0.5f*(buffer[0]-buffer[1]);
buffer[0]=v1;
buffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = buffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = buffer;
B = buffer + ButterfliesPerGroup * 2;
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i = 0; i < 4; i++) {
int brTemp = iSinCosIndex + i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=-sin4_2.m128_f32[0];
cos=-cos4_2.m128_f32[0];
sinCosCalIndex++;
} else {
sin=-sin4_2.m128_f32[sinCosCalIndex];
cos=-cos4_2.m128_f32[sinCosCalIndex];
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=*B*cos - *(B+1)*sin;
v2=*B*sin + *(B+1)*cos;
*B=(*A+v1)*(fft_type)0.5;
*(A++)=*(B++)-v1;
*B=(*A+v2)*(fft_type)0.5;
*(A++)=*(B++)-v2;
}
A = B;
B += ButterfliesPerGroup * 2;
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
int bitReverseShift = 16 - hFFT->pow2Bits;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
RealOut[i] = buffer[brValue ];
ImagOut[i] = buffer[brValue+1];
}
RealOut[0] = buffer[0]; // DC component
ImagOut[0] = 0;
RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
ImagOut[hFFT->Points] = 0;
}
void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
int bitReverseShift=16-hFFT->pow2Bits;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
TimeOut[i*2 ] = buffer[brValue ];
TimeOut[i*2+1] = buffer[brValue+1];
}
}
void RealFFTf4xFastMathBR16(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer = (__m128 *)buffer;
__m128 *A,*B;
__m128 *endptr1, *endptr2;
int br1Index, br2Index;
int br1Value, br2Value;
__m128 HRplus, HRminus, HIplus, HIminus;
__m128 v1, v2, sin, cos;
fft_type iToRad = 2 * M_PI/(2 * h->Points);
auto ButterfliesPerGroup = h->Points / 2;
int bitReverseShiftM1 = 17 - h->pow2Bits;
int bitReverseShift = bitReverseShiftM1 - 1;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = &localBuffer[h->Points * 2];
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = &localBuffer[ButterfliesPerGroup * 2];
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_add_ps( *A, v1);
__m128 temp128 = _mm_set1_ps( 2.0);
*(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
*B=_mm_sub_ps(*A,v2);
*(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
/* Massage output to get the output for a real input sequence. */
br1Index = 1; // h->BitReversed + 1;
br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
int sinCosCalIndex = 0;
while(br1Index < br2Index)
{
v4sf sin4_2, cos4_2;
br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
if(!sinCosCalIndex)
{
v4sf vx;
for(int i = 0; i < 4; i++)
vx.m128_f32[i] = ((float)(br1Index+i)) * iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin = _mm_set1_ps(-sin4_2.m128_f32[0]);
cos = _mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin = _mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos = _mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex == 3)
sinCosCalIndex = 0;
else
sinCosCalIndex++;
}
A = &localBuffer[br1Value];
B = &localBuffer[br2Value];
__m128 temp128 = _mm_set1_ps( 2.0);
HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
temp128 = _mm_set1_ps( 0.5);
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
br1Index++;
br2Index--;
}
/* Handle the center bin (just need a conjugate) */
// A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
// negate sse style
*A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
/* Handle DC and Fs/2 bins separately */
/* Put the Fs/2 value into the imaginary part of the DC bin */
v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
localBuffer[1]=v1;
}
/* Description: This routine performs an inverse FFT to real data.
* This code is for floating point data.
*
* Note: Output is BIT-REVERSED! so you must use the BitReversed to
* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
* Input is in normal order, interleaved (real,imaginary) complex data
* You must call GetFFT(fftlen) first to initialize some buffers!
*
* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
* - this can be done because both values will always be real only
* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
*
* Note: The scaling on this is done according to the standard FFT definition,
* so a unit amplitude DC signal will output an amplitude of (N)
* (Older revisions would progressively scale the input, so the output
* values would be similar in amplitude to the input values, which is
* good when using fixed point arithmetic)
*/
void InverseRealFFTf4xFastMathBR16(fft_type *buffer, FFTParam *h)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *A, *B;
__m128 *endptr1, *endptr2;
int br1Index;
__m128 HRplus, HRminus, HIplus, HIminus;
__m128 v1, v2, sin, cos;
fft_type iToRad = 2 * M_PI/(2 * h->Points);
int bitReverseShiftM1 = 17 - h->pow2Bits;
auto ButterfliesPerGroup = h->Points / 2;
/* Massage input to get the input for a real output sequence. */
A = localBuffer + 2;
B = localBuffer + h->Points * 2 - 2;
br1Index=1; //h->BitReversed+1;
int sinCosCalIndex=0;
while(A<B)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++)
vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
HRminus = _mm_sub_ps(*A, *B);
HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
HIminus = _mm_sub_ps( *(A+1), *(B+1));
HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
*A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
*B = _mm_sub_ps(*A, v1);
*(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
*(B+1) = _mm_sub_ps(*(A+1), HIminus);
A=&A[2];
B=&B[-2];
br1Index++;
}
/* Handle center bin (just need conjugate) */
// negate sse style
*(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
/* Handle DC bin separately - this ignores any Fs/2 component
buffer[1]=buffer[0]=buffer[0]/2;*/
/* Handle DC and Fs/2 bins specially */
/* The DC bin is passed in as the real part of the DC complex value */
/* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
/* (v1+v2) = buffer[0] == the DC component */
/* (v1-v2) = buffer[1] == the Fs/2 component */
v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
localBuffer[0]=v1;
localBuffer[1]=v2;
/*
* Butterfly:
* Ain-----Aout
* \ /
* / \
* Bin-----Bout
*/
endptr1 = localBuffer + h->Points * 2;
while(ButterfliesPerGroup > 0)
{
A = localBuffer;
B = localBuffer + ButterfliesPerGroup * 2;
int sinCosCalIndex = 0;
int iSinCosIndex = 0;
while(A < endptr1)
{
v4sf sin4_2, cos4_2;
if(!sinCosCalIndex)
{
v4sf vx;
for(int i=0;i<4;i++) {
int brTemp=iSinCosIndex+i;
vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
}
sincos_ps(vx, &sin4_2, &cos4_2);
sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
sinCosCalIndex++;
} else {
sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
if(sinCosCalIndex==3)
sinCosCalIndex=0;
else
sinCosCalIndex++;
}
iSinCosIndex++;
endptr2=B;
while(A<endptr2)
{
v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
*B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++), v1);
*B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
*(A++)=_mm_sub_ps(*(B++),v2);
}
A = B;
B = &B[ButterfliesPerGroup * 2];
}
ButterfliesPerGroup >>= 1;
}
}
void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *localRealOut=(__m128 *)RealOut;
__m128 *localImagOut=(__m128 *)ImagOut;
int bitReverseShift=16-hFFT->pow2Bits;
// Copy the data into the real and imaginary outputs
for(size_t i = 1; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
localRealOut[i]=localBuffer[brValue ];
localImagOut[i]=localBuffer[brValue+1];
}
localRealOut[0] = localBuffer[0]; // DC component
localImagOut[0] = _mm_set1_ps(0.0);
localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
}
void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
{
__m128 *localBuffer=(__m128 *)buffer;
__m128 *localTimeOut=(__m128 *)TimeOut;
int bitReverseShift=16-hFFT->pow2Bits;
// Copy the data into the real outputs
for(size_t i = 0; i < hFFT->Points; i++) {
int brValue;
brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
localTimeOut[i*2 ] = localBuffer[brValue ];
localTimeOut[i*2+1] = localBuffer[brValue+1];
}
}
#endif
#endif