audacia/src/VoiceKey.cpp

1060 lines
40 KiB
C++

/**********************************************************************
Audacity: A Digital Audio Editor
VoiceKey.cpp
?? Dominic Mazzoni
?? Shane Muller
*******************************************************************//*!
\class VoiceKey
\brief
This implements a voice key, detecting either the next "ON"
or "OFF" point
*//*******************************************************************/
#include "Audacity.h"
#include "VoiceKey.h"
#include <wx/string.h>
#include <wx/msgdlg.h>
#include <math.h>
#include <stdio.h>
#include <wx/filedlg.h>
#include <wx/msgdlg.h>
#include <wx/textfile.h>
#include <wx/intl.h>
#include <iostream>
using std::cout;
using std::endl;
VoiceKey::VoiceKey(){
mWindowSize = 0.01; //size of analysis window in seconds
mEnergyMean = .0006; // reasonable initial levels assuming sampling rate of
mEnergySD = .0002; // 44100 hertz
mSignChangesMean = .08;
mSignChangesSD= .02;
mDirectionChangesMean = .25;
mDirectionChangesSD = .2;
AdjustThreshold(2);
mSilentWindowSize = .05; //Amount of time (in seconds) below threshold to call it silence
mSignalWindowSize = .05; //Amount of time (in seconds) above threshold to call it signal
mUseEnergy = true;
mUseSignChangesLow = false;
mUseSignChangesHigh = false;
mUseDirectionChangesLow = false;
mUseDirectionChangesHigh = false;
};
VoiceKey::~VoiceKey(){
};
//---------------------------------------------------------------------------
// VoiceKey::On/Off Forward/Backward
// This operates in two phases:
// First, you take chunks of samples that are WindowSize big.
// If you have a run of them where something passes the threshold for SignalWindowSize seconds,
// you return to the last empty block and scan forward one sample at a time until you find the
// starting point of the speech.
//Move forward to find an ON region.
sampleCount VoiceKey::OnForward (WaveTrack & t, sampleCount start, sampleCount len) {
if((mWindowSize) >= len+10){
/* i18n-hint: Voice key is an experiemental/incomplete feature that
is used to navigate in vocal recordings, to move forwards and
backwards by words. So 'key' is being used in the sense of an index.
This error message means that you've selected too short
a region of audio to be able to use this feature.*/
wxMessageBox(_("Selection is too small to use voice key."));
return start;
}
else{
sampleCount lastsubthresholdsample; // keeps track of the sample number of the last sample to not exceed the threshold
//Change the millisecond-based parameters into sample-based parameters
double rate = t.GetRate(); //Translates seconds to samples
unsigned int WindowSizeInt = (unsigned int)(rate * mWindowSize); //Size of window to examine
unsigned int SignalWindowSizeInt = (unsigned int)(rate * mSignalWindowSize); //This much signal is necessary to trip key
int samplesleft = len - WindowSizeInt; //Indexes the number of samples remaining in the selection
lastsubthresholdsample = start; //start this off at the selection start
unsigned int i; //iterates through waveblock
int blockruns=0; //keeps track of the number of consecutive above-threshold blocks
int blocksize; //The final block may be smaller than WindowSizeInt, so use this
//This loop goes through the selection a block at a time. If a long enough run
//of above-threshold blocks occur, we return to the last sub-threshold block and
//go through one sample at a time.
//If there are fewer than 10 samples leftover, don't bother.
for(i = start; samplesleft >=10; i+=(WindowSizeInt-1) , samplesleft -= (WindowSizeInt -1)){
//Set blocksize so that it is the right size
if((unsigned int)samplesleft < WindowSizeInt){
blocksize = samplesleft;
}
else{
blocksize = WindowSizeInt;
}
//Test whether we are above threshold (the number of stats)
if(AboveThreshold(t,i,blocksize))
{
blockruns++; //Hit
} else{
blockruns=0; //Miss--start over
lastsubthresholdsample = i;
}
//If the blockrun is long enough, break out of the loop early:
if(blockruns > mSignalWindowSize/mWindowSize)
break;
}
//Now, if we broke out early (samplesleft > 10), go back to the lastsubthresholdsample and look more carefully
if(samplesleft > 10){
//Calculate how many to scan through--we only have to go through (at most)
//the first window + 1 samples--but we need another window samples to draw from.
samplesleft = 2*WindowSizeInt+1;
//To speed things up, create a local buffer to store things in, to avoid the costly t.Get();
//Only go through the first SignalWindowSizeInt samples, and choose the first that trips the key.
sampleFormat *buffer = new sampleFormat[samplesleft];
t.Get((samplePtr)buffer, floatSample,lastsubthresholdsample,samplesleft);
//Initialize these trend markers atrend and ztrend. They keep track of the
//up/down trends at the start and end of the evaluation window.
int atrend = sgn(buffer[1]-buffer[0]);
int ztrend = sgn(buffer[WindowSizeInt+1]-buffer[WindowSizeInt]);
double erg=0;
double sc=0;
double dc=0;
//Get initial test statistic values.
if(mUseEnergy)
erg = TestEnergy(t, lastsubthresholdsample, WindowSizeInt);
if(mUseSignChangesLow || mUseSignChangesHigh)
sc = TestSignChanges(t,lastsubthresholdsample, WindowSizeInt);
if(mUseDirectionChangesLow || mUseDirectionChangesHigh)
dc = TestDirectionChanges(t,lastsubthresholdsample,WindowSizeInt);
//Now, go through the sound again, sample by sample.
for(i=0; i<SignalWindowSizeInt-WindowSizeInt;i++){
int tests = 0;
int testThreshold = 0;
//Update the test statistics
if(mUseEnergy)
{
TestEnergyUpdate(erg, WindowSizeInt,buffer[i],buffer[i+WindowSizeInt+1]);
tests += (int)(erg>mThresholdEnergy);
testThreshold++;
}
if(mUseSignChangesLow)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc < mThresholdSignChangesLower);
testThreshold++;
}
if(mUseSignChangesHigh)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc > mThresholdSignChangesUpper);
testThreshold++;
}
if(mUseDirectionChangesLow)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc < mThresholdDirectionChangesLower);
testThreshold++;
}
if(mUseDirectionChangesHigh)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc > mThresholdDirectionChangesUpper);
testThreshold++;
}
if(tests >= testThreshold)
{ //Finish off on the first hit
break;
}
}
//When we get here, i+lastsubthresholdsample is the best guess for where the word starts
delete [] buffer;
return i+lastsubthresholdsample;
}
else{
//If we failed to find anything, return the start position
return start ;
}
}
}
//Move backward from end to find an ON region.
sampleCount VoiceKey::OnBackward (WaveTrack & t, sampleCount end, sampleCount len) {
if((mWindowSize) >= len+10){
wxMessageBox(_("Selection is too small to use voice key."));
return end;
}
else{
sampleCount lastsubthresholdsample; // keeps track of the sample number of the last sample to not exceed the threshold
//Change the millisecond-based parameters into sample-based parameters
double rate = t.GetRate(); //Translates seconds to samples
unsigned int WindowSizeInt = (unsigned int)(rate * mWindowSize); //Size of window to examine
//unsigned int SilentWindowSizeInt = (unsigned int)(rate * mSilentWindowSize); //This much signal is necessary to trip key
int samplesleft = len - WindowSizeInt; //Indexes the number of samples remaining in the selection
lastsubthresholdsample = end; //start this off at the end
unsigned int i; //iterates through waveblock
int blockruns=0; //keeps track of the number of consecutive above-threshold blocks
int blocksize; //The final block may be smaller than WindowSizeInt, so use this
//This loop goes through the selection a block at a time in reverse order. If a long enough run
//of above-threshold blocks occur, we return to the last sub-threshold block and
//go through one sample at a time.
//If there are fewer than 10 samples leftover, don't bother.
for(i = end - WindowSizeInt; samplesleft >=10; i-=(WindowSizeInt-1) , samplesleft -= (WindowSizeInt -1)){
//Set blocksize so that it is the right size
if(samplesleft < (int)WindowSizeInt){
blocksize = samplesleft;
}
else{
blocksize = WindowSizeInt;
}
//Test whether we are above threshold
if(AboveThreshold(t,i,blocksize))
{
blockruns++; //Hit
}
else
{
blockruns=0; //Miss--start over
lastsubthresholdsample = i+WindowSizeInt;
}
//If the blockrun is long enough, break out of the loop early:
if(blockruns > mSilentWindowSize/mWindowSize)
break;
}
//Now, if we broke out early (samplesleft > 10), go back to the lastsubthresholdsample and look more carefully
if(samplesleft > 10){
//Calculate how many to scan through--we only have to go through (at most)
//the first window + 1 samples--but we need another window samples to draw from.
samplesleft = 2*WindowSizeInt+1;
//To speed things up, create a local buffer to store things in, to avoid the costly t.Get();
//Only go through the first mSilentWindowSizeInt samples, and choose the first that trips the key.
sampleFormat *buffer = new sampleFormat[samplesleft];
t.Get((samplePtr)buffer, floatSample, lastsubthresholdsample-samplesleft,samplesleft);
//Initialize these trend markers atrend and ztrend. They keep track of the
//up/down trends at the start and end of the evaluation window.
int atrend = sgn(buffer[samplesleft - 2]-buffer[samplesleft - 1]);
int ztrend = sgn(buffer[samplesleft - WindowSizeInt-2]-buffer[samplesleft - WindowSizeInt-2]);
double erg=0;
double sc = 0;
double dc = 0;
//Get initial test statistic values.
if(mUseEnergy)
erg = TestEnergy(t, lastsubthresholdsample, WindowSizeInt);
if(mUseSignChangesLow || mUseSignChangesHigh)
sc = TestSignChanges(t,lastsubthresholdsample, WindowSizeInt);
if(mUseDirectionChangesLow || mUseDirectionChangesHigh)
dc = TestDirectionChanges(t,lastsubthresholdsample,WindowSizeInt);
//Now, go through the sound again, sample by sample.
for(i=samplesleft-1; i>WindowSizeInt; i--){
int tests = 0;
int testThreshold = 0;
//Update the test statistics
if(mUseEnergy)
{
TestEnergyUpdate(erg, WindowSizeInt,buffer[i],buffer[i+WindowSizeInt+1]);
tests += (int)(erg>mThresholdEnergy);
testThreshold++;
}
if(mUseSignChangesLow)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc < mThresholdSignChangesLower);
testThreshold++;
}
if(mUseSignChangesHigh)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc > mThresholdSignChangesUpper);
testThreshold++;
}
if(mUseDirectionChangesLow)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc < mThresholdDirectionChangesLower);
testThreshold++;
}
if(mUseDirectionChangesHigh)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc > mThresholdDirectionChangesUpper);
testThreshold++;
}
if(tests >= testThreshold)
{ //Finish off on the first hit
break;
}
}
//When we get here, i+lastsubthresholdsample is the best guess for where the word starts
delete [] buffer;
return lastsubthresholdsample - samplesleft + i;
}
else{
//If we failed to find anything, return the start position
return end ;
}
}
}
//Move froward from the start to find an OFF region.
sampleCount VoiceKey::OffForward (WaveTrack & t, sampleCount start, sampleCount len) {
if((mWindowSize) >= len+10){
wxMessageBox(_("Selection is too small to use voice key."));
return start;
}
else{
sampleCount lastsubthresholdsample; // keeps track of the sample number of the last sample to not exceed the threshold
//Change the millisecond-based parameters into sample-based parameters
double rate = t.GetRate(); //Translates seconds to samples
unsigned int WindowSizeInt = (unsigned int)(rate * mWindowSize); //Size of window to examine
unsigned int SilentWindowSizeInt = (unsigned int)(rate * mSilentWindowSize); //This much signal is necessary to trip key
int samplesleft = len - WindowSizeInt; //Indexes the number of samples remaining in the selection
lastsubthresholdsample = start; //start this off at the selection start
unsigned int i; //iterates through waveblock
int blockruns=0; //keeps track of the number of consecutive above-threshold blocks
int blocksize; //The final block may be smaller than WindowSizeInt, so use this
//This loop goes through the selection a block at a time. If a long enough run
//of above-threshold blocks occur, we return to the last sub-threshold block and
//go through one sample at a time.
//If there are fewer than 10 samples leftover, don't bother.
for(i = start; samplesleft >=10; i+=(WindowSizeInt-1) , samplesleft -= (WindowSizeInt -1)){
//Set blocksize so that it is the right size
if(samplesleft < (int)WindowSizeInt){
blocksize = samplesleft;
}
else{
blocksize = WindowSizeInt;
}
if(!AboveThreshold(t,i,blocksize))
{
blockruns++; //Hit
}
else
{
blockruns=0; //Above threshold--start over
lastsubthresholdsample = i;
}
//If the blockrun is long enough, break out of the loop early:
if(blockruns > mSilentWindowSize/mWindowSize)
break;
}
//Now, if we broke out early (samplesleft > 10), go back to the lastsubthresholdsample and look more carefully
if(samplesleft > 10){
//Calculate how many to scan through--we only have to go through (at most)
//the first window + 1 samples--but we need another window samples to draw from.
samplesleft = 2*WindowSizeInt+1;
//To speed things up, create a local buffer to store things in, to avoid the costly t.Get();
//Only go through the first SilentWindowSizeInt samples, and choose the first that trips the key.
sampleFormat *buffer = new sampleFormat[samplesleft];
t.Get((samplePtr)buffer, floatSample, lastsubthresholdsample,samplesleft);
//Initialize these trend markers atrend and ztrend. They keep track of the
//up/down trends at the start and end of the evaluation window.
int atrend = sgn(buffer[1]-buffer[0]);
int ztrend = sgn(buffer[WindowSizeInt+1]-buffer[WindowSizeInt]);
double erg=0;
double sc=0;
double dc=0;
//Get initial test statistic values.
if(mUseEnergy)
erg = TestEnergy(t, lastsubthresholdsample, WindowSizeInt);
if(mUseSignChangesLow || mUseSignChangesHigh)
sc = TestSignChanges(t,lastsubthresholdsample, WindowSizeInt);
if(mUseDirectionChangesLow || mUseDirectionChangesHigh)
dc = TestDirectionChanges(t,lastsubthresholdsample,WindowSizeInt);
//Now, go through the sound again, sample by sample.
for(i=0; i<SilentWindowSizeInt-WindowSizeInt;i++){
int tests = 0;
int testThreshold = 0;
//Update the test statistics
if(mUseEnergy)
{
TestEnergyUpdate(erg, WindowSizeInt,buffer[i],buffer[i+WindowSizeInt+1]);
tests += (int)(erg>mThresholdEnergy);
testThreshold++;
}
if(mUseSignChangesLow)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc < mThresholdSignChangesLower);
testThreshold++;
}
if(mUseSignChangesHigh)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc > mThresholdSignChangesUpper);
testThreshold++;
}
if(mUseDirectionChangesLow)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc < mThresholdDirectionChangesLower);
testThreshold++;
}
if(mUseDirectionChangesHigh)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc > mThresholdDirectionChangesUpper);
testThreshold++;
}
if(tests < testThreshold)
{ //Finish off on the first below-threshold block
break;
}
}
//When we get here, i+lastsubthresholdsample is the best guess for where the word starts
delete [] buffer;
return i+lastsubthresholdsample;
}
else{
//If we failed to find anything, return the start position
return start ;
}
}
}
//Move backward from the end to find an OFF region
sampleCount VoiceKey::OffBackward (WaveTrack & t, sampleCount end, sampleCount len) {
if((mWindowSize) >= len+10){
wxMessageBox(_("Selection is too small to use voice key."));
return end;
}
else{
sampleCount lastsubthresholdsample; // keeps track of the sample number of the last sample to not exceed the threshold
//Change the millisecond-based parameters into sample-based parameters
double rate = t.GetRate(); //Translates seconds to samples
unsigned int WindowSizeInt = (unsigned int)(rate * mWindowSize); //Size of window to examine
//unsigned int SilentWindowSizeInt = (unsigned int)(rate * mSilentWindowSize); //This much signal is necessary to trip key
int samplesleft = len - WindowSizeInt; //Indexes the number of samples remaining in the selection
lastsubthresholdsample = end; //start this off at the end
unsigned int i; //iterates through waveblock
int blockruns=0; //keeps track of the number of consecutive above-threshold blocks
int blocksize; //The final block may be smaller than WindowSizeInt, so use this
//This loop goes through the selection a block at a time in reverse order. If a long enough run
//of above-threshold blocks occur, we return to the last sub-threshold block and
//go through one sample at a time.
//If there are fewer than 10 samples leftover, don't bother.
for(i = end - WindowSizeInt; samplesleft >=10; i-=(WindowSizeInt-1) , samplesleft -= (WindowSizeInt -1)){
//Set blocksize so that it is the right size
if(samplesleft < (int)WindowSizeInt){
blocksize = samplesleft;
}
else{
blocksize = WindowSizeInt;
}
if(!AboveThreshold(t,i,blocksize))
{
blockruns++; //Hit
}
else
{
blockruns=0; //Miss--start over
lastsubthresholdsample = i+WindowSizeInt;
}
//If the blockrun is long enough, break out of the loop early:
if(blockruns > mSilentWindowSize/mWindowSize)
break;
}
//Now, if we broke out early (samplesleft > 10), go back to the lastsubthresholdsample and look more carefully
if(samplesleft > 10){
//Calculate how many to scan through--we only have to go through (at most)
//the first window + 1 samples--but we need another window samples to draw from.
samplesleft = 2*WindowSizeInt+1;
//To speed things up, create a local buffer to store things in, to avoid the costly t.Get();
//Only go through the first SilentWindowSizeInt samples, and choose the first that trips the key.
sampleFormat *buffer = new sampleFormat[samplesleft];
t.Get((samplePtr)buffer, floatSample, lastsubthresholdsample-samplesleft,samplesleft);
//Initialize these trend markers atrend and ztrend. They keep track of the
//up/down trends at the start and end of the evaluation window.
int atrend = sgn(buffer[samplesleft - 2]-buffer[samplesleft - 1]);
int ztrend = sgn(buffer[samplesleft - WindowSizeInt-2]-buffer[samplesleft - WindowSizeInt-2]);
double erg=0;
double sc=0;
double dc=0;
//Get initial test statistic values.
if(mUseEnergy)
erg = TestEnergy(t, lastsubthresholdsample, WindowSizeInt);
if(mUseSignChangesLow || mUseSignChangesHigh)
sc = TestSignChanges(t,lastsubthresholdsample, WindowSizeInt);
if(mUseDirectionChangesLow || mUseDirectionChangesHigh)
dc = TestDirectionChanges(t,lastsubthresholdsample,WindowSizeInt);
//Now, go through the sound again, sample by sample.
for(i=samplesleft-1; i>WindowSizeInt; i--){
int tests = 0;
int testThreshold = 0;
//Update the test statistics
if(mUseEnergy)
{
TestEnergyUpdate(erg, WindowSizeInt,buffer[i],buffer[i+WindowSizeInt+1]);
tests += (int)(erg>mThresholdEnergy);
testThreshold++;
}
if(mUseSignChangesLow)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc < mThresholdSignChangesLower);
testThreshold++;
}
if(mUseSignChangesHigh)
{
TestSignChangesUpdate(sc,WindowSizeInt,buffer[i],buffer[i+1],buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(sc > mThresholdSignChangesUpper);
testThreshold++;
}
if(mUseDirectionChangesLow)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc < mThresholdDirectionChangesLower);
testThreshold++;
}
if(mUseDirectionChangesHigh)
{
TestDirectionChangesUpdate(dc,WindowSizeInt,atrend,buffer[i],buffer[i+1],ztrend,buffer[i+WindowSizeInt],buffer[i+WindowSizeInt+1]);
tests += (int)(dc > mThresholdDirectionChangesUpper);
testThreshold++;
}
if(tests < testThreshold)
{ //Finish off on the first hit
break;
}
}
//When we get here, i+lastsubthresholdsample is the best guess for where the word starts
delete [] buffer;
return lastsubthresholdsample - samplesleft + i;
}
else{
//If we failed to find anything, return the start position
return end ;
}
}
}
//This tests whether a specified block region is above or below threshold.
bool VoiceKey::AboveThreshold(WaveTrack & t, sampleCount start, sampleCount len)
{
double erg=0;
double sc=0;
double dc=0; //These store three statistics: energy, signchanges, and directionchanges
int tests =0; //Keeps track of how many statistics surpass the threshold.
int testThreshold=0; //Keeps track of the threshold.
//Calculate the test statistics
if(mUseEnergy)
{
testThreshold++;
erg = TestEnergy(t, start,len);
tests +=(int)(erg > mThresholdEnergy);
#if 0
std::cout << "Energy: " << erg << " " <<mThresholdEnergy << std::endl;
#endif
}
if(mUseSignChangesLow)
{
testThreshold++;
sc = TestSignChanges(t,start,len);
tests += (int)(sc < mThresholdSignChangesLower);
#if 0
std::cout << "SignChanges: " << sc << " " <<mThresholdSignChangesLower<< " < " << mThresholdSignChangesUpper << std::endl;
#endif
}
if(mUseSignChangesHigh)
{
testThreshold++;
sc = TestSignChanges(t,start,len);
tests += (int)(sc > mThresholdSignChangesUpper);
#if 0
std::cout << "SignChanges: " << sc << " " <<mThresholdSignChangesLower<< " < " << mThresholdSignChangesUpper << std::endl;
#endif
}
if(mUseDirectionChangesLow)
{
testThreshold++;
dc = TestDirectionChanges(t,start,len);
tests += (int)(dc < mThresholdDirectionChangesLower);
#if 0
std::cout << "DirectionChanges: " << dc << " " <<mThresholdDirectionChangesLower<< " < " << mThresholdDirectionChangesUpper << std::endl;
#endif
}
if(mUseDirectionChangesHigh)
{
testThreshold++;
dc = TestDirectionChanges(t,start,len);
tests += (int)(dc > mThresholdDirectionChangesUpper);
#if 0
std::cout << "DirectionChanges: " << dc << " " <<mThresholdDirectionChangesLower<< " < " << mThresholdDirectionChangesUpper << std::endl;
#endif
}
//Test whether we are above threshold (the number of stats)
return (tests >= testThreshold);
}
//This adjusts the threshold. Larger values of t expand the noise region,
//making more things be classified as noise (and requiring a stronger signal).
void VoiceKey::AdjustThreshold(double t){
mThresholdAdjustment = t;
mThresholdEnergy = mEnergyMean + mEnergySD * t;
mThresholdSignChangesUpper = mSignChangesMean + mSignChangesSD * t;
mThresholdSignChangesLower = mSignChangesMean - mSignChangesSD * t;
mThresholdDirectionChangesUpper = mDirectionChangesMean + mDirectionChangesSD * t;
mThresholdDirectionChangesLower = mDirectionChangesMean - mDirectionChangesSD * t;
};
//This 'calibrates' the voicekey to noise
void VoiceKey::CalibrateNoise(WaveTrack & t, sampleCount start, sampleCount len){
//To calibrate the noise, we need to scan the sample block just like in the voicekey and
//calculate the mean and standard deviation of the test statistics.
//Then, we set the BaselineThreshold to be one
wxBusyCursor busy;
//initialize some sample statistics: sums of X and X^2
double sumerg, sumerg2;
double sumsc, sumsc2;
double sumdc, sumdc2;
double erg, sc, dc;
// sampleFormat a1=0;
// sampleFormat a2=0;
// sampleFormat z1=0;
// sampleFormat z2=0; // keeps track of initial and final samples of window to enable updating formulae
int atrend=0; // equals sgn(a2-a1); Keeps track of trend at start of window
int ztrend=0; // equals sgn(z2-z1); Keeps track of trend at end of window
//Now, change the millisecond-based parameters into sample-based parameters
//(This depends on WaveTrack t)
double rate = t.GetRate();
unsigned int WindowSizeInt = (unsigned int)(rate * mWindowSize);
// unsigned int SignalWindowSizeInt = (unsigned int)(rate * mSignalWindowSize);
//Get the first test statistics
//Calibrate all of the statistic, because they might be
//changed later.
// if(mUseEnergy)
erg = TestEnergy(t, start, WindowSizeInt);
// if(mUseSignChanges)
sc = TestSignChanges(t,start, WindowSizeInt);
// if(mUseDirectionChanges)
dc = TestDirectionChanges(t,start,WindowSizeInt);
//Calculate initial values for the trend trackers--------------------//
//Get((samplePtr)a1, floatSample, start, 1);
// t.Get((samplePtr)a2, floatSample, start+1, 1);
// t.Get((samplePtr)z1, floatSample, start+WindowSizeInt-1, 1);
// t.Get((samplePtr)z2, floatSample, start+WindowSizeInt, 1);
atrend = -1;//sgn(a2-a1);
ztrend = -1;// sgn(z2-z1);
//-------------------------------------------------------------------//
sumerg =0.0;
sumerg2 = 0.0;
sumsc =0.0;
sumsc2 = 0.0;
sumdc =0.0;
sumdc2 =0.0;
// int n = len - WindowSizeInt; //This is how many samples we have
int samplesleft = len - WindowSizeInt;
int i;
int blocksize;
int samples=0;
for(i = start; samplesleft >=10 ; i += (WindowSizeInt -1), samplesleft -= (WindowSizeInt -1) )
{
//Take samples chunk-by-chunk.
//Normally, this should be in WindowSizeInt chunks, but at the end (if there are more than 10
//samples left) take a chunk that eats the rest of the samples.
samples++; //Increment the number of samples we have
if(samplesleft < (int)WindowSizeInt)
{
blocksize = samplesleft;
}
else
{
blocksize = WindowSizeInt;
}
erg = TestEnergy(t, i, blocksize);
sumerg +=(double)erg;
sumerg2 += pow((double)erg,2);
sc = TestSignChanges(t,i, blocksize);
sumsc += (double)sc;
sumsc2 += pow((double)sc,2);
dc = TestDirectionChanges(t,i,blocksize);
sumdc += (double)dc;
sumdc2 += pow((double)dc,2);
}
mEnergyMean = sumerg / samples;
mEnergySD = sqrt(sumerg2/samples - mEnergyMean*mEnergyMean);
mSignChangesMean = sumsc / samples;
mSignChangesSD = sqrt(sumsc2 / samples - mSignChangesMean * mSignChangesMean);
mDirectionChangesMean = sumdc / samples;
mDirectionChangesSD =sqrt(sumdc2 / samples - mDirectionChangesMean * mDirectionChangesMean) ;
wxString text = wxString::Format(_("Calibration Results\n"));
/* i18n-hint: %1.4f is replaced by a number. sd stands for 'Standard Deviations'*/
text += wxString::Format(_("Energy -- mean: %1.4f sd: (%1.4f)\n"),mEnergyMean,mEnergySD);
text+= wxString::Format(_("Sign Changes -- mean: %1.4f sd: (%1.4f)\n"),mSignChangesMean,mSignChangesSD);
text+= wxString::Format(_("Direction Changes -- mean: %1.4f sd: (%1.4f)\n"),mDirectionChangesMean,mDirectionChangesSD);
wxMessageDialog * stats = new wxMessageDialog(NULL, text,
wxT("Calibration Complete"),
wxOK | wxICON_INFORMATION,
wxPoint(-1,-1));
stats->ShowModal();
delete stats;
AdjustThreshold(mThresholdAdjustment);
}
void VoiceKey::SetKeyType(bool erg, bool scLow , bool scHigh,
bool dcLow, bool dcHigh)
{
mUseEnergy = erg;
mUseSignChangesLow = scLow;
mUseSignChangesHigh = scHigh;
mUseDirectionChangesLow = dcLow;
mUseDirectionChangesHigh = dcHigh;
}
//This might continue over a number of blocks.
double VoiceKey::TestEnergy (WaveTrack & t, sampleCount start, sampleCount len)
{
double sum = 1;
sampleCount s = start; //Keep track of start
sampleCount originalLen = len; //Keep track of the length of block to process (its not the length of t)
sampleCount blockSize = t.GetMaxBlockSize(); //Determine size of sampling buffer
if( blockSize > len)
blockSize = len;
float *buffer = new float[blockSize]; //Get a sampling buffer
while(len > 0)
{
sampleCount block = t.GetBestBlockSize(s); //Figure out how much to grab
if(block > len) block = len; //Don't grab too much!
t.Get((samplePtr)buffer,floatSample, s,block); //grab the block;
//Now, go through the block and calculate energy
for(int i = 0; i< block; i++)
{
sum += buffer[i]*buffer[i];
}
len -= block;
s += block;
}
delete [] buffer;
return sum / originalLen;
}
//This will update RMSE by adding one element and subtracting another
void VoiceKey::TestEnergyUpdate (double & prevErg, int len, const sampleFormat & drop, const sampleFormat & add)
{
//This is an updating formula for RMSE. It will only recalculate what's changed.
prevErg = prevErg + (double)(abs(add) - abs(drop))/len;
}
double VoiceKey::TestSignChanges(WaveTrack & t, sampleCount start, sampleCount len)
{
sampleCount s = start; //Keep track of start
sampleCount originalLen = len; //Keep track of the length of block to process (its not the length of t)
sampleCount blockSize = t.GetMaxBlockSize(); //Determine size of sampling buffer
unsigned long signchanges = 1;
int currentsign=0;
if( blockSize > len)
blockSize = len;
sampleFormat *buffer = new sampleFormat[blockSize]; //Get a sampling buffer
while(len > 0) {
sampleCount block = t.GetBestBlockSize(s); //Figure out how much to grab
if(block > len) block = len; //Don't grab too much!
t.Get((samplePtr)buffer,floatSample, s,block); //grab the block;
if (len == originalLen)
{
//The first time through, set stuff up special.
currentsign = sgn(buffer[0]);
}
//Now, go through the block and calculate zero crossings
for(int i = 0; i< block; i++)
{
if( sgn(buffer[i]) != currentsign)
{
currentsign = sgn(buffer[i]);
signchanges++;
}
}
len -= block;
s += block;
}
delete [] buffer;
return (double)signchanges/originalLen;
}
void VoiceKey::TestSignChangesUpdate(double & currentsignchanges, int len,
const sampleFormat & a1,
const sampleFormat & a2,
const sampleFormat & z1,
const sampleFormat & z2)
{
if(sgn(a1)!=sgn(a2)) currentsignchanges -= 1.0/len;
if(sgn(z1)!=sgn(z2)) currentsignchanges += 1.0/len;
}
double VoiceKey::TestDirectionChanges(WaveTrack & t, sampleCount start, sampleCount len)
{
sampleCount s = start; //Keep track of start
sampleCount originalLen = len; //Keep track of the length of block to process (its not the length of t)
sampleCount blockSize = t.GetMaxBlockSize(); //Determine size of sampling buffer
unsigned long directionchanges = 1;
sampleFormat lastval=sampleFormat(0);
int lastdirection=1;
if( blockSize > len)
blockSize = len;
sampleFormat *buffer = new sampleFormat[blockSize]; //Get a sampling buffer
while(len > 0) {
sampleCount block = t.GetBestBlockSize(s); //Figure out how much to grab
if(block > len) block = len; //Don't grab too much!
t.Get((samplePtr)buffer,floatSample, s,block); //grab the block;
if (len == originalLen) {
//The first time through, set stuff up special.
lastval = buffer[0];
}
//Now, go through the block and calculate zero crossings
for(int i = 0; i< block; i++){
if( sgn(buffer[i]-lastval) != lastdirection) {
directionchanges++;
lastdirection = sgn(buffer[i] - lastval);
}
lastval = buffer[i];
}
len -= block;
s += block;
}
delete [] buffer;
return (double)directionchanges/originalLen;
}
// This method does an updating by looking at the trends
// This will change currentdirections and atrend/trend, so be warned.
void VoiceKey::TestDirectionChangesUpdate(double & currentdirectionchanges, int len,
int & atrend, const sampleFormat & a1, const sampleFormat & a2,
int & ztrend, const sampleFormat & z1, const sampleFormat & z2)
{
if(sgn(a2 - a1)!= atrend ) {
//Here, the direction shifted for the item we're dropping.
currentdirectionchanges -= 1.0/len;
atrend = sgn(a2-a1);
}
if(sgn(z2 - z1)!= ztrend){
//Here, the direction shifts when we add an item
currentdirectionchanges += 1.0/len;
ztrend = sgn(z2-z1);
}
}
// Indentation settings for Vim and Emacs and unique identifier for Arch, a
// version control system. Please do not modify past this point.
//
// Local Variables:
// c-basic-offset: 3
// indent-tabs-mode: nil
// End:
//
// vim: et sts=3 sw=3
// arch-tag: