audacia/lib-src/libnyquist/nyquist/nyqsrc/convolve.c

497 lines
18 KiB
C

/* convolve.c -- implements (non-"fast") convolution */
/*
* Note: this code is mostly generated by translate.lsp (see convole.tran
* in the tran directory), but it has been modified by hand to extend the
* stop time to include the "tail" of the convolution beyond the length
* of the first parameter.
*/
/* Original convolve.c modified to do fast convolution. Here are some
* notes:
* The first arg is arbitrary length. The second arg is the impulse
* response, which is converted into a table. The FFT size will be
* limited to 64K, which allows convolution with up to 32K samples.
* For longer impulse responses, we'll have to do convolutions one
* 32K block at a time. I considered just limiting the convolution
* size and handling longer impulse responses in Nyquist XLISP code,
* but that would require taking FFT's of each input block multiple
* times. Here, we save the FFT's and reuse them, which should gain
* a factor of 2 in speed (we still have to inverse FFT each block
* after multiplication, which should take 1/2 the time of doing
* FFT/inverse-FFT on each block).
*
* The fast convolution works like this:
* inputs are x_snd and h_snd.
* Compute the length of h_snd in samples.
* Set fft_size = MAX_FFT_SIZE
* If length <= MAX_FFT_SIZE / 4 then
* set fft_size = (round length to power of 2) * 2
* set N = fft_size/2
* Set h_len = (length rounded up to multiple of fft_size/2) * 2
* Let L = h_len/ fft_size
* Allocate H of h_len floats
* Iterate over i from 0 to L-1:
* Copy ht with zero fill into H[i] of size fft_size,
* where each H[i] of size fft_size is filled with
* fft_size/2 samples (except for the last H[i])
* Compute FFT of H[i] in place (FFT size is fft_size)
* Allocate X of h_len floats. This represents the history
* of x_snd, which is initially all zero, so the FFT, X is all zero
* Allocate output buffers Y and R, each of size fft_size
* Iterate over j (i.e. run this to generate MAX_CONVOLVE_LEN
* samples; then j = (j + 1) mod L.
* Copy 2nd half of R to first half and zero the 2nd half.
* Note: the first time does nothing because R is initially
* filled with zeros
* Copy fft_size/2 samples of x_snd into X[j],
* where X[j] is of size fft_size and filled with
* N samples (except when x_snd terminates)
* Zero fill X[j]
* Compute FFT of X[j] in place.
* Iterate k = 0 to L-1
* Multiply X[(j-k) mod L] by H[k] (result goes into Y).
* Compute IFFT of Y in place. Y is now time domain convolution
* of two blocks of samples.
* Add Y to R.
* Now N samples of R can be output.
* For simplicity, we'll keep processing x_snd input even after x_snd
* terminates. This will avoid special cases where we do not need all
* of X[j] at the end of the convolution.
*
* Length of output is length of x input + length of h
*/
// You can turn on debugging output with: #define D if (1)
#define D if (0)
#define MAX_IR_LEN 4000000 /* maximum impulse response length */
#define MAX_LOG_FFT_SIZE 16 /* maximum fft size for convolution */
//#define MAX_LOG_FFT_SIZE 4 /* maximum fft size for convolution */
#define _USE_MATH_DEFINES 1 /* for Visual C++ to get M_LN2 */
#include <assert.h>
#include <math.h>
#include "stdio.h"
#ifndef mips
#include "stdlib.h"
#endif
#include "xlisp.h"
#include "sound.h"
#include "samples.h"
#include "falloc.h"
#include "cext.h"
#include "fftlib.h"
#include "fftext.h"
#include "convolve.h"
void convolve_free(snd_susp_type a_susp);
typedef struct convolve_susp_struct {
snd_susp_node susp;
int64_t terminate_cnt;
boolean know_end_of_x;
boolean logically_stopped;
sound_type x_snd;
int x_snd_cnt;
sample_block_values_type x_snd_ptr;
sample_type *X; // the FFTs of x_snd
int j; // which block are we processing? 0 <= j < L
sample_type *H; // the FFTs of h_snd
sample_type *Y; // product of X*H where we inverse FFT
int h_snd_len; // true length of h_snd in samples
int N; // length of convolution, FFTs are of size 2*N
int M; // log2 of 2*N, the FFT size
int L; // number of blocks: h_len / (2*N)
sample_type *R; // result buffer where output is summed
sample_type *R_current; // pointer to next sample to output
} convolve_susp_node, *convolve_susp_type;
/*
void h_reverse(sample_type *h, long len)
{
sample_type temp;
int i;
for (i = 0; i < len; i++) {
temp = h[i];
h[i] = h[len - 1];
h[len - 1] = temp;
len--;
}
}
*/
void convolve_s_fetch(snd_susp_type a_susp, snd_list_type snd_list)
{
convolve_susp_type susp = (convolve_susp_type) a_susp;
int cnt = 0; /* how many samples computed */
int togo;
int n;
sample_block_type out;
register sample_block_values_type out_ptr;
register sample_block_values_type out_ptr_reg;
sample_type *R = susp->R;
sample_type *R_current;
int N = susp->N;
falloc_sample_block(out, "convolve_s_fetch");
out_ptr = out->samples;
snd_list->block = out;
while (cnt < max_sample_block_len) { /* outer loop */
/* first compute how many samples to generate in inner loop: */
/* don't overflow the output sample block: */
togo = max_sample_block_len - cnt;
/* if we need output samples, generate them here */
D printf("test R_current at offset %td\n", susp->R_current - R);
if (susp->R_current >= R + N) { // true when we output half of R
int i = 0;
int k;
sample_type *Xj = susp->X + susp->j * N * 2;
sample_type *H = susp->H;
sample_type *Y = susp->Y;
int to_copy;
/* Shift R, zero fill: */
memcpy(R, R + N, N * sizeof(*R));
memset(R + N, 0, N * sizeof(*R));
/* Copy N samples of x_snd into Xj and zero fill to size 2N */
D printf("Copying N samples of x_snd into Xj at offset %td\n", Xj - susp->X);
while (i < N) {
if (susp->x_snd_cnt == 0) {
susp_get_samples(x_snd, x_snd_ptr, x_snd_cnt);
if (susp->x_snd->logical_stop_cnt ==
susp->x_snd->current - susp->x_snd_cnt) {
min_cnt(&susp->susp.log_stop_cnt, susp->x_snd,
(snd_susp_type) susp, susp->x_snd_cnt);
}
}
/* This code is not standard. Since we extend the terminate
* count by susp->h_snd_len, the "standard" call to min_cnt()
* results in extending the terminate time forever. Instead,
* we make this code run once only by setting know_end_of_x.
*/
if (!susp->know_end_of_x &&
susp->x_snd_ptr == zero_block->samples) {
susp->terminate_cnt = susp->x_snd->current - susp->x_snd_cnt;
/* extend the output to include impulse response */
susp->terminate_cnt += susp->h_snd_len;
susp->know_end_of_x = TRUE;
}
/* copy no more than the remaining space and no more than
* the amount remaining in the block
*/
to_copy = min(N - i, susp->x_snd_cnt);
memcpy(Xj + i, susp->x_snd_ptr,
to_copy * sizeof(*susp->x_snd_ptr));
susp->x_snd_ptr += to_copy;
susp->x_snd_cnt -= to_copy;
i += to_copy;
}
/* zero fill to size 2N */
memset(Xj + N, 0, N * sizeof(Xj[0]));
D {
printf("Xj at offset %td: ", Xj - susp->X);
printf(" %d samples ", susp->N * 2);
float big = 0.0;
for (i = 0; i < susp->N * 2; i++) {
// printf("%g ", Xj[i]);
big = max(big, fabs(Xj[i]));
}
printf("MAX: %g\n", big);
}
/* Compute FFT of Xj in place */
fftInit(susp->M);
rffts(Xj, susp->M, 1);
/* convolve pairs of blocks and sum into Y */
memset(Y, 0, N * sizeof(*Y)); /* initialize sum to zero */
for (k = 0; k < susp->L; k++) {
/* Multiply Xj by H (result goes into X) */
sample_type *X = susp->X + ((susp->L + susp->j - k) % susp->L) * N * 2;
rspectprod(X, H + k * N * 2, Y, N * 2);
/* Compute IFFT of Y in place */
riffts(Y, susp->M, 1);
/* R += Y */
D { printf("Output block %d, X offset %td: ", k, X - susp->X);
printf(" %d samples ", 2 * N);
float big = 0.0;
for (i = 0; i < 2 * N; i++) {
big = max(big, fabs(Y[i]));
}
printf("MAX: %g\n", big);
}
for (i = 0; i < 2 * N; i++) {
R[i] += Y[i];
}
}
/* now N samples of R can be output */
susp->R_current = R;
D printf("R: %d samples ", susp->N);
D { float big = 0.0;
for (i = 0; i < susp->N; i++) {
// printf("%g ", R[i]);
big = max(big, fabs(R[i]));
}
printf("MAX: %g\n", big);
}
susp->j = (susp->j + 1) % susp->L;
}
/* compute togo, the number of samples to "compute" */
/* can't use more than what's left in R. R_current is
the next sample of R, so what's left is N - (R - R_current) */
R_current = susp->R_current;
togo = (int) min(togo, N - (R_current - R));
/* don't run past terminate time */
if (susp->terminate_cnt != UNKNOWN &&
susp->terminate_cnt <= susp->susp.current + cnt + togo) {
togo = (int) (susp->terminate_cnt - (susp->susp.current + cnt));
if (togo == 0) break;
}
/* don't run past logical stop time */
if (!susp->logically_stopped &&
susp->susp.log_stop_cnt != UNKNOWN &&
susp->susp.log_stop_cnt <= susp->susp.current + cnt + togo) {
togo = (int) (susp->susp.log_stop_cnt - (susp->susp.current + cnt));
D printf("susp->susp.log_stop_cnt is set to %" PRId64 "\n",
susp->susp.log_stop_cnt);
if (togo == 0) break;
}
n = togo;
out_ptr_reg = out_ptr;
if (n) do { /* the inner sample computation loop */
*out_ptr_reg++ = (sample_type) *R_current++;
} while (--n); /* inner loop */
/* using R_current is a bad idea on RS/6000: */
susp->R_current += togo;
out_ptr += togo;
cnt += togo;
} /* outer loop */
/* test for termination */
if (togo == 0 && cnt == 0) {
snd_list_terminate(snd_list);
} else {
snd_list->block_len = cnt;
susp->susp.current += cnt;
}
/* test for logical stop */
if (susp->logically_stopped) {
snd_list->logically_stopped = true;
} else if (susp->susp.log_stop_cnt == susp->susp.current) {
susp->logically_stopped = true;
}
} /* convolve_s_fetch */
void convolve_toss_fetch(snd_susp_type a_susp, snd_list_type snd_list)
{
convolve_susp_type susp = (convolve_susp_type) a_susp;
time_type final_time = susp->susp.t0;
long n;
/* fetch samples from x_snd up to final_time for this block of zeros */
while ((ROUNDBIG((final_time - susp->x_snd->t0) * susp->x_snd->sr)) >=
susp->x_snd->current)
susp_get_samples(x_snd, x_snd_ptr, x_snd_cnt);
/* convert to normal processing when we hit final_count */
/* we want each signal positioned at final_time */
n = (long) ROUNDBIG((final_time - susp->x_snd->t0) * susp->x_snd->sr -
(susp->x_snd->current - susp->x_snd_cnt));
susp->x_snd_ptr += n;
susp_took(x_snd_cnt, n);
susp->susp.fetch = susp->susp.keep_fetch;
(*(susp->susp.fetch))(a_susp, snd_list);
}
void convolve_mark(snd_susp_type a_susp)
{
convolve_susp_type susp = (convolve_susp_type) a_susp;
sound_xlmark(susp->x_snd);
}
void convolve_free(snd_susp_type a_susp)
{
convolve_susp_type susp = (convolve_susp_type) a_susp;
free(susp->R);
free(susp->X);
free(susp->Y);
free(susp->H);
sound_unref(susp->x_snd);
ffree_generic(susp, sizeof(convolve_susp_node), "convolve_free");
}
void convolve_print_tree(snd_susp_type a_susp, int n)
{
convolve_susp_type susp = (convolve_susp_type) a_susp;
indent(n);
stdputstr("x_snd:");
sound_print_tree_1(susp->x_snd, n);
}
void fill_with_samples(sample_type *x, sound_type s, long n)
{
/* this is based on snd_fetch in samples.c */
#define CNT extra[1]
#define INDEX extra[2]
#define FIELDS 3
#define SAMPLES list->block->samples
int i;
for (i = 0; i < n; i++) {
if (!s->extra) { /* this is the first call, so fix up s */
s->extra = (int64_t *) malloc(sizeof(s->extra[0]) * FIELDS);
s->extra[0] = sizeof(s->extra[0]) * FIELDS;
s->CNT = s->INDEX = 0;
}
int icnt = (int) s->CNT; /* need this to be int type */
assert(icnt >= 0);
if (icnt == s->INDEX) {
sound_get_next(s, &icnt);
assert(icnt >= 0);
s->CNT = icnt; /* save the count back into s->extra */
s->INDEX = 0;
}
x[i] = s->SAMPLES[s->INDEX++] * s->scale;
assert(x[i] < 2);
}
D { float big = 0.0;
for (i = 0; i < n; i++) {
big = max(big, fabs(x[i]));
assert(big < 2);
}
printf("fill_with_samples n %ld scale %g max %g\n", n, s->scale, big);
}
}
sound_type snd_make_convolve(sound_type x_snd, sound_type h_snd)
{
register convolve_susp_type susp;
rate_type sr = x_snd->sr;
time_type t0 = x_snd->t0;
sample_type scale_factor = 1.0F;
time_type t0_min = t0;
int64_t h_len;
int i;
// assume fft_size is maximal. We fix this later if it is wrong
long fft_size = 1 << MAX_LOG_FFT_SIZE;
if (sr != h_snd->sr) {
xlfail("convolve requires both inputs to have the same sample rates");
}
falloc_generic(susp, convolve_susp_node, "snd_make_convolve");
/* compute the length of h_snd in samples */
h_len = snd_length(h_snd, MAX_IR_LEN + 1);
if (h_len > MAX_IR_LEN) {
char emsg[100];
sprintf(emsg, "convolve maximum impulse length is %d", MAX_IR_LEN);
xlfail(emsg);
}
/* len is the impulse response length;
* the FFT size is at least double that */
if (h_len <= fft_size / 4) {
/* compute log-base-2(h_len): */;
double log_len = log((double) h_len) / M_LN2;
int log_len_int = (int) log_len;
if (log_len_int != log_len) log_len_int++; /* round up to power of 2 */
susp->M = log_len_int + 1;
} else {
susp->M = MAX_LOG_FFT_SIZE;
}
fft_size = (1 << susp->M);
D printf("fft_size %ld\n", fft_size);
susp->N = fft_size / 2;
// round h_len up to multiple of susp->N and multiply by 2
susp->h_snd_len = (int) h_len;
h_len = ((h_len + susp->N - 1) / susp->N) * susp->N * 2;
susp->L = (int) (h_len / fft_size);
// allocate memory
susp->H = (sample_type *) calloc((size_t) h_len, sizeof(susp->H[0]));
if (!susp->H) {
xlfail("memory allocation failure in convolve");
}
for (i = 0; i < susp->L; i++) {
/* copy fft_size/2 samples into each H[i] */
fill_with_samples(susp->H + i * susp->N * 2, h_snd, susp->N);
}
for (i = 0; i < susp->L; i++) {
int j;
float *H = susp->H + i * susp->N * 2;
D { printf("H_%d at %td: ", i, H - susp->H);
printf("%d samples ", susp->N * 2);
float big = 0.0;
for (j = 0; j < susp->N * 2; j++) {
big = max(big, fabs(H[j]));
assert(big < 2);
// printf("%g ", H[j]);
}
printf("big %g\n", big);
}
}
sound_unref(h_snd);
h_snd = NULL;
/* remaining N samples are already zero-filled */
if (fftInit(susp->M)) {
free(susp->H);
xlfail("fft initialization error in convolve");
}
/* take the FFT of each block of the impulse response */
for (i = 0; i < susp->L; i++) {
rffts(susp->H + i * susp->N * 2, susp->M, 1);
}
susp->X = (sample_type *) calloc((size_t) h_len, sizeof(susp->X[0]));
susp->R = (sample_type *) calloc(fft_size, sizeof(susp->R[0]));
susp->Y = (sample_type *) calloc(fft_size, sizeof(susp->Y[0]));
if (!susp->X || !susp->R || !susp->Y) {
free(susp->H);
if (susp->X) free(susp->X);
if (susp->R) free(susp->R);
if (susp->Y) free(susp->Y);
xlfail("memory allocation failed in convolve");
}
susp->R_current = susp->R + susp->N;
susp->susp.fetch = &convolve_s_fetch;
susp->terminate_cnt = UNKNOWN;
susp->know_end_of_x = FALSE;
/* handle unequal start times, if any */
if (t0 < x_snd->t0) sound_prepend_zeros(x_snd, t0);
/* minimum start time over all inputs: */
t0_min = min(x_snd->t0, t0);
/* how many samples to toss before t0: */
susp->susp.toss_cnt = (long) ((t0 - t0_min) * sr + 0.5);
if (susp->susp.toss_cnt > 0) {
susp->susp.keep_fetch = susp->susp.fetch;
susp->susp.fetch = convolve_toss_fetch;
}
/* initialize susp state */
susp->susp.free = convolve_free;
susp->susp.sr = sr;
susp->susp.t0 = t0;
susp->susp.mark = convolve_mark;
susp->susp.print_tree = convolve_print_tree;
susp->susp.name = "convolve";
susp->logically_stopped = false;
susp->susp.log_stop_cnt = logical_stop_cnt_cvt(x_snd);
susp->susp.current = 0;
susp->x_snd = x_snd;
susp->x_snd_cnt = 0;
susp->j = 0;
return sound_create((snd_susp_type)susp, t0, sr, scale_factor);
}
sound_type snd_convolve(sound_type x_snd, sound_type h_snd)
{
sound_type x_snd_copy = sound_copy(x_snd);
sound_type h_snd_copy = sound_copy(h_snd);
return snd_make_convolve(x_snd_copy, h_snd_copy);
}