summaryrefslogtreecommitdiff
path: root/ffts/src
diff options
context:
space:
mode:
Diffstat (limited to 'ffts/src')
-rw-r--r--ffts/src/dxpose.c79
-rw-r--r--ffts/src/dxpose.h16
-rw-r--r--ffts/src/fft2d.c402
-rw-r--r--ffts/src/fft2d.h114
-rw-r--r--ffts/src/fftext.c156
-rw-r--r--ffts/src/fftext.h106
-rw-r--r--ffts/src/fftlib.c3174
-rw-r--r--ffts/src/fftlib.h76
-rw-r--r--ffts/src/files22
-rw-r--r--ffts/src/matlib.c297
-rw-r--r--ffts/src/matlib.h33
11 files changed, 4475 insertions, 0 deletions
diff --git a/ffts/src/dxpose.c b/ffts/src/dxpose.c
new file mode 100644
index 0000000..9ee8836
--- /dev/null
+++ b/ffts/src/dxpose.c
@@ -0,0 +1,79 @@
+/*********************
+This matrix transpose is in a seperate file because it should always be double precision.
+*********************/
+#include "dxpose.h"
+void dxpose(xdouble *indata, long iRsiz, xdouble *outdata, long oRsiz, long Nrows, long Ncols){
+/* not in-place double precision matrix transpose */
+/* INPUTS */
+/* *indata = input data array */
+/* iRsiz = offset to between rows of input data array */
+/* oRsiz = offset to between rows of output data array */
+/* Nrows = number of rows in input data array */
+/* Ncols = number of columns in input data array */
+/* OUTPUTS */
+/* *outdata = output data array */
+
+xdouble *irow; /* pointer to input row start */
+xdouble *ocol; /* pointer to output col start */
+xdouble *idata; /* pointer to input data */
+xdouble *odata; /* pointer to output data */
+long RowCnt; /* row counter */
+long ColCnt; /* col counter */
+xdouble T0; /* data storage */
+xdouble T1; /* data storage */
+xdouble T2; /* data storage */
+xdouble T3; /* data storage */
+xdouble T4; /* data storage */
+xdouble T5; /* data storage */
+xdouble T6; /* data storage */
+xdouble T7; /* data storage */
+const long inRsizd1 = iRsiz;
+const long inRsizd2 = 2*iRsiz;
+const long inRsizd3 = inRsizd2+iRsiz;
+const long inRsizd4 = 4*iRsiz;
+const long inRsizd5 = inRsizd3+inRsizd2;
+const long inRsizd6 = inRsizd4+inRsizd2;
+const long inRsizd7 = inRsizd4+inRsizd3;
+const long inRsizd8 = 8*iRsiz;
+
+ocol = outdata;
+irow = indata;
+for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){
+ idata = irow;
+ odata = ocol;
+ for (ColCnt=Ncols; ColCnt>0; ColCnt--){
+ T0 = *idata;
+ T1 = *(idata+inRsizd1);
+ T2 = *(idata+inRsizd2);
+ T3 = *(idata+inRsizd3);
+ T4 = *(idata+inRsizd4);
+ T5 = *(idata+inRsizd5);
+ T6 = *(idata+inRsizd6);
+ T7 = *(idata+inRsizd7);
+ *odata = T0;
+ *(odata+1) = T1;
+ *(odata+2) = T2;
+ *(odata+3) = T3;
+ *(odata+4) = T4;
+ *(odata+5) = T5;
+ *(odata+6) = T6;
+ *(odata+7) = T7;
+ idata++;
+ odata += oRsiz;
+ }
+ irow += inRsizd8;
+ ocol += 8;
+}
+if (Nrows%8 != 0){
+ for (ColCnt=Ncols; ColCnt>0; ColCnt--){
+ idata = irow++;
+ odata = ocol;
+ ocol += oRsiz;
+ for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){
+ T0 = *idata;
+ *odata++ = T0;
+ idata += iRsiz;
+ }
+ }
+}
+}
diff --git a/ffts/src/dxpose.h b/ffts/src/dxpose.h
new file mode 100644
index 0000000..730f81f
--- /dev/null
+++ b/ffts/src/dxpose.h
@@ -0,0 +1,16 @@
+/*********************
+This matrix transpose is in a seperate file because it should always be double precision.
+*********************/
+typedef double_t xdouble; // I use double_t so that global search and replace on double won't
+ // change this to float accidentally.
+
+void dxpose(xdouble *indata, long iRsiz, xdouble *outdata, long oRsiz, long Nrows, long Ncols);
+/* not in-place double precision matrix transpose */
+/* INPUTS */
+/* *indata = input data array */
+/* iRsiz = offset to between rows of input data array */
+/* oRsiz = offset to between rows of output data array */
+/* Nrows = number of rows in input data array */
+/* Ncols = number of columns in input data array */
+/* OUTPUTS */
+/* *outdata = output data array */
diff --git a/ffts/src/fft2d.c b/ffts/src/fft2d.c
new file mode 100644
index 0000000..0d8c3d8
--- /dev/null
+++ b/ffts/src/fft2d.c
@@ -0,0 +1,402 @@
+/*******************************************************************
+ This file extends the fftlib with 2d and 3d complex fft's and
+ 2d real fft's. All fft's return results in-place. Temporary buffers
+ for transposing columns are maintained privately via calls to
+ fft2dInit, fft2dFree, fft3dInit, and fft3dFree.
+ Note that you can call fft2dInit and fft3dInit repeatedly
+ with the same sizes, the extra calls will be ignored.
+ So, you could make a macro to call fft2dInit every time you call fft2d.
+ *** Warning *** fft2dFree and fft3dFree also call fftFree
+ so you must re-init all 1d fft sizes you are going to continue using
+*******************************************************************/
+#include <stdlib.h>
+#include <fp.h>
+#include "fftlib.h"
+#include "fftext.h"
+#include "matlib.h"
+#include "dxpose.h"
+#include "fft2d.h"
+ // use trick of using a real double transpose in place of a complex transpose if it fits
+#define cxpose(a,b,c,d,e,f) (2*sizeof(float)==sizeof(xdouble)) ? dxpose((xdouble *)(a), b, (xdouble *)(c), d, e, f) : cxpose(a,b,c,d,e,f);
+ // for this trick to work you must NOT replace the xdouble declarations in
+ // dxpose with float declarations.
+
+
+ // pointers for temporary storage for four columns
+static float *Array2d[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};
+int fft2dInit(long M2, long M){
+ // init for fft2d, ifft2d, rfft2d, and rifft2d
+ // malloc storage for 4 columns of 2d ffts then call fftinit for both row and column ffts sizes
+/* INPUTS */
+/* M = log2 of number of columns */
+/* M2 = log2 of number of rows */
+/* of 2d matrix to be fourier transformed */
+/* OUTPUTS */
+/* private storage for columns of 2d ffts */
+/* calls fftInit for cosine and bit reversed tables */
+int theError = 1;
+if ((M2 >= 0) && (M2 < 8*sizeof(long))){
+ theError = 0;
+ if (Array2d[M2] == 0){
+ Array2d[M2] = (float *) malloc( 4*2*POW2(M2)*sizeof(float) );
+ if (Array2d[M2] == 0)
+ theError = 2;
+ else{
+ theError = fftInit(M2);
+ }
+ }
+ if (theError == 0)
+ theError = fftInit(M);
+}
+return theError;
+}
+
+void fft2dFree(){
+// free storage for columns of 2d ffts and call fftFree to free all BRLow and Utbl storage
+long i1;
+for (i1=8*sizeof(long)-1; i1>=0; i1--){
+ if (Array2d[i1] != 0){
+ free(Array2d[i1]);
+ Array2d[i1] = 0;
+ };
+};
+fftFree();
+}
+
+void fft2d(float *data, long M2, long M){
+/* Compute 2D complex fft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+long i1;
+if((M2>0)&&(M>0)){
+ ffts(data, M, POW2(M2));
+ if (M>2)
+ for (i1=0; i1<POW2(M); i1+=4){
+ cxpose(data + i1*2, POW2(M), Array2d[M2], POW2(M2), POW2(M2), 4);
+ ffts(Array2d[M2], M2, 4);
+ cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M), 4, POW2(M2));
+ }
+ else{
+ cxpose(data, POW2(M), Array2d[M2], POW2(M2), POW2(M2), POW2(M));
+ ffts(Array2d[M2], M2, POW2(M));
+ cxpose(Array2d[M2], POW2(M2), data, POW2(M), POW2(M), POW2(M2));
+ }
+}
+else
+ ffts(data, M2+M, 1);
+}
+
+void ifft2d(float *data, long M2, long M){
+/* Compute 2D complex ifft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+long i1;
+if((M2>0)&&(M>0)){
+ iffts(data, M, POW2(M2));
+ if (M>2)
+ for (i1=0; i1<POW2(M); i1+=4){
+ cxpose(data + i1*2, POW2(M), Array2d[M2], POW2(M2), POW2(M2), 4);
+ iffts(Array2d[M2], M2, 4);
+ cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M), 4, POW2(M2));
+ }
+ else{
+ cxpose(data, POW2(M), Array2d[M2], POW2(M2), POW2(M2), POW2(M));
+ iffts(Array2d[M2], M2, POW2(M));
+ cxpose(Array2d[M2], POW2(M2), data, POW2(M), POW2(M), POW2(M2));
+ }
+}
+else
+ iffts(data, M2+M, 1);
+}
+
+int fft3dInit(long L, long M2, long M){
+ // init for fft3d, ifft3d
+ // malloc storage for 4 columns and 4 pages of 3d ffts
+ // then call fftinit for page, row and column ffts sizes
+//* M = log2 of number of columns */
+/* M2 = log2 of number of rows */
+/* L = log2 of number of pages */
+/* of 3d matrix to be fourier transformed */
+/* OUTPUTS */
+/* private storage for columns and pages of 3d ffts */
+/* calls fftInit for cosine and bit reversed tables */
+int theError = 1;
+if ((L >= 0) && (L < 8*sizeof(long))){
+ theError = 0;
+ if (Array2d[L] == 0){
+ Array2d[L] = (float *) malloc( 4*2*POW2(L)*sizeof(float) );
+ if (Array2d[L] == 0)
+ theError = 2;
+ else{
+ theError = fftInit(L);
+ }
+ }
+ if (theError == 0){
+ if (Array2d[M2] == 0){
+ Array2d[M2] = (float *) malloc( 4*2*POW2(M2)*sizeof(float) );
+ if (Array2d[M2] == 0)
+ theError = 2;
+ else{
+ theError = fftInit(M2);
+ }
+ }
+ }
+ if (theError == 0)
+ theError = fftInit(M);
+}
+return theError;
+}
+
+void fft3dFree(){
+// free storage for columns of all 2d&3d ffts and call fftFree to free all BRLow and Utbl storage
+fft2dFree();
+}
+
+void fft3d(float *data, long M3, long M2, long M){
+/* Compute 2D complex fft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M3 = log2 of fft size number of pages */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+long i1;
+long i2;
+const long N = POW2(M);
+const long N2 = POW2(M2);
+const long N3 = POW2(M3);
+if((M3>0)&&(M2>0)&&(M>0)){
+ ffts(data, M, N3*N2);
+ if (M>2)
+ for (i2=0; i2<N3; i2++){
+ for (i1=0; i1<N; i1+=4){
+ cxpose(data + i2*2*POW2(M2+M) + i1*2, N, Array2d[M2], N2, N2, 4);
+ ffts(Array2d[M2], M2, 4);
+ cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M) + i1*2, N, 4, N2);
+ }
+ }
+ else{
+ for (i2=0; i2<N3; i2++){
+ cxpose(data + i2*2*POW2(M2+M), N, Array2d[M2], N2, N2, N);
+ ffts(Array2d[M2], M2, N);
+ cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M), N, N, N2);
+ }
+ }
+ if ((M2+M)>2)
+ for (i1=0; i1<POW2(M2+M); i1+=4){
+ cxpose(data + i1*2, POW2(M2+M), Array2d[M3], N3, N3, 4);
+ ffts(Array2d[M3], M3, 4);
+ cxpose(Array2d[M3], N3, data + i1*2, POW2(M2+M), 4, N3);
+ }
+ else{
+ cxpose(data, POW2(M2+M), Array2d[M3], N3, N3, POW2(M2+M));
+ ffts(Array2d[M3], M3, POW2(M2+M));
+ cxpose(Array2d[M3], N3, data, POW2(M2+M), POW2(M2+M), N3);
+ }
+}
+else
+ if(M3==0) fft2d(data, M2, M);
+ else
+ if(M2==0) fft2d(data, M3, M);
+ else
+ if(M==0) fft2d(data, M3, M2);
+}
+
+void ifft3d(float *data, long M3, long M2, long M){
+/* Compute 2D complex ifft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M3 = log2 of fft size number of pages */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+long i1;
+long i2;
+const long N = POW2(M);
+const long N2 = POW2(M2);
+const long N3 = POW2(M3);
+if((M3>0)&&(M2>0)&&(M>0)){
+ iffts(data, M, N3*N2);
+ if (M>2)
+ for (i2=0; i2<N3; i2++){
+ for (i1=0; i1<N; i1+=4){
+ cxpose(data + i2*2*POW2(M2+M) + i1*2, N, Array2d[M2], N2, N2, 4);
+ iffts(Array2d[M2], M2, 4);
+ cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M) + i1*2, N, 4, N2);
+ }
+ }
+ else{
+ for (i2=0; i2<N3; i2++){
+ cxpose(data + i2*2*POW2(M2+M), N, Array2d[M2], N2, N2, N);
+ iffts(Array2d[M2], M2, N);
+ cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M), N, N, N2);
+ }
+ }
+ if ((M2+M)>2)
+ for (i1=0; i1<POW2(M2+M); i1+=4){
+ cxpose(data + i1*2, POW2(M2+M), Array2d[M3], N3, N3, 4);
+ iffts(Array2d[M3], M3, 4);
+ cxpose(Array2d[M3], N3, data + i1*2, POW2(M2+M), 4, N3);
+ }
+ else{
+ cxpose(data, POW2(M2+M), Array2d[M3], N3, N3, POW2(M2+M));
+ iffts(Array2d[M3], M3, POW2(M2+M));
+ cxpose(Array2d[M3], N3, data, POW2(M2+M), POW2(M2+M), N3);
+ }
+}
+else
+ if(M3==0) ifft2d(data, M2, M);
+ else
+ if(M2==0) ifft2d(data, M3, M);
+ else
+ if(M==0) ifft2d(data, M3, M2);
+}
+
+void rfft2d(float *data, long M2, long M){
+/* Compute 2D real fft and return results in-place */
+/* First performs real fft on rows using size from M to compute positive frequencies */
+/* then performs transform on columns using size from M2 to compute wavenumbers */
+/* If you think of the result as a complex pow(2,M2) by pow(2,M-1) matrix */
+/* then the first column contains the positive wavenumber spectra of DC frequency */
+/* followed by the positive wavenumber spectra of the nyquest frequency */
+/* since these are two positive wavenumber spectra the first complex value */
+/* of each is really the real values for the zero and nyquest wavenumber packed together */
+/* All other columns contain the positive and negative wavenumber spectra of a positive frequency */
+/* See rspect2dprod for multiplying two of these spectra together- ex. for fast convolution */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows in */
+/* M = log2 of fft size number of columns in */
+/* OUTPUTS */
+/* *data = output data array */
+long i1;
+if((M2>0)&&(M>0)){
+ rffts(data, M, POW2(M2));
+ if (M==1){
+ cxpose(data, POW2(M)/2, Array2d[M2]+POW2(M2)*2, POW2(M2), POW2(M2), 1);
+ xpose(Array2d[M2]+POW2(M2)*2, 2, Array2d[M2], POW2(M2), POW2(M2), 2);
+ rffts(Array2d[M2], M2, 2);
+ cxpose(Array2d[M2], POW2(M2), data, POW2(M)/2, 1, POW2(M2));
+ }
+ else if (M==2){
+ cxpose(data, POW2(M)/2, Array2d[M2]+POW2(M2)*2, POW2(M2), POW2(M2), 1);
+ xpose(Array2d[M2]+POW2(M2)*2, 2, Array2d[M2], POW2(M2), POW2(M2), 2);
+ rffts(Array2d[M2], M2, 2);
+ cxpose(Array2d[M2], POW2(M2), data, POW2(M)/2, 1, POW2(M2));
+
+ cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1);
+ ffts(Array2d[M2], M2, 1);
+ cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 1, POW2(M2));
+ }
+ else{
+ cxpose(data, POW2(M)/2, Array2d[M2]+POW2(M2)*2, POW2(M2), POW2(M2), 1);
+ xpose(Array2d[M2]+POW2(M2)*2, 2, Array2d[M2], POW2(M2), POW2(M2), 2);
+ rffts(Array2d[M2], M2, 2);
+ cxpose(Array2d[M2], POW2(M2), data, POW2(M)/2, 1, POW2(M2));
+
+ cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 3);
+ ffts(Array2d[M2], M2, 3);
+ cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 3, POW2(M2));
+ for (i1=4; i1<POW2(M)/2; i1+=4){
+ cxpose(data + i1*2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 4);
+ ffts(Array2d[M2], M2, 4);
+ cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M)/2, 4, POW2(M2));
+ }
+ }
+}
+else
+ rffts(data, M2+M, 1);
+}
+
+void rifft2d(float *data, long M2, long M){
+/* Compute 2D real ifft and return results in-place */
+/* The input must be in the order as outout from rfft2d */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows out */
+/* M = log2 of fft size number of columns out */
+/* OUTPUTS */
+/* *data = output data array */
+long i1;
+if((M2>0)&&(M>0)){
+ if (M==1){
+ cxpose(data, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1);
+ riffts(Array2d[M2], M2, 2);
+ xpose(Array2d[M2], POW2(M2), Array2d[M2]+POW2(M2)*2, 2, 2, POW2(M2));
+ cxpose(Array2d[M2]+POW2(M2)*2, POW2(M2), data, POW2(M)/2, 1, POW2(M2));
+ }
+ else if (M==2){
+ cxpose(data, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1);
+ riffts(Array2d[M2], M2, 2);
+ xpose(Array2d[M2], POW2(M2), Array2d[M2]+POW2(M2)*2, 2, 2, POW2(M2));
+ cxpose(Array2d[M2]+POW2(M2)*2, POW2(M2), data, POW2(M)/2, 1, POW2(M2));
+
+ cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1);
+ iffts(Array2d[M2], M2, 1);
+ cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 1, POW2(M2));
+ }
+ else{
+ cxpose(data, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1);
+ riffts(Array2d[M2], M2, 2);
+ xpose(Array2d[M2], POW2(M2), Array2d[M2]+POW2(M2)*2, 2, 2, POW2(M2));
+ cxpose(Array2d[M2]+POW2(M2)*2, POW2(M2), data, POW2(M)/2, 1, POW2(M2));
+
+ cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 3);
+ iffts(Array2d[M2], M2, 3);
+ cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 3, POW2(M2));
+ for (i1=4; i1<POW2(M)/2; i1+=4){
+ cxpose(data + i1*2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 4);
+ iffts(Array2d[M2], M2, 4);
+ cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M)/2, 4, POW2(M2));
+ }
+ }
+ riffts(data, M, POW2(M2));
+}
+else
+ riffts(data, M2+M, 1);
+}
+
+void rspect2dprod(float *data1, float *data2, float *outdata, long N2, long N1){
+// When multiplying a pair of 2d spectra from rfft2d care must be taken to multiply the
+// four real values seperately from the complex ones. This routine does it correctly.
+// the result can be stored in-place over one of the inputs
+/* *data1 = input data array first spectra */
+/* *data2 = input data array second spectra */
+/* N2 = fft size number of rows into rfft2d for both data1 and data2 */
+/* N1 = fft size number of columns into rfft2d for both data1 and data2 */
+long N = N2 * N1/2; // number of "complex" numbers in spectra
+
+if( (N2 > 1) && (N1>1)){
+ outdata[0] = data1[0] * data2[0]; // multiply the zero freq, zero wavenumber values
+ outdata[1] = data1[1] * data2[1]; // multiply the zero freq, nyquest wavenumber values
+
+ cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1);
+
+ outdata[N] = data1[N] * data2[N]; // multiply the nyquest freq, zero wavenumber values
+ outdata[N+1] = data1[N+1] * data2[N+1]; // multiply the nyquest freq, nyquest wavenumber values
+
+ cvprod(data1 + N+2, data2 + N+2, outdata + N+2, N/2-1);
+}
+else{ // really 1D rfft spectra
+ N = N2 * N1; // one of these is a 1
+ if(N>1){
+ outdata[0] = data1[0] * data2[0]; // multiply the zero freq values
+ outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values
+ cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values
+ }
+ else{
+ outdata[0] = data1[0] * data2[0];
+ }
+}
+} \ No newline at end of file
diff --git a/ffts/src/fft2d.h b/ffts/src/fft2d.h
new file mode 100644
index 0000000..fd074ff
--- /dev/null
+++ b/ffts/src/fft2d.h
@@ -0,0 +1,114 @@
+/*******************************************************************
+ This file extends the fftlib with 2d and 3d complex fft's and
+ 2d real fft's. All fft's return results in-place. Temporary buffers
+ for transposing columns are maintained privately via calls to
+ fft2dInit, fft2dFree, fft3dInit, and fft3dFree.
+ Note that you can call fft2dInit and fft3dInit repeatedly
+ with the same sizes, the extra calls will be ignored.
+ So, you could make a macro to call fft2dInit every time you call fft2d.
+ *** Warning *** fft2dFree and fft3dFree also call fftFree
+ so you must re-init all 1d fft sizes you are going to continue using
+*******************************************************************/
+int fft2dInit(long M2, long M);
+ // init for fft2d, ifft2d, rfft2d, and rifft2d
+ // malloc storage for columns of 2d ffts then call fftinit for both row and column ffts sizes
+/* INPUTS */
+/* M = log2 of number of columns */
+/* M2 = log2 of number of rows */
+/* of 2d matrix to be fourier transformed */
+/* OUTPUTS */
+/* private storage for columns of 2d ffts */
+/* calls fftInit for cosine and bit reversed tables */
+
+void fft2dFree();
+// free storage for columns of all 2d&3d ffts and call fftFree to free all BRLow and Utbl storage
+
+void fft2d(float *data, long M2, long M);
+/* Compute 2D complex fft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+
+void ifft2d(float *data, long M2, long M);
+/* Compute 2D complex ifft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+
+int fft3dInit(long L, long M2, long M);
+ // init for fft3d, ifft3d
+ // malloc storage for 4 columns and 4 pages of 3d ffts
+ // then call fftinit for page, row and column ffts sizes
+/* M = log2 of number of columns */
+/* M2 = log2 of number of rows */
+/* L = log2 of number of pages */
+/* of 3d matrix to be fourier transformed */
+/* OUTPUTS */
+/* private storage for columns and pages of 3d ffts */
+/* calls fftInit for cosine and bit reversed tables */
+
+void fft3dFree();
+// free storage for columns of all 2d&3d ffts and call fftFree to free all BRLow and Utbl storage
+
+void fft3d(float *data, long M3, long M2, long M);
+/* Compute 2D complex fft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M3 = log2 of fft size number of pages */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+
+void ifft3d(float *data, long M3, long M2, long M);
+/* Compute 2D complex ifft and return results in-place */
+/* INPUTS */
+/* *data = input data array */
+/* M3 = log2 of fft size number of pages */
+/* M2 = log2 of fft size number of rows */
+/* M = log2 of fft size number of columns */
+/* OUTPUTS */
+/* *data = output data array */
+
+void rfft2d(float *data, long M2, long M);
+/* Compute 2D real fft and return results in-place */
+/* First performs real fft on rows using size from M to compute positive frequencies */
+/* then performs transform on columns using size from M2 to compute wavenumbers */
+/* If you think of the result as a complex pow(2,M2) by pow(2,M-1) matrix */
+/* then the first column contains the positive wavenumber spectra of DC frequency */
+/* followed by the positive wavenumber spectra of the nyquest frequency */
+/* since these are two positive wavenumber spectra the first complex value */
+/* of each is really the real values for the zero and nyquest wavenumber packed together */
+/* All other columns contain the positive and negative wavenumber spectra of a positive frequency */
+/* See rspect2dprod for multiplying two of these spectra together- ex. for fast convolution */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows in */
+/* M = log2 of fft size number of columns in */
+/* OUTPUTS */
+/* *data = output data array */
+
+void rifft2d(float *data, long M2, long M);
+/* Compute 2D real ifft and return results in-place */
+/* The input must be in the order as outout from rfft2d */
+/* INPUTS */
+/* *data = input data array */
+/* M2 = log2 of fft size number of rows out */
+/* M = log2 of fft size number of columns out */
+/* OUTPUTS */
+/* *data = output data array */
+
+void rspect2dprod(float *data1, float *data2, float *outdata, long N2, long N1);
+// When multiplying a pair of 2d spectra from rfft2d care must be taken to multiply the
+// four real values seperately from the complex ones. This routine does it correctly.
+// the result can be stored in-place over one of the inputs
+/* *data1 = input data array first spectra */
+/* *data2 = input data array second spectra */
+/* N2 = fft size number of rows into rfft2d for both data1 and data2 */
+/* N1 = fft size number of columns into rfft2d for both data1 and data2 */
diff --git a/ffts/src/fftext.c b/ffts/src/fftext.c
new file mode 100644
index 0000000..a39911c
--- /dev/null
+++ b/ffts/src/fftext.c
@@ -0,0 +1,156 @@
+/*******************************************************************
+ This file extends the fftlib with calls to maintain the cosine and bit reversed tables
+ for you (including mallocs and free's). Call the init routine for each fft size you will
+ be using. Then you can call the fft routines below which will make the fftlib library
+ call with the appropriate tables passed. When you are done with all fft's you can call
+ fftfree to release the storage for the tables. Note that you can call fftinit repeatedly
+ with the same size, the extra calls will be ignored. So, you could make a macro to call
+ fftInit every time you call ffts. For example you could have someting like:
+ #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n");
+*******************************************************************/
+#include <stdlib.h>
+#include "fftlib.h"
+#include "matlib.h"
+#include "fftext.h"
+
+// pointers to storage of Utbl's and BRLow's
+static float *UtblArray[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};
+static short *BRLowArray[8*sizeof(long)/2] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0};
+
+int fftInit(long M){
+// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft
+/* INPUTS */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* OUTPUTS */
+/* private cosine and bit reversed tables */
+
+int theError = 1;
+/*** I did NOT test cases with M>27 ***/
+if ((M >= 0) && (M < 8*sizeof(long))){
+ theError = 0;
+ if (UtblArray[M] == 0){ // have we not inited this size fft yet?
+ // init cos table
+ UtblArray[M] = (float *) malloc( (POW2(M)/4+1)*sizeof(float) );
+ if (UtblArray[M] == 0)
+ theError = 2;
+ else{
+ fftCosInit(M, UtblArray[M]);
+ }
+ if (M > 1){
+ if (BRLowArray[M/2] == 0){ // init bit reversed table for cmplx fft
+ BRLowArray[M/2] = (short *) malloc( POW2(M/2-1)*sizeof(short) );
+ if (BRLowArray[M/2] == 0)
+ theError = 2;
+ else{
+ fftBRInit(M, BRLowArray[M/2]);
+ }
+ }
+ }
+ if (M > 2){
+ if (BRLowArray[(M-1)/2] == 0){ // init bit reversed table for real fft
+ BRLowArray[(M-1)/2] = (short *) malloc( POW2((M-1)/2-1)*sizeof(short) );
+ if (BRLowArray[(M-1)/2] == 0)
+ theError = 2;
+ else{
+ fftBRInit(M-1, BRLowArray[(M-1)/2]);
+ }
+ }
+ }
+ }
+};
+return theError;
+}
+
+void fftFree(){
+// release storage for all private cosine and bit reversed tables
+long i1;
+for (i1=8*sizeof(long)/2-1; i1>=0; i1--){
+ if (BRLowArray[i1] != 0){
+ free(BRLowArray[i1]);
+ BRLowArray[i1] = 0;
+ };
+};
+for (i1=8*sizeof(long)-1; i1>=0; i1--){
+ if (UtblArray[i1] != 0){
+ free(UtblArray[i1]);
+ UtblArray[i1] = 0;
+ };
+};
+}
+
+/*************************************************
+ The following calls are easier than calling to fftlib directly.
+ Just make sure fftlib has been called for each M first.
+**************************************************/
+
+void ffts(float *data, long M, long Rows){
+/* Compute in-place complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = output data array */
+ ffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]);
+}
+
+void iffts(float *data, long M, long Rows){
+/* Compute in-place inverse complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = output data array */
+ iffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]);
+}
+
+void rffts(float *data, long M, long Rows){
+/* Compute in-place real fft on the rows of the input array */
+/* The result is the complex spectra of the positive frequencies */
+/* except the location for the first complex number contains the real */
+/* values for DC and Nyquest */
+/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */
+/* INPUTS */
+/* *ioptr = real input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = output data array in the following order */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+ rffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]);
+}
+
+void riffts(float *data, long M, long Rows){
+/* Compute in-place real ifft on the rows of the input array */
+/* data order as from rffts */
+/* INPUTS */
+/* *ioptr = input data array in the following order */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = real output data array */
+ riffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]);
+}
+
+void rspectprod(float *data1, float *data2, float *outdata, long N){
+// When multiplying a pair of spectra from rfft care must be taken to multiply the
+// two real values seperately from the complex ones. This routine does it correctly.
+// the result can be stored in-place over one of the inputs
+/* INPUTS */
+/* *data1 = input data array first spectra */
+/* *data2 = input data array second spectra */
+/* N = fft input size for both data1 and data2 */
+/* OUTPUTS */
+/* *outdata = output data array spectra */
+if(N>1){
+ outdata[0] = data1[0] * data2[0]; // multiply the zero freq values
+ outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values
+ cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values
+}
+else{
+ outdata[0] = data1[0] * data2[0];
+}
+}
diff --git a/ffts/src/fftext.h b/ffts/src/fftext.h
new file mode 100644
index 0000000..15d8a6b
--- /dev/null
+++ b/ffts/src/fftext.h
@@ -0,0 +1,106 @@
+/*******************************************************************
+ This file extends the fftlib with calls to maintain the cosine and bit reversed tables
+ for you (including mallocs and free's). Call the init routine for each fft size you will
+ be using. Then you can call the fft routines below which will make the fftlib library
+ call with the appropriate tables passed. When you are done with all fft's you can call
+ fftfree to release the storage for the tables. Note that you can call fftinit repeatedly
+ with the same size, the extra calls will be ignored. So, you could make a macro to call
+ fftInit every time you call ffts. For example you could have someting like:
+ #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n");
+*******************************************************************/
+
+int fftInit(long M);
+// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft
+/* INPUTS */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* OUTPUTS */
+/* private cosine and bit reversed tables */
+
+void fftFree();
+// release storage for all private cosine and bit reversed tables
+
+void ffts(float *data, long M, long Rows);
+/* Compute in-place complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = output data array */
+
+void iffts(float *data, long M, long Rows);
+/* Compute in-place inverse complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = output data array */
+
+void rffts(float *data, long M, long Rows);
+/* Compute in-place real fft on the rows of the input array */
+/* The result is the complex spectra of the positive frequencies */
+/* except the location for the first complex number contains the real */
+/* values for DC and Nyquest */
+/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */
+/* INPUTS */
+/* *ioptr = real input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = output data array in the following order */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+
+void riffts(float *data, long M, long Rows);
+/* Compute in-place real ifft on the rows of the input array */
+/* data order as from rffts */
+/* INPUTS */
+/* *ioptr = input data array in the following order */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */
+/* OUTPUTS */
+/* *ioptr = real output data array */
+
+void rspectprod(float *data1, float *data2, float *outdata, long N);
+// When multiplying a pair of spectra from rfft care must be taken to multiply the
+// two real values seperately from the complex ones. This routine does it correctly.
+// the result can be stored in-place over one of the inputs
+/* INPUTS */
+/* *data1 = input data array first spectra */
+/* *data2 = input data array second spectra */
+/* N = fft input size for both data1 and data2 */
+/* OUTPUTS */
+/* *outdata = output data array spectra */
+
+
+// The following is FYI
+
+
+//Note that most of the fft routines require full matrices, ie Rsiz==Ncols
+//This is how I like to define a real matrix:
+//struct matrix { // real matrix
+// float *d; // pointer to data
+// long Nrows; // number of rows in the matrix
+// long Ncols; // number of columns in the matrix (can be less than Rsiz)
+// long Rsiz; // number of floats from one row to the next
+//};
+//typedef struct matrix matrix;
+
+
+
+// CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make
+// arrays that start exactly on a cache line start.
+// First we CACHEFILLMALLOC a void * (use this void * when free'ing),
+// then we set our array pointer equal to the properly cast CEILCACHELINE of this void *
+// example:
+// aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) );
+// a = (float *) CEILCACHELINE(ainit);
+// ... main body of code ...
+// free(aInit);
+//
+// To disable this alignment, set CACHELINESIZE to 1
+//#define CACHELINESIZE 32 // Bytes per cache line
+//#define CACHELINEFILL (CACHELINESIZE-1)
+//#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE)
+//#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL)
diff --git a/ffts/src/fftlib.c b/ffts/src/fftlib.c
new file mode 100644
index 0000000..d832f89
--- /dev/null
+++ b/ffts/src/fftlib.c
@@ -0,0 +1,3174 @@
+/*******************************************************************
+lower level fft stuff including routines called in fftext.c and fft2d.c
+*******************************************************************/
+#include "fftlib.h"
+#include <math.h>
+#define MCACHE (11-(sizeof(float)/8)) // fft's with M bigger than this bust primary cache
+#ifdef WIN32
+#define inline __inline
+#endif
+
+// some math constants to 40 decimal places
+#define MYPI 3.141592653589793238462643383279502884197 // pi
+#define MYROOT2 1.414213562373095048801688724209698078569 // sqrt(2)
+#define MYCOSPID8 0.9238795325112867561281831893967882868224 // cos(pi/8)
+#define MYSINPID8 0.3826834323650897717284599840303988667613 // sin(pi/8)
+
+
+/*************************************************
+routines to initialize tables used by fft routines
+**************************************************/
+
+void fftCosInit(long M, float *Utbl){
+/* Compute Utbl, the cosine table for ffts */
+/* of size (pow(2,M)/4 +1) */
+/* INPUTS */
+/* M = log2 of fft size */
+/* OUTPUTS */
+/* *Utbl = cosine table */
+unsigned long fftN = POW2(M);
+unsigned long i1;
+ Utbl[0] = 1.0;
+ for (i1 = 1; i1 < fftN/4; i1++)
+ Utbl[i1] = cos( (2.0 * MYPI * i1) / fftN );
+ Utbl[fftN/4] = 0.0;
+}
+
+void fftBRInit(long M, short *BRLow){
+/* Compute BRLow, the bit reversed table for ffts */
+/* of size pow(2,M/2 -1) */
+/* INPUTS */
+/* M = log2 of fft size */
+/* OUTPUTS */
+/* *BRLow = bit reversed counter table */
+long Mroot_1 = M / 2 - 1;
+long Nroot_1 = POW2(Mroot_1);
+long i1;
+long bitsum;
+long bitmask;
+long bit;
+for (i1 = 0; i1 < Nroot_1; i1++){
+ bitsum = 0;
+ bitmask = 1;
+ for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++)
+ if (i1 & bitmask)
+ bitsum = bitsum + (Nroot_1 >> bit);
+ BRLow[i1] = bitsum;
+};
+}
+
+/************************************************
+parts of ffts1
+*************************************************/
+
+inline void bitrevR2(float *ioptr, long M, short *BRLow);
+inline void bitrevR2(float *ioptr, long M, short *BRLow){
+/*** bit reverse and first radix 2 stage of forward or inverse fft ***/
+float f0r;
+float f0i;
+float f1r;
+float f1i;
+float f2r;
+float f2i;
+float f3r;
+float f3i;
+float f4r;
+float f4i;
+float f5r;
+float f5i;
+float f6r;
+float f6i;
+float f7r;
+float f7i;
+float t0r;
+float t0i;
+float t1r;
+float t1i;
+float *p0r;
+float *p1r;
+float *IOP;
+float *iolimit;
+long Colstart;
+long iCol;
+unsigned long posA;
+unsigned long posAi;
+unsigned long posB;
+unsigned long posBi;
+
+const unsigned long Nrems2 = POW2((M+3)/2);
+const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2;
+const unsigned long Nroot_1 = POW2(M/2-1)-1;
+const unsigned long ColstartShift = (M+1)/2 +1;
+posA = POW2(M); // 1/2 of POW2(M) complexes
+posAi = posA + 1;
+posB = posA + 2;
+posBi = posB + 1;
+
+iolimit = ioptr + Nrems2;
+for (; ioptr < iolimit; ioptr += POW2(M/2+1)){
+ for (Colstart = Nroot_1; Colstart >= 0; Colstart--){
+ iCol = Nroot_1;
+ p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
+ IOP = ioptr + (Colstart << ColstartShift);
+ p1r = IOP + BRLow[iCol]*4;
+ f0r = *(p0r);
+ f0i = *(p0r+1);
+ f1r = *(p0r+posA);
+ f1i = *(p0r+posAi);
+ for (; iCol > Colstart;){
+ f2r = *(p0r+2);
+ f2i = *(p0r+(2+1));
+ f3r = *(p0r+posB);
+ f3i = *(p0r+posBi);
+ f4r = *(p1r);
+ f4i = *(p1r+1);
+ f5r = *(p1r+posA);
+ f5i = *(p1r+posAi);
+ f6r = *(p1r+2);
+ f6i = *(p1r+(2+1));
+ f7r = *(p1r+posB);
+ f7i = *(p1r+posBi);
+
+ t0r = f0r + f1r;
+ t0i = f0i + f1i;
+ f1r = f0r - f1r;
+ f1i = f0i - f1i;
+ t1r = f2r + f3r;
+ t1i = f2i + f3i;
+ f3r = f2r - f3r;
+ f3i = f2i - f3i;
+ f0r = f4r + f5r;
+ f0i = f4i + f5i;
+ f5r = f4r - f5r;
+ f5i = f4i - f5i;
+ f2r = f6r + f7r;
+ f2i = f6i + f7i;
+ f7r = f6r - f7r;
+ f7i = f6i - f7i;
+
+ *(p1r) = t0r;
+ *(p1r+1) = t0i;
+ *(p1r+2) = f1r;
+ *(p1r+(2+1)) = f1i;
+ *(p1r+posA) = t1r;
+ *(p1r+posAi) = t1i;
+ *(p1r+posB) = f3r;
+ *(p1r+posBi) = f3i;
+ *(p0r) = f0r;
+ *(p0r+1) = f0i;
+ *(p0r+2) = f5r;
+ *(p0r+(2+1)) = f5i;
+ *(p0r+posA) = f2r;
+ *(p0r+posAi) = f2i;
+ *(p0r+posB) = f7r;
+ *(p0r+posBi) = f7i;
+
+ p0r -= Nrems2;
+ f0r = *(p0r);
+ f0i = *(p0r+1);
+ f1r = *(p0r+posA);
+ f1i = *(p0r+posAi);
+ iCol -= 1;
+ p1r = IOP + BRLow[iCol]*4;
+ };
+ f2r = *(p0r+2);
+ f2i = *(p0r+(2+1));
+ f3r = *(p0r+posB);
+ f3i = *(p0r+posBi);
+
+ t0r = f0r + f1r;
+ t0i = f0i + f1i;
+ f1r = f0r - f1r;
+ f1i = f0i - f1i;
+ t1r = f2r + f3r;
+ t1i = f2i + f3i;
+ f3r = f2r - f3r;
+ f3i = f2i - f3i;
+
+ *(p0r) = t0r;
+ *(p0r+1) = t0i;
+ *(p0r+2) = f1r;
+ *(p0r+(2+1)) = f1i;
+ *(p0r+posA) = t1r;
+ *(p0r+posAi) = t1i;
+ *(p0r+posB) = f3r;
+ *(p0r+posBi) = f3i;
+
+ };
+};
+}
+
+inline void fft2pt(float *ioptr);
+inline void fft2pt(float *ioptr){
+/*** RADIX 2 fft ***/
+float f0r, f0i, f1r, f1i;
+float t0r, t0i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[2];
+f1i = ioptr[3];
+
+ /* Butterflys */
+ /*
+ f0 - - t0
+ f1 - 1 - f1
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+ /* store result */
+ioptr[0] = t0r;
+ioptr[1] = t0i;
+ioptr[2] = f1r;
+ioptr[3] = f1i;
+}
+
+
+inline void fft4pt(float *ioptr);
+inline void fft4pt(float *ioptr){
+/*** RADIX 4 fft ***/
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float t0r, t0i, t1r, t1i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[4];
+f1i = ioptr[5];
+f2r = ioptr[2];
+f2i = ioptr[3];
+f3r = ioptr[6];
+f3i = ioptr[7];
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0
+ f1 - 1 - f1 - - f1
+ f2 - - f2 - 1 - f2
+ f3 - 1 - t1 - -i - f3
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r - t1i;
+f3i = f1i + t1r;
+f1r = f1r + t1i;
+f1i = f1i - t1r;
+
+ /* store result */
+ioptr[0] = f0r;
+ioptr[1] = f0i;
+ioptr[2] = f1r;
+ioptr[3] = f1i;
+ioptr[4] = f2r;
+ioptr[5] = f2i;
+ioptr[6] = f3r;
+ioptr[7] = f3i;
+}
+
+inline void fft8pt(float *ioptr);
+inline void fft8pt(float *ioptr){
+/*** RADIX 8 fft ***/
+float w0r = 1.0/MYROOT2; /* cos(pi/4) */
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[8];
+f1i = ioptr[9];
+f2r = ioptr[4];
+f2i = ioptr[5];
+f3r = ioptr[12];
+f3i = ioptr[13];
+f4r = ioptr[2];
+f4i = ioptr[3];
+f5r = ioptr[10];
+f5i = ioptr[11];
+f6r = ioptr[6];
+f6i = ioptr[7];
+f7r = ioptr[14];
+f7i = ioptr[15];
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0 - - f0
+ f1 - 1 - f1 - - f1 - - f1
+ f2 - - f2 - 1 - f2 - - f2
+ f3 - 1 - t1 - -i - f3 - - f3
+ f4 - - t0 - - f4 - 1 - t0
+ f5 - 1 - f5 - - f5 - w3 - f4
+ f6 - - f6 - 1 - f6 - -i - t1
+ f7 - 1 - t1 - -i - f7 - iw3- f6
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r - t1i;
+f3i = f1i + t1r;
+f1r = f1r + t1i;
+f1i = f1i - t1r;
+
+
+t0r = f4r + f5r;
+t0i = f4i + f5i;
+f5r = f4r - f5r;
+f5i = f4i - f5i;
+
+t1r = f6r - f7r;
+t1i = f6i - f7i;
+f6r = f6r + f7r;
+f6i = f6i + f7i;
+
+f4r = t0r + f6r;
+f4i = t0i + f6i;
+f6r = t0r - f6r;
+f6i = t0i - f6i;
+
+f7r = f5r - t1i;
+f7i = f5i + t1r;
+f5r = f5r + t1i;
+f5i = f5i - t1r;
+
+
+t0r = f0r - f4r;
+t0i = f0i - f4i;
+f0r = f0r + f4r;
+f0i = f0i + f4i;
+
+t1r = f2r - f6i;
+t1i = f2i + f6r;
+f2r = f2r + f6i;
+f2i = f2i - f6r;
+
+f4r = f1r - f5r * w0r - f5i * w0r;
+f4i = f1i + f5r * w0r - f5i * w0r;
+f1r = f1r * Two - f4r;
+f1i = f1i * Two - f4i;
+
+f6r = f3r + f7r * w0r - f7i * w0r;
+f6i = f3i + f7r * w0r + f7i * w0r;
+f3r = f3r * Two - f6r;
+f3i = f3i * Two - f6i;
+
+ /* store result */
+ioptr[0] = f0r;
+ioptr[1] = f0i;
+ioptr[2] = f1r;
+ioptr[3] = f1i;
+ioptr[4] = f2r;
+ioptr[5] = f2i;
+ioptr[6] = f3r;
+ioptr[7] = f3i;
+ioptr[8] = t0r;
+ioptr[9] = t0i;
+ioptr[10] = f4r;
+ioptr[11] = f4i;
+ioptr[12] = t1r;
+ioptr[13] = t1i;
+ioptr[14] = f6r;
+ioptr[15] = f6i;
+}
+
+inline void bfR2(float *ioptr, long M, long NDiffU);
+inline void bfR2(float *ioptr, long M, long NDiffU){
+/*** 2nd radix 2 stage ***/
+unsigned long pos;
+unsigned long posi;
+unsigned long pinc;
+unsigned long pnext;
+unsigned long NSameU;
+unsigned long SameUCnt;
+
+float *pstrt;
+float *p0r, *p1r, *p2r, *p3r;
+
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+const float Two = 2.0;
+
+pinc = NDiffU * 2; // 2 floats per complex
+pnext = pinc * 4;
+pos = 2;
+posi = pos+1;
+NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time
+pstrt = ioptr;
+p0r = pstrt;
+p1r = pstrt+pinc;
+p2r = p1r+pinc;
+p3r = p2r+pinc;
+
+ /* Butterflys */
+ /*
+ f0 - - f4
+ f1 - 1 - f5
+ f2 - - f6
+ f3 - 1 - f7
+ */
+ /* Butterflys */
+ /*
+ f0 - - f4
+ f1 - 1 - f5
+ f2 - - f6
+ f3 - 1 - f7
+ */
+
+for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){
+
+ f0r = *p0r;
+ f1r = *p1r;
+ f0i = *(p0r + 1);
+ f1i = *(p1r + 1);
+ f2r = *p2r;
+ f3r = *p3r;
+ f2i = *(p2r + 1);
+ f3i = *(p3r + 1);
+
+ f4r = f0r + f1r;
+ f4i = f0i + f1i;
+ f5r = f0r - f1r;
+ f5i = f0i - f1i;
+
+ f6r = f2r + f3r;
+ f6i = f2i + f3i;
+ f7r = f2r - f3r;
+ f7i = f2i - f3i;
+
+ *p0r = f4r;
+ *(p0r + 1) = f4i;
+ *p1r = f5r;
+ *(p1r + 1) = f5i;
+ *p2r = f6r;
+ *(p2r + 1) = f6i;
+ *p3r = f7r;
+ *(p3r + 1) = f7i;
+
+ f0r = *(p0r + pos);
+ f1i = *(p1r + posi);
+ f0i = *(p0r + posi);
+ f1r = *(p1r + pos);
+ f2r = *(p2r + pos);
+ f3i = *(p3r + posi);
+ f2i = *(p2r + posi);
+ f3r = *(p3r + pos);
+
+ f4r = f0r + f1i;
+ f4i = f0i - f1r;
+ f5r = f0r - f1i;
+ f5i = f0i + f1r;
+
+ f6r = f2r + f3i;
+ f6i = f2i - f3r;
+ f7r = f2r - f3i;
+ f7i = f2i + f3r;
+
+ *(p0r + pos) = f4r;
+ *(p0r + posi) = f4i;
+ *(p1r + pos) = f5r;
+ *(p1r + posi) = f5i;
+ *(p2r + pos) = f6r;
+ *(p2r + posi) = f6i;
+ *(p3r + pos) = f7r;
+ *(p3r + posi) = f7i;
+
+ p0r += pnext;
+ p1r += pnext;
+ p2r += pnext;
+ p3r += pnext;
+
+}
+}
+
+inline void bfR4(float *ioptr, long M, long NDiffU);
+inline void bfR4(float *ioptr, long M, long NDiffU){
+/*** 1 radix 4 stage ***/
+unsigned long pos;
+unsigned long posi;
+unsigned long pinc;
+unsigned long pnext;
+unsigned long pnexti;
+unsigned long NSameU;
+unsigned long SameUCnt;
+
+float *pstrt;
+float *p0r, *p1r, *p2r, *p3r;
+
+float w1r = 1.0/MYROOT2; /* cos(pi/4) */
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t1r, t1i;
+const float Two = 2.0;
+
+pinc = NDiffU * 2; // 2 floats per complex
+pnext = pinc * 4;
+pnexti = pnext + 1;
+pos = 2;
+posi = pos+1;
+NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly
+pstrt = ioptr;
+p0r = pstrt;
+p1r = pstrt+pinc;
+p2r = p1r+pinc;
+p3r = p2r+pinc;
+
+ /* Butterflys */
+ /*
+ f0 - - f0 - - f4
+ f1 - 1 - f5 - - f5
+ f2 - - f6 - 1 - f6
+ f3 - 1 - f3 - -i - f7
+ */
+ /* Butterflys */
+ /*
+ f0 - - f4 - - f4
+ f1 - -i - t1 - - f5
+ f2 - - f2 - w1 - f6
+ f3 - -i - f7 - iw1- f7
+ */
+
+f0r = *p0r;
+f1r = *p1r;
+f2r = *p2r;
+f3r = *p3r;
+f0i = *(p0r + 1);
+f1i = *(p1r + 1);
+f2i = *(p2r + 1);
+f3i = *(p3r + 1);
+
+f5r = f0r - f1r;
+f5i = f0i - f1i;
+f0r = f0r + f1r;
+f0i = f0i + f1i;
+
+f6r = f2r + f3r;
+f6i = f2i + f3i;
+f3r = f2r - f3r;
+f3i = f2i - f3i;
+
+for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
+
+ f7r = f5r - f3i;
+ f7i = f5i + f3r;
+ f5r = f5r + f3i;
+ f5i = f5i - f3r;
+
+ f4r = f0r + f6r;
+ f4i = f0i + f6i;
+ f6r = f0r - f6r;
+ f6i = f0i - f6i;
+
+ f2r = *(p2r + pos);
+ f2i = *(p2r + posi);
+ f1r = *(p1r + pos);
+ f1i = *(p1r + posi);
+ f3i = *(p3r + posi);
+ f0r = *(p0r + pos);
+ f3r = *(p3r + pos);
+ f0i = *(p0r + posi);
+
+ *p3r = f7r;
+ *p0r = f4r;
+ *(p3r + 1) = f7i;
+ *(p0r + 1) = f4i;
+ *p1r = f5r;
+ *p2r = f6r;
+ *(p1r + 1) = f5i;
+ *(p2r + 1) = f6i;
+
+ f7r = f2r - f3i;
+ f7i = f2i + f3r;
+ f2r = f2r + f3i;
+ f2i = f2i - f3r;
+
+ f4r = f0r + f1i;
+ f4i = f0i - f1r;
+ t1r = f0r - f1i;
+ t1i = f0i + f1r;
+
+ f5r = t1r - f7r * w1r + f7i * w1r;
+ f5i = t1i - f7r * w1r - f7i * w1r;
+ f7r = t1r * Two - f5r;
+ f7i = t1i * Two - f5i;
+
+ f6r = f4r - f2r * w1r - f2i * w1r;
+ f6i = f4i + f2r * w1r - f2i * w1r;
+ f4r = f4r * Two - f6r;
+ f4i = f4i * Two - f6i;
+
+ f3r = *(p3r + pnext);
+ f0r = *(p0r + pnext);
+ f3i = *(p3r + pnexti);
+ f0i = *(p0r + pnexti);
+ f2r = *(p2r + pnext);
+ f2i = *(p2r + pnexti);
+ f1r = *(p1r + pnext);
+ f1i = *(p1r + pnexti);
+
+ *(p2r + pos) = f6r;
+ *(p1r + pos) = f5r;
+ *(p2r + posi) = f6i;
+ *(p1r + posi) = f5i;
+ *(p3r + pos) = f7r;
+ *(p0r + pos) = f4r;
+ *(p3r + posi) = f7i;
+ *(p0r + posi) = f4i;
+
+ f6r = f2r + f3r;
+ f6i = f2i + f3i;
+ f3r = f2r - f3r;
+ f3i = f2i - f3i;
+
+ f5r = f0r - f1r;
+ f5i = f0i - f1i;
+ f0r = f0r + f1r;
+ f0i = f0i + f1i;
+
+ p3r += pnext;
+ p0r += pnext;
+ p1r += pnext;
+ p2r += pnext;
+
+}
+f7r = f5r - f3i;
+f7i = f5i + f3r;
+f5r = f5r + f3i;
+f5i = f5i - f3r;
+
+f4r = f0r + f6r;
+f4i = f0i + f6i;
+f6r = f0r - f6r;
+f6i = f0i - f6i;
+
+f2r = *(p2r + pos);
+f2i = *(p2r + posi);
+f1r = *(p1r + pos);
+f1i = *(p1r + posi);
+f3i = *(p3r + posi);
+f0r = *(p0r + pos);
+f3r = *(p3r + pos);
+f0i = *(p0r + posi);
+
+*p3r = f7r;
+*p0r = f4r;
+*(p3r + 1) = f7i;
+*(p0r + 1) = f4i;
+*p1r = f5r;
+*p2r = f6r;
+*(p1r + 1) = f5i;
+*(p2r + 1) = f6i;
+
+f7r = f2r - f3i;
+f7i = f2i + f3r;
+f2r = f2r + f3i;
+f2i = f2i - f3r;
+
+f4r = f0r + f1i;
+f4i = f0i - f1r;
+t1r = f0r - f1i;
+t1i = f0i + f1r;
+
+f5r = t1r - f7r * w1r + f7i * w1r;
+f5i = t1i - f7r * w1r - f7i * w1r;
+f7r = t1r * Two - f5r;
+f7i = t1i * Two - f5i;
+
+f6r = f4r - f2r * w1r - f2i * w1r;
+f6i = f4i + f2r * w1r - f2i * w1r;
+f4r = f4r * Two - f6r;
+f4i = f4i * Two - f6i;
+
+*(p2r + pos) = f6r;
+*(p1r + pos) = f5r;
+*(p2r + posi) = f6i;
+*(p1r + posi) = f5i;
+*(p3r + pos) = f7r;
+*(p0r + pos) = f4r;
+*(p3r + posi) = f7i;
+*(p0r + posi) = f4i;
+
+}
+
+inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
+inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
+/*** RADIX 8 Stages ***/
+unsigned long pos;
+unsigned long posi;
+unsigned long pinc;
+unsigned long pnext;
+unsigned long NSameU;
+unsigned long Uinc;
+unsigned long Uinc2;
+unsigned long Uinc4;
+unsigned long DiffUCnt;
+unsigned long SameUCnt;
+unsigned long U2toU3;
+
+float *pstrt;
+float *p0r, *p1r, *p2r, *p3r;
+float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
+
+float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+pinc = NDiffU * 2; // 2 floats per complex
+pnext = pinc * 8;
+pos = pinc * 4;
+posi = pos + 1;
+NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly
+Uinc = NSameU * Ustride;
+Uinc2 = Uinc * 2;
+Uinc4 = Uinc * 4;
+U2toU3 = (POW2(M) / 8)*Ustride;
+for (; StageCnt > 0 ; StageCnt--){
+
+ u0r = &Utbl[0];
+ u0i = &Utbl[POW2(M-2)*Ustride];
+ u1r = u0r;
+ u1i = u0i;
+ u2r = u0r;
+ u2i = u0i;
+
+ w0r = *u0r;
+ w0i = *u0i;
+ w1r = *u1r;
+ w1i = *u1i;
+ w2r = *u2r;
+ w2i = *u2i;
+ w3r = *(u2r+U2toU3);
+ w3i = *(u2i-U2toU3);
+
+ pstrt = ioptr;
+
+ p0r = pstrt;
+ p1r = pstrt+pinc;
+ p2r = p1r+pinc;
+ p3r = p2r+pinc;
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0 - - f0
+ f1 - w0- f1 - - f1 - - f1
+ f2 - - f2 - w1- f2 - - f4
+ f3 - w0- t1 - iw1- f3 - - f5
+
+ f4 - - t0 - - f4 - w2- t0
+ f5 - w0- f5 - - f5 - w3- t1
+ f6 - - f6 - w1- f6 - iw2- f6
+ f7 - w0- t1 - iw1- f7 - iw3- f7
+ */
+
+ for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){
+ f0r = *p0r;
+ f0i = *(p0r + 1);
+ f1r = *p1r;
+ f1i = *(p1r + 1);
+ for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
+ f2r = *p2r;
+ f2i = *(p2r + 1);
+ f3r = *p3r;
+ f3i = *(p3r + 1);
+
+ t0r = f0r + f1r * w0r + f1i * w0i;
+ t0i = f0i - f1r * w0i + f1i * w0r;
+ f1r = f0r * Two - t0r;
+ f1i = f0i * Two - t0i;
+
+ f4r = *(p0r + pos);
+ f4i = *(p0r + posi);
+ f5r = *(p1r + pos);
+ f5i = *(p1r + posi);
+
+ f6r = *(p2r + pos);
+ f6i = *(p2r + posi);
+ f7r = *(p3r + pos);
+ f7i = *(p3r + posi);
+
+ t1r = f2r - f3r * w0r - f3i * w0i;
+ t1i = f2i + f3r * w0i - f3i * w0r;
+ f2r = f2r * Two - t1r;
+ f2i = f2i * Two - t1i;
+
+ f0r = t0r + f2r * w1r + f2i * w1i;
+ f0i = t0i - f2r * w1i + f2i * w1r;
+ f2r = t0r * Two - f0r;
+ f2i = t0i * Two - f0i;
+
+ f3r = f1r + t1r * w1i - t1i * w1r;
+ f3i = f1i + t1r * w1r + t1i * w1i;
+ f1r = f1r * Two - f3r;
+ f1i = f1i * Two - f3i;
+
+
+ t0r = f4r + f5r * w0r + f5i * w0i;
+ t0i = f4i - f5r * w0i + f5i * w0r;
+ f5r = f4r * Two - t0r;
+ f5i = f4i * Two - t0i;
+
+ t1r = f6r - f7r * w0r - f7i * w0i;
+ t1i = f6i + f7r * w0i - f7i * w0r;
+ f6r = f6r * Two - t1r;
+ f6i = f6i * Two - t1i;
+
+ f4r = t0r + f6r * w1r + f6i * w1i;
+ f4i = t0i - f6r * w1i + f6i * w1r;
+ f6r = t0r * Two - f4r;
+ f6i = t0i * Two - f4i;
+
+ f7r = f5r + t1r * w1i - t1i * w1r;
+ f7i = f5i + t1r * w1r + t1i * w1i;
+ f5r = f5r * Two - f7r;
+ f5i = f5i * Two - f7i;
+
+ t0r = f0r - f4r * w2r - f4i * w2i;
+ t0i = f0i + f4r * w2i - f4i * w2r;
+ f0r = f0r * Two - t0r;
+ f0i = f0i * Two - t0i;
+
+ t1r = f1r - f5r * w3r - f5i * w3i;
+ t1i = f1i + f5r * w3i - f5i * w3r;
+ f1r = f1r * Two - t1r;
+ f1i = f1i * Two - t1i;
+
+ *(p0r + pos) = t0r;
+ *(p1r + pos) = t1r;
+ *(p0r + posi) = t0i;
+ *(p1r + posi) = t1i;
+ *p0r = f0r;
+ *p1r = f1r;
+ *(p0r + 1) = f0i;
+ *(p1r + 1) = f1i;
+
+ p0r += pnext;
+ f0r = *p0r;
+ f0i = *(p0r + 1);
+
+ p1r += pnext;
+
+ f1r = *p1r;
+ f1i = *(p1r + 1);
+
+ f4r = f2r - f6r * w2i + f6i * w2r;
+ f4i = f2i - f6r * w2r - f6i * w2i;
+ f6r = f2r * Two - f4r;
+ f6i = f2i * Two - f4i;
+
+ f5r = f3r - f7r * w3i + f7i * w3r;
+ f5i = f3i - f7r * w3r - f7i * w3i;
+ f7r = f3r * Two - f5r;
+ f7i = f3i * Two - f5i;
+
+ *p2r = f4r;
+ *p3r = f5r;
+ *(p2r + 1) = f4i;
+ *(p3r + 1) = f5i;
+ *(p2r + pos) = f6r;
+ *(p3r + pos) = f7r;
+ *(p2r + posi) = f6i;
+ *(p3r + posi) = f7i;
+
+ p2r += pnext;
+ p3r += pnext;
+
+ }
+
+ f2r = *p2r;
+ f2i = *(p2r + 1);
+ f3r = *p3r;
+ f3i = *(p3r + 1);
+
+ t0r = f0r + f1r * w0r + f1i * w0i;
+ t0i = f0i - f1r * w0i + f1i * w0r;
+ f1r = f0r * Two - t0r;
+ f1i = f0i * Two - t0i;
+
+ f4r = *(p0r + pos);
+ f4i = *(p0r + posi);
+ f5r = *(p1r + pos);
+ f5i = *(p1r + posi);
+
+ f6r = *(p2r + pos);
+ f6i = *(p2r + posi);
+ f7r = *(p3r + pos);
+ f7i = *(p3r + posi);
+
+ t1r = f2r - f3r * w0r - f3i * w0i;
+ t1i = f2i + f3r * w0i - f3i * w0r;
+ f2r = f2r * Two - t1r;
+ f2i = f2i * Two - t1i;
+
+ f0r = t0r + f2r * w1r + f2i * w1i;
+ f0i = t0i - f2r * w1i + f2i * w1r;
+ f2r = t0r * Two - f0r;
+ f2i = t0i * Two - f0i;
+
+ f3r = f1r + t1r * w1i - t1i * w1r;
+ f3i = f1i + t1r * w1r + t1i * w1i;
+ f1r = f1r * Two - f3r;
+ f1i = f1i * Two - f3i;
+
+ if (DiffUCnt == NDiffU/2)
+ Uinc4 = -Uinc4;
+
+ u0r += Uinc4;
+ u0i -= Uinc4;
+ u1r += Uinc2;
+ u1i -= Uinc2;
+ u2r += Uinc;
+ u2i -= Uinc;
+
+ pstrt += 2;
+
+ t0r = f4r + f5r * w0r + f5i * w0i;
+ t0i = f4i - f5r * w0i + f5i * w0r;
+ f5r = f4r * Two - t0r;
+ f5i = f4i * Two - t0i;
+
+ t1r = f6r - f7r * w0r - f7i * w0i;
+ t1i = f6i + f7r * w0i - f7i * w0r;
+ f6r = f6r * Two - t1r;
+ f6i = f6i * Two - t1i;
+
+ f4r = t0r + f6r * w1r + f6i * w1i;
+ f4i = t0i - f6r * w1i + f6i * w1r;
+ f6r = t0r * Two - f4r;
+ f6i = t0i * Two - f4i;
+
+ f7r = f5r + t1r * w1i - t1i * w1r;
+ f7i = f5i + t1r * w1r + t1i * w1i;
+ f5r = f5r * Two - f7r;
+ f5i = f5i * Two - f7i;
+
+ w0r = *u0r;
+ w0i = *u0i;
+ w1r = *u1r;
+ w1i = *u1i;
+
+ if (DiffUCnt <= NDiffU/2)
+ w0r = -w0r;
+
+ t0r = f0r - f4r * w2r - f4i * w2i;
+ t0i = f0i + f4r * w2i - f4i * w2r;
+ f0r = f0r * Two - t0r;
+ f0i = f0i * Two - t0i;
+
+ f4r = f2r - f6r * w2i + f6i * w2r;
+ f4i = f2i - f6r * w2r - f6i * w2i;
+ f6r = f2r * Two - f4r;
+ f6i = f2i * Two - f4i;
+
+ *(p0r + pos) = t0r;
+ *p2r = f4r;
+ *(p0r + posi) = t0i;
+ *(p2r + 1) = f4i;
+ w2r = *u2r;
+ w2i = *u2i;
+ *p0r = f0r;
+ *(p2r + pos) = f6r;
+ *(p0r + 1) = f0i;
+ *(p2r + posi) = f6i;
+
+ p0r = pstrt;
+ p2r = pstrt + pinc + pinc;
+
+ t1r = f1r - f5r * w3r - f5i * w3i;
+ t1i = f1i + f5r * w3i - f5i * w3r;
+ f1r = f1r * Two - t1r;
+ f1i = f1i * Two - t1i;
+
+ f5r = f3r - f7r * w3i + f7i * w3r;
+ f5i = f3i - f7r * w3r - f7i * w3i;
+ f7r = f3r * Two - f5r;
+ f7i = f3i * Two - f5i;
+
+ *(p1r + pos) = t1r;
+ *p3r = f5r;
+ *(p1r + posi) = t1i;
+ *(p3r + 1) = f5i;
+ w3r = *(u2r+U2toU3);
+ w3i = *(u2i-U2toU3);
+ *p1r = f1r;
+ *(p3r + pos) = f7r;
+ *(p1r + 1) = f1i;
+ *(p3r + posi) = f7i;
+
+ p1r = pstrt + pinc;
+ p3r = p2r + pinc;
+ }
+ NSameU /= 8;
+ Uinc /= 8;
+ Uinc2 /= 8;
+ Uinc4 = Uinc * 4;
+ NDiffU *= 8;
+ pinc *= 8;
+ pnext *= 8;
+ pos *= 8;
+ posi = pos + 1;
+}
+}
+
+void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
+void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
+/* recursive bfstages calls to maximize on chip cache efficiency */
+long i1;
+if (M <= MCACHE) // fits on chip ?
+ bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
+else{
+ for (i1=0; i1<8; i1++){
+ fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */
+ }
+ bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */
+}
+}
+
+void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
+/* Compute in-place complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = output data array */
+
+long StageCnt;
+long NDiffU;
+
+switch (M){
+case 0:
+ break;
+case 1:
+ for (;Rows>0;Rows--){
+ fft2pt(ioptr); /* a 2 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 2:
+ for (;Rows>0;Rows--){
+ fft4pt(ioptr); /* a 4 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 3:
+ for (;Rows>0;Rows--){
+ fft8pt(ioptr); /* an 8 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+default:
+ for (;Rows>0;Rows--){
+
+ bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */
+
+ StageCnt = (M-1) / 3; // number of radix 8 stages
+ NDiffU = 2; // one radix 2 stage already complete
+
+ if ( (M-1-(StageCnt * 3)) == 1 ){
+ bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
+ NDiffU *= 2;
+ }
+
+ if ( (M-1-(StageCnt * 3)) == 2 ){
+ bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
+ NDiffU *= 4;
+ }
+
+ if (M <= MCACHE)
+ bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
+
+ else{
+ fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
+ }
+
+ ioptr += 2*POW2(M);
+ }
+}
+}
+
+/************************************************
+parts of iffts1
+*************************************************/
+
+inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale);
+inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale){
+/*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/
+float f0r;
+float f0i;
+float f1r;
+float f1i;
+float f2r;
+float f2i;
+float f3r;
+float f3i;
+float f4r;
+float f4i;
+float f5r;
+float f5i;
+float f6r;
+float f6i;
+float f7r;
+float f7i;
+float t0r;
+float t0i;
+float t1r;
+float t1i;
+float *p0r;
+float *p1r;
+float *IOP;
+float *iolimit;
+long Colstart;
+long iCol;
+unsigned long posA;
+unsigned long posAi;
+unsigned long posB;
+unsigned long posBi;
+
+const unsigned long Nrems2 = POW2((M+3)/2);
+const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2;
+const unsigned long Nroot_1 = POW2(M/2-1)-1;
+const unsigned long ColstartShift = (M+1)/2 +1;
+posA = POW2(M); // 1/2 of POW2(M) complexes
+posAi = posA + 1;
+posB = posA + 2;
+posBi = posB + 1;
+
+iolimit = ioptr + Nrems2;
+for (; ioptr < iolimit; ioptr += POW2(M/2+1)){
+ for (Colstart = Nroot_1; Colstart >= 0; Colstart--){
+ iCol = Nroot_1;
+ p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4;
+ IOP = ioptr + (Colstart << ColstartShift);
+ p1r = IOP + BRLow[iCol]*4;
+ f0r = *(p0r);
+ f0i = *(p0r+1);
+ f1r = *(p0r+posA);
+ f1i = *(p0r+posAi);
+ for (; iCol > Colstart;){
+ f2r = *(p0r+2);
+ f2i = *(p0r+(2+1));
+ f3r = *(p0r+posB);
+ f3i = *(p0r+posBi);
+ f4r = *(p1r);
+ f4i = *(p1r+1);
+ f5r = *(p1r+posA);
+ f5i = *(p1r+posAi);
+ f6r = *(p1r+2);
+ f6i = *(p1r+(2+1));
+ f7r = *(p1r+posB);
+ f7i = *(p1r+posBi);
+
+ t0r = f0r + f1r;
+ t0i = f0i + f1i;
+ f1r = f0r - f1r;
+ f1i = f0i - f1i;
+ t1r = f2r + f3r;
+ t1i = f2i + f3i;
+ f3r = f2r - f3r;
+ f3i = f2i - f3i;
+ f0r = f4r + f5r;
+ f0i = f4i + f5i;
+ f5r = f4r - f5r;
+ f5i = f4i - f5i;
+ f2r = f6r + f7r;
+ f2i = f6i + f7i;
+ f7r = f6r - f7r;
+ f7i = f6i - f7i;
+
+ *(p1r) = scale*t0r;
+ *(p1r+1) = scale*t0i;
+ *(p1r+2) = scale*f1r;
+ *(p1r+(2+1)) = scale*f1i;
+ *(p1r+posA) = scale*t1r;
+ *(p1r+posAi) = scale*t1i;
+ *(p1r+posB) = scale*f3r;
+ *(p1r+posBi) = scale*f3i;
+ *(p0r) = scale*f0r;
+ *(p0r+1) = scale*f0i;
+ *(p0r+2) = scale*f5r;
+ *(p0r+(2+1)) = scale*f5i;
+ *(p0r+posA) = scale*f2r;
+ *(p0r+posAi) = scale*f2i;
+ *(p0r+posB) = scale*f7r;
+ *(p0r+posBi) = scale*f7i;
+
+ p0r -= Nrems2;
+ f0r = *(p0r);
+ f0i = *(p0r+1);
+ f1r = *(p0r+posA);
+ f1i = *(p0r+posAi);
+ iCol -= 1;
+ p1r = IOP + BRLow[iCol]*4;
+ };
+ f2r = *(p0r+2);
+ f2i = *(p0r+(2+1));
+ f3r = *(p0r+posB);
+ f3i = *(p0r+posBi);
+
+ t0r = f0r + f1r;
+ t0i = f0i + f1i;
+ f1r = f0r - f1r;
+ f1i = f0i - f1i;
+ t1r = f2r + f3r;
+ t1i = f2i + f3i;
+ f3r = f2r - f3r;
+ f3i = f2i - f3i;
+
+ *(p0r) = scale*t0r;
+ *(p0r+1) = scale*t0i;
+ *(p0r+2) = scale*f1r;
+ *(p0r+(2+1)) = scale*f1i;
+ *(p0r+posA) = scale*t1r;
+ *(p0r+posAi) = scale*t1i;
+ *(p0r+posB) = scale*f3r;
+ *(p0r+posBi) = scale*f3i;
+
+ };
+};
+}
+
+inline void ifft2pt(float *ioptr, float scale);
+inline void ifft2pt(float *ioptr, float scale){
+/*** RADIX 2 ifft ***/
+float f0r, f0i, f1r, f1i;
+float t0r, t0i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[2];
+f1i = ioptr[3];
+
+ /* Butterflys */
+ /*
+ f0 - - t0
+ f1 - 1 - f1
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+ /* store result */
+ioptr[0] = scale*t0r;
+ioptr[1] = scale*t0i;
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+}
+
+inline void ifft4pt(float *ioptr, float scale);
+inline void ifft4pt(float *ioptr, float scale){
+/*** RADIX 4 ifft ***/
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float t0r, t0i, t1r, t1i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[4];
+f1i = ioptr[5];
+f2r = ioptr[2];
+f2i = ioptr[3];
+f3r = ioptr[6];
+f3i = ioptr[7];
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0
+ f1 - 1 - f1 - - f1
+ f2 - - f2 - 1 - f2
+ f3 - 1 - t1 - i - f3
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r + t1i;
+f3i = f1i - t1r;
+f1r = f1r - t1i;
+f1i = f1i + t1r;
+
+ /* store result */
+ioptr[0] = scale*f0r;
+ioptr[1] = scale*f0i;
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+ioptr[4] = scale*f2r;
+ioptr[5] = scale*f2i;
+ioptr[6] = scale*f3r;
+ioptr[7] = scale*f3i;
+}
+
+inline void ifft8pt(float *ioptr, float scale);
+inline void ifft8pt(float *ioptr, float scale){
+/*** RADIX 8 ifft ***/
+float w0r = 1.0/MYROOT2; /* cos(pi/4) */
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[8];
+f1i = ioptr[9];
+f2r = ioptr[4];
+f2i = ioptr[5];
+f3r = ioptr[12];
+f3i = ioptr[13];
+f4r = ioptr[2];
+f4i = ioptr[3];
+f5r = ioptr[10];
+f5i = ioptr[11];
+f6r = ioptr[6];
+f6i = ioptr[7];
+f7r = ioptr[14];
+f7i = ioptr[15];
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0 - - f0
+ f1 - 1 - f1 - - f1 - - f1
+ f2 - - f2 - 1 - f2 - - f2
+ f3 - 1 - t1 - i - f3 - - f3
+ f4 - - t0 - - f4 - 1 - t0
+ f5 - 1 - f5 - - f5 - w3 - f4
+ f6 - - f6 - 1 - f6 - i - t1
+ f7 - 1 - t1 - i - f7 - iw3- f6
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r + t1i;
+f3i = f1i - t1r;
+f1r = f1r - t1i;
+f1i = f1i + t1r;
+
+
+t0r = f4r + f5r;
+t0i = f4i + f5i;
+f5r = f4r - f5r;
+f5i = f4i - f5i;
+
+t1r = f6r - f7r;
+t1i = f6i - f7i;
+f6r = f6r + f7r;
+f6i = f6i + f7i;
+
+f4r = t0r + f6r;
+f4i = t0i + f6i;
+f6r = t0r - f6r;
+f6i = t0i - f6i;
+
+f7r = f5r + t1i;
+f7i = f5i - t1r;
+f5r = f5r - t1i;
+f5i = f5i + t1r;
+
+
+t0r = f0r - f4r;
+t0i = f0i - f4i;
+f0r = f0r + f4r;
+f0i = f0i + f4i;
+
+t1r = f2r + f6i;
+t1i = f2i - f6r;
+f2r = f2r - f6i;
+f2i = f2i + f6r;
+
+f4r = f1r - f5r * w0r + f5i * w0r;
+f4i = f1i - f5r * w0r - f5i * w0r;
+f1r = f1r * Two - f4r;
+f1i = f1i * Two - f4i;
+
+f6r = f3r + f7r * w0r + f7i * w0r;
+f6i = f3i - f7r * w0r + f7i * w0r;
+f3r = f3r * Two - f6r;
+f3i = f3i * Two - f6i;
+
+ /* store result */
+ioptr[0] = scale*f0r;
+ioptr[1] = scale*f0i;
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+ioptr[4] = scale*f2r;
+ioptr[5] = scale*f2i;
+ioptr[6] = scale*f3r;
+ioptr[7] = scale*f3i;
+ioptr[8] = scale*t0r;
+ioptr[9] = scale*t0i;
+ioptr[10] = scale*f4r;
+ioptr[11] = scale*f4i;
+ioptr[12] = scale*t1r;
+ioptr[13] = scale*t1i;
+ioptr[14] = scale*f6r;
+ioptr[15] = scale*f6i;
+}
+
+inline void ibfR2(float *ioptr, long M, long NDiffU);
+inline void ibfR2(float *ioptr, long M, long NDiffU){
+/*** 2nd radix 2 stage ***/
+unsigned long pos;
+unsigned long posi;
+unsigned long pinc;
+unsigned long pnext;
+unsigned long NSameU;
+unsigned long SameUCnt;
+
+float *pstrt;
+float *p0r, *p1r, *p2r, *p3r;
+
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+const float Two = 2.0;
+
+pinc = NDiffU * 2; // 2 floats per complex
+pnext = pinc * 4;
+pos = 2;
+posi = pos+1;
+NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time
+pstrt = ioptr;
+p0r = pstrt;
+p1r = pstrt+pinc;
+p2r = p1r+pinc;
+p3r = p2r+pinc;
+
+ /* Butterflys */
+ /*
+ f0 - - f4
+ f1 - 1 - f5
+ f2 - - f6
+ f3 - 1 - f7
+ */
+ /* Butterflys */
+ /*
+ f0 - - f4
+ f1 - 1 - f5
+ f2 - - f6
+ f3 - 1 - f7
+ */
+
+for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){
+
+ f0r = *p0r;
+ f1r = *p1r;
+ f0i = *(p0r + 1);
+ f1i = *(p1r + 1);
+ f2r = *p2r;
+ f3r = *p3r;
+ f2i = *(p2r + 1);
+ f3i = *(p3r + 1);
+
+ f4r = f0r + f1r;
+ f4i = f0i + f1i;
+ f5r = f0r - f1r;
+ f5i = f0i - f1i;
+
+ f6r = f2r + f3r;
+ f6i = f2i + f3i;
+ f7r = f2r - f3r;
+ f7i = f2i - f3i;
+
+ *p0r = f4r;
+ *(p0r + 1) = f4i;
+ *p1r = f5r;
+ *(p1r + 1) = f5i;
+ *p2r = f6r;
+ *(p2r + 1) = f6i;
+ *p3r = f7r;
+ *(p3r + 1) = f7i;
+
+ f0r = *(p0r + pos);
+ f1i = *(p1r + posi);
+ f0i = *(p0r + posi);
+ f1r = *(p1r + pos);
+ f2r = *(p2r + pos);
+ f3i = *(p3r + posi);
+ f2i = *(p2r + posi);
+ f3r = *(p3r + pos);
+
+ f4r = f0r - f1i;
+ f4i = f0i + f1r;
+ f5r = f0r + f1i;
+ f5i = f0i - f1r;
+
+ f6r = f2r - f3i;
+ f6i = f2i + f3r;
+ f7r = f2r + f3i;
+ f7i = f2i - f3r;
+
+ *(p0r + pos) = f4r;
+ *(p0r + posi) = f4i;
+ *(p1r + pos) = f5r;
+ *(p1r + posi) = f5i;
+ *(p2r + pos) = f6r;
+ *(p2r + posi) = f6i;
+ *(p3r + pos) = f7r;
+ *(p3r + posi) = f7i;
+
+ p0r += pnext;
+ p1r += pnext;
+ p2r += pnext;
+ p3r += pnext;
+
+}
+}
+
+inline void ibfR4(float *ioptr, long M, long NDiffU);
+inline void ibfR4(float *ioptr, long M, long NDiffU){
+/*** 1 radix 4 stage ***/
+unsigned long pos;
+unsigned long posi;
+unsigned long pinc;
+unsigned long pnext;
+unsigned long pnexti;
+unsigned long NSameU;
+unsigned long SameUCnt;
+
+float *pstrt;
+float *p0r, *p1r, *p2r, *p3r;
+
+float w1r = 1.0/MYROOT2; /* cos(pi/4) */
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t1r, t1i;
+const float Two = 2.0;
+
+pinc = NDiffU * 2; // 2 floats per complex
+pnext = pinc * 4;
+pnexti = pnext + 1;
+pos = 2;
+posi = pos+1;
+NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly
+pstrt = ioptr;
+p0r = pstrt;
+p1r = pstrt+pinc;
+p2r = p1r+pinc;
+p3r = p2r+pinc;
+
+ /* Butterflys */
+ /*
+ f0 - - f0 - - f4
+ f1 - 1 - f5 - - f5
+ f2 - - f6 - 1 - f6
+ f3 - 1 - f3 - -i - f7
+ */
+ /* Butterflys */
+ /*
+ f0 - - f4 - - f4
+ f1 - -i - t1 - - f5
+ f2 - - f2 - w1 - f6
+ f3 - -i - f7 - iw1- f7
+ */
+
+f0r = *p0r;
+f1r = *p1r;
+f2r = *p2r;
+f3r = *p3r;
+f0i = *(p0r + 1);
+f1i = *(p1r + 1);
+f2i = *(p2r + 1);
+f3i = *(p3r + 1);
+
+f5r = f0r - f1r;
+f5i = f0i - f1i;
+f0r = f0r + f1r;
+f0i = f0i + f1i;
+
+f6r = f2r + f3r;
+f6i = f2i + f3i;
+f3r = f2r - f3r;
+f3i = f2i - f3i;
+
+for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
+
+ f7r = f5r + f3i;
+ f7i = f5i - f3r;
+ f5r = f5r - f3i;
+ f5i = f5i + f3r;
+
+ f4r = f0r + f6r;
+ f4i = f0i + f6i;
+ f6r = f0r - f6r;
+ f6i = f0i - f6i;
+
+ f2r = *(p2r + pos);
+ f2i = *(p2r + posi);
+ f1r = *(p1r + pos);
+ f1i = *(p1r + posi);
+ f3i = *(p3r + posi);
+ f0r = *(p0r + pos);
+ f3r = *(p3r + pos);
+ f0i = *(p0r + posi);
+
+ *p3r = f7r;
+ *p0r = f4r;
+ *(p3r + 1) = f7i;
+ *(p0r + 1) = f4i;
+ *p1r = f5r;
+ *p2r = f6r;
+ *(p1r + 1) = f5i;
+ *(p2r + 1) = f6i;
+
+ f7r = f2r + f3i;
+ f7i = f2i - f3r;
+ f2r = f2r - f3i;
+ f2i = f2i + f3r;
+
+ f4r = f0r - f1i;
+ f4i = f0i + f1r;
+ t1r = f0r + f1i;
+ t1i = f0i - f1r;
+
+ f5r = t1r - f7r * w1r - f7i * w1r;
+ f5i = t1i + f7r * w1r - f7i * w1r;
+ f7r = t1r * Two - f5r;
+ f7i = t1i * Two - f5i;
+
+ f6r = f4r - f2r * w1r + f2i * w1r;
+ f6i = f4i - f2r * w1r - f2i * w1r;
+ f4r = f4r * Two - f6r;
+ f4i = f4i * Two - f6i;
+
+ f3r = *(p3r + pnext);
+ f0r = *(p0r + pnext);
+ f3i = *(p3r + pnexti);
+ f0i = *(p0r + pnexti);
+ f2r = *(p2r + pnext);
+ f2i = *(p2r + pnexti);
+ f1r = *(p1r + pnext);
+ f1i = *(p1r + pnexti);
+
+ *(p2r + pos) = f6r;
+ *(p1r + pos) = f5r;
+ *(p2r + posi) = f6i;
+ *(p1r + posi) = f5i;
+ *(p3r + pos) = f7r;
+ *(p0r + pos) = f4r;
+ *(p3r + posi) = f7i;
+ *(p0r + posi) = f4i;
+
+ f6r = f2r + f3r;
+ f6i = f2i + f3i;
+ f3r = f2r - f3r;
+ f3i = f2i - f3i;
+
+ f5r = f0r - f1r;
+ f5i = f0i - f1i;
+ f0r = f0r + f1r;
+ f0i = f0i + f1i;
+
+ p3r += pnext;
+ p0r += pnext;
+ p1r += pnext;
+ p2r += pnext;
+
+}
+f7r = f5r + f3i;
+f7i = f5i - f3r;
+f5r = f5r - f3i;
+f5i = f5i + f3r;
+
+f4r = f0r + f6r;
+f4i = f0i + f6i;
+f6r = f0r - f6r;
+f6i = f0i - f6i;
+
+f2r = *(p2r + pos);
+f2i = *(p2r + posi);
+f1r = *(p1r + pos);
+f1i = *(p1r + posi);
+f3i = *(p3r + posi);
+f0r = *(p0r + pos);
+f3r = *(p3r + pos);
+f0i = *(p0r + posi);
+
+*p3r = f7r;
+*p0r = f4r;
+*(p3r + 1) = f7i;
+*(p0r + 1) = f4i;
+*p1r = f5r;
+*p2r = f6r;
+*(p1r + 1) = f5i;
+*(p2r + 1) = f6i;
+
+f7r = f2r + f3i;
+f7i = f2i - f3r;
+f2r = f2r - f3i;
+f2i = f2i + f3r;
+
+f4r = f0r - f1i;
+f4i = f0i + f1r;
+t1r = f0r + f1i;
+t1i = f0i - f1r;
+
+f5r = t1r - f7r * w1r - f7i * w1r;
+f5i = t1i + f7r * w1r - f7i * w1r;
+f7r = t1r * Two - f5r;
+f7i = t1i * Two - f5i;
+
+f6r = f4r - f2r * w1r + f2i * w1r;
+f6i = f4i - f2r * w1r - f2i * w1r;
+f4r = f4r * Two - f6r;
+f4i = f4i * Two - f6i;
+
+*(p2r + pos) = f6r;
+*(p1r + pos) = f5r;
+*(p2r + posi) = f6i;
+*(p1r + posi) = f5i;
+*(p3r + pos) = f7r;
+*(p0r + pos) = f4r;
+*(p3r + posi) = f7i;
+*(p0r + posi) = f4i;
+
+}
+
+inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
+inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
+/*** RADIX 8 Stages ***/
+unsigned long pos;
+unsigned long posi;
+unsigned long pinc;
+unsigned long pnext;
+unsigned long NSameU;
+unsigned long Uinc;
+unsigned long Uinc2;
+unsigned long Uinc4;
+unsigned long DiffUCnt;
+unsigned long SameUCnt;
+unsigned long U2toU3;
+
+float *pstrt;
+float *p0r, *p1r, *p2r, *p3r;
+float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
+
+float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+pinc = NDiffU * 2; // 2 floats per complex
+pnext = pinc * 8;
+pos = pinc * 4;
+posi = pos + 1;
+NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly
+Uinc = NSameU * Ustride;
+Uinc2 = Uinc * 2;
+Uinc4 = Uinc * 4;
+U2toU3 = (POW2(M) / 8)*Ustride;
+for (; StageCnt > 0 ; StageCnt--){
+
+ u0r = &Utbl[0];
+ u0i = &Utbl[POW2(M-2)*Ustride];
+ u1r = u0r;
+ u1i = u0i;
+ u2r = u0r;
+ u2i = u0i;
+
+ w0r = *u0r;
+ w0i = *u0i;
+ w1r = *u1r;
+ w1i = *u1i;
+ w2r = *u2r;
+ w2i = *u2i;
+ w3r = *(u2r+U2toU3);
+ w3i = *(u2i-U2toU3);
+
+ pstrt = ioptr;
+
+ p0r = pstrt;
+ p1r = pstrt+pinc;
+ p2r = p1r+pinc;
+ p3r = p2r+pinc;
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0 - - f0
+ f1 - w0- f1 - - f1 - - f1
+ f2 - - f2 - w1- f2 - - f4
+ f3 - w0- t1 - iw1- f3 - - f5
+
+ f4 - - t0 - - f4 - w2- t0
+ f5 - w0- f5 - - f5 - w3- t1
+ f6 - - f6 - w1- f6 - iw2- f6
+ f7 - w0- t1 - iw1- f7 - iw3- f7
+ */
+
+ for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){
+ f0r = *p0r;
+ f0i = *(p0r + 1);
+ f1r = *p1r;
+ f1i = *(p1r + 1);
+ for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){
+ f2r = *p2r;
+ f2i = *(p2r + 1);
+ f3r = *p3r;
+ f3i = *(p3r + 1);
+
+ t0r = f0r + f1r * w0r - f1i * w0i;
+ t0i = f0i + f1r * w0i + f1i * w0r;
+ f1r = f0r * Two - t0r;
+ f1i = f0i * Two - t0i;
+
+ f4r = *(p0r + pos);
+ f4i = *(p0r + posi);
+ f5r = *(p1r + pos);
+ f5i = *(p1r + posi);
+
+ f6r = *(p2r + pos);
+ f6i = *(p2r + posi);
+ f7r = *(p3r + pos);
+ f7i = *(p3r + posi);
+
+ t1r = f2r - f3r * w0r + f3i * w0i;
+ t1i = f2i - f3r * w0i - f3i * w0r;
+ f2r = f2r * Two - t1r;
+ f2i = f2i * Two - t1i;
+
+ f0r = t0r + f2r * w1r - f2i * w1i;
+ f0i = t0i + f2r * w1i + f2i * w1r;
+ f2r = t0r * Two - f0r;
+ f2i = t0i * Two - f0i;
+
+ f3r = f1r + t1r * w1i + t1i * w1r;
+ f3i = f1i - t1r * w1r + t1i * w1i;
+ f1r = f1r * Two - f3r;
+ f1i = f1i * Two - f3i;
+
+
+ t0r = f4r + f5r * w0r - f5i * w0i;
+ t0i = f4i + f5r * w0i + f5i * w0r;
+ f5r = f4r * Two - t0r;
+ f5i = f4i * Two - t0i;
+
+ t1r = f6r - f7r * w0r + f7i * w0i;
+ t1i = f6i - f7r * w0i - f7i * w0r;
+ f6r = f6r * Two - t1r;
+ f6i = f6i * Two - t1i;
+
+ f4r = t0r + f6r * w1r - f6i * w1i;
+ f4i = t0i + f6r * w1i + f6i * w1r;
+ f6r = t0r * Two - f4r;
+ f6i = t0i * Two - f4i;
+
+ f7r = f5r + t1r * w1i + t1i * w1r;
+ f7i = f5i - t1r * w1r + t1i * w1i;
+ f5r = f5r * Two - f7r;
+ f5i = f5i * Two - f7i;
+
+ t0r = f0r - f4r * w2r + f4i * w2i;
+ t0i = f0i - f4r * w2i - f4i * w2r;
+ f0r = f0r * Two - t0r;
+ f0i = f0i * Two - t0i;
+
+ t1r = f1r - f5r * w3r + f5i * w3i;
+ t1i = f1i - f5r * w3i - f5i * w3r;
+ f1r = f1r * Two - t1r;
+ f1i = f1i * Two - t1i;
+
+ *(p0r + pos) = t0r;
+ *(p0r + posi) = t0i;
+ *p0r = f0r;
+ *(p0r + 1) = f0i;
+
+ p0r += pnext;
+ f0r = *p0r;
+ f0i = *(p0r + 1);
+
+ *(p1r + pos) = t1r;
+ *(p1r + posi) = t1i;
+ *p1r = f1r;
+ *(p1r + 1) = f1i;
+
+ p1r += pnext;
+
+ f1r = *p1r;
+ f1i = *(p1r + 1);
+
+ f4r = f2r - f6r * w2i - f6i * w2r;
+ f4i = f2i + f6r * w2r - f6i * w2i;
+ f6r = f2r * Two - f4r;
+ f6i = f2i * Two - f4i;
+
+ f5r = f3r - f7r * w3i - f7i * w3r;
+ f5i = f3i + f7r * w3r - f7i * w3i;
+ f7r = f3r * Two - f5r;
+ f7i = f3i * Two - f5i;
+
+ *p2r = f4r;
+ *(p2r + 1) = f4i;
+ *(p2r + pos) = f6r;
+ *(p2r + posi) = f6i;
+
+ p2r += pnext;
+
+ *p3r = f5r;
+ *(p3r + 1) = f5i;
+ *(p3r + pos) = f7r;
+ *(p3r + posi) = f7i;
+
+ p3r += pnext;
+
+ }
+
+ f2r = *p2r;
+ f2i = *(p2r + 1);
+ f3r = *p3r;
+ f3i = *(p3r + 1);
+
+ t0r = f0r + f1r * w0r - f1i * w0i;
+ t0i = f0i + f1r * w0i + f1i * w0r;
+ f1r = f0r * Two - t0r;
+ f1i = f0i * Two - t0i;
+
+ f4r = *(p0r + pos);
+ f4i = *(p0r + posi);
+ f5r = *(p1r + pos);
+ f5i = *(p1r + posi);
+
+ f6r = *(p2r + pos);
+ f6i = *(p2r + posi);
+ f7r = *(p3r + pos);
+ f7i = *(p3r + posi);
+
+ t1r = f2r - f3r * w0r + f3i * w0i;
+ t1i = f2i - f3r * w0i - f3i * w0r;
+ f2r = f2r * Two - t1r;
+ f2i = f2i * Two - t1i;
+
+ f0r = t0r + f2r * w1r - f2i * w1i;
+ f0i = t0i + f2r * w1i + f2i * w1r;
+ f2r = t0r * Two - f0r;
+ f2i = t0i * Two - f0i;
+
+ f3r = f1r + t1r * w1i + t1i * w1r;
+ f3i = f1i - t1r * w1r + t1i * w1i;
+ f1r = f1r * Two - f3r;
+ f1i = f1i * Two - f3i;
+
+ if (DiffUCnt == NDiffU/2)
+ Uinc4 = -Uinc4;
+
+ u0r += Uinc4;
+ u0i -= Uinc4;
+ u1r += Uinc2;
+ u1i -= Uinc2;
+ u2r += Uinc;
+ u2i -= Uinc;
+
+ pstrt += 2;
+
+ t0r = f4r + f5r * w0r - f5i * w0i;
+ t0i = f4i + f5r * w0i + f5i * w0r;
+ f5r = f4r * Two - t0r;
+ f5i = f4i * Two - t0i;
+
+ t1r = f6r - f7r * w0r + f7i * w0i;
+ t1i = f6i - f7r * w0i - f7i * w0r;
+ f6r = f6r * Two - t1r;
+ f6i = f6i * Two - t1i;
+
+ f4r = t0r + f6r * w1r - f6i * w1i;
+ f4i = t0i + f6r * w1i + f6i * w1r;
+ f6r = t0r * Two - f4r;
+ f6i = t0i * Two - f4i;
+
+ f7r = f5r + t1r * w1i + t1i * w1r;
+ f7i = f5i - t1r * w1r + t1i * w1i;
+ f5r = f5r * Two - f7r;
+ f5i = f5i * Two - f7i;
+
+ w0r = *u0r;
+ w0i = *u0i;
+ w1r = *u1r;
+ w1i = *u1i;
+
+ if (DiffUCnt <= NDiffU/2)
+ w0r = -w0r;
+
+ t0r = f0r - f4r * w2r + f4i * w2i;
+ t0i = f0i - f4r * w2i - f4i * w2r;
+ f0r = f0r * Two - t0r;
+ f0i = f0i * Two - t0i;
+
+ f4r = f2r - f6r * w2i - f6i * w2r;
+ f4i = f2i + f6r * w2r - f6i * w2i;
+ f6r = f2r * Two - f4r;
+ f6i = f2i * Two - f4i;
+
+ *(p0r + pos) = t0r;
+ *p2r = f4r;
+ *(p0r + posi) = t0i;
+ *(p2r + 1) = f4i;
+ w2r = *u2r;
+ w2i = *u2i;
+ *p0r = f0r;
+ *(p2r + pos) = f6r;
+ *(p0r + 1) = f0i;
+ *(p2r + posi) = f6i;
+
+ p0r = pstrt;
+ p2r = pstrt + pinc + pinc;
+
+ t1r = f1r - f5r * w3r + f5i * w3i;
+ t1i = f1i - f5r * w3i - f5i * w3r;
+ f1r = f1r * Two - t1r;
+ f1i = f1i * Two - t1i;
+
+ f5r = f3r - f7r * w3i - f7i * w3r;
+ f5i = f3i + f7r * w3r - f7i * w3i;
+ f7r = f3r * Two - f5r;
+ f7i = f3i * Two - f5i;
+
+ *(p1r + pos) = t1r;
+ *p3r = f5r;
+ *(p1r + posi) = t1i;
+ *(p3r + 1) = f5i;
+ w3r = *(u2r+U2toU3);
+ w3i = *(u2i-U2toU3);
+ *p1r = f1r;
+ *(p3r + pos) = f7r;
+ *(p1r + 1) = f1i;
+ *(p3r + posi) = f7i;
+
+ p1r = pstrt + pinc;
+ p3r = p2r + pinc;
+ }
+ NSameU /= 8;
+ Uinc /= 8;
+ Uinc2 /= 8;
+ Uinc4 = Uinc * 4;
+ NDiffU *= 8;
+ pinc *= 8;
+ pnext *= 8;
+ pos *= 8;
+ posi = pos + 1;
+}
+}
+
+void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt);
+void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){
+/* recursive bfstages calls to maximize on chip cache efficiency */
+long i1;
+if (M <= MCACHE)
+ ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */
+else{
+ for (i1=0; i1<8; i1++){
+ ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */
+ }
+ ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */
+}
+}
+
+void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
+/* Compute in-place inverse complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = output data array */
+
+long StageCnt;
+long NDiffU;
+const float scale = 1.0/POW2(M);
+
+switch (M){
+case 0:
+ break;
+case 1:
+ for (;Rows>0;Rows--){
+ ifft2pt(ioptr, scale); /* a 2 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 2:
+ for (;Rows>0;Rows--){
+ ifft4pt(ioptr, scale); /* a 4 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 3:
+ for (;Rows>0;Rows--){
+ ifft8pt(ioptr, scale); /* an 8 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+default:
+ for (;Rows>0;Rows--){
+
+ scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */
+
+ StageCnt = (M-1) / 3; // number of radix 8 stages
+ NDiffU = 2; // one radix 2 stage already complete
+
+ if ( (M-1-(StageCnt * 3)) == 1 ){
+ ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
+ NDiffU *= 2;
+ }
+
+ if ( (M-1-(StageCnt * 3)) == 2 ){
+ ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
+ NDiffU *= 4;
+ }
+
+ if (M <= MCACHE)
+ ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
+
+ else{
+ ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */
+ }
+
+ ioptr += 2*POW2(M);
+ }
+}
+}
+
+/************************************************
+parts of rffts1
+*************************************************/
+
+inline void rfft1pt(float *ioptr);
+inline void rfft1pt(float *ioptr){
+/*** RADIX 2 rfft ***/
+float f0r, f0i;
+float t0r, t0i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+
+ /* finish rfft */
+t0r = f0r + f0i;
+t0i = f0r - f0i;
+
+ /* store result */
+ioptr[0] = t0r;
+ioptr[1] = t0i;
+}
+
+inline void rfft2pt(float *ioptr);
+inline void rfft2pt(float *ioptr){
+/*** RADIX 4 rfft ***/
+float f0r, f0i, f1r, f1i;
+float t0r, t0i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[2];
+f1i = ioptr[3];
+
+ /* Butterflys */
+ /*
+ f0 - - t0
+ f1 - 1 - f1
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f1i - f0i;
+ /* finish rfft */
+f0r = t0r + t0i;
+f0i = t0r - t0i;
+
+ /* store result */
+ioptr[0] = f0r;
+ioptr[1] = f0i;
+ioptr[2] = f1r;
+ioptr[3] = f1i;
+}
+
+inline void rfft4pt(float *ioptr);
+inline void rfft4pt(float *ioptr){
+/*** RADIX 8 rfft ***/
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float t0r, t0i, t1r, t1i;
+float w0r = 1.0/MYROOT2; /* cos(pi/4) */
+const float Two = 2.0;
+const float scale = 0.5;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[4];
+f1i = ioptr[5];
+f2r = ioptr[2];
+f2i = ioptr[3];
+f3r = ioptr[6];
+f3i = ioptr[7];
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0
+ f1 - 1 - f1 - - f1
+ f2 - - f2 - 1 - f2
+ f3 - 1 - t1 - -i - f3
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = f2i - t0i; // neg for rfft
+
+f3r = f1r - t1i;
+f3i = f1i + t1r;
+f1r = f1r + t1i;
+f1i = f1i - t1r;
+
+ /* finish rfft */
+t0r = f0r + f0i; /* compute Re(x[0]) */
+t0i = f0r - f0i; /* compute Re(x[N/2]) */
+
+t1r = f1r + f3r;
+t1i = f1i - f3i;
+f0r = f1i + f3i;
+f0i = f3r - f1r;
+
+f1r = t1r + w0r * f0r + w0r * f0i;
+f1i = t1i - w0r * f0r + w0r * f0i;
+f3r = Two * t1r - f1r;
+f3i = f1i - Two * t1i;
+
+ /* store result */
+ioptr[4] = f2r;
+ioptr[5] = f2i;
+ioptr[0] = t0r;
+ioptr[1] = t0i;
+
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+ioptr[6] = scale*f3r;
+ioptr[7] = scale*f3i;
+}
+
+inline void rfft8pt(float *ioptr);
+inline void rfft8pt(float *ioptr){
+/*** RADIX 16 rfft ***/
+float w0r = 1.0/MYROOT2; /* cos(pi/4) */
+float w1r = MYCOSPID8; /* cos(pi/8) */
+float w1i = MYSINPID8; /* sin(pi/8) */
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+const float scale = 0.5;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+f1r = ioptr[8];
+f1i = ioptr[9];
+f2r = ioptr[4];
+f2i = ioptr[5];
+f3r = ioptr[12];
+f3i = ioptr[13];
+f4r = ioptr[2];
+f4i = ioptr[3];
+f5r = ioptr[10];
+f5i = ioptr[11];
+f6r = ioptr[6];
+f6i = ioptr[7];
+f7r = ioptr[14];
+f7i = ioptr[15];
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0 - - f0
+ f1 - 1 - f1 - - f1 - - f1
+ f2 - - f2 - 1 - f2 - - f2
+ f3 - 1 - t1 - -i - f3 - - f3
+ f4 - - t0 - - f4 - 1 - t0
+ f5 - 1 - f5 - - f5 - w3 - f4
+ f6 - - f6 - 1 - f6 - -i - t1
+ f7 - 1 - t1 - -i - f7 - iw3- f6
+ */
+
+t0r = f0r + f1r;
+t0i = f0i + f1i;
+f1r = f0r - f1r;
+f1i = f0i - f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r - t1i;
+f3i = f1i + t1r;
+f1r = f1r + t1i;
+f1i = f1i - t1r;
+
+
+t0r = f4r + f5r;
+t0i = f4i + f5i;
+f5r = f4r - f5r;
+f5i = f4i - f5i;
+
+t1r = f6r - f7r;
+t1i = f6i - f7i;
+f6r = f6r + f7r;
+f6i = f6i + f7i;
+
+f4r = t0r + f6r;
+f4i = t0i + f6i;
+f6r = t0r - f6r;
+f6i = t0i - f6i;
+
+f7r = f5r - t1i;
+f7i = f5i + t1r;
+f5r = f5r + t1i;
+f5i = f5i - t1r;
+
+
+t0r = f0r - f4r;
+t0i = f4i - f0i; // neg for rfft
+f0r = f0r + f4r;
+f0i = f0i + f4i;
+
+t1r = f2r - f6i;
+t1i = f2i + f6r;
+f2r = f2r + f6i;
+f2i = f2i - f6r;
+
+f4r = f1r - f5r * w0r - f5i * w0r;
+f4i = f1i + f5r * w0r - f5i * w0r;
+f1r = f1r * Two - f4r;
+f1i = f1i * Two - f4i;
+
+f6r = f3r + f7r * w0r - f7i * w0r;
+f6i = f3i + f7r * w0r + f7i * w0r;
+f3r = f3r * Two - f6r;
+f3i = f3i * Two - f6i;
+
+ /* finish rfft */
+f5r = f0r + f0i; /* compute Re(x[0]) */
+f5i = f0r - f0i; /* compute Re(x[N/2]) */
+
+f0r = f2r + t1r;
+f0i = f2i - t1i;
+f7r = f2i + t1i;
+f7i = t1r - f2r;
+
+f2r = f0r + w0r * f7r + w0r * f7i;
+f2i = f0i - w0r * f7r + w0r * f7i;
+t1r = Two * f0r - f2r;
+t1i = f2i - Two * f0i;
+
+
+f0r = f1r + f6r;
+f0i = f1i - f6i;
+f7r = f1i + f6i;
+f7i = f6r - f1r;
+
+f1r = f0r + w1r * f7r + w1i * f7i;
+f1i = f0i - w1i * f7r + w1r * f7i;
+f6r = Two * f0r - f1r;
+f6i = f1i - Two * f0i;
+
+f0r = f3r + f4r;
+f0i = f3i - f4i;
+f7r = f3i + f4i;
+f7i = f4r - f3r;
+
+f3r = f0r + w1i * f7r + w1r * f7i;
+f3i = f0i - w1r * f7r + w1i * f7i;
+f4r = Two * f0r - f3r;
+f4i = f3i - Two * f0i;
+
+ /* store result */
+ioptr[8] = t0r;
+ioptr[9] = t0i;
+ioptr[0] = f5r;
+ioptr[1] = f5i;
+
+ioptr[4] = scale*f2r;
+ioptr[5] = scale*f2i;
+ioptr[12] = scale*t1r;
+ioptr[13] = scale*t1i;
+
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+ioptr[6] = scale*f3r;
+ioptr[7] = scale*f3i;
+ioptr[10] = scale*f4r;
+ioptr[11] = scale*f4i;
+ioptr[14] = scale*f6r;
+ioptr[15] = scale*f6i;
+}
+
+inline void frstage(float *ioptr, long M, float *Utbl);
+inline void frstage(float *ioptr, long M, float *Utbl){
+/* Finish RFFT */
+
+unsigned long pos;
+unsigned long posi;
+unsigned long diffUcnt;
+
+float *p0r, *p1r;
+float *u0r, *u0i;
+
+float w0r, w0i;
+float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+pos = POW2(M-1);
+posi = pos + 1;
+
+p0r = ioptr;
+p1r = ioptr + pos/2;
+
+u0r = Utbl + POW2(M-3);
+
+w0r = *u0r,
+
+f0r = *(p0r);
+f0i = *(p0r + 1);
+f4r = *(p0r + pos);
+f4i = *(p0r + posi);
+f1r = *(p1r);
+f1i = *(p1r + 1);
+f5r = *(p1r + pos);
+f5i = *(p1r + posi);
+
+ t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */
+ t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */
+ t1r = f4r + f4r;
+ t1i = -f4i - f4i;
+
+ f0r = f1r + f5r;
+ f0i = f1i - f5i;
+ f4r = f1i + f5i;
+ f4i = f5r - f1r;
+
+ f1r = f0r + w0r * f4r + w0r * f4i;
+ f1i = f0i - w0r * f4r + w0r * f4i;
+ f5r = Two * f0r - f1r;
+ f5i = f1i - Two * f0i;
+
+*(p0r) = t0r;
+*(p0r + 1) = t0i;
+*(p0r + pos) = t1r;
+*(p0r + posi) = t1i;
+*(p1r) = f1r;
+*(p1r + 1) = f1i;
+*(p1r + pos) = f5r;
+*(p1r + posi) = f5i;
+
+u0r = Utbl + 1;
+u0i = Utbl + (POW2(M-2)-1);
+
+w0r = *u0r,
+w0i = *u0i;
+
+p0r = (ioptr + 2);
+p1r = (ioptr + (POW2(M-2)-1)*2);
+
+ /* Butterflys */
+ /*
+ f0 - t0 - - f0
+ f5 - t1 - w0 - f5
+
+ f1 - t0 - - f1
+ f4 - t1 -iw0 - f4
+ */
+
+for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){
+
+ f0r = *(p0r);
+ f0i = *(p0r + 1);
+ f5r = *(p1r + pos);
+ f5i = *(p1r + posi);
+ f1r = *(p1r);
+ f1i = *(p1r + 1);
+ f4r = *(p0r + pos);
+ f4i = *(p0r + posi);
+
+ t0r = f0r + f5r;
+ t0i = f0i - f5i;
+ t1r = f0i + f5i;
+ t1i = f5r - f0r;
+
+ f0r = t0r + w0r * t1r + w0i * t1i;
+ f0i = t0i - w0i * t1r + w0r * t1i;
+ f5r = Two * t0r - f0r;
+ f5i = f0i - Two * t0i;
+
+ t0r = f1r + f4r;
+ t0i = f1i - f4i;
+ t1r = f1i + f4i;
+ t1i = f4r - f1r;
+
+ f1r = t0r + w0i * t1r + w0r * t1i;
+ f1i = t0i - w0r * t1r + w0i * t1i;
+ f4r = Two * t0r - f1r;
+ f4i = f1i - Two * t0i;
+
+ *(p0r) = f0r;
+ *(p0r + 1) = f0i;
+ *(p1r + pos) = f5r;
+ *(p1r + posi) = f5i;
+
+ w0r = *++u0r;
+ w0i = *--u0i;
+
+ *(p1r) = f1r;
+ *(p1r + 1) = f1i;
+ *(p0r + pos) = f4r;
+ *(p0r + posi) = f4i;
+
+ p0r += 2;
+ p1r -= 2;
+};
+}
+
+void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
+/* Compute in-place real fft on the rows of the input array */
+/* The result is the complex spectra of the positive frequencies */
+/* except the location for the first complex number contains the real */
+/* values for DC and Nyquest */
+/* INPUTS */
+/* *ioptr = real input data array */
+/* M = log2 of fft size */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = output data array in the following order */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+
+float scale;
+long StageCnt;
+long NDiffU;
+
+M=M-1;
+switch (M){
+case -1:
+ break;
+case 0:
+ for (;Rows>0;Rows--){
+ rfft1pt(ioptr); /* a 2 pt fft */
+ ioptr += 2*POW2(M);
+ }
+case 1:
+ for (;Rows>0;Rows--){
+ rfft2pt(ioptr); /* a 4 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 2:
+ for (;Rows>0;Rows--){
+ rfft4pt(ioptr); /* an 8 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 3:
+ for (;Rows>0;Rows--){
+ rfft8pt(ioptr); /* a 16 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+default:
+ scale = 0.5;
+ for (;Rows>0;Rows--){
+
+ scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */
+
+ StageCnt = (M-1) / 3; // number of radix 8 stages
+ NDiffU = 2; // one radix 2 stage already complete
+
+ if ( (M-1-(StageCnt * 3)) == 1 ){
+ bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
+ NDiffU *= 2;
+ }
+
+ if ( (M-1-(StageCnt * 3)) == 2 ){
+ bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
+ NDiffU *= 4;
+ }
+
+ if (M <= MCACHE){
+ bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
+ frstage(ioptr, M+1, Utbl);
+ }
+
+ else{
+ fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
+ frstage(ioptr, M+1, Utbl);
+ }
+
+ ioptr += 2*POW2(M);
+ }
+}
+}
+
+/************************************************
+parts of riffts1
+*************************************************/
+
+inline void rifft1pt(float *ioptr, float scale);
+inline void rifft1pt(float *ioptr, float scale){
+/*** RADIX 2 rifft ***/
+float f0r, f0i;
+float t0r, t0i;
+
+ /* bit reversed load */
+f0r = ioptr[0];
+f0i = ioptr[1];
+
+ /* finish rfft */
+t0r = f0r + f0i;
+t0i = f0r - f0i;
+
+ /* store result */
+ioptr[0] = scale*t0r;
+ioptr[1] = scale*t0i;
+}
+
+inline void rifft2pt(float *ioptr, float scale);
+inline void rifft2pt(float *ioptr, float scale){
+/*** RADIX 4 rifft ***/
+float f0r, f0i, f1r, f1i;
+float t0r, t0i;
+const float Two = 2.0;
+
+ /* bit reversed load */
+t0r = ioptr[0];
+t0i = ioptr[1];
+f1r = Two * ioptr[2];
+f1i = Two * ioptr[3];
+
+ /* start rifft */
+f0r = t0r + t0i;
+f0i = t0r - t0i;
+ /* Butterflys */
+ /*
+ f0 - - t0
+ f1 - 1 - f1
+ */
+
+t0r = f0r + f1r;
+t0i = f0i - f1i;
+f1r = f0r - f1r;
+f1i = f0i + f1i;
+
+ /* store result */
+ioptr[0] = scale*t0r;
+ioptr[1] = scale*t0i;
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+}
+
+inline void rifft4pt(float *ioptr, float scale);
+inline void rifft4pt(float *ioptr, float scale){
+/*** RADIX 8 rifft ***/
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float t0r, t0i, t1r, t1i;
+float w0r = 1.0/MYROOT2; /* cos(pi/4) */
+const float Two = 2.0;
+
+ /* bit reversed load */
+t0r = ioptr[0];
+t0i = ioptr[1];
+f2r = ioptr[2];
+f2i = ioptr[3];
+f1r = Two * ioptr[4];
+f1i = Two * ioptr[5];
+f3r = ioptr[6];
+f3i = ioptr[7];
+
+ /* start rfft */
+f0r = t0r + t0i; /* compute Re(x[0]) */
+f0i = t0r - t0i; /* compute Re(x[N/2]) */
+
+t1r = f2r + f3r;
+t1i = f2i - f3i;
+t0r = f2r - f3r;
+t0i = f2i + f3i;
+
+f2r = t1r - w0r * t0r - w0r * t0i;
+f2i = t1i + w0r * t0r - w0r * t0i;
+f3r = Two * t1r - f2r;
+f3i = f2i - Two * t1i;
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0
+ f1 - 1 - f1 - - f1
+ f2 - - f2 - 1 - f2
+ f3 - 1 - t1 - i - f3
+ */
+
+t0r = f0r + f1r;
+t0i = f0i - f1i;
+f1r = f0r - f1r;
+f1i = f0i + f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r + t1i;
+f3i = f1i - t1r;
+f1r = f1r - t1i;
+f1i = f1i + t1r;
+
+ /* store result */
+ioptr[0] = scale*f0r;
+ioptr[1] = scale*f0i;
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+ioptr[4] = scale*f2r;
+ioptr[5] = scale*f2i;
+ioptr[6] = scale*f3r;
+ioptr[7] = scale*f3i;
+}
+
+inline void rifft8pt(float *ioptr, float scale);
+inline void rifft8pt(float *ioptr, float scale){
+/*** RADIX 16 rifft ***/
+float w0r = 1.0/MYROOT2; /* cos(pi/4) */
+float w1r = MYCOSPID8; /* cos(pi/8) */
+float w1i = MYSINPID8; /* sin(pi/8) */
+float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
+float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+ /* bit reversed load */
+t0r = ioptr[0];
+t0i = ioptr[1];
+f4r = ioptr[2];
+f4i = ioptr[3];
+f2r = ioptr[4];
+f2i = ioptr[5];
+f6r = ioptr[6];
+f6i = ioptr[7];
+f1r = Two * ioptr[8];
+f1i = Two * ioptr[9];
+f5r = ioptr[10];
+f5i = ioptr[11];
+f3r = ioptr[12];
+f3i = ioptr[13];
+f7r = ioptr[14];
+f7i = ioptr[15];
+
+
+ /* start rfft */
+f0r = t0r + t0i; /* compute Re(x[0]) */
+f0i = t0r - t0i; /* compute Re(x[N/2]) */
+
+t0r = f2r + f3r;
+t0i = f2i - f3i;
+t1r = f2r - f3r;
+t1i = f2i + f3i;
+
+f2r = t0r - w0r * t1r - w0r * t1i;
+f2i = t0i + w0r * t1r - w0r * t1i;
+f3r = Two * t0r - f2r;
+f3i = f2i - Two * t0i;
+
+t0r = f4r + f7r;
+t0i = f4i - f7i;
+t1r = f4r - f7r;
+t1i = f4i + f7i;
+
+f4r = t0r - w1i * t1r - w1r * t1i;
+f4i = t0i + w1r * t1r - w1i * t1i;
+f7r = Two * t0r - f4r;
+f7i = f4i - Two * t0i;
+
+t0r = f6r + f5r;
+t0i = f6i - f5i;
+t1r = f6r - f5r;
+t1i = f6i + f5i;
+
+f6r = t0r - w1r * t1r - w1i * t1i;
+f6i = t0i + w1i * t1r - w1r * t1i;
+f5r = Two * t0r - f6r;
+f5i = f6i - Two * t0i;
+
+ /* Butterflys */
+ /*
+ f0 - - t0 - - f0 - - f0
+ f1* - 1 - f1 - - f1 - - f1
+ f2 - - f2 - 1 - f2 - - f2
+ f3 - 1 - t1 - i - f3 - - f3
+ f4 - - t0 - - f4 - 1 - t0
+ f5 - 1 - f5 - - f5 - w3 - f4
+ f6 - - f6 - 1 - f6 - i - t1
+ f7 - 1 - t1 - i - f7 - iw3- f6
+ */
+
+t0r = f0r + f1r;
+t0i = f0i - f1i;
+f1r = f0r - f1r;
+f1i = f0i + f1i;
+
+t1r = f2r - f3r;
+t1i = f2i - f3i;
+f2r = f2r + f3r;
+f2i = f2i + f3i;
+
+f0r = t0r + f2r;
+f0i = t0i + f2i;
+f2r = t0r - f2r;
+f2i = t0i - f2i;
+
+f3r = f1r + t1i;
+f3i = f1i - t1r;
+f1r = f1r - t1i;
+f1i = f1i + t1r;
+
+
+t0r = f4r + f5r;
+t0i = f4i + f5i;
+f5r = f4r - f5r;
+f5i = f4i - f5i;
+
+t1r = f6r - f7r;
+t1i = f6i - f7i;
+f6r = f6r + f7r;
+f6i = f6i + f7i;
+
+f4r = t0r + f6r;
+f4i = t0i + f6i;
+f6r = t0r - f6r;
+f6i = t0i - f6i;
+
+f7r = f5r + t1i;
+f7i = f5i - t1r;
+f5r = f5r - t1i;
+f5i = f5i + t1r;
+
+
+t0r = f0r - f4r;
+t0i = f0i - f4i;
+f0r = f0r + f4r;
+f0i = f0i + f4i;
+
+t1r = f2r + f6i;
+t1i = f2i - f6r;
+f2r = f2r - f6i;
+f2i = f2i + f6r;
+
+f4r = f1r - f5r * w0r + f5i * w0r;
+f4i = f1i - f5r * w0r - f5i * w0r;
+f1r = f1r * Two - f4r;
+f1i = f1i * Two - f4i;
+
+f6r = f3r + f7r * w0r + f7i * w0r;
+f6i = f3i - f7r * w0r + f7i * w0r;
+f3r = f3r * Two - f6r;
+f3i = f3i * Two - f6i;
+
+ /* store result */
+ioptr[0] = scale*f0r;
+ioptr[1] = scale*f0i;
+ioptr[2] = scale*f1r;
+ioptr[3] = scale*f1i;
+ioptr[4] = scale*f2r;
+ioptr[5] = scale*f2i;
+ioptr[6] = scale*f3r;
+ioptr[7] = scale*f3i;
+ioptr[8] = scale*t0r;
+ioptr[9] = scale*t0i;
+ioptr[10] = scale*f4r;
+ioptr[11] = scale*f4i;
+ioptr[12] = scale*t1r;
+ioptr[13] = scale*t1i;
+ioptr[14] = scale*f6r;
+ioptr[15] = scale*f6i;
+}
+
+inline void ifrstage(float *ioptr, long M, float *Utbl);
+inline void ifrstage(float *ioptr, long M, float *Utbl){
+/* Start RIFFT */
+
+unsigned long pos;
+unsigned long posi;
+unsigned long diffUcnt;
+
+float *p0r, *p1r;
+float *u0r, *u0i;
+
+float w0r, w0i;
+float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
+float t0r, t0i, t1r, t1i;
+const float Two = 2.0;
+
+pos = POW2(M-1);
+posi = pos + 1;
+
+p0r = ioptr;
+p1r = ioptr + pos/2;
+
+u0r = Utbl + POW2(M-3);
+
+w0r = *u0r,
+
+f0r = *(p0r);
+f0i = *(p0r + 1);
+f4r = *(p0r + pos);
+f4i = *(p0r + posi);
+f1r = *(p1r);
+f1i = *(p1r + 1);
+f5r = *(p1r + pos);
+f5i = *(p1r + posi);
+
+ t0r = f0r + f0i;
+ t0i = f0r - f0i;
+ t1r = f4r + f4r;
+ t1i = -f4i - f4i;
+
+ f0r = f1r + f5r;
+ f0i = f1i - f5i;
+ f4r = f1r - f5r;
+ f4i = f1i + f5i;
+
+ f1r = f0r - w0r * f4r - w0r * f4i;
+ f1i = f0i + w0r * f4r - w0r * f4i;
+ f5r = Two * f0r - f1r;
+ f5i = f1i - Two * f0i;
+
+*(p0r) = t0r;
+*(p0r + 1) = t0i;
+*(p0r + pos) = t1r;
+*(p0r + posi) = t1i;
+*(p1r) = f1r;
+*(p1r + 1) = f1i;
+*(p1r + pos) = f5r;
+*(p1r + posi) = f5i;
+
+u0r = Utbl + 1;
+u0i = Utbl + (POW2(M-2)-1);
+
+w0r = *u0r,
+w0i = *u0i;
+
+p0r = (ioptr + 2);
+p1r = (ioptr + (POW2(M-2)-1)*2);
+
+ /* Butterflys */
+ /*
+ f0 - t0 - f0
+ f1 - t1 -w0- f1
+
+ f2 - t0 - f2
+ f3 - t1 -iw0- f3
+ */
+
+for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){
+
+ f0r = *(p0r);
+ f0i = *(p0r + 1);
+ f5r = *(p1r + pos);
+ f5i = *(p1r + posi);
+ f1r = *(p1r);
+ f1i = *(p1r + 1);
+ f4r = *(p0r + pos);
+ f4i = *(p0r + posi);
+
+ t0r = f0r + f5r;
+ t0i = f0i - f5i;
+ t1r = f0r - f5r;
+ t1i = f0i + f5i;
+
+ f0r = t0r - w0i * t1r - w0r * t1i;
+ f0i = t0i + w0r * t1r - w0i * t1i;
+ f5r = Two * t0r - f0r;
+ f5i = f0i - Two * t0i;
+
+ t0r = f1r + f4r;
+ t0i = f1i - f4i;
+ t1r = f1r - f4r;
+ t1i = f1i + f4i;
+
+ f1r = t0r - w0r * t1r - w0i * t1i;
+ f1i = t0i + w0i * t1r - w0r * t1i;
+ f4r = Two * t0r - f1r;
+ f4i = f1i - Two * t0i;
+
+ *(p0r) = f0r;
+ *(p0r + 1) = f0i;
+ *(p1r + pos) = f5r;
+ *(p1r + posi) = f5i;
+
+ w0r = *++u0r;
+ w0i = *--u0i;
+
+ *(p1r) = f1r;
+ *(p1r + 1) = f1i;
+ *(p0r + pos) = f4r;
+ *(p0r + posi) = f4i;
+
+ p0r += 2;
+ p1r -= 2;
+};
+}
+
+void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){
+/* Compute in-place real ifft on the rows of the input array */
+/* data order as from rffts1 */
+/* INPUTS */
+/* *ioptr = input data array in the following order */
+/* M = log2 of fft size */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = real output data array */
+
+float scale;
+long StageCnt;
+long NDiffU;
+
+scale = 1.0/POW2(M);
+M=M-1;
+switch (M){
+case -1:
+ break;
+case 0:
+ for (;Rows>0;Rows--){
+ rifft1pt(ioptr, scale); /* a 2 pt fft */
+ ioptr += 2*POW2(M);
+ }
+case 1:
+ for (;Rows>0;Rows--){
+ rifft2pt(ioptr, scale); /* a 4 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 2:
+ for (;Rows>0;Rows--){
+ rifft4pt(ioptr, scale); /* an 8 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+case 3:
+ for (;Rows>0;Rows--){
+ rifft8pt(ioptr, scale); /* a 16 pt fft */
+ ioptr += 2*POW2(M);
+ }
+ break;
+default:
+ for (;Rows>0;Rows--){
+
+ ifrstage(ioptr, M+1, Utbl);
+
+ scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */
+
+ StageCnt = (M-1) / 3; // number of radix 8 stages
+ NDiffU = 2; // one radix 2 stage already complete
+
+ if ( (M-1-(StageCnt * 3)) == 1 ){
+ ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */
+ NDiffU *= 2;
+ }
+
+ if ( (M-1-(StageCnt * 3)) == 2 ){
+ ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */
+ NDiffU *= 4;
+ }
+
+ if (M <= MCACHE){
+ ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
+ }
+
+ else{
+ ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */
+ }
+
+ ioptr += 2*POW2(M);
+ }
+}
+}
+
diff --git a/ffts/src/fftlib.h b/ffts/src/fftlib.h
new file mode 100644
index 0000000..86de4ac
--- /dev/null
+++ b/ffts/src/fftlib.h
@@ -0,0 +1,76 @@
+#define MYRECIPLN2 1.442695040888963407359924681001892137426 // 1.0/log(2)
+
+/* some useful conversions between a number and its power of 2 */
+#define LOG2(a) (MYRECIPLN2*log(a)) // floating point logarithm base 2
+#define POW2(m) ((unsigned long) 1 << (m)) // integer power of 2 for m<32
+
+/*******************************************************************
+lower level fft stuff called by routines in fftext.c and fft2d.c
+*******************************************************************/
+
+void fftCosInit(long M, float *Utbl);
+/* Compute Utbl, the cosine table for ffts */
+/* of size (pow(2,M)/4 +1) */
+/* INPUTS */
+/* M = log2 of fft size */
+/* OUTPUTS */
+/* *Utbl = cosine table */
+
+void fftBRInit(long M, short *BRLow);
+/* Compute BRLow, the bit reversed table for ffts */
+/* of size pow(2,M/2 -1) */
+/* INPUTS */
+/* M = log2 of fft size */
+/* OUTPUTS */
+/* *BRLow = bit reversed counter table */
+
+void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow);
+/* Compute in-place complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size (ex M=10 for 1024 point fft) */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = output data array */
+
+void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow);
+/* Compute in-place inverse complex fft on the rows of the input array */
+/* INPUTS */
+/* *ioptr = input data array */
+/* M = log2 of fft size */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = output data array */
+
+void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow);
+/* Compute in-place real fft on the rows of the input array */
+/* The result is the complex spectra of the positive frequencies */
+/* except the location for the first complex number contains the real */
+/* values for DC and Nyquest */
+/* INPUTS */
+/* *ioptr = real input data array */
+/* M = log2 of fft size */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = output data array in the following order */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+
+
+void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow);
+/* Compute in-place real ifft on the rows of the input array */
+/* data order as from rffts1 */
+/* INPUTS */
+/* *ioptr = input data array in the following order */
+/* M = log2 of fft size */
+/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */
+/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */
+/* *Utbl = cosine table */
+/* *BRLow = bit reversed counter table */
+/* OUTPUTS */
+/* *ioptr = real output data array */
diff --git a/ffts/src/files b/ffts/src/files
new file mode 100644
index 0000000..2833fee
--- /dev/null
+++ b/ffts/src/files
@@ -0,0 +1,22 @@
+tr "\r" "\n" < dxpose.c > tmp.c
+mv tmp.c dxpose.c
+tr "\r" "\n" < dxpose.h > tmp.c
+mv tmp.c dxpose.h
+tr "\r" "\n" < fft2d.c > tmp.c
+mv tmp.c fft2d.c
+tr "\r" "\n" < fft2d.h > tmp.c
+mv tmp.c fft2d.h
+tr "\r" "\n" < fftext.c > tmp.c
+mv tmp.c fftext.c
+tr "\r" "\n" < fftext.h > tmp.c
+mv tmp.c fftext.h
+tr "\r" "\n" < fftlib.c > tmp.c
+mv tmp.c fftlib.c
+tr "\r" "\n" < fftlib.h > tmp.c
+mv tmp.c fftlib.h
+tr "\r" "\n" < files > tmp.c
+mv tmp.c n" < files
+tr "\r" "\n" < matlib.c > tmp.c
+mv tmp.c matlib.c
+tr "\r" "\n" < matlib.h > tmp.c
+mv tmp.c matlib.h
diff --git a/ffts/src/matlib.c b/ffts/src/matlib.c
new file mode 100644
index 0000000..4e8b682
--- /dev/null
+++ b/ffts/src/matlib.c
@@ -0,0 +1,297 @@
+/* a few routines from a vector/matrix library */
+#include "matlib.h"
+
+void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){
+/* not in-place matrix transpose */
+/* INPUTS */
+/* *indata = input data array */
+/* iRsiz = offset to between rows of input data array */
+/* oRsiz = offset to between rows of output data array */
+/* Nrows = number of rows in input data array */
+/* Ncols = number of columns in input data array */
+/* OUTPUTS */
+/* *outdata = output data array */
+
+float *irow; /* pointer to input row start */
+float *ocol; /* pointer to output col start */
+float *idata; /* pointer to input data */
+float *odata; /* pointer to output data */
+long RowCnt; /* row counter */
+long ColCnt; /* col counter */
+float T0; /* data storage */
+float T1; /* data storage */
+float T2; /* data storage */
+float T3; /* data storage */
+float T4; /* data storage */
+float T5; /* data storage */
+float T6; /* data storage */
+float T7; /* data storage */
+const long inRsizd1 = iRsiz;
+const long inRsizd2 = 2*iRsiz;
+const long inRsizd3 = inRsizd2+iRsiz;
+const long inRsizd4 = 4*iRsiz;
+const long inRsizd5 = inRsizd3+inRsizd2;
+const long inRsizd6 = inRsizd4+inRsizd2;
+const long inRsizd7 = inRsizd4+inRsizd3;
+const long inRsizd8 = 8*iRsiz;
+
+ocol = outdata;
+irow = indata;
+for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){
+ idata = irow;
+ odata = ocol;
+ for (ColCnt=Ncols; ColCnt>0; ColCnt--){
+ T0 = *idata;
+ T1 = *(idata+inRsizd1);
+ T2 = *(idata+inRsizd2);
+ T3 = *(idata+inRsizd3);
+ T4 = *(idata+inRsizd4);
+ T5 = *(idata+inRsizd5);
+ T6 = *(idata+inRsizd6);
+ T7 = *(idata+inRsizd7);
+ *odata = T0;
+ *(odata+1) = T1;
+ *(odata+2) = T2;
+ *(odata+3) = T3;
+ *(odata+4) = T4;
+ *(odata+5) = T5;
+ *(odata+6) = T6;
+ *(odata+7) = T7;
+ idata++;
+ odata += oRsiz;
+ }
+ irow += inRsizd8;
+ ocol += 8;
+}
+if (Nrows%8 != 0){
+ for (ColCnt=Ncols; ColCnt>0; ColCnt--){
+ idata = irow++;
+ odata = ocol;
+ ocol += oRsiz;
+ for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){
+ T0 = *idata;
+ *odata++ = T0;
+ idata += iRsiz;
+ }
+ }
+}
+}
+
+void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){
+/* not in-place complex float matrix transpose */
+/* INPUTS */
+/* *indata = input data array */
+/* iRsiz = offset to between rows of input data array */
+/* oRsiz = offset to between rows of output data array */
+/* Nrows = number of rows in input data array */
+/* Ncols = number of columns in input data array */
+/* OUTPUTS */
+/* *outdata = output data array */
+
+float *irow; /* pointer to input row start */
+float *ocol; /* pointer to output col start */
+float *idata; /* pointer to input data */
+float *odata; /* pointer to output data */
+long RowCnt; /* row counter */
+long ColCnt; /* col counter */
+float T0r; /* data storage */
+float T0i; /* data storage */
+float T1r; /* data storage */
+float T1i; /* data storage */
+float T2r; /* data storage */
+float T2i; /* data storage */
+float T3r; /* data storage */
+float T3i; /* data storage */
+const long inRsizd1 = 2*iRsiz;
+const long inRsizd1i = 2*iRsiz + 1;
+const long inRsizd2 = 4*iRsiz;
+const long inRsizd2i = 4*iRsiz + 1;
+const long inRsizd3 = inRsizd2+inRsizd1;
+const long inRsizd3i = inRsizd2+inRsizd1 + 1;
+const long inRsizd4 = 8*iRsiz;
+
+ocol = outdata;
+irow = indata;
+for (RowCnt=Nrows/4; RowCnt>0; RowCnt--){
+ idata = irow;
+ odata = ocol;
+ for (ColCnt=Ncols; ColCnt>0; ColCnt--){
+ T0r = *idata;
+ T0i = *(idata +1);
+ T1r = *(idata+inRsizd1);
+ T1i = *(idata+inRsizd1i);
+ T2r = *(idata+inRsizd2);
+ T2i = *(idata+inRsizd2i);
+ T3r = *(idata+inRsizd3);
+ T3i = *(idata+inRsizd3i);
+ *odata = T0r;
+ *(odata+1) = T0i;
+ *(odata+2) = T1r;
+ *(odata+3) = T1i;
+ *(odata+4) = T2r;
+ *(odata+5) = T2i;
+ *(odata+6) = T3r;
+ *(odata+7) = T3i;
+ idata+=2;
+ odata += 2*oRsiz;
+ }
+ irow += inRsizd4;
+ ocol += 8;
+}
+if (Nrows%4 != 0){
+ for (ColCnt=Ncols; ColCnt>0; ColCnt--){
+ idata = irow;
+ odata = ocol;
+ for (RowCnt=Nrows%4; RowCnt>0; RowCnt--){
+ T0r = *idata;
+ T0i = *(idata+1);
+ *odata = T0r;
+ *(odata+1) = T0i;
+ odata+=2;
+ idata += 2*iRsiz;
+ }
+ irow+=2;
+ ocol += 2*oRsiz;
+ }
+}
+}
+
+void cvprod(float *a, float *b, float *out, long N){
+/* complex vector product, can be in-place */
+/* product of complex vector *a times complex vector *b */
+/* INPUTS */
+/* N vector length */
+/* *a complex vector length N complex numbers */
+/* *b complex vector length N complex numbers */
+/* OUTPUTS */
+/* *out complex vector length N */
+
+long OutCnt; /* output counter */
+float A0R; /* A storage */
+float A0I; /* A storage */
+float A1R; /* A storage */
+float A1I; /* A storage */
+float A2R; /* A storage */
+float A2I; /* A storage */
+float A3R; /* A storage */
+float A3I; /* A storage */
+float B0R; /* B storage */
+float B0I; /* B storage */
+float B1R; /* B storage */
+float B1I; /* B storage */
+float B2R; /* B storage */
+float B2I; /* B storage */
+float B3R; /* B storage */
+float B3I; /* B storage */
+float T0R; /* TMP storage */
+float T0I; /* TMP storage */
+float T1R; /* TMP storage */
+float T1I; /* TMP storage */
+float T2R; /* TMP storage */
+float T2I; /* TMP storage */
+float T3R; /* TMP storage */
+float T3I; /* TMP storage */
+
+if (N>=4){
+ A0R = *a;
+ B0R = *b;
+ A0I = *(a +1);
+ B0I = *(b +1);
+ A1R = *(a +2);
+ B1R = *(b +2);
+ A1I = *(a +3);
+ B1I = *(b +3);
+ A2R = *(a +4);
+ B2R = *(b +4);
+ A2I = *(a +5);
+ B2I = *(b +5);
+ A3R = *(a +6);
+ B3R = *(b +6);
+ A3I = *(a +7);
+ B3I = *(b +7);
+ T0R = A0R * B0R;
+ T0I = (A0R * B0I);
+ T1R = A1R * B1R;
+ T1I = (A1R * B1I);
+ T2R = A2R * B2R;
+ T2I = (A2R * B2I);
+ T3R = A3R * B3R;
+ T3I = (A3R * B3I);
+ T0R -= (A0I * B0I);
+ T0I = A0I * B0R + T0I;
+ T1R -= (A1I * B1I);
+ T1I = A1I * B1R + T1I;
+ T2R -= (A2I * B2I);
+ T2I = A2I * B2R + T2I;
+ T3R -= (A3I * B3I);
+ T3I = A3I * B3R + T3I;
+ for (OutCnt=N/4-1; OutCnt > 0; OutCnt--){
+ a += 8;
+ b += 8;
+ A0R = *a;
+ B0R = *b;
+ A0I = *(a +1);
+ B0I = *(b +1);
+ A1R = *(a +2);
+ B1R = *(b +2);
+ A1I = *(a +3);
+ B1I = *(b +3);
+ A2R = *(a +4);
+ B2R = *(b +4);
+ A2I = *(a +5);
+ B2I = *(b +5);
+ A3R = *(a +6);
+ B3R = *(b +6);
+ A3I = *(a +7);
+ B3I = *(b +7);
+ *out = T0R;
+ *(out +1) = T0I;
+ *(out +2) = T1R;
+ *(out +3) = T1I;
+ *(out +4) = T2R;
+ *(out +5) = T2I;
+ *(out +6) = T3R;
+ *(out +7) = T3I;
+ T0R = A0R * B0R;
+ T0I = (A0R * B0I);
+ T1R = A1R * B1R;
+ T1I = (A1R * B1I);
+ T2R = A2R * B2R;
+ T2I = (A2R * B2I);
+ T3R = A3R * B3R;
+ T3I = (A3R * B3I);
+ T0R -= (A0I * B0I);
+ T0I = A0I * B0R + T0I;
+ T1R -= (A1I * B1I);
+ T1I = A1I * B1R + T1I;
+ T2R -= (A2I * B2I);
+ T2I = A2I * B2R + T2I;
+ T3R -= (A3I * B3I);
+ T3I = A3I * B3R + T3I;
+ out += 8;
+ }
+ a += 8;
+ b += 8;
+ *out = T0R;
+ *(out +1) = T0I;
+ *(out +2) = T1R;
+ *(out +3) = T1I;
+ *(out +4) = T2R;
+ *(out +5) = T2I;
+ *(out +6) = T3R;
+ *(out +7) = T3I;
+ out += 8;
+}
+for (OutCnt=N%4; OutCnt > 0; OutCnt--){
+ A0R = *a++;
+ B0R = *b++;
+ A0I = *a++;
+ B0I = *b++;
+ T0R = A0R * B0R;
+ T0I = (A0R * B0I);
+ T0R -= (A0I * B0I);
+ T0I = A0I * B0R + T0I;
+ *out++ = T0R;
+ *out++ = T0I;
+}
+}
diff --git a/ffts/src/matlib.h b/ffts/src/matlib.h
new file mode 100644
index 0000000..baf9e6e
--- /dev/null
+++ b/ffts/src/matlib.h
@@ -0,0 +1,33 @@
+/* a few routines from a vector/matrix library */
+
+void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols);
+/* not in-place matrix transpose */
+/* INPUTS */
+/* *indata = input data array */
+/* iRsiz = offset to between rows of input data array */
+/* oRsiz = offset to between rows of output data array */
+/* Nrows = number of rows in input data array */
+/* Ncols = number of columns in input data array */
+/* OUTPUTS */
+/* *outdata = output data array */
+
+void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols);
+/* not in-place complex matrix transpose */
+/* INPUTS */
+/* *indata = input data array */
+/* iRsiz = offset to between rows of input data array */
+/* oRsiz = offset to between rows of output data array */
+/* Nrows = number of rows in input data array */
+/* Ncols = number of columns in input data array */
+/* OUTPUTS */
+/* *outdata = output data array */
+
+void cvprod(float *a, float *b, float *out, long N);
+/* complex vector product, can be in-place */
+/* product of complex vector *a times complex vector *b */
+/* INPUTS */
+/* N vector length */
+/* *a complex vector length N complex numbers */
+/* *b complex vector length N complex numbers */
+/* OUTPUTS */
+/* *out complex vector length N */