diff options
author | Steve M. Robbins <smr@debian.org> | 2011-10-22 04:54:51 +0200 |
---|---|---|
committer | Steve M. Robbins <smr@debian.org> | 2011-10-22 04:54:51 +0200 |
commit | dd657ad3f1428b026486db3ec36691df17ddf515 (patch) | |
tree | 6ffb465595479fb5a76c1a6ea3ec992abaa8c1c1 /ffts |
Import nyquist_3.05.orig.tar.gz
[dgit import orig nyquist_3.05.orig.tar.gz]
Diffstat (limited to 'ffts')
26 files changed, 5856 insertions, 0 deletions
diff --git a/ffts/Matlab-testing/conv2dTest.c b/ffts/Matlab-testing/conv2dTest.c new file mode 100644 index 0000000..37213e6 --- /dev/null +++ b/ffts/Matlab-testing/conv2dTest.c @@ -0,0 +1,116 @@ +/* A program to test fast 2d real convolution */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" +#include "fft2d.h" + +#if macintosh +#include <timer.h> +#endif + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) +//#define BIPRAND(a) round(100*(2.0/(RAND_MAX+1.0)*a-1.0)) + +void main(){ +long N = 256; /* the number of cols in 2d ffts, must be power of 2 */ +long N2 = 64; /* the number of rows in 2d ffts, must be power of 2 */ +long kernSize = 53; /* kernal cols must be less than N */ +long kernSize2 = 29; /* kernal rows must be less than N2*/ +long dataSize = N-kernSize+1; /* data cols */ +long dataSize2 = N2-kernSize2+1; /* data rows */ +float *a; +float *b; +long i1; +long i2; +long TheErr; +long M; +long M2; + +FILE *fdataout; /* output file */ + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0])); +printf(" randseed = %10u\n", randseed); + +srand(randseed); +M = roundtol(LOG2(N)); +N = POW2(M); +M2 = roundtol(LOG2(N2)); +N2 = POW2(M2); + +printf("fft size = %6d X%6d, ", N2, N); + +if ((dataSize <= 0)||(dataSize2 <= 0)) TheErr = 22; +else TheErr = 0; + +if(!TheErr){ + TheErr = fft2dInit(M2, M); +} + +a = (float *) calloc(N2*N,sizeof(float) ); // calloc to zero pad data to fill N to N2 +if (a == 0) TheErr = 2; +if(!TheErr){ + b = (float *) calloc(N2*N,sizeof(float) ); // calloc to zero pad data to fill N to N2 + if (b == 0) TheErr = 2; +} +if(!TheErr){ + fdataout = fopen("conv2ddat.c2d", "wb"); + if (fdataout == NULL) TheErr = -50; +} +if(!TheErr){ + + /* write sizes to fdataout */ + fwrite(&dataSize, sizeof(dataSize), 1, fdataout); + fwrite(&dataSize2, sizeof(dataSize2), 1, fdataout); + fwrite(&kernSize, sizeof(kernSize), 1, fdataout); + fwrite(&kernSize2, sizeof(kernSize2), 1, fdataout); + /* set up a simple test case and write to fdataout */ + for (i2=0; i2<dataSize2; i2++){ + for (i1=0; i1<dataSize; i1++){ + rannum = rand(); + a[i2*N+i1] = BIPRAND(rannum); + } + fwrite(&a[i2*N], dataSize*sizeof(float), 1, fdataout); + } + for (i2=0; i2<kernSize2; i2++){ + for (i1=0; i1<kernSize; i1++){ + rannum = rand(); + b[i2*N+i1] = BIPRAND(rannum); + } + fwrite(&b[i2*N], kernSize*sizeof(float), 1, fdataout); + } + + /* fast 2d convolution of zero padded sequences */ + rfft2d(a, M2, M); + rfft2d(b, M2, M); + rspect2dprod(a, b, a, N2, N); + rifft2d(a, M2, M); + + /* write out answer */ + fwrite(a, N2*N*sizeof(float), 1, fdataout); + + fclose(fdataout); + + free(b); + free(a); + fft2dFree(); +} +else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fft2dFree(); +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Matlab-testing/conv2dtest.m b/ffts/Matlab-testing/conv2dtest.m new file mode 100644 index 0000000..5439ba1 --- /dev/null +++ b/ffts/Matlab-testing/conv2dtest.m @@ -0,0 +1,25 @@ +% program to test 2d real fast conv + +% let user select file then open it +[fname, pname] = uigetfile('*.c2d', 'select conv file'); +cd(pname); +fidout=fopen(fname,'r'); + +% read header info +aN=fread(fidout,1,'long'); +aM=fread(fidout,1,'long'); +bN=fread(fidout,1,'long'); +bM=fread(fidout,1,'long'); +% read in data +%status=fseek(fidout,Nheader,'bof'); +a=fread(fidout,aN*aM,'float'); +a=reshape(a,aN,aM); +b=fread(fidout,bN*bM,'float'); +b=reshape(b,bN,bM); +c=fread(fidout,(aN+bN-1)*(aM+bM-1),'float'); +c=reshape(c,(aN+bN-1),(aM+bM-1)); +fclose(fidout); + +c2=conv2(a,b); + +max(max(abs(c2-c))) diff --git a/ffts/Matlab-testing/convTest.c b/ffts/Matlab-testing/convTest.c new file mode 100644 index 0000000..ee34475 --- /dev/null +++ b/ffts/Matlab-testing/convTest.c @@ -0,0 +1,113 @@ +/* A program to test fast 1d real convolution */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" + +#if macintosh +#include <timer.h> +#endif + + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) +//#define BIPRAND(a) round(100*(2.0/(RAND_MAX+1.0)*a-1.0)) + +void main(){ +const long N2 = 2; /* the number ffts to test */ +long N = 2048; /* size of FFTs, must be power of 2 */ +long kernSize = 1003; /* kernal size must be less than N */ +long dataSize = N-kernSize+1; /* data size */ +float *a; +float *b; +long i1; +long i2; +long TheErr; +long M; + +FILE *fdataout; /* output file */ + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0])); +printf(" randseed = %10u\n", randseed); + +srand(randseed); +M = roundtol(LOG2(N)); +N = POW2(M); + +printf("fft size = %6d, ", N); + +if (dataSize <= 0) TheErr = 22; +else TheErr = 0; + +if(!TheErr){ + TheErr = fftInit(M); +} + +a = (float *) calloc(N2*N,sizeof(float) ); // calloc to zero pad data to fill N +if (a == 0) TheErr = 2; +if(!TheErr){ + b = (float *) calloc(N2*N,sizeof(float) ); // calloc to zero pad data to fill N + if (b == 0) TheErr = 2; +} +if(!TheErr){ + fdataout = fopen("convdat.cnv", "wb"); + if (fdataout == NULL) TheErr = -50; +} +if(!TheErr){ + + /* write sizes to fdataout */ + fwrite(&dataSize, sizeof(dataSize), 1, fdataout); + fwrite(&kernSize, sizeof(kernSize), 1, fdataout); + fwrite(&N2, sizeof(N2), 1, fdataout); + /* set up a simple test case and write to fdataout */ + for (i2=0; i2<N2; i2++){ + for (i1=0; i1<dataSize; i1++){ + rannum = rand(); + a[i2*N+i1] = BIPRAND(rannum); + } + fwrite(&a[i2*N], dataSize*sizeof(float), 1, fdataout); + } + for (i2=0; i2<N2; i2++){ + for (i1=0; i1<kernSize; i1++){ + rannum = rand(); + b[i2*N+i1] = BIPRAND(rannum); + } + fwrite(&b[i2*N], kernSize*sizeof(float), 1, fdataout); + } + + + /* fast convolution of zero padded sequences */ + rffts(a, M, N2); + rffts(b, M, N2); + for (i2=0; i2<N2*N; i2+=N){ + rspectprod(&a[i2], &b[i2], &a[i2], N); + } + riffts(a, M, N2); + + /* write out answer */ + fwrite(a, N2*N*sizeof(float), 1, fdataout); + + fclose(fdataout); + + free(b); + free(a); + fftFree(); +} +else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fftFree(); +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Matlab-testing/convtest.m b/ffts/Matlab-testing/convtest.m new file mode 100644 index 0000000..b899911 --- /dev/null +++ b/ffts/Matlab-testing/convtest.m @@ -0,0 +1,26 @@ +% program to test 1d real fast conv +clear c2 + +% let user select file then open it +[fname, pname] = uigetfile('*.cnv', 'select conv file'); +cd(pname); +fidout=fopen(fname,'r'); + +% read header info +aN=fread(fidout,1,'long'); +bN=fread(fidout,1,'long'); +M=fread(fidout,1,'long'); +% read in data +%status=fseek(fidout,Nheader,'bof'); +a=fread(fidout,aN*M,'float'); +a=reshape(a,aN,M); +b=fread(fidout,bN*M,'float'); +b=reshape(b,bN,M); +c=fread(fidout,(aN+bN-1)*M,'float'); +c=reshape(c,(aN+bN-1),M); +fclose(fidout); + +for i1=1:M; + c2(:,i1)=conv(a(:,i1),b(:,i1)); +end; +max(max(abs(c2-c))) diff --git a/ffts/Matlab-testing/rfft2dTestML.c b/ffts/Matlab-testing/rfft2dTestML.c new file mode 100644 index 0000000..2cb8d0f --- /dev/null +++ b/ffts/Matlab-testing/rfft2dTestML.c @@ -0,0 +1,102 @@ +/* A program to test real 2d forward and inverse fast fourier transform routines */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" +#include "fft2d.h" + +#if macintosh +#include <timer.h> +#endif + + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) + +void main(){ +long N2 = 64; /* the number of rows in 2d ffts, must be power of 2 */ +long N = 256; /* the number of cols in 2d ffts, must be power of 2 */ +float *a; +float maxerrfft; +long i1; +long TheErr; +long M; +long M2; + +FILE *fdataout; /* output file */ + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0])); +printf(" randseed = %10u\n", randseed); + +srand(randseed); +M = roundtol(LOG2(N)); +N = POW2(M); +M2 = roundtol(LOG2(N2)); +N2 = POW2(M2); + +printf("fft size = %6d X%6d, ", N2, N); + +TheErr = 0; + +if(!TheErr){ + TheErr = fft2dInit(M2, M); +} + +a = (float *) malloc(N2*N*sizeof(float) ); +if (a == 0) TheErr = 2; +if(!TheErr){ + fdataout = fopen("fftdat.dr2", "wb"); + if (fdataout == NULL) TheErr = -50; +} +if(!TheErr){ + + /* write sizes to fdataout */ + fwrite(&N, sizeof(N), 1, fdataout); + fwrite(&N2, sizeof(N2), 1, fdataout); + /* set up a simple test case and write to fdataout */ + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a[i1] = BIPRAND(rannum); + } + fwrite(a, N2*N*sizeof(float), 1, fdataout); + + /* real, 2d fast fourier transform */ + rfft2d(a, M2, M); + + /* write out answer */ + fwrite(a, N2*N*sizeof(float), 1, fdataout); + fclose(fdataout); + + /* compute and check inverse transform */ + rifft2d(a, M2, M); + + maxerrfft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a[i1])); + } + + printf("maxerr rfft = %6.4e\n", maxerrfft); + + free(a); + fft2dFree(); +} +else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fft2dFree(); +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Matlab-testing/rfft2dTestML.m b/ffts/Matlab-testing/rfft2dTestML.m new file mode 100644 index 0000000..03ba5a2 --- /dev/null +++ b/ffts/Matlab-testing/rfft2dTestML.m @@ -0,0 +1,39 @@ +% program to test 2d real fft + +% let user select file then open it +[fname, pname] = uigetfile('*.dr2', 'select conv file'); +cd(pname); +fidout=fopen(fname,'r'); + +% read header info +N=fread(fidout,1,'long'); +M=fread(fidout,1,'long'); +% read in data +%status=fseek(fidout,Nheader,'bof'); +a=fread(fidout,N*M,'float'); +a=reshape(a,N,M); +c=fread(fidout,N*M,'float'); +c=reshape(c,N,M); +c=c(1:2:N,:)+j*c(2:2:N,:); +fclose(fidout); + +c2=fft2(a); + +% Remember Matlab is column major order, C is row major order +% Matlab starts with index of 1, f and k start at 0 + +%%%% check the real transforms of DC and Nyquest frequencies +% check the four real values +maxerr=abs(real(c(1,1))-c2(1,1)); % 0 f, 0 k +maxerr=max(maxerr,abs(imag(c(1,1))-c2(1,M/2+1))); % 0 f, M/2 k +maxerr=max(maxerr,abs(real(c(1,M/2+1))-c2(N/2+1,1)));% N/2 f, 0 k +maxerr=max(maxerr,abs(imag(c(1,M/2+1))-c2(N/2+1,M/2+1)));% N/2 f, M/2 k +%check the rest of the pos wavenumbers of DC and Nyquest frequencies +maxerr=max(maxerr,max(abs(c(1,2:M/2)-c2(1,2:M/2)))); %f = 0, k=1 to M/2-1 +maxerr=max(maxerr,max(abs(c(1,M/2+2:M)-c2(N/2+1,2:M/2))));%f = N/2, k=1 to M/2-1 +%%%% + +% check all the other positive frequencies at all wavenumbers + % f from 1 to N/2-1, k from 0 to M-1 (wraps around through negative k) +maxerr=max(maxerr,max(max(abs(c(2:N/2,:)-c2(2:N/2,:))))) + diff --git a/ffts/Numerical-Recipes-testing/fftTest.c b/ffts/Numerical-Recipes-testing/fftTest.c new file mode 100644 index 0000000..0cdca37 --- /dev/null +++ b/ffts/Numerical-Recipes-testing/fftTest.c @@ -0,0 +1,121 @@ +/* A program to test complex forward and inverse fast fourier transform routines */ + +#include <NR.H> /* uses four1 from numerical recipes in C to verify iffts */ + /*change fmin in numerical recipes to fminnr to avoid conflict with fp.h */ +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" + + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 24 /* the number of different fft sizes to test */ + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) + +typedef struct{ + float Re; + float Im; + } Complex; + +void main(){ +long fftSize[NSIZES] = /* size of FFTs, must be powers of 2 */ + {2, 4, 8, 16, 32, 64, 128, 256, + 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216}; +Complex *a1; +const long N2 = 2; /* the number ffts to test at each size */ +long isize; +long i1; +long TheErr; +long N; +long M; +float maxerrifft; +float maxerrfft; + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a1[0].Re)); +printf(" randseed = %10u\n", randseed); +for (isize = 0; isize < NSIZES; isize++){ + + srand(randseed); + N = fftSize[isize]; + printf("ffts size = %8d, ", N); + M = roundtol(LOG2(N)); + + TheErr = fftInit(M); + + if(!TheErr){ + a1 = (Complex *) malloc( N2*N*sizeof(Complex) ); + if (a1 == 0) TheErr = 2; + } + + if(!TheErr){ + + /* set up a1 simple test case */ + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a1[i1].Re = BIPRAND(rannum); + rannum = rand(); + a1[i1].Im = BIPRAND(rannum); + } + + /* first use four1 from numerical recipes in C to verify iffts */ + /* Note their inverse fft is really the conventional forward fft */ + for (i1=0; i1<N2; i1++){ + four1((float *)(a1+i1*N)-1, N, -1); + } + iffts((float *)a1, M, N2); + + maxerrifft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a1[i1].Re)); + a1[i1].Re = BIPRAND(rannum); + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a1[i1].Im)); + a1[i1].Im = BIPRAND(rannum); + } + + printf("maxerrifft = %6.4e, ", maxerrifft); + + /* now use iffts to verify ffts */ + iffts((float *)a1, M, N2); + ffts((float *)a1, M, N2); + + maxerrfft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a1[i1].Re)); + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a1[i1].Im)); + } + + printf("maxerrfft = %6.4e\n", maxerrfft); + + free(a1); + fftFree(); + } + else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fftFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Numerical-Recipes-testing/fftTest2d.c b/ffts/Numerical-Recipes-testing/fftTest2d.c new file mode 100644 index 0000000..8d22bdd --- /dev/null +++ b/ffts/Numerical-Recipes-testing/fftTest2d.c @@ -0,0 +1,130 @@ +/* A program to test 2d complex forward and inverse fast fourier transform routines */ + +#include <NR.H> /* uses fourn from numerical recipes in C to verify ifft2d */ + /*change fmin in numerical recipes to fminnr to avoid conflict with fp.h */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" +#include "fft2d.h" + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 24 /* the number of different ffts col sizes to test */ + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) +//#define BIPRAND(a) round(100*(2.0/(RAND_MAX+1.0)*a-1.0)) +typedef struct{ + float Re; + float Im; + } Complex; + +void main(){ +long fftSize[NSIZES] = /* size of FFTs cols, must be powers of 2 */ + {2, 4, 8, 16, 32, 64, 128, 256, + 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216}; +Complex *a1; +long N2 = 64; /* the number of rows in the 2d fft */ +long isize; +long i1; +long TheErr; +long N; +long M; +long M2; +float maxerrifft; +float maxerrfft; +unsigned long nn[2]; + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a1[0].Re)); +printf(" randseed = %10u\n", randseed); +for (isize = 0; isize < NSIZES; isize++){ + + srand(randseed); + N = fftSize[isize]; + M = roundtol(LOG2(N)); + N = POW2(M); + M2 = roundtol(LOG2(N2)); + N2 = POW2(M2); + + printf("ffts size = %6d X%6d, ", N2, N); + + nn[0] = N2; + nn[1] = N; + + TheErr = fft2dInit(M2, M); + + if(!TheErr){ + a1 = (Complex *) malloc(N2*N*sizeof(Complex) ); + if (a1 == 0) TheErr = 2; + } + + if(!TheErr){ + + /* set up a simple test case */ + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a1[i1].Re = BIPRAND(rannum); + rannum = rand(); + a1[i1].Im = BIPRAND(rannum); + } + + /* first use fourn from numerical recipes in C to verify ifft2d */ + /* Note their inverse fft is really the conventional forward fft */ + fourn((float *)a1-1, nn-1, 2, -1); + + ifft2d((float *)a1, M2, M); + + maxerrifft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a1[i1].Re)); + a1[i1].Re = BIPRAND(rannum); + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a1[i1].Im)); + a1[i1].Im = BIPRAND(rannum); + } + + printf("maxerrifft = %6.4e, ", maxerrifft); + + /* now use iffts to verify ffts */ + ifft2d((float *)a1, M2, M); + fft2d((float *)a1, M2, M); + + maxerrfft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a1[i1].Re)); + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a1[i1].Im)); + } + + printf("maxerrfft = %6.4e\n", maxerrfft); + + free(a1); + fft2dFree(); + } + else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fft2dFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Numerical-Recipes-testing/fftTest3d.c b/ffts/Numerical-Recipes-testing/fftTest3d.c new file mode 100644 index 0000000..0624ebe --- /dev/null +++ b/ffts/Numerical-Recipes-testing/fftTest3d.c @@ -0,0 +1,135 @@ +/* A program to test 3d complex forward and inverse fast fourier transform routines */ + +#include <NR.H> /* uses fourn from numerical recipes in C to verify ifft3d */ + /*change fmin in numerical recipes to fminnr to avoid conflict with fp.h */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" +#include "fft2d.h" + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 24 /* the number of different ffts col sizes to test */ + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) + +typedef struct{ + float Re; + float Im; + } Complex; + +void main(){ +long fftSize[NSIZES] = /* size of FFTs cols, must be powers of 2 */ + {2, 4, 8, 16, 32, 64, 128, 256, + 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216}; +Complex *a1; +long isize; +long i1; +long TheErr; +long N; +long M; +long N2 = 16; /* the number of rows in the 3d fft */ +long M2; +long N3 = 32; /* the number of pages in the 3d fft */ +long M3; +float maxerrifft; +float maxerrfft; +unsigned long nn[3]; + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a1[0].Re)); +printf(" randseed = %10u\n", randseed); +for (isize = 0; isize < NSIZES; isize++){ + + srand(randseed); + N = fftSize[isize]; + M = roundtol(LOG2(N)); + N = POW2(M); + M2 = roundtol(LOG2(N2)); + N2 = POW2(M2); + M3 = roundtol(LOG2(N3)); + N3 = POW2(M3); + + printf("ffts size = %5d X%5d X%6d, ", N3, N2, N); + + nn[0] = N3; + nn[1] = N2; + nn[2] = N; + + TheErr = fft3dInit(M3, M2, M); + + if(!TheErr){ + a1 = (Complex *) malloc(N3*N2*N*sizeof(Complex) ); + if (a1 == 0) TheErr = 2; + } + + if(!TheErr){ + + /* set up a simple test case */ + for (i1=0; i1<N3*N2*N; i1++){ + rannum = rand(); + a1[i1].Re = BIPRAND(rannum); + rannum = rand(); + a1[i1].Im = BIPRAND(rannum); + } + + /* first use fourn from numerical recipes in C to verify ifft3d */ + /* Note their inverse fft is really the conventional forward fft */ + fourn((float *)a1-1, nn-1, 3, -1); + + ifft3d((float *)a1, M3, M2, M); + + maxerrifft = 0; + srand(randseed); + for (i1=0; i1<N3*N2*N; i1++){ + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a1[i1].Re)); + a1[i1].Re = BIPRAND(rannum); + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a1[i1].Im)); + a1[i1].Im = BIPRAND(rannum); + } + + printf("errifft = %4.3e, ", maxerrifft); + + /* now use iffts to verify ffts */ + ifft3d((float *)a1, M3, M2, M); + fft3d((float *)a1, M3, M2, M); + + maxerrfft = 0; + srand(randseed); + for (i1=0; i1<N3*N2*N; i1++){ + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a1[i1].Re)); + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a1[i1].Im)); + } + + printf("errfft = %4.3e\n", maxerrfft); + + free(a1); + fft3dFree(); + } + else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fft3dFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Numerical-Recipes-testing/rfftTest.c b/ffts/Numerical-Recipes-testing/rfftTest.c new file mode 100644 index 0000000..b5e6c92 --- /dev/null +++ b/ffts/Numerical-Recipes-testing/rfftTest.c @@ -0,0 +1,121 @@ +/* A program to test real forward and inverse fast fourier transform routines */ + +#include <NR.H> /* uses realft from numerical recipes in C to verify riffts */ + /*change fmin in numerical recipes to fminnr to avoid conflict with fp.h */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 24 /* the number of different fft sizes to test */ + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) + +void main(){ +long fftSize[NSIZES] = /* size of FFTs, must be powers of 2 */ + {2, 4, 8, 16, 32, 64, 128, 256, + 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216}; +float *a; +const long N2 = 2; /* the number ffts to test at each size */ +long isize; +long i1; +long i2; +long TheErr; +long N; +long M; +float maxerrifft; +float maxerrfft; + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0])); +printf(" randseed = %10u\n", randseed); +for (isize = 0; isize < NSIZES; isize++){ + + srand(randseed); + N = fftSize[isize]; + printf("rffts size = %8d, ", N); + M = roundtol(LOG2(N)); + + TheErr = 0; + TheErr = fftInit(M); + + if(!TheErr){ + a = (float *) malloc(N2*N*sizeof(float) ); + if (a == 0) TheErr = 2; + } + + if(!TheErr){ + + /* set up a simple test case */ + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a[i1] = BIPRAND(rannum); + } + + /* first use realft from numerical recipes in C to verify riffts */ + /* unfortunately numerical recipes in C uses backwards time */ + /* forward fft, so our answer comes out time reversed */ + for (i2=0; i2<N2; i2++){ + realft((a+i2*N)-1, N, 1); + } + riffts(a, M, N2); + + srand(randseed); + for (i2=0; i2<N2; i2++){ + rannum = rand(); + maxerrifft = fabs(BIPRAND(rannum)-a[i2*N]); + for (i1=1; i1<N; i1++){ + rannum = rand(); + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a[i2*N+N-i1])); + } + } + + printf("maxerrifft = %6.4e, ", maxerrifft); + + /* now use iffts to verify ffts */ + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a[i1] = BIPRAND(rannum); + } + + riffts(a, M, N2); + rffts(a, M, N2); + + maxerrfft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a[i1])); + } + + printf("maxerrfft = %6.4e\n", maxerrfft); + + fftFree(); + free(a); + a = 0; + } + else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fftFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Numerical-Recipes-testing/rfftTest2d.c b/ffts/Numerical-Recipes-testing/rfftTest2d.c new file mode 100644 index 0000000..123746c --- /dev/null +++ b/ffts/Numerical-Recipes-testing/rfftTest2d.c @@ -0,0 +1,158 @@ +/* A program to test real 2d forward and inverse fast fourier transform routines */ + +#include <NR.H> /* uses rlft3 from numerical recipes in C to verify rifft2d */ + /*change fmin in numerical recipes to fminnr to avoid conflict with fp.h */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" +#include "fft2d.h" +#include <NRUTIL.H> // uses ugly tensors from numerical recipes; so can call rlft3 + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 24 /* the number of different ffts sizes to test */ + +#define BIPRAND(a) (2.0/(RAND_MAX+1.0)*a-1.0) + +void main(){ +long fftSize[NSIZES] = /* size of FFTs, must be powers of 2 */ + {2, 4, 8, 16, 32, 64, 128, 256, + 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, + 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216}; +float *a; +long N2 = 64; /* the number of rows in 2d ffts, must be power of 2 */ +long isize; +long i1; +long i2; +long TheErr; +long N; +long M; +long M2; +float maxerrifft; +float maxerrfft; + +float ***NRtensdata; /* needed for rlft3 */ +float **NRmatdata; /* needed for rlft3 */ +float *specdata; /* needed for rlft3 */ +float t1,t2; + +unsigned int randseed = 777; +int rannum; +#if macintosh + UnsignedWide TheTime1; + Microseconds(&TheTime1); + randseed = TheTime1.lo; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0])); +printf(" randseed = %10u\n", randseed); +for (isize = 0; isize < NSIZES; isize++){ + + srand(randseed); + N = fftSize[isize]; + M = roundtol(LOG2(N)); + N = POW2(M); + M2 = roundtol(LOG2(N2)); + N2 = POW2(M2); + + printf("rffts size = %6d X%6d, ", N2, N); + + TheErr = 0; + TheErr = fft2dInit(M2, M); + + if(!TheErr){ + NRmatdata=matrix(1,1,1,2*N2); + specdata = &NRmatdata[1][1]; + NRtensdata=f3tensor(1,1,1,N2,1,N); // uses ugly tensors from NRUTIL; so can call rlft3 + a = &NRtensdata[1][1][1]; + if ((a == 0)||(specdata == 0)) TheErr = 2; + } + + if(!TheErr){ + + /* set up a simple test case */ + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a[i1] = BIPRAND(rannum); + } + + /* first use rlft3 from numerical recipes in C to verify rifft2d */ + /* unfortunately numerical recipes in C uses backwards time and space */ + /* forward fft, so our answer comes out time and space reversed */ + + rlft3(NRtensdata, NRmatdata, 1, N2, N, 1); + + /* move data to my in-place order */ + a[1] = a[N2/2*N]; // pack in nyquest wavenumber point for DC freq transform + for (i2=0; i2<N2/2; i2++){ // move transform of nyquest frequency + a[(N2/2+i2)*N] = specdata[i2*2]; + a[(N2/2+i2)*N+1] = specdata[i2*2+1]; + } + a[N2/2*N+1] = specdata[N2]; // pack in nyquest wavenumber point for nyquest freq transform + + + rifft2d(a, M2, M); + + srand(randseed); + rannum = rand(); + maxerrifft = fabs(BIPRAND(rannum)-a[0]); + for (i1=1; i1<N; i1++){ + rannum = rand(); + t1=BIPRAND(rannum); + t2=a[N-i1]; + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a[N-i1])); + } + for (i2=1; i2<N2; i2++){ + rannum = rand(); + t1=BIPRAND(rannum); + t2=a[(N2-i2)*N]; + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a[(N2-i2)*N])); + for (i1=1; i1<N; i1++){ + rannum = rand(); + t1=BIPRAND(rannum); + t2=a[(N2-i2)*N+N-i1]; + maxerrifft = fmax(maxerrifft, fabs(BIPRAND(rannum)-a[(N2-i2)*N+N-i1])); + } + } + + printf("maxerrifft = %6.4e, ", maxerrifft); + + /* now use rifft2d to verify rfft2d */ + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + a[i1] = BIPRAND(rannum); + } + + rifft2d(a, M2, M); + rfft2d(a, M2, M); + + maxerrfft = 0; + srand(randseed); + for (i1=0; i1<N2*N; i1++){ + rannum = rand(); + maxerrfft = fmax(maxerrfft, fabs(BIPRAND(rannum)-a[i1])); + } + + printf("maxerrfft = %6.4e\n", maxerrfft); + + fft2dFree(); + free_f3tensor(NRtensdata,1,1,1,N2,1,N); + free_matrix(NRmatdata,1,1,1,2*N2); + a = 0; + } + else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fft2dFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/README.txt b/ffts/README.txt new file mode 100644 index 0000000..d3d8192 --- /dev/null +++ b/ffts/README.txt @@ -0,0 +1,70 @@ +This directory contains a public domain FFT library which was optimized +for speed on RISC processors such as the PowerPC. All ffts +use single precision floats, for double precision just use a +global search and replace to change float to double in all +source files. +Codewarrier Pro 1.0 project files are also supplied. + +** Warning ** Perform rigorous testing to +your own standards before using this code. + + (John Green) green_jt@vsdec.npt.nuwc.navy.mil + +files: + fftTiming +Application to time complex ffts + + rfftTiming +Application to time real ffts + +// Directory: fft libraries + +files: + + fftext.c +Library of in-place fast fourier transforms. Contains forward +and inverse complex and real transforms. The real fft's expect the +frequency domain data to have the real part of the fsamp/2 bin (which +has a 0 imaginary part) to be stored in the location for the imaginary +part of the DC bin (the DC bin of real data is also strictly real.) +You must first call an initialization routine fftInit before calling +the fft computation routines ffts, iffts, rffts and riffts. +The init routines malloc the memory to store the cosine and +bit reversed counter tables as well as initializing their values. + + fftlib.c +Lower level library of in-place fast fourier transforms. Same as fftext.c but you +need to manage the mallocs for the cosine and bit reversed tables yourself. + + + fft2d.c +Library of 2d and 3d complex and 2d real in-place fast fourier transforms. +The init routine fft2dInit must be called before using the 2d routines and +fft3dInit must be called before using the 3d routines. These init routines +will also call the appropriate 1d init routines in fftext.c + + matlib.c +Matrix transpose routines used by fft2d.c and complex vector multiply +for forming the product of two spectra. + + dxpose.c +Double precision matrix transpose for quick single precision complex transposing + +// Directory: timing code +This directory contains the source to fftTiming and rfftTiming + +// Directory: Numerical Recipes testing +This directory contains files used to test the various fft routines using +the Numerical Recipes in C routines as a baseline. These routines can be purchased +in PeeCee (after expanding you can move them to a Mac) format from: +http://cfata2.harvard.edu/numerical-recipes/ +Unfortunately Numerical Recipes defines its forward and inverse fft's backwards. +For complex fft's I just use their inverse fft as a forward one, but for real ffts +their forward fft followed by my inverse fft reverses the data. They also have ugly matrix +and tensor data types and start their indices with one, Fortran style, but these are +minor annoyances. + +// Directory: Matlab testing +This directory contains files to test fast 1d and 2d convolution with Matlab used to +verify the results. An example of using Matlab to test the fft library routines is +also given for the 2d real fft. diff --git a/ffts/Timing-code/fftTiming.c b/ffts/Timing-code/fftTiming.c new file mode 100644 index 0000000..ffa4bcb --- /dev/null +++ b/ffts/Timing-code/fftTiming.c @@ -0,0 +1,98 @@ +/* A program to time complex forward and inverse fast fourier transform routines */ +void four1(float data[], unsigned long nn, int isign); + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 3 /* the number of different ffts sizes to time */ + +typedef struct{ + float Re; + float Im; + } Complex; + +void main(){ +long fftSize[NSIZES] = {1024, 16384, 262144}; /* size of FFTs, must be powers of 2 */ +long fftRepeats[NSIZES] = {2000, 50, 1}; /* number of timing loops */ +Complex *a; +long isize; +long i1; +long TheErr; +long N; +long M; + +#if macintosh +UnsignedWide TheTime1; +UnsignedWide TheTime2; +double TheTime; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0].Re)); +for (isize = 0; isize < NSIZES; isize++){ + + N = fftSize[isize]; + printf("ffts size = %7d, ", N); + M = roundtol(LOG2(N)); + + TheErr = fftInit(M); + + if(!TheErr){ + a = (Complex *) malloc(N*sizeof(Complex) ); + if (a == 0) TheErr = 2; + } + + if(!TheErr){ + + /* set up a simple test case */ + for (i1=0; i1<N; i1++){ + a[i1].Re = sqrt(i1+.77777); + a[i1].Im = sqrt(i1+.22222); + } + + #if macintosh + /* make sure routines are in physical (not virtual) memory */ + Microseconds(&TheTime1); + ffts((float *)a, M, 1); + iffts((float *)a, M, 1); + + Microseconds(&TheTime1); + + for (i1=0;i1<fftRepeats[isize];i1++){ /* do many times for timing */ + ffts((float *)a, M, 1); + iffts((float *)a, M, 1); + } + + Microseconds(&TheTime2); + + TheTime = (double)(TheTime2.hi - TheTime1.hi) * 65536.0 * 65536.0; + TheTime = (TheTime + (double)(TheTime2.lo - TheTime1.lo)); + printf("ffts time = %12.1f µs, a[0].Re=%6e\n", TheTime/fftRepeats[isize]/2, a[0].Re); + + #else + printf("start timing %12d real ffts's\n", fftRepeats[isize]*2); + for (i1=0;i1<fftRepeats[isize];i1++){ /* do many times for timing */ + ffts((float *)a, M, 1); + iffts((float *)a, M, 1); + } + printf("end timing \n"); + #endif + free(a); + fftFree(); + } + else{ + if(TheErr==2) printf(" out of memory \n"); + else printf(" error \n"); + fftFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/Timing-code/rfftTiming.c b/ffts/Timing-code/rfftTiming.c new file mode 100644 index 0000000..ac8498a --- /dev/null +++ b/ffts/Timing-code/rfftTiming.c @@ -0,0 +1,90 @@ +/* A program to time real input fast fourier transform routine */ + +#include <stdio.h> +#include <stdlib.h> +#include <fp.h> +#include <math.h> +#include "fftlib.h" +#include "fftext.h" + +#if macintosh +#include <timer.h> +#endif + +#define NSIZES 3 /* the number of different fft sizes to time */ + +void main(){ +float *a; +long fftSize[NSIZES] = {2048, 32768, 524288}; /* size of FFTs, must be powers of 2 */ +long fftRepeats[NSIZES] = {2000, 50, 1}; /* number of timing loops */ +long isize; +long i1; +long TheErr; +long N; +long M; + +#if macintosh +UnsignedWide TheTime1; +UnsignedWide TheTime2; +double TheTime; +#endif + +printf(" %6d Byte Floats \n", sizeof(a[0])); +for (isize = 0; isize < NSIZES; isize++){ + + N = fftSize[isize]; + printf("rffts size = %9d ", N); + M = roundtol(LOG2(N)); + TheErr = fftInit(M); + + if(!TheErr){ + a = (float *) malloc(N*sizeof(float)); + if (a == 0) TheErr = 2; + } + + if(!TheErr){ + /* set up a simple test case */ + for (i1=0; i1<N; i1++){ + a[i1] = sqrt(i1+.77777); + } + + #if macintosh + /* make sure routines are in physical (not virtual) memory */ + Microseconds(&TheTime1); + rffts(a, M, 1); + riffts(a, M, 1); + + Microseconds(&TheTime1); + + for (i1 = 0; i1 < fftRepeats[isize]; i1++){ /* do many times for timing */ + rffts(a, M, 1); + riffts(a, M, 1); + } + + Microseconds(&TheTime2); + + + TheTime = (double)(TheTime2.hi - TheTime1.hi) * 65536.0 * 65536.0; + TheTime = (TheTime + (double)(TheTime2.lo - TheTime1.lo)); + printf("Ave of rffts & riffts Times = %14.1f µs.\n", TheTime/fftRepeats[isize]/2); + + #else + printf("start timing %12d real fft's\n", fftRepeats[isize]*2); + for (i1=0;i1<fftRepeats[isize];i1++){ /* do many times for timing */ + rffts(a, M, 1); + riffts(a, M, 1); + } + printf("end timing \n"); + #endif + free (a); + fftFree(); + } + else{ + if(TheErr==2) printf(" out of memory "); + else printf(" error "); + fftFree(); + } +} +printf(" Done. \n"); +return; +} diff --git a/ffts/abstract b/ffts/abstract new file mode 100644 index 0000000..45f3aa8 --- /dev/null +++ b/ffts/abstract @@ -0,0 +1,37 @@ +Subject: FFT for RISC 2.0 +To: macgifts@sumex-aim.stanford.edu +Enclosure: FFTs-for-RISC-2.sit + +Enclosed is a stuffit archive of version 2.0 of my 'C' source code fft library. + + Very-Fast Fourier Transform routines. Routines are provided for real and complex +forward and inverse 1d and 2d fourier transforms and 3d complex forward and inverse ffts. +I coded these to optimize execution speed on Risc processors like the PowerPC. +All fft sizes must still be a power of two. +Test programs based on the Numerical Recipes in C routines are provided. +Also included are some simple applications with source code which time the FFTs. +See the enclosed read me file for more information. + +Revision version 2.0: + Rewrote code to rely more on compiler optimization (and be less ugly.) + Removed restrictions on too small or too large ffts. + Provided a library extension that manages memory for cosine and bit +reversed counter tables. + Added 2d and 3d complex and 2d real ffts. + Speeded routines for data too large to fit in primary cache. + Changed most testing from Matlab to Numerical Recipes based (because its cheaper.) + Changed call parameters (watch out.) +Revision version 1.21: + line 126 of rfftTest.c corrected. +Revisions version 1.2: + I now store the Nyquest point of the real transform where the 0 for the DC term's +imaginary part used to be. !! WATCH OUT FOR THIS IF YOU USE rfft !! + Added the real inverse Fourier transform. + +Revisions version 1.1: + Re-arranged to put fft routines in a shared library and changed source file name to fftlib.c. + Removed some ugly optimizations that are no longer needed for CodeWarrier. + +This code is public domain, do anything you want to with it. + +[Moderators- This file should replace ffts-for-risc-121-c.hqx and can be included on any CD] diff --git a/ffts/src/dxpose.c b/ffts/src/dxpose.c new file mode 100644 index 0000000..9ee8836 --- /dev/null +++ b/ffts/src/dxpose.c @@ -0,0 +1,79 @@ +/********************* +This matrix transpose is in a seperate file because it should always be double precision. +*********************/ +#include "dxpose.h" +void dxpose(xdouble *indata, long iRsiz, xdouble *outdata, long oRsiz, long Nrows, long Ncols){ +/* not in-place double precision matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +xdouble *irow; /* pointer to input row start */ +xdouble *ocol; /* pointer to output col start */ +xdouble *idata; /* pointer to input data */ +xdouble *odata; /* pointer to output data */ +long RowCnt; /* row counter */ +long ColCnt; /* col counter */ +xdouble T0; /* data storage */ +xdouble T1; /* data storage */ +xdouble T2; /* data storage */ +xdouble T3; /* data storage */ +xdouble T4; /* data storage */ +xdouble T5; /* data storage */ +xdouble T6; /* data storage */ +xdouble T7; /* data storage */ +const long inRsizd1 = iRsiz; +const long inRsizd2 = 2*iRsiz; +const long inRsizd3 = inRsizd2+iRsiz; +const long inRsizd4 = 4*iRsiz; +const long inRsizd5 = inRsizd3+inRsizd2; +const long inRsizd6 = inRsizd4+inRsizd2; +const long inRsizd7 = inRsizd4+inRsizd3; +const long inRsizd8 = 8*iRsiz; + +ocol = outdata; +irow = indata; +for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){ + idata = irow; + odata = ocol; + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + T0 = *idata; + T1 = *(idata+inRsizd1); + T2 = *(idata+inRsizd2); + T3 = *(idata+inRsizd3); + T4 = *(idata+inRsizd4); + T5 = *(idata+inRsizd5); + T6 = *(idata+inRsizd6); + T7 = *(idata+inRsizd7); + *odata = T0; + *(odata+1) = T1; + *(odata+2) = T2; + *(odata+3) = T3; + *(odata+4) = T4; + *(odata+5) = T5; + *(odata+6) = T6; + *(odata+7) = T7; + idata++; + odata += oRsiz; + } + irow += inRsizd8; + ocol += 8; +} +if (Nrows%8 != 0){ + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + idata = irow++; + odata = ocol; + ocol += oRsiz; + for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){ + T0 = *idata; + *odata++ = T0; + idata += iRsiz; + } + } +} +} diff --git a/ffts/src/dxpose.h b/ffts/src/dxpose.h new file mode 100644 index 0000000..730f81f --- /dev/null +++ b/ffts/src/dxpose.h @@ -0,0 +1,16 @@ +/********************* +This matrix transpose is in a seperate file because it should always be double precision. +*********************/ +typedef double_t xdouble; // I use double_t so that global search and replace on double won't + // change this to float accidentally. + +void dxpose(xdouble *indata, long iRsiz, xdouble *outdata, long oRsiz, long Nrows, long Ncols); +/* not in-place double precision matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ diff --git a/ffts/src/fft2d.c b/ffts/src/fft2d.c new file mode 100644 index 0000000..0d8c3d8 --- /dev/null +++ b/ffts/src/fft2d.c @@ -0,0 +1,402 @@ +/******************************************************************* + This file extends the fftlib with 2d and 3d complex fft's and + 2d real fft's. All fft's return results in-place. Temporary buffers + for transposing columns are maintained privately via calls to + fft2dInit, fft2dFree, fft3dInit, and fft3dFree. + Note that you can call fft2dInit and fft3dInit repeatedly + with the same sizes, the extra calls will be ignored. + So, you could make a macro to call fft2dInit every time you call fft2d. + *** Warning *** fft2dFree and fft3dFree also call fftFree + so you must re-init all 1d fft sizes you are going to continue using +*******************************************************************/ +#include <stdlib.h> +#include <fp.h> +#include "fftlib.h" +#include "fftext.h" +#include "matlib.h" +#include "dxpose.h" +#include "fft2d.h" + // use trick of using a real double transpose in place of a complex transpose if it fits +#define cxpose(a,b,c,d,e,f) (2*sizeof(float)==sizeof(xdouble)) ? dxpose((xdouble *)(a), b, (xdouble *)(c), d, e, f) : cxpose(a,b,c,d,e,f); + // for this trick to work you must NOT replace the xdouble declarations in + // dxpose with float declarations. + + + // pointers for temporary storage for four columns +static float *Array2d[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; +int fft2dInit(long M2, long M){ + // init for fft2d, ifft2d, rfft2d, and rifft2d + // malloc storage for 4 columns of 2d ffts then call fftinit for both row and column ffts sizes +/* INPUTS */ +/* M = log2 of number of columns */ +/* M2 = log2 of number of rows */ +/* of 2d matrix to be fourier transformed */ +/* OUTPUTS */ +/* private storage for columns of 2d ffts */ +/* calls fftInit for cosine and bit reversed tables */ +int theError = 1; +if ((M2 >= 0) && (M2 < 8*sizeof(long))){ + theError = 0; + if (Array2d[M2] == 0){ + Array2d[M2] = (float *) malloc( 4*2*POW2(M2)*sizeof(float) ); + if (Array2d[M2] == 0) + theError = 2; + else{ + theError = fftInit(M2); + } + } + if (theError == 0) + theError = fftInit(M); +} +return theError; +} + +void fft2dFree(){ +// free storage for columns of 2d ffts and call fftFree to free all BRLow and Utbl storage +long i1; +for (i1=8*sizeof(long)-1; i1>=0; i1--){ + if (Array2d[i1] != 0){ + free(Array2d[i1]); + Array2d[i1] = 0; + }; +}; +fftFree(); +} + +void fft2d(float *data, long M2, long M){ +/* Compute 2D complex fft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ +long i1; +if((M2>0)&&(M>0)){ + ffts(data, M, POW2(M2)); + if (M>2) + for (i1=0; i1<POW2(M); i1+=4){ + cxpose(data + i1*2, POW2(M), Array2d[M2], POW2(M2), POW2(M2), 4); + ffts(Array2d[M2], M2, 4); + cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M), 4, POW2(M2)); + } + else{ + cxpose(data, POW2(M), Array2d[M2], POW2(M2), POW2(M2), POW2(M)); + ffts(Array2d[M2], M2, POW2(M)); + cxpose(Array2d[M2], POW2(M2), data, POW2(M), POW2(M), POW2(M2)); + } +} +else + ffts(data, M2+M, 1); +} + +void ifft2d(float *data, long M2, long M){ +/* Compute 2D complex ifft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ +long i1; +if((M2>0)&&(M>0)){ + iffts(data, M, POW2(M2)); + if (M>2) + for (i1=0; i1<POW2(M); i1+=4){ + cxpose(data + i1*2, POW2(M), Array2d[M2], POW2(M2), POW2(M2), 4); + iffts(Array2d[M2], M2, 4); + cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M), 4, POW2(M2)); + } + else{ + cxpose(data, POW2(M), Array2d[M2], POW2(M2), POW2(M2), POW2(M)); + iffts(Array2d[M2], M2, POW2(M)); + cxpose(Array2d[M2], POW2(M2), data, POW2(M), POW2(M), POW2(M2)); + } +} +else + iffts(data, M2+M, 1); +} + +int fft3dInit(long L, long M2, long M){ + // init for fft3d, ifft3d + // malloc storage for 4 columns and 4 pages of 3d ffts + // then call fftinit for page, row and column ffts sizes +//* M = log2 of number of columns */ +/* M2 = log2 of number of rows */ +/* L = log2 of number of pages */ +/* of 3d matrix to be fourier transformed */ +/* OUTPUTS */ +/* private storage for columns and pages of 3d ffts */ +/* calls fftInit for cosine and bit reversed tables */ +int theError = 1; +if ((L >= 0) && (L < 8*sizeof(long))){ + theError = 0; + if (Array2d[L] == 0){ + Array2d[L] = (float *) malloc( 4*2*POW2(L)*sizeof(float) ); + if (Array2d[L] == 0) + theError = 2; + else{ + theError = fftInit(L); + } + } + if (theError == 0){ + if (Array2d[M2] == 0){ + Array2d[M2] = (float *) malloc( 4*2*POW2(M2)*sizeof(float) ); + if (Array2d[M2] == 0) + theError = 2; + else{ + theError = fftInit(M2); + } + } + } + if (theError == 0) + theError = fftInit(M); +} +return theError; +} + +void fft3dFree(){ +// free storage for columns of all 2d&3d ffts and call fftFree to free all BRLow and Utbl storage +fft2dFree(); +} + +void fft3d(float *data, long M3, long M2, long M){ +/* Compute 2D complex fft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M3 = log2 of fft size number of pages */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ +long i1; +long i2; +const long N = POW2(M); +const long N2 = POW2(M2); +const long N3 = POW2(M3); +if((M3>0)&&(M2>0)&&(M>0)){ + ffts(data, M, N3*N2); + if (M>2) + for (i2=0; i2<N3; i2++){ + for (i1=0; i1<N; i1+=4){ + cxpose(data + i2*2*POW2(M2+M) + i1*2, N, Array2d[M2], N2, N2, 4); + ffts(Array2d[M2], M2, 4); + cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M) + i1*2, N, 4, N2); + } + } + else{ + for (i2=0; i2<N3; i2++){ + cxpose(data + i2*2*POW2(M2+M), N, Array2d[M2], N2, N2, N); + ffts(Array2d[M2], M2, N); + cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M), N, N, N2); + } + } + if ((M2+M)>2) + for (i1=0; i1<POW2(M2+M); i1+=4){ + cxpose(data + i1*2, POW2(M2+M), Array2d[M3], N3, N3, 4); + ffts(Array2d[M3], M3, 4); + cxpose(Array2d[M3], N3, data + i1*2, POW2(M2+M), 4, N3); + } + else{ + cxpose(data, POW2(M2+M), Array2d[M3], N3, N3, POW2(M2+M)); + ffts(Array2d[M3], M3, POW2(M2+M)); + cxpose(Array2d[M3], N3, data, POW2(M2+M), POW2(M2+M), N3); + } +} +else + if(M3==0) fft2d(data, M2, M); + else + if(M2==0) fft2d(data, M3, M); + else + if(M==0) fft2d(data, M3, M2); +} + +void ifft3d(float *data, long M3, long M2, long M){ +/* Compute 2D complex ifft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M3 = log2 of fft size number of pages */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ +long i1; +long i2; +const long N = POW2(M); +const long N2 = POW2(M2); +const long N3 = POW2(M3); +if((M3>0)&&(M2>0)&&(M>0)){ + iffts(data, M, N3*N2); + if (M>2) + for (i2=0; i2<N3; i2++){ + for (i1=0; i1<N; i1+=4){ + cxpose(data + i2*2*POW2(M2+M) + i1*2, N, Array2d[M2], N2, N2, 4); + iffts(Array2d[M2], M2, 4); + cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M) + i1*2, N, 4, N2); + } + } + else{ + for (i2=0; i2<N3; i2++){ + cxpose(data + i2*2*POW2(M2+M), N, Array2d[M2], N2, N2, N); + iffts(Array2d[M2], M2, N); + cxpose(Array2d[M2], N2, data + i2*2*POW2(M2+M), N, N, N2); + } + } + if ((M2+M)>2) + for (i1=0; i1<POW2(M2+M); i1+=4){ + cxpose(data + i1*2, POW2(M2+M), Array2d[M3], N3, N3, 4); + iffts(Array2d[M3], M3, 4); + cxpose(Array2d[M3], N3, data + i1*2, POW2(M2+M), 4, N3); + } + else{ + cxpose(data, POW2(M2+M), Array2d[M3], N3, N3, POW2(M2+M)); + iffts(Array2d[M3], M3, POW2(M2+M)); + cxpose(Array2d[M3], N3, data, POW2(M2+M), POW2(M2+M), N3); + } +} +else + if(M3==0) ifft2d(data, M2, M); + else + if(M2==0) ifft2d(data, M3, M); + else + if(M==0) ifft2d(data, M3, M2); +} + +void rfft2d(float *data, long M2, long M){ +/* Compute 2D real fft and return results in-place */ +/* First performs real fft on rows using size from M to compute positive frequencies */ +/* then performs transform on columns using size from M2 to compute wavenumbers */ +/* If you think of the result as a complex pow(2,M2) by pow(2,M-1) matrix */ +/* then the first column contains the positive wavenumber spectra of DC frequency */ +/* followed by the positive wavenumber spectra of the nyquest frequency */ +/* since these are two positive wavenumber spectra the first complex value */ +/* of each is really the real values for the zero and nyquest wavenumber packed together */ +/* All other columns contain the positive and negative wavenumber spectra of a positive frequency */ +/* See rspect2dprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows in */ +/* M = log2 of fft size number of columns in */ +/* OUTPUTS */ +/* *data = output data array */ +long i1; +if((M2>0)&&(M>0)){ + rffts(data, M, POW2(M2)); + if (M==1){ + cxpose(data, POW2(M)/2, Array2d[M2]+POW2(M2)*2, POW2(M2), POW2(M2), 1); + xpose(Array2d[M2]+POW2(M2)*2, 2, Array2d[M2], POW2(M2), POW2(M2), 2); + rffts(Array2d[M2], M2, 2); + cxpose(Array2d[M2], POW2(M2), data, POW2(M)/2, 1, POW2(M2)); + } + else if (M==2){ + cxpose(data, POW2(M)/2, Array2d[M2]+POW2(M2)*2, POW2(M2), POW2(M2), 1); + xpose(Array2d[M2]+POW2(M2)*2, 2, Array2d[M2], POW2(M2), POW2(M2), 2); + rffts(Array2d[M2], M2, 2); + cxpose(Array2d[M2], POW2(M2), data, POW2(M)/2, 1, POW2(M2)); + + cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1); + ffts(Array2d[M2], M2, 1); + cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 1, POW2(M2)); + } + else{ + cxpose(data, POW2(M)/2, Array2d[M2]+POW2(M2)*2, POW2(M2), POW2(M2), 1); + xpose(Array2d[M2]+POW2(M2)*2, 2, Array2d[M2], POW2(M2), POW2(M2), 2); + rffts(Array2d[M2], M2, 2); + cxpose(Array2d[M2], POW2(M2), data, POW2(M)/2, 1, POW2(M2)); + + cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 3); + ffts(Array2d[M2], M2, 3); + cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 3, POW2(M2)); + for (i1=4; i1<POW2(M)/2; i1+=4){ + cxpose(data + i1*2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 4); + ffts(Array2d[M2], M2, 4); + cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M)/2, 4, POW2(M2)); + } + } +} +else + rffts(data, M2+M, 1); +} + +void rifft2d(float *data, long M2, long M){ +/* Compute 2D real ifft and return results in-place */ +/* The input must be in the order as outout from rfft2d */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows out */ +/* M = log2 of fft size number of columns out */ +/* OUTPUTS */ +/* *data = output data array */ +long i1; +if((M2>0)&&(M>0)){ + if (M==1){ + cxpose(data, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1); + riffts(Array2d[M2], M2, 2); + xpose(Array2d[M2], POW2(M2), Array2d[M2]+POW2(M2)*2, 2, 2, POW2(M2)); + cxpose(Array2d[M2]+POW2(M2)*2, POW2(M2), data, POW2(M)/2, 1, POW2(M2)); + } + else if (M==2){ + cxpose(data, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1); + riffts(Array2d[M2], M2, 2); + xpose(Array2d[M2], POW2(M2), Array2d[M2]+POW2(M2)*2, 2, 2, POW2(M2)); + cxpose(Array2d[M2]+POW2(M2)*2, POW2(M2), data, POW2(M)/2, 1, POW2(M2)); + + cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1); + iffts(Array2d[M2], M2, 1); + cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 1, POW2(M2)); + } + else{ + cxpose(data, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 1); + riffts(Array2d[M2], M2, 2); + xpose(Array2d[M2], POW2(M2), Array2d[M2]+POW2(M2)*2, 2, 2, POW2(M2)); + cxpose(Array2d[M2]+POW2(M2)*2, POW2(M2), data, POW2(M)/2, 1, POW2(M2)); + + cxpose(data + 2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 3); + iffts(Array2d[M2], M2, 3); + cxpose(Array2d[M2], POW2(M2), data + 2, POW2(M)/2, 3, POW2(M2)); + for (i1=4; i1<POW2(M)/2; i1+=4){ + cxpose(data + i1*2, POW2(M)/2, Array2d[M2], POW2(M2), POW2(M2), 4); + iffts(Array2d[M2], M2, 4); + cxpose(Array2d[M2], POW2(M2), data + i1*2, POW2(M)/2, 4, POW2(M2)); + } + } + riffts(data, M, POW2(M2)); +} +else + riffts(data, M2+M, 1); +} + +void rspect2dprod(float *data1, float *data2, float *outdata, long N2, long N1){ +// When multiplying a pair of 2d spectra from rfft2d care must be taken to multiply the +// four real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N2 = fft size number of rows into rfft2d for both data1 and data2 */ +/* N1 = fft size number of columns into rfft2d for both data1 and data2 */ +long N = N2 * N1/2; // number of "complex" numbers in spectra + +if( (N2 > 1) && (N1>1)){ + outdata[0] = data1[0] * data2[0]; // multiply the zero freq, zero wavenumber values + outdata[1] = data1[1] * data2[1]; // multiply the zero freq, nyquest wavenumber values + + cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); + + outdata[N] = data1[N] * data2[N]; // multiply the nyquest freq, zero wavenumber values + outdata[N+1] = data1[N+1] * data2[N+1]; // multiply the nyquest freq, nyquest wavenumber values + + cvprod(data1 + N+2, data2 + N+2, outdata + N+2, N/2-1); +} +else{ // really 1D rfft spectra + N = N2 * N1; // one of these is a 1 + if(N>1){ + outdata[0] = data1[0] * data2[0]; // multiply the zero freq values + outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values + cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values + } + else{ + outdata[0] = data1[0] * data2[0]; + } +} +}
\ No newline at end of file diff --git a/ffts/src/fft2d.h b/ffts/src/fft2d.h new file mode 100644 index 0000000..fd074ff --- /dev/null +++ b/ffts/src/fft2d.h @@ -0,0 +1,114 @@ +/******************************************************************* + This file extends the fftlib with 2d and 3d complex fft's and + 2d real fft's. All fft's return results in-place. Temporary buffers + for transposing columns are maintained privately via calls to + fft2dInit, fft2dFree, fft3dInit, and fft3dFree. + Note that you can call fft2dInit and fft3dInit repeatedly + with the same sizes, the extra calls will be ignored. + So, you could make a macro to call fft2dInit every time you call fft2d. + *** Warning *** fft2dFree and fft3dFree also call fftFree + so you must re-init all 1d fft sizes you are going to continue using +*******************************************************************/ +int fft2dInit(long M2, long M); + // init for fft2d, ifft2d, rfft2d, and rifft2d + // malloc storage for columns of 2d ffts then call fftinit for both row and column ffts sizes +/* INPUTS */ +/* M = log2 of number of columns */ +/* M2 = log2 of number of rows */ +/* of 2d matrix to be fourier transformed */ +/* OUTPUTS */ +/* private storage for columns of 2d ffts */ +/* calls fftInit for cosine and bit reversed tables */ + +void fft2dFree(); +// free storage for columns of all 2d&3d ffts and call fftFree to free all BRLow and Utbl storage + +void fft2d(float *data, long M2, long M); +/* Compute 2D complex fft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ + +void ifft2d(float *data, long M2, long M); +/* Compute 2D complex ifft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ + +int fft3dInit(long L, long M2, long M); + // init for fft3d, ifft3d + // malloc storage for 4 columns and 4 pages of 3d ffts + // then call fftinit for page, row and column ffts sizes +/* M = log2 of number of columns */ +/* M2 = log2 of number of rows */ +/* L = log2 of number of pages */ +/* of 3d matrix to be fourier transformed */ +/* OUTPUTS */ +/* private storage for columns and pages of 3d ffts */ +/* calls fftInit for cosine and bit reversed tables */ + +void fft3dFree(); +// free storage for columns of all 2d&3d ffts and call fftFree to free all BRLow and Utbl storage + +void fft3d(float *data, long M3, long M2, long M); +/* Compute 2D complex fft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M3 = log2 of fft size number of pages */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ + +void ifft3d(float *data, long M3, long M2, long M); +/* Compute 2D complex ifft and return results in-place */ +/* INPUTS */ +/* *data = input data array */ +/* M3 = log2 of fft size number of pages */ +/* M2 = log2 of fft size number of rows */ +/* M = log2 of fft size number of columns */ +/* OUTPUTS */ +/* *data = output data array */ + +void rfft2d(float *data, long M2, long M); +/* Compute 2D real fft and return results in-place */ +/* First performs real fft on rows using size from M to compute positive frequencies */ +/* then performs transform on columns using size from M2 to compute wavenumbers */ +/* If you think of the result as a complex pow(2,M2) by pow(2,M-1) matrix */ +/* then the first column contains the positive wavenumber spectra of DC frequency */ +/* followed by the positive wavenumber spectra of the nyquest frequency */ +/* since these are two positive wavenumber spectra the first complex value */ +/* of each is really the real values for the zero and nyquest wavenumber packed together */ +/* All other columns contain the positive and negative wavenumber spectra of a positive frequency */ +/* See rspect2dprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows in */ +/* M = log2 of fft size number of columns in */ +/* OUTPUTS */ +/* *data = output data array */ + +void rifft2d(float *data, long M2, long M); +/* Compute 2D real ifft and return results in-place */ +/* The input must be in the order as outout from rfft2d */ +/* INPUTS */ +/* *data = input data array */ +/* M2 = log2 of fft size number of rows out */ +/* M = log2 of fft size number of columns out */ +/* OUTPUTS */ +/* *data = output data array */ + +void rspect2dprod(float *data1, float *data2, float *outdata, long N2, long N1); +// When multiplying a pair of 2d spectra from rfft2d care must be taken to multiply the +// four real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N2 = fft size number of rows into rfft2d for both data1 and data2 */ +/* N1 = fft size number of columns into rfft2d for both data1 and data2 */ diff --git a/ffts/src/fftext.c b/ffts/src/fftext.c new file mode 100644 index 0000000..a39911c --- /dev/null +++ b/ffts/src/fftext.c @@ -0,0 +1,156 @@ +/******************************************************************* + This file extends the fftlib with calls to maintain the cosine and bit reversed tables + for you (including mallocs and free's). Call the init routine for each fft size you will + be using. Then you can call the fft routines below which will make the fftlib library + call with the appropriate tables passed. When you are done with all fft's you can call + fftfree to release the storage for the tables. Note that you can call fftinit repeatedly + with the same size, the extra calls will be ignored. So, you could make a macro to call + fftInit every time you call ffts. For example you could have someting like: + #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); +*******************************************************************/ +#include <stdlib.h> +#include "fftlib.h" +#include "matlib.h" +#include "fftext.h" + +// pointers to storage of Utbl's and BRLow's +static float *UtblArray[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; +static short *BRLowArray[8*sizeof(long)/2] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; + +int fftInit(long M){ +// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft +/* INPUTS */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* OUTPUTS */ +/* private cosine and bit reversed tables */ + +int theError = 1; +/*** I did NOT test cases with M>27 ***/ +if ((M >= 0) && (M < 8*sizeof(long))){ + theError = 0; + if (UtblArray[M] == 0){ // have we not inited this size fft yet? + // init cos table + UtblArray[M] = (float *) malloc( (POW2(M)/4+1)*sizeof(float) ); + if (UtblArray[M] == 0) + theError = 2; + else{ + fftCosInit(M, UtblArray[M]); + } + if (M > 1){ + if (BRLowArray[M/2] == 0){ // init bit reversed table for cmplx fft + BRLowArray[M/2] = (short *) malloc( POW2(M/2-1)*sizeof(short) ); + if (BRLowArray[M/2] == 0) + theError = 2; + else{ + fftBRInit(M, BRLowArray[M/2]); + } + } + } + if (M > 2){ + if (BRLowArray[(M-1)/2] == 0){ // init bit reversed table for real fft + BRLowArray[(M-1)/2] = (short *) malloc( POW2((M-1)/2-1)*sizeof(short) ); + if (BRLowArray[(M-1)/2] == 0) + theError = 2; + else{ + fftBRInit(M-1, BRLowArray[(M-1)/2]); + } + } + } + } +}; +return theError; +} + +void fftFree(){ +// release storage for all private cosine and bit reversed tables +long i1; +for (i1=8*sizeof(long)/2-1; i1>=0; i1--){ + if (BRLowArray[i1] != 0){ + free(BRLowArray[i1]); + BRLowArray[i1] = 0; + }; +}; +for (i1=8*sizeof(long)-1; i1>=0; i1--){ + if (UtblArray[i1] != 0){ + free(UtblArray[i1]); + UtblArray[i1] = 0; + }; +}; +} + +/************************************************* + The following calls are easier than calling to fftlib directly. + Just make sure fftlib has been called for each M first. +**************************************************/ + +void ffts(float *data, long M, long Rows){ +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + ffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); +} + +void iffts(float *data, long M, long Rows){ +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + iffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); +} + +void rffts(float *data, long M, long Rows){ +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + rffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); +} + +void riffts(float *data, long M, long Rows){ +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + riffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); +} + +void rspectprod(float *data1, float *data2, float *outdata, long N){ +// When multiplying a pair of spectra from rfft care must be taken to multiply the +// two real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* INPUTS */ +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N = fft input size for both data1 and data2 */ +/* OUTPUTS */ +/* *outdata = output data array spectra */ +if(N>1){ + outdata[0] = data1[0] * data2[0]; // multiply the zero freq values + outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values + cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values +} +else{ + outdata[0] = data1[0] * data2[0]; +} +} diff --git a/ffts/src/fftext.h b/ffts/src/fftext.h new file mode 100644 index 0000000..15d8a6b --- /dev/null +++ b/ffts/src/fftext.h @@ -0,0 +1,106 @@ +/******************************************************************* + This file extends the fftlib with calls to maintain the cosine and bit reversed tables + for you (including mallocs and free's). Call the init routine for each fft size you will + be using. Then you can call the fft routines below which will make the fftlib library + call with the appropriate tables passed. When you are done with all fft's you can call + fftfree to release the storage for the tables. Note that you can call fftinit repeatedly + with the same size, the extra calls will be ignored. So, you could make a macro to call + fftInit every time you call ffts. For example you could have someting like: + #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); +*******************************************************************/ + +int fftInit(long M); +// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft +/* INPUTS */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* OUTPUTS */ +/* private cosine and bit reversed tables */ + +void fftFree(); +// release storage for all private cosine and bit reversed tables + +void ffts(float *data, long M, long Rows); +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void iffts(float *data, long M, long Rows); +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void rffts(float *data, long M, long Rows); +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + +void riffts(float *data, long M, long Rows); +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + +void rspectprod(float *data1, float *data2, float *outdata, long N); +// When multiplying a pair of spectra from rfft care must be taken to multiply the +// two real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* INPUTS */ +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N = fft input size for both data1 and data2 */ +/* OUTPUTS */ +/* *outdata = output data array spectra */ + + +// The following is FYI + + +//Note that most of the fft routines require full matrices, ie Rsiz==Ncols +//This is how I like to define a real matrix: +//struct matrix { // real matrix +// float *d; // pointer to data +// long Nrows; // number of rows in the matrix +// long Ncols; // number of columns in the matrix (can be less than Rsiz) +// long Rsiz; // number of floats from one row to the next +//}; +//typedef struct matrix matrix; + + + +// CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make +// arrays that start exactly on a cache line start. +// First we CACHEFILLMALLOC a void * (use this void * when free'ing), +// then we set our array pointer equal to the properly cast CEILCACHELINE of this void * +// example: +// aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); +// a = (float *) CEILCACHELINE(ainit); +// ... main body of code ... +// free(aInit); +// +// To disable this alignment, set CACHELINESIZE to 1 +//#define CACHELINESIZE 32 // Bytes per cache line +//#define CACHELINEFILL (CACHELINESIZE-1) +//#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) +//#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) diff --git a/ffts/src/fftlib.c b/ffts/src/fftlib.c new file mode 100644 index 0000000..d832f89 --- /dev/null +++ b/ffts/src/fftlib.c @@ -0,0 +1,3174 @@ +/******************************************************************* +lower level fft stuff including routines called in fftext.c and fft2d.c +*******************************************************************/ +#include "fftlib.h" +#include <math.h> +#define MCACHE (11-(sizeof(float)/8)) // fft's with M bigger than this bust primary cache +#ifdef WIN32 +#define inline __inline +#endif + +// some math constants to 40 decimal places +#define MYPI 3.141592653589793238462643383279502884197 // pi +#define MYROOT2 1.414213562373095048801688724209698078569 // sqrt(2) +#define MYCOSPID8 0.9238795325112867561281831893967882868224 // cos(pi/8) +#define MYSINPID8 0.3826834323650897717284599840303988667613 // sin(pi/8) + + +/************************************************* +routines to initialize tables used by fft routines +**************************************************/ + +void fftCosInit(long M, float *Utbl){ +/* Compute Utbl, the cosine table for ffts */ +/* of size (pow(2,M)/4 +1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *Utbl = cosine table */ +unsigned long fftN = POW2(M); +unsigned long i1; + Utbl[0] = 1.0; + for (i1 = 1; i1 < fftN/4; i1++) + Utbl[i1] = cos( (2.0 * MYPI * i1) / fftN ); + Utbl[fftN/4] = 0.0; +} + +void fftBRInit(long M, short *BRLow){ +/* Compute BRLow, the bit reversed table for ffts */ +/* of size pow(2,M/2 -1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *BRLow = bit reversed counter table */ +long Mroot_1 = M / 2 - 1; +long Nroot_1 = POW2(Mroot_1); +long i1; +long bitsum; +long bitmask; +long bit; +for (i1 = 0; i1 < Nroot_1; i1++){ + bitsum = 0; + bitmask = 1; + for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++) + if (i1 & bitmask) + bitsum = bitsum + (Nroot_1 >> bit); + BRLow[i1] = bitsum; +}; +} + +/************************************************ +parts of ffts1 +*************************************************/ + +inline void bitrevR2(float *ioptr, long M, short *BRLow); +inline void bitrevR2(float *ioptr, long M, short *BRLow){ +/*** bit reverse and first radix 2 stage of forward or inverse fft ***/ +float f0r; +float f0i; +float f1r; +float f1i; +float f2r; +float f2i; +float f3r; +float f3i; +float f4r; +float f4i; +float f5r; +float f5i; +float f6r; +float f6i; +float f7r; +float f7i; +float t0r; +float t0i; +float t1r; +float t1i; +float *p0r; +float *p1r; +float *IOP; +float *iolimit; +long Colstart; +long iCol; +unsigned long posA; +unsigned long posAi; +unsigned long posB; +unsigned long posBi; + +const unsigned long Nrems2 = POW2((M+3)/2); +const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2; +const unsigned long Nroot_1 = POW2(M/2-1)-1; +const unsigned long ColstartShift = (M+1)/2 +1; +posA = POW2(M); // 1/2 of POW2(M) complexes +posAi = posA + 1; +posB = posA + 2; +posBi = posB + 1; + +iolimit = ioptr + Nrems2; +for (; ioptr < iolimit; ioptr += POW2(M/2+1)){ + for (Colstart = Nroot_1; Colstart >= 0; Colstart--){ + iCol = Nroot_1; + p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4; + IOP = ioptr + (Colstart << ColstartShift); + p1r = IOP + BRLow[iCol]*4; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + for (; iCol > Colstart;){ + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + f4r = *(p1r); + f4i = *(p1r+1); + f5r = *(p1r+posA); + f5i = *(p1r+posAi); + f6r = *(p1r+2); + f6i = *(p1r+(2+1)); + f7r = *(p1r+posB); + f7i = *(p1r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + f0r = f4r + f5r; + f0i = f4i + f5i; + f5r = f4r - f5r; + f5i = f4i - f5i; + f2r = f6r + f7r; + f2i = f6i + f7i; + f7r = f6r - f7r; + f7i = f6i - f7i; + + *(p1r) = t0r; + *(p1r+1) = t0i; + *(p1r+2) = f1r; + *(p1r+(2+1)) = f1i; + *(p1r+posA) = t1r; + *(p1r+posAi) = t1i; + *(p1r+posB) = f3r; + *(p1r+posBi) = f3i; + *(p0r) = f0r; + *(p0r+1) = f0i; + *(p0r+2) = f5r; + *(p0r+(2+1)) = f5i; + *(p0r+posA) = f2r; + *(p0r+posAi) = f2i; + *(p0r+posB) = f7r; + *(p0r+posBi) = f7i; + + p0r -= Nrems2; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + iCol -= 1; + p1r = IOP + BRLow[iCol]*4; + }; + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + *(p0r) = t0r; + *(p0r+1) = t0i; + *(p0r+2) = f1r; + *(p0r+(2+1)) = f1i; + *(p0r+posA) = t1r; + *(p0r+posAi) = t1i; + *(p0r+posB) = f3r; + *(p0r+posBi) = f3i; + + }; +}; +} + +inline void fft2pt(float *ioptr); +inline void fft2pt(float *ioptr){ +/*** RADIX 2 fft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[2]; +f1i = ioptr[3]; + + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + + /* store result */ +ioptr[0] = t0r; +ioptr[1] = t0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +} + + +inline void fft4pt(float *ioptr); +inline void fft4pt(float *ioptr){ +/*** RADIX 4 fft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[4]; +f1i = ioptr[5]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - -i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + /* store result */ +ioptr[0] = f0r; +ioptr[1] = f0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +ioptr[4] = f2r; +ioptr[5] = f2i; +ioptr[6] = f3r; +ioptr[7] = f3i; +} + +inline void fft8pt(float *ioptr); +inline void fft8pt(float *ioptr){ +/*** RADIX 8 fft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[8]; +f1i = ioptr[9]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f7r = ioptr[14]; +f7i = ioptr[15]; + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - -i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - -i - t1 + f7 - 1 - t1 - -i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r - t1i; +f7i = f5i + t1r; +f5r = f5r + t1i; +f5i = f5i - t1r; + + +t0r = f0r - f4r; +t0i = f0i - f4i; +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r - f6i; +t1i = f2i + f6r; +f2r = f2r + f6i; +f2i = f2i - f6r; + +f4r = f1r - f5r * w0r - f5i * w0r; +f4i = f1i + f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r - f7i * w0r; +f6i = f3i + f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* store result */ +ioptr[0] = f0r; +ioptr[1] = f0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +ioptr[4] = f2r; +ioptr[5] = f2i; +ioptr[6] = f3r; +ioptr[7] = f3i; +ioptr[8] = t0r; +ioptr[9] = t0i; +ioptr[10] = f4r; +ioptr[11] = f4i; +ioptr[12] = t1r; +ioptr[13] = t1i; +ioptr[14] = f6r; +ioptr[15] = f6i; +} + +inline void bfR2(float *ioptr, long M, long NDiffU); +inline void bfR2(float *ioptr, long M, long NDiffU){ +/*** 2nd radix 2 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + +for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){ + + f0r = *p0r; + f1r = *p1r; + f0i = *(p0r + 1); + f1i = *(p1r + 1); + f2r = *p2r; + f3r = *p3r; + f2i = *(p2r + 1); + f3i = *(p3r + 1); + + f4r = f0r + f1r; + f4i = f0i + f1i; + f5r = f0r - f1r; + f5i = f0i - f1i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f7r = f2r - f3r; + f7i = f2i - f3i; + + *p0r = f4r; + *(p0r + 1) = f4i; + *p1r = f5r; + *(p1r + 1) = f5i; + *p2r = f6r; + *(p2r + 1) = f6i; + *p3r = f7r; + *(p3r + 1) = f7i; + + f0r = *(p0r + pos); + f1i = *(p1r + posi); + f0i = *(p0r + posi); + f1r = *(p1r + pos); + f2r = *(p2r + pos); + f3i = *(p3r + posi); + f2i = *(p2r + posi); + f3r = *(p3r + pos); + + f4r = f0r + f1i; + f4i = f0i - f1r; + f5r = f0r - f1i; + f5i = f0i + f1r; + + f6r = f2r + f3i; + f6i = f2i - f3r; + f7r = f2r - f3i; + f7i = f2i + f3r; + + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + *(p2r + pos) = f6r; + *(p2r + posi) = f6i; + *(p3r + pos) = f7r; + *(p3r + posi) = f7i; + + p0r += pnext; + p1r += pnext; + p2r += pnext; + p3r += pnext; + +} +} + +inline void bfR4(float *ioptr, long M, long NDiffU); +inline void bfR4(float *ioptr, long M, long NDiffU){ +/*** 1 radix 4 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long pnexti; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float w1r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pnexti = pnext + 1; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f0 - - f4 + f1 - 1 - f5 - - f5 + f2 - - f6 - 1 - f6 + f3 - 1 - f3 - -i - f7 + */ + /* Butterflys */ + /* + f0 - - f4 - - f4 + f1 - -i - t1 - - f5 + f2 - - f2 - w1 - f6 + f3 - -i - f7 - iw1- f7 + */ + +f0r = *p0r; +f1r = *p1r; +f2r = *p2r; +f3r = *p3r; +f0i = *(p0r + 1); +f1i = *(p1r + 1); +f2i = *(p2r + 1); +f3i = *(p3r + 1); + +f5r = f0r - f1r; +f5i = f0i - f1i; +f0r = f0r + f1r; +f0i = f0i + f1i; + +f6r = f2r + f3r; +f6i = f2i + f3i; +f3r = f2r - f3r; +f3i = f2i - f3i; + +for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + + f7r = f5r - f3i; + f7i = f5i + f3r; + f5r = f5r + f3i; + f5i = f5i - f3r; + + f4r = f0r + f6r; + f4i = f0i + f6i; + f6r = f0r - f6r; + f6i = f0i - f6i; + + f2r = *(p2r + pos); + f2i = *(p2r + posi); + f1r = *(p1r + pos); + f1i = *(p1r + posi); + f3i = *(p3r + posi); + f0r = *(p0r + pos); + f3r = *(p3r + pos); + f0i = *(p0r + posi); + + *p3r = f7r; + *p0r = f4r; + *(p3r + 1) = f7i; + *(p0r + 1) = f4i; + *p1r = f5r; + *p2r = f6r; + *(p1r + 1) = f5i; + *(p2r + 1) = f6i; + + f7r = f2r - f3i; + f7i = f2i + f3r; + f2r = f2r + f3i; + f2i = f2i - f3r; + + f4r = f0r + f1i; + f4i = f0i - f1r; + t1r = f0r - f1i; + t1i = f0i + f1r; + + f5r = t1r - f7r * w1r + f7i * w1r; + f5i = t1i - f7r * w1r - f7i * w1r; + f7r = t1r * Two - f5r; + f7i = t1i * Two - f5i; + + f6r = f4r - f2r * w1r - f2i * w1r; + f6i = f4i + f2r * w1r - f2i * w1r; + f4r = f4r * Two - f6r; + f4i = f4i * Two - f6i; + + f3r = *(p3r + pnext); + f0r = *(p0r + pnext); + f3i = *(p3r + pnexti); + f0i = *(p0r + pnexti); + f2r = *(p2r + pnext); + f2i = *(p2r + pnexti); + f1r = *(p1r + pnext); + f1i = *(p1r + pnexti); + + *(p2r + pos) = f6r; + *(p1r + pos) = f5r; + *(p2r + posi) = f6i; + *(p1r + posi) = f5i; + *(p3r + pos) = f7r; + *(p0r + pos) = f4r; + *(p3r + posi) = f7i; + *(p0r + posi) = f4i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + f5r = f0r - f1r; + f5i = f0i - f1i; + f0r = f0r + f1r; + f0i = f0i + f1i; + + p3r += pnext; + p0r += pnext; + p1r += pnext; + p2r += pnext; + +} +f7r = f5r - f3i; +f7i = f5i + f3r; +f5r = f5r + f3i; +f5i = f5i - f3r; + +f4r = f0r + f6r; +f4i = f0i + f6i; +f6r = f0r - f6r; +f6i = f0i - f6i; + +f2r = *(p2r + pos); +f2i = *(p2r + posi); +f1r = *(p1r + pos); +f1i = *(p1r + posi); +f3i = *(p3r + posi); +f0r = *(p0r + pos); +f3r = *(p3r + pos); +f0i = *(p0r + posi); + +*p3r = f7r; +*p0r = f4r; +*(p3r + 1) = f7i; +*(p0r + 1) = f4i; +*p1r = f5r; +*p2r = f6r; +*(p1r + 1) = f5i; +*(p2r + 1) = f6i; + +f7r = f2r - f3i; +f7i = f2i + f3r; +f2r = f2r + f3i; +f2i = f2i - f3r; + +f4r = f0r + f1i; +f4i = f0i - f1r; +t1r = f0r - f1i; +t1i = f0i + f1r; + +f5r = t1r - f7r * w1r + f7i * w1r; +f5i = t1i - f7r * w1r - f7i * w1r; +f7r = t1r * Two - f5r; +f7i = t1i * Two - f5i; + +f6r = f4r - f2r * w1r - f2i * w1r; +f6i = f4i + f2r * w1r - f2i * w1r; +f4r = f4r * Two - f6r; +f4i = f4i * Two - f6i; + +*(p2r + pos) = f6r; +*(p1r + pos) = f5r; +*(p2r + posi) = f6i; +*(p1r + posi) = f5i; +*(p3r + pos) = f7r; +*(p0r + pos) = f4r; +*(p3r + posi) = f7i; +*(p0r + posi) = f4i; + +} + +inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/*** RADIX 8 Stages ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long Uinc; +unsigned long Uinc2; +unsigned long Uinc4; +unsigned long DiffUCnt; +unsigned long SameUCnt; +unsigned long U2toU3; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; +float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; + +float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 8; +pos = pinc * 4; +posi = pos + 1; +NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly +Uinc = NSameU * Ustride; +Uinc2 = Uinc * 2; +Uinc4 = Uinc * 4; +U2toU3 = (POW2(M) / 8)*Ustride; +for (; StageCnt > 0 ; StageCnt--){ + + u0r = &Utbl[0]; + u0i = &Utbl[POW2(M-2)*Ustride]; + u1r = u0r; + u1i = u0i; + u2r = u0r; + u2i = u0i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + w2r = *u2r; + w2i = *u2i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + + pstrt = ioptr; + + p0r = pstrt; + p1r = pstrt+pinc; + p2r = p1r+pinc; + p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - w0- f1 - - f1 - - f1 + f2 - - f2 - w1- f2 - - f4 + f3 - w0- t1 - iw1- f3 - - f5 + + f4 - - t0 - - f4 - w2- t0 + f5 - w0- f5 - - f5 - w3- t1 + f6 - - f6 - w1- f6 - iw2- f6 + f7 - w0- t1 - iw1- f7 - iw3- f7 + */ + + for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){ + f0r = *p0r; + f0i = *(p0r + 1); + f1r = *p1r; + f1i = *(p1r + 1); + for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r + f1i * w0i; + t0i = f0i - f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r - f3i * w0i; + t1i = f2i + f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r + f2i * w1i; + f0i = t0i - f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i - t1i * w1r; + f3i = f1i + t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + + t0r = f4r + f5r * w0r + f5i * w0i; + t0i = f4i - f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r - f7i * w0i; + t1i = f6i + f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r + f6i * w1i; + f4i = t0i - f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i - t1i * w1r; + f7i = f5i + t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + t0r = f0r - f4r * w2r - f4i * w2i; + t0i = f0i + f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + t1r = f1r - f5r * w3r - f5i * w3i; + t1i = f1i + f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + *(p0r + pos) = t0r; + *(p1r + pos) = t1r; + *(p0r + posi) = t0i; + *(p1r + posi) = t1i; + *p0r = f0r; + *p1r = f1r; + *(p0r + 1) = f0i; + *(p1r + 1) = f1i; + + p0r += pnext; + f0r = *p0r; + f0i = *(p0r + 1); + + p1r += pnext; + + f1r = *p1r; + f1i = *(p1r + 1); + + f4r = f2r - f6r * w2i + f6i * w2r; + f4i = f2i - f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + f5r = f3r - f7r * w3i + f7i * w3r; + f5i = f3i - f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *p2r = f4r; + *p3r = f5r; + *(p2r + 1) = f4i; + *(p3r + 1) = f5i; + *(p2r + pos) = f6r; + *(p3r + pos) = f7r; + *(p2r + posi) = f6i; + *(p3r + posi) = f7i; + + p2r += pnext; + p3r += pnext; + + } + + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r + f1i * w0i; + t0i = f0i - f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r - f3i * w0i; + t1i = f2i + f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r + f2i * w1i; + f0i = t0i - f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i - t1i * w1r; + f3i = f1i + t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + if (DiffUCnt == NDiffU/2) + Uinc4 = -Uinc4; + + u0r += Uinc4; + u0i -= Uinc4; + u1r += Uinc2; + u1i -= Uinc2; + u2r += Uinc; + u2i -= Uinc; + + pstrt += 2; + + t0r = f4r + f5r * w0r + f5i * w0i; + t0i = f4i - f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r - f7i * w0i; + t1i = f6i + f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r + f6i * w1i; + f4i = t0i - f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i - t1i * w1r; + f7i = f5i + t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + + if (DiffUCnt <= NDiffU/2) + w0r = -w0r; + + t0r = f0r - f4r * w2r - f4i * w2i; + t0i = f0i + f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + f4r = f2r - f6r * w2i + f6i * w2r; + f4i = f2i - f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + *(p0r + pos) = t0r; + *p2r = f4r; + *(p0r + posi) = t0i; + *(p2r + 1) = f4i; + w2r = *u2r; + w2i = *u2i; + *p0r = f0r; + *(p2r + pos) = f6r; + *(p0r + 1) = f0i; + *(p2r + posi) = f6i; + + p0r = pstrt; + p2r = pstrt + pinc + pinc; + + t1r = f1r - f5r * w3r - f5i * w3i; + t1i = f1i + f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + f5r = f3r - f7r * w3i + f7i * w3r; + f5i = f3i - f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *(p1r + pos) = t1r; + *p3r = f5r; + *(p1r + posi) = t1i; + *(p3r + 1) = f5i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + *p1r = f1r; + *(p3r + pos) = f7r; + *(p1r + 1) = f1i; + *(p3r + posi) = f7i; + + p1r = pstrt + pinc; + p3r = p2r + pinc; + } + NSameU /= 8; + Uinc /= 8; + Uinc2 /= 8; + Uinc4 = Uinc * 4; + NDiffU *= 8; + pinc *= 8; + pnext *= 8; + pos *= 8; + posi = pos + 1; +} +} + +void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/* recursive bfstages calls to maximize on chip cache efficiency */ +long i1; +if (M <= MCACHE) // fits on chip ? + bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ +else{ + for (i1=0; i1<8; i1++){ + fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */ + } + bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */ +} +} + +void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +long StageCnt; +long NDiffU; + +switch (M){ +case 0: + break; +case 1: + for (;Rows>0;Rows--){ + fft2pt(ioptr); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + fft4pt(ioptr); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + fft8pt(ioptr); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + for (;Rows>0;Rows--){ + + bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE) + bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + + else{ + fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + ioptr += 2*POW2(M); + } +} +} + +/************************************************ +parts of iffts1 +*************************************************/ + +inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale); +inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale){ +/*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/ +float f0r; +float f0i; +float f1r; +float f1i; +float f2r; +float f2i; +float f3r; +float f3i; +float f4r; +float f4i; +float f5r; +float f5i; +float f6r; +float f6i; +float f7r; +float f7i; +float t0r; +float t0i; +float t1r; +float t1i; +float *p0r; +float *p1r; +float *IOP; +float *iolimit; +long Colstart; +long iCol; +unsigned long posA; +unsigned long posAi; +unsigned long posB; +unsigned long posBi; + +const unsigned long Nrems2 = POW2((M+3)/2); +const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2; +const unsigned long Nroot_1 = POW2(M/2-1)-1; +const unsigned long ColstartShift = (M+1)/2 +1; +posA = POW2(M); // 1/2 of POW2(M) complexes +posAi = posA + 1; +posB = posA + 2; +posBi = posB + 1; + +iolimit = ioptr + Nrems2; +for (; ioptr < iolimit; ioptr += POW2(M/2+1)){ + for (Colstart = Nroot_1; Colstart >= 0; Colstart--){ + iCol = Nroot_1; + p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4; + IOP = ioptr + (Colstart << ColstartShift); + p1r = IOP + BRLow[iCol]*4; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + for (; iCol > Colstart;){ + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + f4r = *(p1r); + f4i = *(p1r+1); + f5r = *(p1r+posA); + f5i = *(p1r+posAi); + f6r = *(p1r+2); + f6i = *(p1r+(2+1)); + f7r = *(p1r+posB); + f7i = *(p1r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + f0r = f4r + f5r; + f0i = f4i + f5i; + f5r = f4r - f5r; + f5i = f4i - f5i; + f2r = f6r + f7r; + f2i = f6i + f7i; + f7r = f6r - f7r; + f7i = f6i - f7i; + + *(p1r) = scale*t0r; + *(p1r+1) = scale*t0i; + *(p1r+2) = scale*f1r; + *(p1r+(2+1)) = scale*f1i; + *(p1r+posA) = scale*t1r; + *(p1r+posAi) = scale*t1i; + *(p1r+posB) = scale*f3r; + *(p1r+posBi) = scale*f3i; + *(p0r) = scale*f0r; + *(p0r+1) = scale*f0i; + *(p0r+2) = scale*f5r; + *(p0r+(2+1)) = scale*f5i; + *(p0r+posA) = scale*f2r; + *(p0r+posAi) = scale*f2i; + *(p0r+posB) = scale*f7r; + *(p0r+posBi) = scale*f7i; + + p0r -= Nrems2; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + iCol -= 1; + p1r = IOP + BRLow[iCol]*4; + }; + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + *(p0r) = scale*t0r; + *(p0r+1) = scale*t0i; + *(p0r+2) = scale*f1r; + *(p0r+(2+1)) = scale*f1i; + *(p0r+posA) = scale*t1r; + *(p0r+posAi) = scale*t1i; + *(p0r+posB) = scale*f3r; + *(p0r+posBi) = scale*f3i; + + }; +}; +} + +inline void ifft2pt(float *ioptr, float scale); +inline void ifft2pt(float *ioptr, float scale){ +/*** RADIX 2 ifft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[2]; +f1i = ioptr[3]; + + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + + /* store result */ +ioptr[0] = scale*t0r; +ioptr[1] = scale*t0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +} + +inline void ifft4pt(float *ioptr, float scale); +inline void ifft4pt(float *ioptr, float scale){ +/*** RADIX 4 ifft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[4]; +f1i = ioptr[5]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +} + +inline void ifft8pt(float *ioptr, float scale); +inline void ifft8pt(float *ioptr, float scale){ +/*** RADIX 8 ifft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[8]; +f1i = ioptr[9]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f7r = ioptr[14]; +f7i = ioptr[15]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - i - t1 + f7 - 1 - t1 - i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r + t1i; +f7i = f5i - t1r; +f5r = f5r - t1i; +f5i = f5i + t1r; + + +t0r = f0r - f4r; +t0i = f0i - f4i; +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r + f6i; +t1i = f2i - f6r; +f2r = f2r - f6i; +f2i = f2i + f6r; + +f4r = f1r - f5r * w0r + f5i * w0r; +f4i = f1i - f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r + f7i * w0r; +f6i = f3i - f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +ioptr[8] = scale*t0r; +ioptr[9] = scale*t0i; +ioptr[10] = scale*f4r; +ioptr[11] = scale*f4i; +ioptr[12] = scale*t1r; +ioptr[13] = scale*t1i; +ioptr[14] = scale*f6r; +ioptr[15] = scale*f6i; +} + +inline void ibfR2(float *ioptr, long M, long NDiffU); +inline void ibfR2(float *ioptr, long M, long NDiffU){ +/*** 2nd radix 2 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + +for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){ + + f0r = *p0r; + f1r = *p1r; + f0i = *(p0r + 1); + f1i = *(p1r + 1); + f2r = *p2r; + f3r = *p3r; + f2i = *(p2r + 1); + f3i = *(p3r + 1); + + f4r = f0r + f1r; + f4i = f0i + f1i; + f5r = f0r - f1r; + f5i = f0i - f1i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f7r = f2r - f3r; + f7i = f2i - f3i; + + *p0r = f4r; + *(p0r + 1) = f4i; + *p1r = f5r; + *(p1r + 1) = f5i; + *p2r = f6r; + *(p2r + 1) = f6i; + *p3r = f7r; + *(p3r + 1) = f7i; + + f0r = *(p0r + pos); + f1i = *(p1r + posi); + f0i = *(p0r + posi); + f1r = *(p1r + pos); + f2r = *(p2r + pos); + f3i = *(p3r + posi); + f2i = *(p2r + posi); + f3r = *(p3r + pos); + + f4r = f0r - f1i; + f4i = f0i + f1r; + f5r = f0r + f1i; + f5i = f0i - f1r; + + f6r = f2r - f3i; + f6i = f2i + f3r; + f7r = f2r + f3i; + f7i = f2i - f3r; + + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + *(p2r + pos) = f6r; + *(p2r + posi) = f6i; + *(p3r + pos) = f7r; + *(p3r + posi) = f7i; + + p0r += pnext; + p1r += pnext; + p2r += pnext; + p3r += pnext; + +} +} + +inline void ibfR4(float *ioptr, long M, long NDiffU); +inline void ibfR4(float *ioptr, long M, long NDiffU){ +/*** 1 radix 4 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long pnexti; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float w1r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pnexti = pnext + 1; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f0 - - f4 + f1 - 1 - f5 - - f5 + f2 - - f6 - 1 - f6 + f3 - 1 - f3 - -i - f7 + */ + /* Butterflys */ + /* + f0 - - f4 - - f4 + f1 - -i - t1 - - f5 + f2 - - f2 - w1 - f6 + f3 - -i - f7 - iw1- f7 + */ + +f0r = *p0r; +f1r = *p1r; +f2r = *p2r; +f3r = *p3r; +f0i = *(p0r + 1); +f1i = *(p1r + 1); +f2i = *(p2r + 1); +f3i = *(p3r + 1); + +f5r = f0r - f1r; +f5i = f0i - f1i; +f0r = f0r + f1r; +f0i = f0i + f1i; + +f6r = f2r + f3r; +f6i = f2i + f3i; +f3r = f2r - f3r; +f3i = f2i - f3i; + +for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + + f7r = f5r + f3i; + f7i = f5i - f3r; + f5r = f5r - f3i; + f5i = f5i + f3r; + + f4r = f0r + f6r; + f4i = f0i + f6i; + f6r = f0r - f6r; + f6i = f0i - f6i; + + f2r = *(p2r + pos); + f2i = *(p2r + posi); + f1r = *(p1r + pos); + f1i = *(p1r + posi); + f3i = *(p3r + posi); + f0r = *(p0r + pos); + f3r = *(p3r + pos); + f0i = *(p0r + posi); + + *p3r = f7r; + *p0r = f4r; + *(p3r + 1) = f7i; + *(p0r + 1) = f4i; + *p1r = f5r; + *p2r = f6r; + *(p1r + 1) = f5i; + *(p2r + 1) = f6i; + + f7r = f2r + f3i; + f7i = f2i - f3r; + f2r = f2r - f3i; + f2i = f2i + f3r; + + f4r = f0r - f1i; + f4i = f0i + f1r; + t1r = f0r + f1i; + t1i = f0i - f1r; + + f5r = t1r - f7r * w1r - f7i * w1r; + f5i = t1i + f7r * w1r - f7i * w1r; + f7r = t1r * Two - f5r; + f7i = t1i * Two - f5i; + + f6r = f4r - f2r * w1r + f2i * w1r; + f6i = f4i - f2r * w1r - f2i * w1r; + f4r = f4r * Two - f6r; + f4i = f4i * Two - f6i; + + f3r = *(p3r + pnext); + f0r = *(p0r + pnext); + f3i = *(p3r + pnexti); + f0i = *(p0r + pnexti); + f2r = *(p2r + pnext); + f2i = *(p2r + pnexti); + f1r = *(p1r + pnext); + f1i = *(p1r + pnexti); + + *(p2r + pos) = f6r; + *(p1r + pos) = f5r; + *(p2r + posi) = f6i; + *(p1r + posi) = f5i; + *(p3r + pos) = f7r; + *(p0r + pos) = f4r; + *(p3r + posi) = f7i; + *(p0r + posi) = f4i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + f5r = f0r - f1r; + f5i = f0i - f1i; + f0r = f0r + f1r; + f0i = f0i + f1i; + + p3r += pnext; + p0r += pnext; + p1r += pnext; + p2r += pnext; + +} +f7r = f5r + f3i; +f7i = f5i - f3r; +f5r = f5r - f3i; +f5i = f5i + f3r; + +f4r = f0r + f6r; +f4i = f0i + f6i; +f6r = f0r - f6r; +f6i = f0i - f6i; + +f2r = *(p2r + pos); +f2i = *(p2r + posi); +f1r = *(p1r + pos); +f1i = *(p1r + posi); +f3i = *(p3r + posi); +f0r = *(p0r + pos); +f3r = *(p3r + pos); +f0i = *(p0r + posi); + +*p3r = f7r; +*p0r = f4r; +*(p3r + 1) = f7i; +*(p0r + 1) = f4i; +*p1r = f5r; +*p2r = f6r; +*(p1r + 1) = f5i; +*(p2r + 1) = f6i; + +f7r = f2r + f3i; +f7i = f2i - f3r; +f2r = f2r - f3i; +f2i = f2i + f3r; + +f4r = f0r - f1i; +f4i = f0i + f1r; +t1r = f0r + f1i; +t1i = f0i - f1r; + +f5r = t1r - f7r * w1r - f7i * w1r; +f5i = t1i + f7r * w1r - f7i * w1r; +f7r = t1r * Two - f5r; +f7i = t1i * Two - f5i; + +f6r = f4r - f2r * w1r + f2i * w1r; +f6i = f4i - f2r * w1r - f2i * w1r; +f4r = f4r * Two - f6r; +f4i = f4i * Two - f6i; + +*(p2r + pos) = f6r; +*(p1r + pos) = f5r; +*(p2r + posi) = f6i; +*(p1r + posi) = f5i; +*(p3r + pos) = f7r; +*(p0r + pos) = f4r; +*(p3r + posi) = f7i; +*(p0r + posi) = f4i; + +} + +inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/*** RADIX 8 Stages ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long Uinc; +unsigned long Uinc2; +unsigned long Uinc4; +unsigned long DiffUCnt; +unsigned long SameUCnt; +unsigned long U2toU3; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; +float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; + +float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 8; +pos = pinc * 4; +posi = pos + 1; +NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly +Uinc = NSameU * Ustride; +Uinc2 = Uinc * 2; +Uinc4 = Uinc * 4; +U2toU3 = (POW2(M) / 8)*Ustride; +for (; StageCnt > 0 ; StageCnt--){ + + u0r = &Utbl[0]; + u0i = &Utbl[POW2(M-2)*Ustride]; + u1r = u0r; + u1i = u0i; + u2r = u0r; + u2i = u0i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + w2r = *u2r; + w2i = *u2i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + + pstrt = ioptr; + + p0r = pstrt; + p1r = pstrt+pinc; + p2r = p1r+pinc; + p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - w0- f1 - - f1 - - f1 + f2 - - f2 - w1- f2 - - f4 + f3 - w0- t1 - iw1- f3 - - f5 + + f4 - - t0 - - f4 - w2- t0 + f5 - w0- f5 - - f5 - w3- t1 + f6 - - f6 - w1- f6 - iw2- f6 + f7 - w0- t1 - iw1- f7 - iw3- f7 + */ + + for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){ + f0r = *p0r; + f0i = *(p0r + 1); + f1r = *p1r; + f1i = *(p1r + 1); + for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r - f1i * w0i; + t0i = f0i + f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r + f3i * w0i; + t1i = f2i - f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r - f2i * w1i; + f0i = t0i + f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i + t1i * w1r; + f3i = f1i - t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + + t0r = f4r + f5r * w0r - f5i * w0i; + t0i = f4i + f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r + f7i * w0i; + t1i = f6i - f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r - f6i * w1i; + f4i = t0i + f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i + t1i * w1r; + f7i = f5i - t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + t0r = f0r - f4r * w2r + f4i * w2i; + t0i = f0i - f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + t1r = f1r - f5r * w3r + f5i * w3i; + t1i = f1i - f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + *(p0r + pos) = t0r; + *(p0r + posi) = t0i; + *p0r = f0r; + *(p0r + 1) = f0i; + + p0r += pnext; + f0r = *p0r; + f0i = *(p0r + 1); + + *(p1r + pos) = t1r; + *(p1r + posi) = t1i; + *p1r = f1r; + *(p1r + 1) = f1i; + + p1r += pnext; + + f1r = *p1r; + f1i = *(p1r + 1); + + f4r = f2r - f6r * w2i - f6i * w2r; + f4i = f2i + f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + f5r = f3r - f7r * w3i - f7i * w3r; + f5i = f3i + f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *p2r = f4r; + *(p2r + 1) = f4i; + *(p2r + pos) = f6r; + *(p2r + posi) = f6i; + + p2r += pnext; + + *p3r = f5r; + *(p3r + 1) = f5i; + *(p3r + pos) = f7r; + *(p3r + posi) = f7i; + + p3r += pnext; + + } + + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r - f1i * w0i; + t0i = f0i + f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r + f3i * w0i; + t1i = f2i - f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r - f2i * w1i; + f0i = t0i + f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i + t1i * w1r; + f3i = f1i - t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + if (DiffUCnt == NDiffU/2) + Uinc4 = -Uinc4; + + u0r += Uinc4; + u0i -= Uinc4; + u1r += Uinc2; + u1i -= Uinc2; + u2r += Uinc; + u2i -= Uinc; + + pstrt += 2; + + t0r = f4r + f5r * w0r - f5i * w0i; + t0i = f4i + f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r + f7i * w0i; + t1i = f6i - f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r - f6i * w1i; + f4i = t0i + f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i + t1i * w1r; + f7i = f5i - t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + + if (DiffUCnt <= NDiffU/2) + w0r = -w0r; + + t0r = f0r - f4r * w2r + f4i * w2i; + t0i = f0i - f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + f4r = f2r - f6r * w2i - f6i * w2r; + f4i = f2i + f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + *(p0r + pos) = t0r; + *p2r = f4r; + *(p0r + posi) = t0i; + *(p2r + 1) = f4i; + w2r = *u2r; + w2i = *u2i; + *p0r = f0r; + *(p2r + pos) = f6r; + *(p0r + 1) = f0i; + *(p2r + posi) = f6i; + + p0r = pstrt; + p2r = pstrt + pinc + pinc; + + t1r = f1r - f5r * w3r + f5i * w3i; + t1i = f1i - f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + f5r = f3r - f7r * w3i - f7i * w3r; + f5i = f3i + f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *(p1r + pos) = t1r; + *p3r = f5r; + *(p1r + posi) = t1i; + *(p3r + 1) = f5i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + *p1r = f1r; + *(p3r + pos) = f7r; + *(p1r + 1) = f1i; + *(p3r + posi) = f7i; + + p1r = pstrt + pinc; + p3r = p2r + pinc; + } + NSameU /= 8; + Uinc /= 8; + Uinc2 /= 8; + Uinc4 = Uinc * 4; + NDiffU *= 8; + pinc *= 8; + pnext *= 8; + pos *= 8; + posi = pos + 1; +} +} + +void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/* recursive bfstages calls to maximize on chip cache efficiency */ +long i1; +if (M <= MCACHE) + ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ +else{ + for (i1=0; i1<8; i1++){ + ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */ + } + ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */ +} +} + +void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +long StageCnt; +long NDiffU; +const float scale = 1.0/POW2(M); + +switch (M){ +case 0: + break; +case 1: + for (;Rows>0;Rows--){ + ifft2pt(ioptr, scale); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + ifft4pt(ioptr, scale); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + ifft8pt(ioptr, scale); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + for (;Rows>0;Rows--){ + + scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE) + ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + + else{ + ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + ioptr += 2*POW2(M); + } +} +} + +/************************************************ +parts of rffts1 +*************************************************/ + +inline void rfft1pt(float *ioptr); +inline void rfft1pt(float *ioptr){ +/*** RADIX 2 rfft ***/ +float f0r, f0i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; + + /* finish rfft */ +t0r = f0r + f0i; +t0i = f0r - f0i; + + /* store result */ +ioptr[0] = t0r; +ioptr[1] = t0i; +} + +inline void rfft2pt(float *ioptr); +inline void rfft2pt(float *ioptr){ +/*** RADIX 4 rfft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[2]; +f1i = ioptr[3]; + + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f1i - f0i; + /* finish rfft */ +f0r = t0r + t0i; +f0i = t0r - t0i; + + /* store result */ +ioptr[0] = f0r; +ioptr[1] = f0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +} + +inline void rfft4pt(float *ioptr); +inline void rfft4pt(float *ioptr){ +/*** RADIX 8 rfft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +const float Two = 2.0; +const float scale = 0.5; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[4]; +f1i = ioptr[5]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - -i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = f2i - t0i; // neg for rfft + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + /* finish rfft */ +t0r = f0r + f0i; /* compute Re(x[0]) */ +t0i = f0r - f0i; /* compute Re(x[N/2]) */ + +t1r = f1r + f3r; +t1i = f1i - f3i; +f0r = f1i + f3i; +f0i = f3r - f1r; + +f1r = t1r + w0r * f0r + w0r * f0i; +f1i = t1i - w0r * f0r + w0r * f0i; +f3r = Two * t1r - f1r; +f3i = f1i - Two * t1i; + + /* store result */ +ioptr[4] = f2r; +ioptr[5] = f2i; +ioptr[0] = t0r; +ioptr[1] = t0i; + +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +} + +inline void rfft8pt(float *ioptr); +inline void rfft8pt(float *ioptr){ +/*** RADIX 16 rfft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float w1r = MYCOSPID8; /* cos(pi/8) */ +float w1i = MYSINPID8; /* sin(pi/8) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; +const float scale = 0.5; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[8]; +f1i = ioptr[9]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f7r = ioptr[14]; +f7i = ioptr[15]; + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - -i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - -i - t1 + f7 - 1 - t1 - -i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r - t1i; +f7i = f5i + t1r; +f5r = f5r + t1i; +f5i = f5i - t1r; + + +t0r = f0r - f4r; +t0i = f4i - f0i; // neg for rfft +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r - f6i; +t1i = f2i + f6r; +f2r = f2r + f6i; +f2i = f2i - f6r; + +f4r = f1r - f5r * w0r - f5i * w0r; +f4i = f1i + f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r - f7i * w0r; +f6i = f3i + f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* finish rfft */ +f5r = f0r + f0i; /* compute Re(x[0]) */ +f5i = f0r - f0i; /* compute Re(x[N/2]) */ + +f0r = f2r + t1r; +f0i = f2i - t1i; +f7r = f2i + t1i; +f7i = t1r - f2r; + +f2r = f0r + w0r * f7r + w0r * f7i; +f2i = f0i - w0r * f7r + w0r * f7i; +t1r = Two * f0r - f2r; +t1i = f2i - Two * f0i; + + +f0r = f1r + f6r; +f0i = f1i - f6i; +f7r = f1i + f6i; +f7i = f6r - f1r; + +f1r = f0r + w1r * f7r + w1i * f7i; +f1i = f0i - w1i * f7r + w1r * f7i; +f6r = Two * f0r - f1r; +f6i = f1i - Two * f0i; + +f0r = f3r + f4r; +f0i = f3i - f4i; +f7r = f3i + f4i; +f7i = f4r - f3r; + +f3r = f0r + w1i * f7r + w1r * f7i; +f3i = f0i - w1r * f7r + w1i * f7i; +f4r = Two * f0r - f3r; +f4i = f3i - Two * f0i; + + /* store result */ +ioptr[8] = t0r; +ioptr[9] = t0i; +ioptr[0] = f5r; +ioptr[1] = f5i; + +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[12] = scale*t1r; +ioptr[13] = scale*t1i; + +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +ioptr[10] = scale*f4r; +ioptr[11] = scale*f4i; +ioptr[14] = scale*f6r; +ioptr[15] = scale*f6i; +} + +inline void frstage(float *ioptr, long M, float *Utbl); +inline void frstage(float *ioptr, long M, float *Utbl){ +/* Finish RFFT */ + +unsigned long pos; +unsigned long posi; +unsigned long diffUcnt; + +float *p0r, *p1r; +float *u0r, *u0i; + +float w0r, w0i; +float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pos = POW2(M-1); +posi = pos + 1; + +p0r = ioptr; +p1r = ioptr + pos/2; + +u0r = Utbl + POW2(M-3); + +w0r = *u0r, + +f0r = *(p0r); +f0i = *(p0r + 1); +f4r = *(p0r + pos); +f4i = *(p0r + posi); +f1r = *(p1r); +f1i = *(p1r + 1); +f5r = *(p1r + pos); +f5i = *(p1r + posi); + + t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */ + t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */ + t1r = f4r + f4r; + t1i = -f4i - f4i; + + f0r = f1r + f5r; + f0i = f1i - f5i; + f4r = f1i + f5i; + f4i = f5r - f1r; + + f1r = f0r + w0r * f4r + w0r * f4i; + f1i = f0i - w0r * f4r + w0r * f4i; + f5r = Two * f0r - f1r; + f5i = f1i - Two * f0i; + +*(p0r) = t0r; +*(p0r + 1) = t0i; +*(p0r + pos) = t1r; +*(p0r + posi) = t1i; +*(p1r) = f1r; +*(p1r + 1) = f1i; +*(p1r + pos) = f5r; +*(p1r + posi) = f5i; + +u0r = Utbl + 1; +u0i = Utbl + (POW2(M-2)-1); + +w0r = *u0r, +w0i = *u0i; + +p0r = (ioptr + 2); +p1r = (ioptr + (POW2(M-2)-1)*2); + + /* Butterflys */ + /* + f0 - t0 - - f0 + f5 - t1 - w0 - f5 + + f1 - t0 - - f1 + f4 - t1 -iw0 - f4 + */ + +for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){ + + f0r = *(p0r); + f0i = *(p0r + 1); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + f1r = *(p1r); + f1i = *(p1r + 1); + f4r = *(p0r + pos); + f4i = *(p0r + posi); + + t0r = f0r + f5r; + t0i = f0i - f5i; + t1r = f0i + f5i; + t1i = f5r - f0r; + + f0r = t0r + w0r * t1r + w0i * t1i; + f0i = t0i - w0i * t1r + w0r * t1i; + f5r = Two * t0r - f0r; + f5i = f0i - Two * t0i; + + t0r = f1r + f4r; + t0i = f1i - f4i; + t1r = f1i + f4i; + t1i = f4r - f1r; + + f1r = t0r + w0i * t1r + w0r * t1i; + f1i = t0i - w0r * t1r + w0i * t1i; + f4r = Two * t0r - f1r; + f4i = f1i - Two * t0i; + + *(p0r) = f0r; + *(p0r + 1) = f0i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + + w0r = *++u0r; + w0i = *--u0i; + + *(p1r) = f1r; + *(p1r + 1) = f1i; + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + + p0r += 2; + p1r -= 2; +}; +} + +void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + +float scale; +long StageCnt; +long NDiffU; + +M=M-1; +switch (M){ +case -1: + break; +case 0: + for (;Rows>0;Rows--){ + rfft1pt(ioptr); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } +case 1: + for (;Rows>0;Rows--){ + rfft2pt(ioptr); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + rfft4pt(ioptr); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + rfft8pt(ioptr); /* a 16 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + scale = 0.5; + for (;Rows>0;Rows--){ + + scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE){ + bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + frstage(ioptr, M+1, Utbl); + } + + else{ + fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + frstage(ioptr, M+1, Utbl); + } + + ioptr += 2*POW2(M); + } +} +} + +/************************************************ +parts of riffts1 +*************************************************/ + +inline void rifft1pt(float *ioptr, float scale); +inline void rifft1pt(float *ioptr, float scale){ +/*** RADIX 2 rifft ***/ +float f0r, f0i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; + + /* finish rfft */ +t0r = f0r + f0i; +t0i = f0r - f0i; + + /* store result */ +ioptr[0] = scale*t0r; +ioptr[1] = scale*t0i; +} + +inline void rifft2pt(float *ioptr, float scale); +inline void rifft2pt(float *ioptr, float scale){ +/*** RADIX 4 rifft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; +const float Two = 2.0; + + /* bit reversed load */ +t0r = ioptr[0]; +t0i = ioptr[1]; +f1r = Two * ioptr[2]; +f1i = Two * ioptr[3]; + + /* start rifft */ +f0r = t0r + t0i; +f0i = t0r - t0i; + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i - f1i; +f1r = f0r - f1r; +f1i = f0i + f1i; + + /* store result */ +ioptr[0] = scale*t0r; +ioptr[1] = scale*t0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +} + +inline void rifft4pt(float *ioptr, float scale); +inline void rifft4pt(float *ioptr, float scale){ +/*** RADIX 8 rifft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +const float Two = 2.0; + + /* bit reversed load */ +t0r = ioptr[0]; +t0i = ioptr[1]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f1r = Two * ioptr[4]; +f1i = Two * ioptr[5]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* start rfft */ +f0r = t0r + t0i; /* compute Re(x[0]) */ +f0i = t0r - t0i; /* compute Re(x[N/2]) */ + +t1r = f2r + f3r; +t1i = f2i - f3i; +t0r = f2r - f3r; +t0i = f2i + f3i; + +f2r = t1r - w0r * t0r - w0r * t0i; +f2i = t1i + w0r * t0r - w0r * t0i; +f3r = Two * t1r - f2r; +f3i = f2i - Two * t1i; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i - f1i; +f1r = f0r - f1r; +f1i = f0i + f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +} + +inline void rifft8pt(float *ioptr, float scale); +inline void rifft8pt(float *ioptr, float scale){ +/*** RADIX 16 rifft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float w1r = MYCOSPID8; /* cos(pi/8) */ +float w1i = MYSINPID8; /* sin(pi/8) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + + /* bit reversed load */ +t0r = ioptr[0]; +t0i = ioptr[1]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f1r = Two * ioptr[8]; +f1i = Two * ioptr[9]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f7r = ioptr[14]; +f7i = ioptr[15]; + + + /* start rfft */ +f0r = t0r + t0i; /* compute Re(x[0]) */ +f0i = t0r - t0i; /* compute Re(x[N/2]) */ + +t0r = f2r + f3r; +t0i = f2i - f3i; +t1r = f2r - f3r; +t1i = f2i + f3i; + +f2r = t0r - w0r * t1r - w0r * t1i; +f2i = t0i + w0r * t1r - w0r * t1i; +f3r = Two * t0r - f2r; +f3i = f2i - Two * t0i; + +t0r = f4r + f7r; +t0i = f4i - f7i; +t1r = f4r - f7r; +t1i = f4i + f7i; + +f4r = t0r - w1i * t1r - w1r * t1i; +f4i = t0i + w1r * t1r - w1i * t1i; +f7r = Two * t0r - f4r; +f7i = f4i - Two * t0i; + +t0r = f6r + f5r; +t0i = f6i - f5i; +t1r = f6r - f5r; +t1i = f6i + f5i; + +f6r = t0r - w1r * t1r - w1i * t1i; +f6i = t0i + w1i * t1r - w1r * t1i; +f5r = Two * t0r - f6r; +f5i = f6i - Two * t0i; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1* - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - i - t1 + f7 - 1 - t1 - i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i - f1i; +f1r = f0r - f1r; +f1i = f0i + f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r + t1i; +f7i = f5i - t1r; +f5r = f5r - t1i; +f5i = f5i + t1r; + + +t0r = f0r - f4r; +t0i = f0i - f4i; +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r + f6i; +t1i = f2i - f6r; +f2r = f2r - f6i; +f2i = f2i + f6r; + +f4r = f1r - f5r * w0r + f5i * w0r; +f4i = f1i - f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r + f7i * w0r; +f6i = f3i - f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +ioptr[8] = scale*t0r; +ioptr[9] = scale*t0i; +ioptr[10] = scale*f4r; +ioptr[11] = scale*f4i; +ioptr[12] = scale*t1r; +ioptr[13] = scale*t1i; +ioptr[14] = scale*f6r; +ioptr[15] = scale*f6i; +} + +inline void ifrstage(float *ioptr, long M, float *Utbl); +inline void ifrstage(float *ioptr, long M, float *Utbl){ +/* Start RIFFT */ + +unsigned long pos; +unsigned long posi; +unsigned long diffUcnt; + +float *p0r, *p1r; +float *u0r, *u0i; + +float w0r, w0i; +float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pos = POW2(M-1); +posi = pos + 1; + +p0r = ioptr; +p1r = ioptr + pos/2; + +u0r = Utbl + POW2(M-3); + +w0r = *u0r, + +f0r = *(p0r); +f0i = *(p0r + 1); +f4r = *(p0r + pos); +f4i = *(p0r + posi); +f1r = *(p1r); +f1i = *(p1r + 1); +f5r = *(p1r + pos); +f5i = *(p1r + posi); + + t0r = f0r + f0i; + t0i = f0r - f0i; + t1r = f4r + f4r; + t1i = -f4i - f4i; + + f0r = f1r + f5r; + f0i = f1i - f5i; + f4r = f1r - f5r; + f4i = f1i + f5i; + + f1r = f0r - w0r * f4r - w0r * f4i; + f1i = f0i + w0r * f4r - w0r * f4i; + f5r = Two * f0r - f1r; + f5i = f1i - Two * f0i; + +*(p0r) = t0r; +*(p0r + 1) = t0i; +*(p0r + pos) = t1r; +*(p0r + posi) = t1i; +*(p1r) = f1r; +*(p1r + 1) = f1i; +*(p1r + pos) = f5r; +*(p1r + posi) = f5i; + +u0r = Utbl + 1; +u0i = Utbl + (POW2(M-2)-1); + +w0r = *u0r, +w0i = *u0i; + +p0r = (ioptr + 2); +p1r = (ioptr + (POW2(M-2)-1)*2); + + /* Butterflys */ + /* + f0 - t0 - f0 + f1 - t1 -w0- f1 + + f2 - t0 - f2 + f3 - t1 -iw0- f3 + */ + +for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){ + + f0r = *(p0r); + f0i = *(p0r + 1); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + f1r = *(p1r); + f1i = *(p1r + 1); + f4r = *(p0r + pos); + f4i = *(p0r + posi); + + t0r = f0r + f5r; + t0i = f0i - f5i; + t1r = f0r - f5r; + t1i = f0i + f5i; + + f0r = t0r - w0i * t1r - w0r * t1i; + f0i = t0i + w0r * t1r - w0i * t1i; + f5r = Two * t0r - f0r; + f5i = f0i - Two * t0i; + + t0r = f1r + f4r; + t0i = f1i - f4i; + t1r = f1r - f4r; + t1i = f1i + f4i; + + f1r = t0r - w0r * t1r - w0i * t1i; + f1i = t0i + w0i * t1r - w0r * t1i; + f4r = Two * t0r - f1r; + f4i = f1i - Two * t0i; + + *(p0r) = f0r; + *(p0r + 1) = f0i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + + w0r = *++u0r; + w0i = *--u0i; + + *(p1r) = f1r; + *(p1r + 1) = f1i; + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + + p0r += 2; + p1r -= 2; +}; +} + +void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts1 */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + +float scale; +long StageCnt; +long NDiffU; + +scale = 1.0/POW2(M); +M=M-1; +switch (M){ +case -1: + break; +case 0: + for (;Rows>0;Rows--){ + rifft1pt(ioptr, scale); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } +case 1: + for (;Rows>0;Rows--){ + rifft2pt(ioptr, scale); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + rifft4pt(ioptr, scale); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + rifft8pt(ioptr, scale); /* a 16 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + for (;Rows>0;Rows--){ + + ifrstage(ioptr, M+1, Utbl); + + scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE){ + ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + else{ + ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + ioptr += 2*POW2(M); + } +} +} + diff --git a/ffts/src/fftlib.h b/ffts/src/fftlib.h new file mode 100644 index 0000000..86de4ac --- /dev/null +++ b/ffts/src/fftlib.h @@ -0,0 +1,76 @@ +#define MYRECIPLN2 1.442695040888963407359924681001892137426 // 1.0/log(2) + +/* some useful conversions between a number and its power of 2 */ +#define LOG2(a) (MYRECIPLN2*log(a)) // floating point logarithm base 2 +#define POW2(m) ((unsigned long) 1 << (m)) // integer power of 2 for m<32 + +/******************************************************************* +lower level fft stuff called by routines in fftext.c and fft2d.c +*******************************************************************/ + +void fftCosInit(long M, float *Utbl); +/* Compute Utbl, the cosine table for ffts */ +/* of size (pow(2,M)/4 +1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *Utbl = cosine table */ + +void fftBRInit(long M, short *BRLow); +/* Compute BRLow, the bit reversed table for ffts */ +/* of size pow(2,M/2 -1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *BRLow = bit reversed counter table */ + +void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + + +void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts1 */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = real output data array */ diff --git a/ffts/src/files b/ffts/src/files new file mode 100644 index 0000000..2833fee --- /dev/null +++ b/ffts/src/files @@ -0,0 +1,22 @@ +tr "\r" "\n" < dxpose.c > tmp.c +mv tmp.c dxpose.c +tr "\r" "\n" < dxpose.h > tmp.c +mv tmp.c dxpose.h +tr "\r" "\n" < fft2d.c > tmp.c +mv tmp.c fft2d.c +tr "\r" "\n" < fft2d.h > tmp.c +mv tmp.c fft2d.h +tr "\r" "\n" < fftext.c > tmp.c +mv tmp.c fftext.c +tr "\r" "\n" < fftext.h > tmp.c +mv tmp.c fftext.h +tr "\r" "\n" < fftlib.c > tmp.c +mv tmp.c fftlib.c +tr "\r" "\n" < fftlib.h > tmp.c +mv tmp.c fftlib.h +tr "\r" "\n" < files > tmp.c +mv tmp.c n" < files +tr "\r" "\n" < matlib.c > tmp.c +mv tmp.c matlib.c +tr "\r" "\n" < matlib.h > tmp.c +mv tmp.c matlib.h diff --git a/ffts/src/matlib.c b/ffts/src/matlib.c new file mode 100644 index 0000000..4e8b682 --- /dev/null +++ b/ffts/src/matlib.c @@ -0,0 +1,297 @@ +/* a few routines from a vector/matrix library */ +#include "matlib.h" + +void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ +/* not in-place matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +float *irow; /* pointer to input row start */ +float *ocol; /* pointer to output col start */ +float *idata; /* pointer to input data */ +float *odata; /* pointer to output data */ +long RowCnt; /* row counter */ +long ColCnt; /* col counter */ +float T0; /* data storage */ +float T1; /* data storage */ +float T2; /* data storage */ +float T3; /* data storage */ +float T4; /* data storage */ +float T5; /* data storage */ +float T6; /* data storage */ +float T7; /* data storage */ +const long inRsizd1 = iRsiz; +const long inRsizd2 = 2*iRsiz; +const long inRsizd3 = inRsizd2+iRsiz; +const long inRsizd4 = 4*iRsiz; +const long inRsizd5 = inRsizd3+inRsizd2; +const long inRsizd6 = inRsizd4+inRsizd2; +const long inRsizd7 = inRsizd4+inRsizd3; +const long inRsizd8 = 8*iRsiz; + +ocol = outdata; +irow = indata; +for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){ + idata = irow; + odata = ocol; + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + T0 = *idata; + T1 = *(idata+inRsizd1); + T2 = *(idata+inRsizd2); + T3 = *(idata+inRsizd3); + T4 = *(idata+inRsizd4); + T5 = *(idata+inRsizd5); + T6 = *(idata+inRsizd6); + T7 = *(idata+inRsizd7); + *odata = T0; + *(odata+1) = T1; + *(odata+2) = T2; + *(odata+3) = T3; + *(odata+4) = T4; + *(odata+5) = T5; + *(odata+6) = T6; + *(odata+7) = T7; + idata++; + odata += oRsiz; + } + irow += inRsizd8; + ocol += 8; +} +if (Nrows%8 != 0){ + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + idata = irow++; + odata = ocol; + ocol += oRsiz; + for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){ + T0 = *idata; + *odata++ = T0; + idata += iRsiz; + } + } +} +} + +void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ +/* not in-place complex float matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +float *irow; /* pointer to input row start */ +float *ocol; /* pointer to output col start */ +float *idata; /* pointer to input data */ +float *odata; /* pointer to output data */ +long RowCnt; /* row counter */ +long ColCnt; /* col counter */ +float T0r; /* data storage */ +float T0i; /* data storage */ +float T1r; /* data storage */ +float T1i; /* data storage */ +float T2r; /* data storage */ +float T2i; /* data storage */ +float T3r; /* data storage */ +float T3i; /* data storage */ +const long inRsizd1 = 2*iRsiz; +const long inRsizd1i = 2*iRsiz + 1; +const long inRsizd2 = 4*iRsiz; +const long inRsizd2i = 4*iRsiz + 1; +const long inRsizd3 = inRsizd2+inRsizd1; +const long inRsizd3i = inRsizd2+inRsizd1 + 1; +const long inRsizd4 = 8*iRsiz; + +ocol = outdata; +irow = indata; +for (RowCnt=Nrows/4; RowCnt>0; RowCnt--){ + idata = irow; + odata = ocol; + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + T0r = *idata; + T0i = *(idata +1); + T1r = *(idata+inRsizd1); + T1i = *(idata+inRsizd1i); + T2r = *(idata+inRsizd2); + T2i = *(idata+inRsizd2i); + T3r = *(idata+inRsizd3); + T3i = *(idata+inRsizd3i); + *odata = T0r; + *(odata+1) = T0i; + *(odata+2) = T1r; + *(odata+3) = T1i; + *(odata+4) = T2r; + *(odata+5) = T2i; + *(odata+6) = T3r; + *(odata+7) = T3i; + idata+=2; + odata += 2*oRsiz; + } + irow += inRsizd4; + ocol += 8; +} +if (Nrows%4 != 0){ + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + idata = irow; + odata = ocol; + for (RowCnt=Nrows%4; RowCnt>0; RowCnt--){ + T0r = *idata; + T0i = *(idata+1); + *odata = T0r; + *(odata+1) = T0i; + odata+=2; + idata += 2*iRsiz; + } + irow+=2; + ocol += 2*oRsiz; + } +} +} + +void cvprod(float *a, float *b, float *out, long N){ +/* complex vector product, can be in-place */ +/* product of complex vector *a times complex vector *b */ +/* INPUTS */ +/* N vector length */ +/* *a complex vector length N complex numbers */ +/* *b complex vector length N complex numbers */ +/* OUTPUTS */ +/* *out complex vector length N */ + +long OutCnt; /* output counter */ +float A0R; /* A storage */ +float A0I; /* A storage */ +float A1R; /* A storage */ +float A1I; /* A storage */ +float A2R; /* A storage */ +float A2I; /* A storage */ +float A3R; /* A storage */ +float A3I; /* A storage */ +float B0R; /* B storage */ +float B0I; /* B storage */ +float B1R; /* B storage */ +float B1I; /* B storage */ +float B2R; /* B storage */ +float B2I; /* B storage */ +float B3R; /* B storage */ +float B3I; /* B storage */ +float T0R; /* TMP storage */ +float T0I; /* TMP storage */ +float T1R; /* TMP storage */ +float T1I; /* TMP storage */ +float T2R; /* TMP storage */ +float T2I; /* TMP storage */ +float T3R; /* TMP storage */ +float T3I; /* TMP storage */ + +if (N>=4){ + A0R = *a; + B0R = *b; + A0I = *(a +1); + B0I = *(b +1); + A1R = *(a +2); + B1R = *(b +2); + A1I = *(a +3); + B1I = *(b +3); + A2R = *(a +4); + B2R = *(b +4); + A2I = *(a +5); + B2I = *(b +5); + A3R = *(a +6); + B3R = *(b +6); + A3I = *(a +7); + B3I = *(b +7); + T0R = A0R * B0R; + T0I = (A0R * B0I); + T1R = A1R * B1R; + T1I = (A1R * B1I); + T2R = A2R * B2R; + T2I = (A2R * B2I); + T3R = A3R * B3R; + T3I = (A3R * B3I); + T0R -= (A0I * B0I); + T0I = A0I * B0R + T0I; + T1R -= (A1I * B1I); + T1I = A1I * B1R + T1I; + T2R -= (A2I * B2I); + T2I = A2I * B2R + T2I; + T3R -= (A3I * B3I); + T3I = A3I * B3R + T3I; + for (OutCnt=N/4-1; OutCnt > 0; OutCnt--){ + a += 8; + b += 8; + A0R = *a; + B0R = *b; + A0I = *(a +1); + B0I = *(b +1); + A1R = *(a +2); + B1R = *(b +2); + A1I = *(a +3); + B1I = *(b +3); + A2R = *(a +4); + B2R = *(b +4); + A2I = *(a +5); + B2I = *(b +5); + A3R = *(a +6); + B3R = *(b +6); + A3I = *(a +7); + B3I = *(b +7); + *out = T0R; + *(out +1) = T0I; + *(out +2) = T1R; + *(out +3) = T1I; + *(out +4) = T2R; + *(out +5) = T2I; + *(out +6) = T3R; + *(out +7) = T3I; + T0R = A0R * B0R; + T0I = (A0R * B0I); + T1R = A1R * B1R; + T1I = (A1R * B1I); + T2R = A2R * B2R; + T2I = (A2R * B2I); + T3R = A3R * B3R; + T3I = (A3R * B3I); + T0R -= (A0I * B0I); + T0I = A0I * B0R + T0I; + T1R -= (A1I * B1I); + T1I = A1I * B1R + T1I; + T2R -= (A2I * B2I); + T2I = A2I * B2R + T2I; + T3R -= (A3I * B3I); + T3I = A3I * B3R + T3I; + out += 8; + } + a += 8; + b += 8; + *out = T0R; + *(out +1) = T0I; + *(out +2) = T1R; + *(out +3) = T1I; + *(out +4) = T2R; + *(out +5) = T2I; + *(out +6) = T3R; + *(out +7) = T3I; + out += 8; +} +for (OutCnt=N%4; OutCnt > 0; OutCnt--){ + A0R = *a++; + B0R = *b++; + A0I = *a++; + B0I = *b++; + T0R = A0R * B0R; + T0I = (A0R * B0I); + T0R -= (A0I * B0I); + T0I = A0I * B0R + T0I; + *out++ = T0R; + *out++ = T0I; +} +} diff --git a/ffts/src/matlib.h b/ffts/src/matlib.h new file mode 100644 index 0000000..baf9e6e --- /dev/null +++ b/ffts/src/matlib.h @@ -0,0 +1,33 @@ +/* a few routines from a vector/matrix library */ + +void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); +/* not in-place matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); +/* not in-place complex matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +void cvprod(float *a, float *b, float *out, long N); +/* complex vector product, can be in-place */ +/* product of complex vector *a times complex vector *b */ +/* INPUTS */ +/* N vector length */ +/* *a complex vector length N complex numbers */ +/* *b complex vector length N complex numbers */ +/* OUTPUTS */ +/* *out complex vector length N */ |