/******************************************************************* lower level fft stuff including routines called in fftext.c and fft2d.c *******************************************************************/ #include "fftlib.h" #include #define MCACHE (11-(sizeof(float)/8)) // fft's with M bigger than this bust primary cache #ifdef WIN32 #define inline __inline #endif // some math constants to 40 decimal places #define MYPI 3.141592653589793238462643383279502884197 // pi #define MYROOT2 1.414213562373095048801688724209698078569 // sqrt(2) #define MYCOSPID8 0.9238795325112867561281831893967882868224 // cos(pi/8) #define MYSINPID8 0.3826834323650897717284599840303988667613 // sin(pi/8) /************************************************* routines to initialize tables used by fft routines **************************************************/ void fftCosInit(long M, float *Utbl){ /* Compute Utbl, the cosine table for ffts */ /* of size (pow(2,M)/4 +1) */ /* INPUTS */ /* M = log2 of fft size */ /* OUTPUTS */ /* *Utbl = cosine table */ unsigned long fftN = POW2(M); unsigned long i1; Utbl[0] = 1.0; for (i1 = 1; i1 < fftN/4; i1++) Utbl[i1] = cos( (2.0 * MYPI * i1) / fftN ); Utbl[fftN/4] = 0.0; } void fftBRInit(long M, short *BRLow){ /* Compute BRLow, the bit reversed table for ffts */ /* of size pow(2,M/2 -1) */ /* INPUTS */ /* M = log2 of fft size */ /* OUTPUTS */ /* *BRLow = bit reversed counter table */ long Mroot_1 = M / 2 - 1; long Nroot_1 = POW2(Mroot_1); long i1; long bitsum; long bitmask; long bit; for (i1 = 0; i1 < Nroot_1; i1++){ bitsum = 0; bitmask = 1; for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++) if (i1 & bitmask) bitsum = bitsum + (Nroot_1 >> bit); BRLow[i1] = bitsum; }; } /************************************************ parts of ffts1 *************************************************/ inline void bitrevR2(float *ioptr, long M, short *BRLow); inline void bitrevR2(float *ioptr, long M, short *BRLow){ /*** bit reverse and first radix 2 stage of forward or inverse fft ***/ float f0r; float f0i; float f1r; float f1i; float f2r; float f2i; float f3r; float f3i; float f4r; float f4i; float f5r; float f5i; float f6r; float f6i; float f7r; float f7i; float t0r; float t0i; float t1r; float t1i; float *p0r; float *p1r; float *IOP; float *iolimit; long Colstart; long iCol; unsigned long posA; unsigned long posAi; unsigned long posB; unsigned long posBi; const unsigned long Nrems2 = POW2((M+3)/2); const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2; const unsigned long Nroot_1 = POW2(M/2-1)-1; const unsigned long ColstartShift = (M+1)/2 +1; posA = POW2(M); // 1/2 of POW2(M) complexes posAi = posA + 1; posB = posA + 2; posBi = posB + 1; iolimit = ioptr + Nrems2; for (; ioptr < iolimit; ioptr += POW2(M/2+1)){ for (Colstart = Nroot_1; Colstart >= 0; Colstart--){ iCol = Nroot_1; p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4; IOP = ioptr + (Colstart << ColstartShift); p1r = IOP + BRLow[iCol]*4; f0r = *(p0r); f0i = *(p0r+1); f1r = *(p0r+posA); f1i = *(p0r+posAi); for (; iCol > Colstart;){ f2r = *(p0r+2); f2i = *(p0r+(2+1)); f3r = *(p0r+posB); f3i = *(p0r+posBi); f4r = *(p1r); f4i = *(p1r+1); f5r = *(p1r+posA); f5i = *(p1r+posAi); f6r = *(p1r+2); f6i = *(p1r+(2+1)); f7r = *(p1r+posB); f7i = *(p1r+posBi); t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r + f3r; t1i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; f0r = f4r + f5r; f0i = f4i + f5i; f5r = f4r - f5r; f5i = f4i - f5i; f2r = f6r + f7r; f2i = f6i + f7i; f7r = f6r - f7r; f7i = f6i - f7i; *(p1r) = t0r; *(p1r+1) = t0i; *(p1r+2) = f1r; *(p1r+(2+1)) = f1i; *(p1r+posA) = t1r; *(p1r+posAi) = t1i; *(p1r+posB) = f3r; *(p1r+posBi) = f3i; *(p0r) = f0r; *(p0r+1) = f0i; *(p0r+2) = f5r; *(p0r+(2+1)) = f5i; *(p0r+posA) = f2r; *(p0r+posAi) = f2i; *(p0r+posB) = f7r; *(p0r+posBi) = f7i; p0r -= Nrems2; f0r = *(p0r); f0i = *(p0r+1); f1r = *(p0r+posA); f1i = *(p0r+posAi); iCol -= 1; p1r = IOP + BRLow[iCol]*4; }; f2r = *(p0r+2); f2i = *(p0r+(2+1)); f3r = *(p0r+posB); f3i = *(p0r+posBi); t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r + f3r; t1i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; *(p0r) = t0r; *(p0r+1) = t0i; *(p0r+2) = f1r; *(p0r+(2+1)) = f1i; *(p0r+posA) = t1r; *(p0r+posAi) = t1i; *(p0r+posB) = f3r; *(p0r+posBi) = f3i; }; }; } inline void fft2pt(float *ioptr); inline void fft2pt(float *ioptr){ /*** RADIX 2 fft ***/ float f0r, f0i, f1r, f1i; float t0r, t0i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[2]; f1i = ioptr[3]; /* Butterflys */ /* f0 - - t0 f1 - 1 - f1 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; /* store result */ ioptr[0] = t0r; ioptr[1] = t0i; ioptr[2] = f1r; ioptr[3] = f1i; } inline void fft4pt(float *ioptr); inline void fft4pt(float *ioptr){ /*** RADIX 4 fft ***/ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float t0r, t0i, t1r, t1i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[4]; f1i = ioptr[5]; f2r = ioptr[2]; f2i = ioptr[3]; f3r = ioptr[6]; f3i = ioptr[7]; /* Butterflys */ /* f0 - - t0 - - f0 f1 - 1 - f1 - - f1 f2 - - f2 - 1 - f2 f3 - 1 - t1 - -i - f3 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r - t1i; f3i = f1i + t1r; f1r = f1r + t1i; f1i = f1i - t1r; /* store result */ ioptr[0] = f0r; ioptr[1] = f0i; ioptr[2] = f1r; ioptr[3] = f1i; ioptr[4] = f2r; ioptr[5] = f2i; ioptr[6] = f3r; ioptr[7] = f3i; } inline void fft8pt(float *ioptr); inline void fft8pt(float *ioptr){ /*** RADIX 8 fft ***/ float w0r = 1.0/MYROOT2; /* cos(pi/4) */ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t0r, t0i, t1r, t1i; const float Two = 2.0; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[8]; f1i = ioptr[9]; f2r = ioptr[4]; f2i = ioptr[5]; f3r = ioptr[12]; f3i = ioptr[13]; f4r = ioptr[2]; f4i = ioptr[3]; f5r = ioptr[10]; f5i = ioptr[11]; f6r = ioptr[6]; f6i = ioptr[7]; f7r = ioptr[14]; f7i = ioptr[15]; /* Butterflys */ /* f0 - - t0 - - f0 - - f0 f1 - 1 - f1 - - f1 - - f1 f2 - - f2 - 1 - f2 - - f2 f3 - 1 - t1 - -i - f3 - - f3 f4 - - t0 - - f4 - 1 - t0 f5 - 1 - f5 - - f5 - w3 - f4 f6 - - f6 - 1 - f6 - -i - t1 f7 - 1 - t1 - -i - f7 - iw3- f6 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r - t1i; f3i = f1i + t1r; f1r = f1r + t1i; f1i = f1i - t1r; t0r = f4r + f5r; t0i = f4i + f5i; f5r = f4r - f5r; f5i = f4i - f5i; t1r = f6r - f7r; t1i = f6i - f7i; f6r = f6r + f7r; f6i = f6i + f7i; f4r = t0r + f6r; f4i = t0i + f6i; f6r = t0r - f6r; f6i = t0i - f6i; f7r = f5r - t1i; f7i = f5i + t1r; f5r = f5r + t1i; f5i = f5i - t1r; t0r = f0r - f4r; t0i = f0i - f4i; f0r = f0r + f4r; f0i = f0i + f4i; t1r = f2r - f6i; t1i = f2i + f6r; f2r = f2r + f6i; f2i = f2i - f6r; f4r = f1r - f5r * w0r - f5i * w0r; f4i = f1i + f5r * w0r - f5i * w0r; f1r = f1r * Two - f4r; f1i = f1i * Two - f4i; f6r = f3r + f7r * w0r - f7i * w0r; f6i = f3i + f7r * w0r + f7i * w0r; f3r = f3r * Two - f6r; f3i = f3i * Two - f6i; /* store result */ ioptr[0] = f0r; ioptr[1] = f0i; ioptr[2] = f1r; ioptr[3] = f1i; ioptr[4] = f2r; ioptr[5] = f2i; ioptr[6] = f3r; ioptr[7] = f3i; ioptr[8] = t0r; ioptr[9] = t0i; ioptr[10] = f4r; ioptr[11] = f4i; ioptr[12] = t1r; ioptr[13] = t1i; ioptr[14] = f6r; ioptr[15] = f6i; } inline void bfR2(float *ioptr, long M, long NDiffU); inline void bfR2(float *ioptr, long M, long NDiffU){ /*** 2nd radix 2 stage ***/ unsigned long pos; unsigned long posi; unsigned long pinc; unsigned long pnext; unsigned long NSameU; unsigned long SameUCnt; float *pstrt; float *p0r, *p1r, *p2r, *p3r; float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; const float Two = 2.0; pinc = NDiffU * 2; // 2 floats per complex pnext = pinc * 4; pos = 2; posi = pos+1; NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time pstrt = ioptr; p0r = pstrt; p1r = pstrt+pinc; p2r = p1r+pinc; p3r = p2r+pinc; /* Butterflys */ /* f0 - - f4 f1 - 1 - f5 f2 - - f6 f3 - 1 - f7 */ /* Butterflys */ /* f0 - - f4 f1 - 1 - f5 f2 - - f6 f3 - 1 - f7 */ for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){ f0r = *p0r; f1r = *p1r; f0i = *(p0r + 1); f1i = *(p1r + 1); f2r = *p2r; f3r = *p3r; f2i = *(p2r + 1); f3i = *(p3r + 1); f4r = f0r + f1r; f4i = f0i + f1i; f5r = f0r - f1r; f5i = f0i - f1i; f6r = f2r + f3r; f6i = f2i + f3i; f7r = f2r - f3r; f7i = f2i - f3i; *p0r = f4r; *(p0r + 1) = f4i; *p1r = f5r; *(p1r + 1) = f5i; *p2r = f6r; *(p2r + 1) = f6i; *p3r = f7r; *(p3r + 1) = f7i; f0r = *(p0r + pos); f1i = *(p1r + posi); f0i = *(p0r + posi); f1r = *(p1r + pos); f2r = *(p2r + pos); f3i = *(p3r + posi); f2i = *(p2r + posi); f3r = *(p3r + pos); f4r = f0r + f1i; f4i = f0i - f1r; f5r = f0r - f1i; f5i = f0i + f1r; f6r = f2r + f3i; f6i = f2i - f3r; f7r = f2r - f3i; f7i = f2i + f3r; *(p0r + pos) = f4r; *(p0r + posi) = f4i; *(p1r + pos) = f5r; *(p1r + posi) = f5i; *(p2r + pos) = f6r; *(p2r + posi) = f6i; *(p3r + pos) = f7r; *(p3r + posi) = f7i; p0r += pnext; p1r += pnext; p2r += pnext; p3r += pnext; } } inline void bfR4(float *ioptr, long M, long NDiffU); inline void bfR4(float *ioptr, long M, long NDiffU){ /*** 1 radix 4 stage ***/ unsigned long pos; unsigned long posi; unsigned long pinc; unsigned long pnext; unsigned long pnexti; unsigned long NSameU; unsigned long SameUCnt; float *pstrt; float *p0r, *p1r, *p2r, *p3r; float w1r = 1.0/MYROOT2; /* cos(pi/4) */ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t1r, t1i; const float Two = 2.0; pinc = NDiffU * 2; // 2 floats per complex pnext = pinc * 4; pnexti = pnext + 1; pos = 2; posi = pos+1; NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly pstrt = ioptr; p0r = pstrt; p1r = pstrt+pinc; p2r = p1r+pinc; p3r = p2r+pinc; /* Butterflys */ /* f0 - - f0 - - f4 f1 - 1 - f5 - - f5 f2 - - f6 - 1 - f6 f3 - 1 - f3 - -i - f7 */ /* Butterflys */ /* f0 - - f4 - - f4 f1 - -i - t1 - - f5 f2 - - f2 - w1 - f6 f3 - -i - f7 - iw1- f7 */ f0r = *p0r; f1r = *p1r; f2r = *p2r; f3r = *p3r; f0i = *(p0r + 1); f1i = *(p1r + 1); f2i = *(p2r + 1); f3i = *(p3r + 1); f5r = f0r - f1r; f5i = f0i - f1i; f0r = f0r + f1r; f0i = f0i + f1i; f6r = f2r + f3r; f6i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ f7r = f5r - f3i; f7i = f5i + f3r; f5r = f5r + f3i; f5i = f5i - f3r; f4r = f0r + f6r; f4i = f0i + f6i; f6r = f0r - f6r; f6i = f0i - f6i; f2r = *(p2r + pos); f2i = *(p2r + posi); f1r = *(p1r + pos); f1i = *(p1r + posi); f3i = *(p3r + posi); f0r = *(p0r + pos); f3r = *(p3r + pos); f0i = *(p0r + posi); *p3r = f7r; *p0r = f4r; *(p3r + 1) = f7i; *(p0r + 1) = f4i; *p1r = f5r; *p2r = f6r; *(p1r + 1) = f5i; *(p2r + 1) = f6i; f7r = f2r - f3i; f7i = f2i + f3r; f2r = f2r + f3i; f2i = f2i - f3r; f4r = f0r + f1i; f4i = f0i - f1r; t1r = f0r - f1i; t1i = f0i + f1r; f5r = t1r - f7r * w1r + f7i * w1r; f5i = t1i - f7r * w1r - f7i * w1r; f7r = t1r * Two - f5r; f7i = t1i * Two - f5i; f6r = f4r - f2r * w1r - f2i * w1r; f6i = f4i + f2r * w1r - f2i * w1r; f4r = f4r * Two - f6r; f4i = f4i * Two - f6i; f3r = *(p3r + pnext); f0r = *(p0r + pnext); f3i = *(p3r + pnexti); f0i = *(p0r + pnexti); f2r = *(p2r + pnext); f2i = *(p2r + pnexti); f1r = *(p1r + pnext); f1i = *(p1r + pnexti); *(p2r + pos) = f6r; *(p1r + pos) = f5r; *(p2r + posi) = f6i; *(p1r + posi) = f5i; *(p3r + pos) = f7r; *(p0r + pos) = f4r; *(p3r + posi) = f7i; *(p0r + posi) = f4i; f6r = f2r + f3r; f6i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; f5r = f0r - f1r; f5i = f0i - f1i; f0r = f0r + f1r; f0i = f0i + f1i; p3r += pnext; p0r += pnext; p1r += pnext; p2r += pnext; } f7r = f5r - f3i; f7i = f5i + f3r; f5r = f5r + f3i; f5i = f5i - f3r; f4r = f0r + f6r; f4i = f0i + f6i; f6r = f0r - f6r; f6i = f0i - f6i; f2r = *(p2r + pos); f2i = *(p2r + posi); f1r = *(p1r + pos); f1i = *(p1r + posi); f3i = *(p3r + posi); f0r = *(p0r + pos); f3r = *(p3r + pos); f0i = *(p0r + posi); *p3r = f7r; *p0r = f4r; *(p3r + 1) = f7i; *(p0r + 1) = f4i; *p1r = f5r; *p2r = f6r; *(p1r + 1) = f5i; *(p2r + 1) = f6i; f7r = f2r - f3i; f7i = f2i + f3r; f2r = f2r + f3i; f2i = f2i - f3r; f4r = f0r + f1i; f4i = f0i - f1r; t1r = f0r - f1i; t1i = f0i + f1r; f5r = t1r - f7r * w1r + f7i * w1r; f5i = t1i - f7r * w1r - f7i * w1r; f7r = t1r * Two - f5r; f7i = t1i * Two - f5i; f6r = f4r - f2r * w1r - f2i * w1r; f6i = f4i + f2r * w1r - f2i * w1r; f4r = f4r * Two - f6r; f4i = f4i * Two - f6i; *(p2r + pos) = f6r; *(p1r + pos) = f5r; *(p2r + posi) = f6i; *(p1r + posi) = f5i; *(p3r + pos) = f7r; *(p0r + pos) = f4r; *(p3r + posi) = f7i; *(p0r + posi) = f4i; } inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ /*** RADIX 8 Stages ***/ unsigned long pos; unsigned long posi; unsigned long pinc; unsigned long pnext; unsigned long NSameU; unsigned long Uinc; unsigned long Uinc2; unsigned long Uinc4; unsigned long DiffUCnt; unsigned long SameUCnt; unsigned long U2toU3; float *pstrt; float *p0r, *p1r, *p2r, *p3r; float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t0r, t0i, t1r, t1i; const float Two = 2.0; pinc = NDiffU * 2; // 2 floats per complex pnext = pinc * 8; pos = pinc * 4; posi = pos + 1; NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly Uinc = NSameU * Ustride; Uinc2 = Uinc * 2; Uinc4 = Uinc * 4; U2toU3 = (POW2(M) / 8)*Ustride; for (; StageCnt > 0 ; StageCnt--){ u0r = &Utbl[0]; u0i = &Utbl[POW2(M-2)*Ustride]; u1r = u0r; u1i = u0i; u2r = u0r; u2i = u0i; w0r = *u0r; w0i = *u0i; w1r = *u1r; w1i = *u1i; w2r = *u2r; w2i = *u2i; w3r = *(u2r+U2toU3); w3i = *(u2i-U2toU3); pstrt = ioptr; p0r = pstrt; p1r = pstrt+pinc; p2r = p1r+pinc; p3r = p2r+pinc; /* Butterflys */ /* f0 - - t0 - - f0 - - f0 f1 - w0- f1 - - f1 - - f1 f2 - - f2 - w1- f2 - - f4 f3 - w0- t1 - iw1- f3 - - f5 f4 - - t0 - - f4 - w2- t0 f5 - w0- f5 - - f5 - w3- t1 f6 - - f6 - w1- f6 - iw2- f6 f7 - w0- t1 - iw1- f7 - iw3- f7 */ for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){ f0r = *p0r; f0i = *(p0r + 1); f1r = *p1r; f1i = *(p1r + 1); for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ f2r = *p2r; f2i = *(p2r + 1); f3r = *p3r; f3i = *(p3r + 1); t0r = f0r + f1r * w0r + f1i * w0i; t0i = f0i - f1r * w0i + f1i * w0r; f1r = f0r * Two - t0r; f1i = f0i * Two - t0i; f4r = *(p0r + pos); f4i = *(p0r + posi); f5r = *(p1r + pos); f5i = *(p1r + posi); f6r = *(p2r + pos); f6i = *(p2r + posi); f7r = *(p3r + pos); f7i = *(p3r + posi); t1r = f2r - f3r * w0r - f3i * w0i; t1i = f2i + f3r * w0i - f3i * w0r; f2r = f2r * Two - t1r; f2i = f2i * Two - t1i; f0r = t0r + f2r * w1r + f2i * w1i; f0i = t0i - f2r * w1i + f2i * w1r; f2r = t0r * Two - f0r; f2i = t0i * Two - f0i; f3r = f1r + t1r * w1i - t1i * w1r; f3i = f1i + t1r * w1r + t1i * w1i; f1r = f1r * Two - f3r; f1i = f1i * Two - f3i; t0r = f4r + f5r * w0r + f5i * w0i; t0i = f4i - f5r * w0i + f5i * w0r; f5r = f4r * Two - t0r; f5i = f4i * Two - t0i; t1r = f6r - f7r * w0r - f7i * w0i; t1i = f6i + f7r * w0i - f7i * w0r; f6r = f6r * Two - t1r; f6i = f6i * Two - t1i; f4r = t0r + f6r * w1r + f6i * w1i; f4i = t0i - f6r * w1i + f6i * w1r; f6r = t0r * Two - f4r; f6i = t0i * Two - f4i; f7r = f5r + t1r * w1i - t1i * w1r; f7i = f5i + t1r * w1r + t1i * w1i; f5r = f5r * Two - f7r; f5i = f5i * Two - f7i; t0r = f0r - f4r * w2r - f4i * w2i; t0i = f0i + f4r * w2i - f4i * w2r; f0r = f0r * Two - t0r; f0i = f0i * Two - t0i; t1r = f1r - f5r * w3r - f5i * w3i; t1i = f1i + f5r * w3i - f5i * w3r; f1r = f1r * Two - t1r; f1i = f1i * Two - t1i; *(p0r + pos) = t0r; *(p1r + pos) = t1r; *(p0r + posi) = t0i; *(p1r + posi) = t1i; *p0r = f0r; *p1r = f1r; *(p0r + 1) = f0i; *(p1r + 1) = f1i; p0r += pnext; f0r = *p0r; f0i = *(p0r + 1); p1r += pnext; f1r = *p1r; f1i = *(p1r + 1); f4r = f2r - f6r * w2i + f6i * w2r; f4i = f2i - f6r * w2r - f6i * w2i; f6r = f2r * Two - f4r; f6i = f2i * Two - f4i; f5r = f3r - f7r * w3i + f7i * w3r; f5i = f3i - f7r * w3r - f7i * w3i; f7r = f3r * Two - f5r; f7i = f3i * Two - f5i; *p2r = f4r; *p3r = f5r; *(p2r + 1) = f4i; *(p3r + 1) = f5i; *(p2r + pos) = f6r; *(p3r + pos) = f7r; *(p2r + posi) = f6i; *(p3r + posi) = f7i; p2r += pnext; p3r += pnext; } f2r = *p2r; f2i = *(p2r + 1); f3r = *p3r; f3i = *(p3r + 1); t0r = f0r + f1r * w0r + f1i * w0i; t0i = f0i - f1r * w0i + f1i * w0r; f1r = f0r * Two - t0r; f1i = f0i * Two - t0i; f4r = *(p0r + pos); f4i = *(p0r + posi); f5r = *(p1r + pos); f5i = *(p1r + posi); f6r = *(p2r + pos); f6i = *(p2r + posi); f7r = *(p3r + pos); f7i = *(p3r + posi); t1r = f2r - f3r * w0r - f3i * w0i; t1i = f2i + f3r * w0i - f3i * w0r; f2r = f2r * Two - t1r; f2i = f2i * Two - t1i; f0r = t0r + f2r * w1r + f2i * w1i; f0i = t0i - f2r * w1i + f2i * w1r; f2r = t0r * Two - f0r; f2i = t0i * Two - f0i; f3r = f1r + t1r * w1i - t1i * w1r; f3i = f1i + t1r * w1r + t1i * w1i; f1r = f1r * Two - f3r; f1i = f1i * Two - f3i; if (DiffUCnt == NDiffU/2) Uinc4 = -Uinc4; u0r += Uinc4; u0i -= Uinc4; u1r += Uinc2; u1i -= Uinc2; u2r += Uinc; u2i -= Uinc; pstrt += 2; t0r = f4r + f5r * w0r + f5i * w0i; t0i = f4i - f5r * w0i + f5i * w0r; f5r = f4r * Two - t0r; f5i = f4i * Two - t0i; t1r = f6r - f7r * w0r - f7i * w0i; t1i = f6i + f7r * w0i - f7i * w0r; f6r = f6r * Two - t1r; f6i = f6i * Two - t1i; f4r = t0r + f6r * w1r + f6i * w1i; f4i = t0i - f6r * w1i + f6i * w1r; f6r = t0r * Two - f4r; f6i = t0i * Two - f4i; f7r = f5r + t1r * w1i - t1i * w1r; f7i = f5i + t1r * w1r + t1i * w1i; f5r = f5r * Two - f7r; f5i = f5i * Two - f7i; w0r = *u0r; w0i = *u0i; w1r = *u1r; w1i = *u1i; if (DiffUCnt <= NDiffU/2) w0r = -w0r; t0r = f0r - f4r * w2r - f4i * w2i; t0i = f0i + f4r * w2i - f4i * w2r; f0r = f0r * Two - t0r; f0i = f0i * Two - t0i; f4r = f2r - f6r * w2i + f6i * w2r; f4i = f2i - f6r * w2r - f6i * w2i; f6r = f2r * Two - f4r; f6i = f2i * Two - f4i; *(p0r + pos) = t0r; *p2r = f4r; *(p0r + posi) = t0i; *(p2r + 1) = f4i; w2r = *u2r; w2i = *u2i; *p0r = f0r; *(p2r + pos) = f6r; *(p0r + 1) = f0i; *(p2r + posi) = f6i; p0r = pstrt; p2r = pstrt + pinc + pinc; t1r = f1r - f5r * w3r - f5i * w3i; t1i = f1i + f5r * w3i - f5i * w3r; f1r = f1r * Two - t1r; f1i = f1i * Two - t1i; f5r = f3r - f7r * w3i + f7i * w3r; f5i = f3i - f7r * w3r - f7i * w3i; f7r = f3r * Two - f5r; f7i = f3i * Two - f5i; *(p1r + pos) = t1r; *p3r = f5r; *(p1r + posi) = t1i; *(p3r + 1) = f5i; w3r = *(u2r+U2toU3); w3i = *(u2i-U2toU3); *p1r = f1r; *(p3r + pos) = f7r; *(p1r + 1) = f1i; *(p3r + posi) = f7i; p1r = pstrt + pinc; p3r = p2r + pinc; } NSameU /= 8; Uinc /= 8; Uinc2 /= 8; Uinc4 = Uinc * 4; NDiffU *= 8; pinc *= 8; pnext *= 8; pos *= 8; posi = pos + 1; } } void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ /* recursive bfstages calls to maximize on chip cache efficiency */ long i1; if (M <= MCACHE) // fits on chip ? bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ else{ for (i1=0; i1<8; i1++){ fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */ } bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */ } } void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ /* Compute in-place complex fft on the rows of the input array */ /* INPUTS */ /* *ioptr = input data array */ /* M = log2 of fft size (ex M=10 for 1024 point fft) */ /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ /* *Utbl = cosine table */ /* *BRLow = bit reversed counter table */ /* OUTPUTS */ /* *ioptr = output data array */ long StageCnt; long NDiffU; switch (M){ case 0: break; case 1: for (;Rows>0;Rows--){ fft2pt(ioptr); /* a 2 pt fft */ ioptr += 2*POW2(M); } break; case 2: for (;Rows>0;Rows--){ fft4pt(ioptr); /* a 4 pt fft */ ioptr += 2*POW2(M); } break; case 3: for (;Rows>0;Rows--){ fft8pt(ioptr); /* an 8 pt fft */ ioptr += 2*POW2(M); } break; default: for (;Rows>0;Rows--){ bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */ StageCnt = (M-1) / 3; // number of radix 8 stages NDiffU = 2; // one radix 2 stage already complete if ( (M-1-(StageCnt * 3)) == 1 ){ bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ NDiffU *= 2; } if ( (M-1-(StageCnt * 3)) == 2 ){ bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ NDiffU *= 4; } if (M <= MCACHE) bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ else{ fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ } ioptr += 2*POW2(M); } } } /************************************************ parts of iffts1 *************************************************/ inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale); inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale){ /*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/ float f0r; float f0i; float f1r; float f1i; float f2r; float f2i; float f3r; float f3i; float f4r; float f4i; float f5r; float f5i; float f6r; float f6i; float f7r; float f7i; float t0r; float t0i; float t1r; float t1i; float *p0r; float *p1r; float *IOP; float *iolimit; long Colstart; long iCol; unsigned long posA; unsigned long posAi; unsigned long posB; unsigned long posBi; const unsigned long Nrems2 = POW2((M+3)/2); const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2; const unsigned long Nroot_1 = POW2(M/2-1)-1; const unsigned long ColstartShift = (M+1)/2 +1; posA = POW2(M); // 1/2 of POW2(M) complexes posAi = posA + 1; posB = posA + 2; posBi = posB + 1; iolimit = ioptr + Nrems2; for (; ioptr < iolimit; ioptr += POW2(M/2+1)){ for (Colstart = Nroot_1; Colstart >= 0; Colstart--){ iCol = Nroot_1; p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4; IOP = ioptr + (Colstart << ColstartShift); p1r = IOP + BRLow[iCol]*4; f0r = *(p0r); f0i = *(p0r+1); f1r = *(p0r+posA); f1i = *(p0r+posAi); for (; iCol > Colstart;){ f2r = *(p0r+2); f2i = *(p0r+(2+1)); f3r = *(p0r+posB); f3i = *(p0r+posBi); f4r = *(p1r); f4i = *(p1r+1); f5r = *(p1r+posA); f5i = *(p1r+posAi); f6r = *(p1r+2); f6i = *(p1r+(2+1)); f7r = *(p1r+posB); f7i = *(p1r+posBi); t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r + f3r; t1i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; f0r = f4r + f5r; f0i = f4i + f5i; f5r = f4r - f5r; f5i = f4i - f5i; f2r = f6r + f7r; f2i = f6i + f7i; f7r = f6r - f7r; f7i = f6i - f7i; *(p1r) = scale*t0r; *(p1r+1) = scale*t0i; *(p1r+2) = scale*f1r; *(p1r+(2+1)) = scale*f1i; *(p1r+posA) = scale*t1r; *(p1r+posAi) = scale*t1i; *(p1r+posB) = scale*f3r; *(p1r+posBi) = scale*f3i; *(p0r) = scale*f0r; *(p0r+1) = scale*f0i; *(p0r+2) = scale*f5r; *(p0r+(2+1)) = scale*f5i; *(p0r+posA) = scale*f2r; *(p0r+posAi) = scale*f2i; *(p0r+posB) = scale*f7r; *(p0r+posBi) = scale*f7i; p0r -= Nrems2; f0r = *(p0r); f0i = *(p0r+1); f1r = *(p0r+posA); f1i = *(p0r+posAi); iCol -= 1; p1r = IOP + BRLow[iCol]*4; }; f2r = *(p0r+2); f2i = *(p0r+(2+1)); f3r = *(p0r+posB); f3i = *(p0r+posBi); t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r + f3r; t1i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; *(p0r) = scale*t0r; *(p0r+1) = scale*t0i; *(p0r+2) = scale*f1r; *(p0r+(2+1)) = scale*f1i; *(p0r+posA) = scale*t1r; *(p0r+posAi) = scale*t1i; *(p0r+posB) = scale*f3r; *(p0r+posBi) = scale*f3i; }; }; } inline void ifft2pt(float *ioptr, float scale); inline void ifft2pt(float *ioptr, float scale){ /*** RADIX 2 ifft ***/ float f0r, f0i, f1r, f1i; float t0r, t0i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[2]; f1i = ioptr[3]; /* Butterflys */ /* f0 - - t0 f1 - 1 - f1 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; /* store result */ ioptr[0] = scale*t0r; ioptr[1] = scale*t0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; } inline void ifft4pt(float *ioptr, float scale); inline void ifft4pt(float *ioptr, float scale){ /*** RADIX 4 ifft ***/ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float t0r, t0i, t1r, t1i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[4]; f1i = ioptr[5]; f2r = ioptr[2]; f2i = ioptr[3]; f3r = ioptr[6]; f3i = ioptr[7]; /* Butterflys */ /* f0 - - t0 - - f0 f1 - 1 - f1 - - f1 f2 - - f2 - 1 - f2 f3 - 1 - t1 - i - f3 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r + t1i; f3i = f1i - t1r; f1r = f1r - t1i; f1i = f1i + t1r; /* store result */ ioptr[0] = scale*f0r; ioptr[1] = scale*f0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; ioptr[4] = scale*f2r; ioptr[5] = scale*f2i; ioptr[6] = scale*f3r; ioptr[7] = scale*f3i; } inline void ifft8pt(float *ioptr, float scale); inline void ifft8pt(float *ioptr, float scale){ /*** RADIX 8 ifft ***/ float w0r = 1.0/MYROOT2; /* cos(pi/4) */ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t0r, t0i, t1r, t1i; const float Two = 2.0; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[8]; f1i = ioptr[9]; f2r = ioptr[4]; f2i = ioptr[5]; f3r = ioptr[12]; f3i = ioptr[13]; f4r = ioptr[2]; f4i = ioptr[3]; f5r = ioptr[10]; f5i = ioptr[11]; f6r = ioptr[6]; f6i = ioptr[7]; f7r = ioptr[14]; f7i = ioptr[15]; /* Butterflys */ /* f0 - - t0 - - f0 - - f0 f1 - 1 - f1 - - f1 - - f1 f2 - - f2 - 1 - f2 - - f2 f3 - 1 - t1 - i - f3 - - f3 f4 - - t0 - - f4 - 1 - t0 f5 - 1 - f5 - - f5 - w3 - f4 f6 - - f6 - 1 - f6 - i - t1 f7 - 1 - t1 - i - f7 - iw3- f6 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r + t1i; f3i = f1i - t1r; f1r = f1r - t1i; f1i = f1i + t1r; t0r = f4r + f5r; t0i = f4i + f5i; f5r = f4r - f5r; f5i = f4i - f5i; t1r = f6r - f7r; t1i = f6i - f7i; f6r = f6r + f7r; f6i = f6i + f7i; f4r = t0r + f6r; f4i = t0i + f6i; f6r = t0r - f6r; f6i = t0i - f6i; f7r = f5r + t1i; f7i = f5i - t1r; f5r = f5r - t1i; f5i = f5i + t1r; t0r = f0r - f4r; t0i = f0i - f4i; f0r = f0r + f4r; f0i = f0i + f4i; t1r = f2r + f6i; t1i = f2i - f6r; f2r = f2r - f6i; f2i = f2i + f6r; f4r = f1r - f5r * w0r + f5i * w0r; f4i = f1i - f5r * w0r - f5i * w0r; f1r = f1r * Two - f4r; f1i = f1i * Two - f4i; f6r = f3r + f7r * w0r + f7i * w0r; f6i = f3i - f7r * w0r + f7i * w0r; f3r = f3r * Two - f6r; f3i = f3i * Two - f6i; /* store result */ ioptr[0] = scale*f0r; ioptr[1] = scale*f0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; ioptr[4] = scale*f2r; ioptr[5] = scale*f2i; ioptr[6] = scale*f3r; ioptr[7] = scale*f3i; ioptr[8] = scale*t0r; ioptr[9] = scale*t0i; ioptr[10] = scale*f4r; ioptr[11] = scale*f4i; ioptr[12] = scale*t1r; ioptr[13] = scale*t1i; ioptr[14] = scale*f6r; ioptr[15] = scale*f6i; } inline void ibfR2(float *ioptr, long M, long NDiffU); inline void ibfR2(float *ioptr, long M, long NDiffU){ /*** 2nd radix 2 stage ***/ unsigned long pos; unsigned long posi; unsigned long pinc; unsigned long pnext; unsigned long NSameU; unsigned long SameUCnt; float *pstrt; float *p0r, *p1r, *p2r, *p3r; float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; const float Two = 2.0; pinc = NDiffU * 2; // 2 floats per complex pnext = pinc * 4; pos = 2; posi = pos+1; NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time pstrt = ioptr; p0r = pstrt; p1r = pstrt+pinc; p2r = p1r+pinc; p3r = p2r+pinc; /* Butterflys */ /* f0 - - f4 f1 - 1 - f5 f2 - - f6 f3 - 1 - f7 */ /* Butterflys */ /* f0 - - f4 f1 - 1 - f5 f2 - - f6 f3 - 1 - f7 */ for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){ f0r = *p0r; f1r = *p1r; f0i = *(p0r + 1); f1i = *(p1r + 1); f2r = *p2r; f3r = *p3r; f2i = *(p2r + 1); f3i = *(p3r + 1); f4r = f0r + f1r; f4i = f0i + f1i; f5r = f0r - f1r; f5i = f0i - f1i; f6r = f2r + f3r; f6i = f2i + f3i; f7r = f2r - f3r; f7i = f2i - f3i; *p0r = f4r; *(p0r + 1) = f4i; *p1r = f5r; *(p1r + 1) = f5i; *p2r = f6r; *(p2r + 1) = f6i; *p3r = f7r; *(p3r + 1) = f7i; f0r = *(p0r + pos); f1i = *(p1r + posi); f0i = *(p0r + posi); f1r = *(p1r + pos); f2r = *(p2r + pos); f3i = *(p3r + posi); f2i = *(p2r + posi); f3r = *(p3r + pos); f4r = f0r - f1i; f4i = f0i + f1r; f5r = f0r + f1i; f5i = f0i - f1r; f6r = f2r - f3i; f6i = f2i + f3r; f7r = f2r + f3i; f7i = f2i - f3r; *(p0r + pos) = f4r; *(p0r + posi) = f4i; *(p1r + pos) = f5r; *(p1r + posi) = f5i; *(p2r + pos) = f6r; *(p2r + posi) = f6i; *(p3r + pos) = f7r; *(p3r + posi) = f7i; p0r += pnext; p1r += pnext; p2r += pnext; p3r += pnext; } } inline void ibfR4(float *ioptr, long M, long NDiffU); inline void ibfR4(float *ioptr, long M, long NDiffU){ /*** 1 radix 4 stage ***/ unsigned long pos; unsigned long posi; unsigned long pinc; unsigned long pnext; unsigned long pnexti; unsigned long NSameU; unsigned long SameUCnt; float *pstrt; float *p0r, *p1r, *p2r, *p3r; float w1r = 1.0/MYROOT2; /* cos(pi/4) */ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t1r, t1i; const float Two = 2.0; pinc = NDiffU * 2; // 2 floats per complex pnext = pinc * 4; pnexti = pnext + 1; pos = 2; posi = pos+1; NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly pstrt = ioptr; p0r = pstrt; p1r = pstrt+pinc; p2r = p1r+pinc; p3r = p2r+pinc; /* Butterflys */ /* f0 - - f0 - - f4 f1 - 1 - f5 - - f5 f2 - - f6 - 1 - f6 f3 - 1 - f3 - -i - f7 */ /* Butterflys */ /* f0 - - f4 - - f4 f1 - -i - t1 - - f5 f2 - - f2 - w1 - f6 f3 - -i - f7 - iw1- f7 */ f0r = *p0r; f1r = *p1r; f2r = *p2r; f3r = *p3r; f0i = *(p0r + 1); f1i = *(p1r + 1); f2i = *(p2r + 1); f3i = *(p3r + 1); f5r = f0r - f1r; f5i = f0i - f1i; f0r = f0r + f1r; f0i = f0i + f1i; f6r = f2r + f3r; f6i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ f7r = f5r + f3i; f7i = f5i - f3r; f5r = f5r - f3i; f5i = f5i + f3r; f4r = f0r + f6r; f4i = f0i + f6i; f6r = f0r - f6r; f6i = f0i - f6i; f2r = *(p2r + pos); f2i = *(p2r + posi); f1r = *(p1r + pos); f1i = *(p1r + posi); f3i = *(p3r + posi); f0r = *(p0r + pos); f3r = *(p3r + pos); f0i = *(p0r + posi); *p3r = f7r; *p0r = f4r; *(p3r + 1) = f7i; *(p0r + 1) = f4i; *p1r = f5r; *p2r = f6r; *(p1r + 1) = f5i; *(p2r + 1) = f6i; f7r = f2r + f3i; f7i = f2i - f3r; f2r = f2r - f3i; f2i = f2i + f3r; f4r = f0r - f1i; f4i = f0i + f1r; t1r = f0r + f1i; t1i = f0i - f1r; f5r = t1r - f7r * w1r - f7i * w1r; f5i = t1i + f7r * w1r - f7i * w1r; f7r = t1r * Two - f5r; f7i = t1i * Two - f5i; f6r = f4r - f2r * w1r + f2i * w1r; f6i = f4i - f2r * w1r - f2i * w1r; f4r = f4r * Two - f6r; f4i = f4i * Two - f6i; f3r = *(p3r + pnext); f0r = *(p0r + pnext); f3i = *(p3r + pnexti); f0i = *(p0r + pnexti); f2r = *(p2r + pnext); f2i = *(p2r + pnexti); f1r = *(p1r + pnext); f1i = *(p1r + pnexti); *(p2r + pos) = f6r; *(p1r + pos) = f5r; *(p2r + posi) = f6i; *(p1r + posi) = f5i; *(p3r + pos) = f7r; *(p0r + pos) = f4r; *(p3r + posi) = f7i; *(p0r + posi) = f4i; f6r = f2r + f3r; f6i = f2i + f3i; f3r = f2r - f3r; f3i = f2i - f3i; f5r = f0r - f1r; f5i = f0i - f1i; f0r = f0r + f1r; f0i = f0i + f1i; p3r += pnext; p0r += pnext; p1r += pnext; p2r += pnext; } f7r = f5r + f3i; f7i = f5i - f3r; f5r = f5r - f3i; f5i = f5i + f3r; f4r = f0r + f6r; f4i = f0i + f6i; f6r = f0r - f6r; f6i = f0i - f6i; f2r = *(p2r + pos); f2i = *(p2r + posi); f1r = *(p1r + pos); f1i = *(p1r + posi); f3i = *(p3r + posi); f0r = *(p0r + pos); f3r = *(p3r + pos); f0i = *(p0r + posi); *p3r = f7r; *p0r = f4r; *(p3r + 1) = f7i; *(p0r + 1) = f4i; *p1r = f5r; *p2r = f6r; *(p1r + 1) = f5i; *(p2r + 1) = f6i; f7r = f2r + f3i; f7i = f2i - f3r; f2r = f2r - f3i; f2i = f2i + f3r; f4r = f0r - f1i; f4i = f0i + f1r; t1r = f0r + f1i; t1i = f0i - f1r; f5r = t1r - f7r * w1r - f7i * w1r; f5i = t1i + f7r * w1r - f7i * w1r; f7r = t1r * Two - f5r; f7i = t1i * Two - f5i; f6r = f4r - f2r * w1r + f2i * w1r; f6i = f4i - f2r * w1r - f2i * w1r; f4r = f4r * Two - f6r; f4i = f4i * Two - f6i; *(p2r + pos) = f6r; *(p1r + pos) = f5r; *(p2r + posi) = f6i; *(p1r + posi) = f5i; *(p3r + pos) = f7r; *(p0r + pos) = f4r; *(p3r + posi) = f7i; *(p0r + posi) = f4i; } inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ /*** RADIX 8 Stages ***/ unsigned long pos; unsigned long posi; unsigned long pinc; unsigned long pnext; unsigned long NSameU; unsigned long Uinc; unsigned long Uinc2; unsigned long Uinc4; unsigned long DiffUCnt; unsigned long SameUCnt; unsigned long U2toU3; float *pstrt; float *p0r, *p1r, *p2r, *p3r; float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t0r, t0i, t1r, t1i; const float Two = 2.0; pinc = NDiffU * 2; // 2 floats per complex pnext = pinc * 8; pos = pinc * 4; posi = pos + 1; NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly Uinc = NSameU * Ustride; Uinc2 = Uinc * 2; Uinc4 = Uinc * 4; U2toU3 = (POW2(M) / 8)*Ustride; for (; StageCnt > 0 ; StageCnt--){ u0r = &Utbl[0]; u0i = &Utbl[POW2(M-2)*Ustride]; u1r = u0r; u1i = u0i; u2r = u0r; u2i = u0i; w0r = *u0r; w0i = *u0i; w1r = *u1r; w1i = *u1i; w2r = *u2r; w2i = *u2i; w3r = *(u2r+U2toU3); w3i = *(u2i-U2toU3); pstrt = ioptr; p0r = pstrt; p1r = pstrt+pinc; p2r = p1r+pinc; p3r = p2r+pinc; /* Butterflys */ /* f0 - - t0 - - f0 - - f0 f1 - w0- f1 - - f1 - - f1 f2 - - f2 - w1- f2 - - f4 f3 - w0- t1 - iw1- f3 - - f5 f4 - - t0 - - f4 - w2- t0 f5 - w0- f5 - - f5 - w3- t1 f6 - - f6 - w1- f6 - iw2- f6 f7 - w0- t1 - iw1- f7 - iw3- f7 */ for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){ f0r = *p0r; f0i = *(p0r + 1); f1r = *p1r; f1i = *(p1r + 1); for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ f2r = *p2r; f2i = *(p2r + 1); f3r = *p3r; f3i = *(p3r + 1); t0r = f0r + f1r * w0r - f1i * w0i; t0i = f0i + f1r * w0i + f1i * w0r; f1r = f0r * Two - t0r; f1i = f0i * Two - t0i; f4r = *(p0r + pos); f4i = *(p0r + posi); f5r = *(p1r + pos); f5i = *(p1r + posi); f6r = *(p2r + pos); f6i = *(p2r + posi); f7r = *(p3r + pos); f7i = *(p3r + posi); t1r = f2r - f3r * w0r + f3i * w0i; t1i = f2i - f3r * w0i - f3i * w0r; f2r = f2r * Two - t1r; f2i = f2i * Two - t1i; f0r = t0r + f2r * w1r - f2i * w1i; f0i = t0i + f2r * w1i + f2i * w1r; f2r = t0r * Two - f0r; f2i = t0i * Two - f0i; f3r = f1r + t1r * w1i + t1i * w1r; f3i = f1i - t1r * w1r + t1i * w1i; f1r = f1r * Two - f3r; f1i = f1i * Two - f3i; t0r = f4r + f5r * w0r - f5i * w0i; t0i = f4i + f5r * w0i + f5i * w0r; f5r = f4r * Two - t0r; f5i = f4i * Two - t0i; t1r = f6r - f7r * w0r + f7i * w0i; t1i = f6i - f7r * w0i - f7i * w0r; f6r = f6r * Two - t1r; f6i = f6i * Two - t1i; f4r = t0r + f6r * w1r - f6i * w1i; f4i = t0i + f6r * w1i + f6i * w1r; f6r = t0r * Two - f4r; f6i = t0i * Two - f4i; f7r = f5r + t1r * w1i + t1i * w1r; f7i = f5i - t1r * w1r + t1i * w1i; f5r = f5r * Two - f7r; f5i = f5i * Two - f7i; t0r = f0r - f4r * w2r + f4i * w2i; t0i = f0i - f4r * w2i - f4i * w2r; f0r = f0r * Two - t0r; f0i = f0i * Two - t0i; t1r = f1r - f5r * w3r + f5i * w3i; t1i = f1i - f5r * w3i - f5i * w3r; f1r = f1r * Two - t1r; f1i = f1i * Two - t1i; *(p0r + pos) = t0r; *(p0r + posi) = t0i; *p0r = f0r; *(p0r + 1) = f0i; p0r += pnext; f0r = *p0r; f0i = *(p0r + 1); *(p1r + pos) = t1r; *(p1r + posi) = t1i; *p1r = f1r; *(p1r + 1) = f1i; p1r += pnext; f1r = *p1r; f1i = *(p1r + 1); f4r = f2r - f6r * w2i - f6i * w2r; f4i = f2i + f6r * w2r - f6i * w2i; f6r = f2r * Two - f4r; f6i = f2i * Two - f4i; f5r = f3r - f7r * w3i - f7i * w3r; f5i = f3i + f7r * w3r - f7i * w3i; f7r = f3r * Two - f5r; f7i = f3i * Two - f5i; *p2r = f4r; *(p2r + 1) = f4i; *(p2r + pos) = f6r; *(p2r + posi) = f6i; p2r += pnext; *p3r = f5r; *(p3r + 1) = f5i; *(p3r + pos) = f7r; *(p3r + posi) = f7i; p3r += pnext; } f2r = *p2r; f2i = *(p2r + 1); f3r = *p3r; f3i = *(p3r + 1); t0r = f0r + f1r * w0r - f1i * w0i; t0i = f0i + f1r * w0i + f1i * w0r; f1r = f0r * Two - t0r; f1i = f0i * Two - t0i; f4r = *(p0r + pos); f4i = *(p0r + posi); f5r = *(p1r + pos); f5i = *(p1r + posi); f6r = *(p2r + pos); f6i = *(p2r + posi); f7r = *(p3r + pos); f7i = *(p3r + posi); t1r = f2r - f3r * w0r + f3i * w0i; t1i = f2i - f3r * w0i - f3i * w0r; f2r = f2r * Two - t1r; f2i = f2i * Two - t1i; f0r = t0r + f2r * w1r - f2i * w1i; f0i = t0i + f2r * w1i + f2i * w1r; f2r = t0r * Two - f0r; f2i = t0i * Two - f0i; f3r = f1r + t1r * w1i + t1i * w1r; f3i = f1i - t1r * w1r + t1i * w1i; f1r = f1r * Two - f3r; f1i = f1i * Two - f3i; if (DiffUCnt == NDiffU/2) Uinc4 = -Uinc4; u0r += Uinc4; u0i -= Uinc4; u1r += Uinc2; u1i -= Uinc2; u2r += Uinc; u2i -= Uinc; pstrt += 2; t0r = f4r + f5r * w0r - f5i * w0i; t0i = f4i + f5r * w0i + f5i * w0r; f5r = f4r * Two - t0r; f5i = f4i * Two - t0i; t1r = f6r - f7r * w0r + f7i * w0i; t1i = f6i - f7r * w0i - f7i * w0r; f6r = f6r * Two - t1r; f6i = f6i * Two - t1i; f4r = t0r + f6r * w1r - f6i * w1i; f4i = t0i + f6r * w1i + f6i * w1r; f6r = t0r * Two - f4r; f6i = t0i * Two - f4i; f7r = f5r + t1r * w1i + t1i * w1r; f7i = f5i - t1r * w1r + t1i * w1i; f5r = f5r * Two - f7r; f5i = f5i * Two - f7i; w0r = *u0r; w0i = *u0i; w1r = *u1r; w1i = *u1i; if (DiffUCnt <= NDiffU/2) w0r = -w0r; t0r = f0r - f4r * w2r + f4i * w2i; t0i = f0i - f4r * w2i - f4i * w2r; f0r = f0r * Two - t0r; f0i = f0i * Two - t0i; f4r = f2r - f6r * w2i - f6i * w2r; f4i = f2i + f6r * w2r - f6i * w2i; f6r = f2r * Two - f4r; f6i = f2i * Two - f4i; *(p0r + pos) = t0r; *p2r = f4r; *(p0r + posi) = t0i; *(p2r + 1) = f4i; w2r = *u2r; w2i = *u2i; *p0r = f0r; *(p2r + pos) = f6r; *(p0r + 1) = f0i; *(p2r + posi) = f6i; p0r = pstrt; p2r = pstrt + pinc + pinc; t1r = f1r - f5r * w3r + f5i * w3i; t1i = f1i - f5r * w3i - f5i * w3r; f1r = f1r * Two - t1r; f1i = f1i * Two - t1i; f5r = f3r - f7r * w3i - f7i * w3r; f5i = f3i + f7r * w3r - f7i * w3i; f7r = f3r * Two - f5r; f7i = f3i * Two - f5i; *(p1r + pos) = t1r; *p3r = f5r; *(p1r + posi) = t1i; *(p3r + 1) = f5i; w3r = *(u2r+U2toU3); w3i = *(u2i-U2toU3); *p1r = f1r; *(p3r + pos) = f7r; *(p1r + 1) = f1i; *(p3r + posi) = f7i; p1r = pstrt + pinc; p3r = p2r + pinc; } NSameU /= 8; Uinc /= 8; Uinc2 /= 8; Uinc4 = Uinc * 4; NDiffU *= 8; pinc *= 8; pnext *= 8; pos *= 8; posi = pos + 1; } } void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ /* recursive bfstages calls to maximize on chip cache efficiency */ long i1; if (M <= MCACHE) ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ else{ for (i1=0; i1<8; i1++){ ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */ } ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */ } } void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ /* Compute in-place inverse complex fft on the rows of the input array */ /* INPUTS */ /* *ioptr = input data array */ /* M = log2 of fft size */ /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ /* *Utbl = cosine table */ /* *BRLow = bit reversed counter table */ /* OUTPUTS */ /* *ioptr = output data array */ long StageCnt; long NDiffU; const float scale = 1.0/POW2(M); switch (M){ case 0: break; case 1: for (;Rows>0;Rows--){ ifft2pt(ioptr, scale); /* a 2 pt fft */ ioptr += 2*POW2(M); } break; case 2: for (;Rows>0;Rows--){ ifft4pt(ioptr, scale); /* a 4 pt fft */ ioptr += 2*POW2(M); } break; case 3: for (;Rows>0;Rows--){ ifft8pt(ioptr, scale); /* an 8 pt fft */ ioptr += 2*POW2(M); } break; default: for (;Rows>0;Rows--){ scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ StageCnt = (M-1) / 3; // number of radix 8 stages NDiffU = 2; // one radix 2 stage already complete if ( (M-1-(StageCnt * 3)) == 1 ){ ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ NDiffU *= 2; } if ( (M-1-(StageCnt * 3)) == 2 ){ ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ NDiffU *= 4; } if (M <= MCACHE) ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ else{ ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ } ioptr += 2*POW2(M); } } } /************************************************ parts of rffts1 *************************************************/ inline void rfft1pt(float *ioptr); inline void rfft1pt(float *ioptr){ /*** RADIX 2 rfft ***/ float f0r, f0i; float t0r, t0i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; /* finish rfft */ t0r = f0r + f0i; t0i = f0r - f0i; /* store result */ ioptr[0] = t0r; ioptr[1] = t0i; } inline void rfft2pt(float *ioptr); inline void rfft2pt(float *ioptr){ /*** RADIX 4 rfft ***/ float f0r, f0i, f1r, f1i; float t0r, t0i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[2]; f1i = ioptr[3]; /* Butterflys */ /* f0 - - t0 f1 - 1 - f1 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f1i - f0i; /* finish rfft */ f0r = t0r + t0i; f0i = t0r - t0i; /* store result */ ioptr[0] = f0r; ioptr[1] = f0i; ioptr[2] = f1r; ioptr[3] = f1i; } inline void rfft4pt(float *ioptr); inline void rfft4pt(float *ioptr){ /*** RADIX 8 rfft ***/ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float t0r, t0i, t1r, t1i; float w0r = 1.0/MYROOT2; /* cos(pi/4) */ const float Two = 2.0; const float scale = 0.5; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[4]; f1i = ioptr[5]; f2r = ioptr[2]; f2i = ioptr[3]; f3r = ioptr[6]; f3i = ioptr[7]; /* Butterflys */ /* f0 - - t0 - - f0 f1 - 1 - f1 - - f1 f2 - - f2 - 1 - f2 f3 - 1 - t1 - -i - f3 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = f2i - t0i; // neg for rfft f3r = f1r - t1i; f3i = f1i + t1r; f1r = f1r + t1i; f1i = f1i - t1r; /* finish rfft */ t0r = f0r + f0i; /* compute Re(x[0]) */ t0i = f0r - f0i; /* compute Re(x[N/2]) */ t1r = f1r + f3r; t1i = f1i - f3i; f0r = f1i + f3i; f0i = f3r - f1r; f1r = t1r + w0r * f0r + w0r * f0i; f1i = t1i - w0r * f0r + w0r * f0i; f3r = Two * t1r - f1r; f3i = f1i - Two * t1i; /* store result */ ioptr[4] = f2r; ioptr[5] = f2i; ioptr[0] = t0r; ioptr[1] = t0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; ioptr[6] = scale*f3r; ioptr[7] = scale*f3i; } inline void rfft8pt(float *ioptr); inline void rfft8pt(float *ioptr){ /*** RADIX 16 rfft ***/ float w0r = 1.0/MYROOT2; /* cos(pi/4) */ float w1r = MYCOSPID8; /* cos(pi/8) */ float w1i = MYSINPID8; /* sin(pi/8) */ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t0r, t0i, t1r, t1i; const float Two = 2.0; const float scale = 0.5; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; f1r = ioptr[8]; f1i = ioptr[9]; f2r = ioptr[4]; f2i = ioptr[5]; f3r = ioptr[12]; f3i = ioptr[13]; f4r = ioptr[2]; f4i = ioptr[3]; f5r = ioptr[10]; f5i = ioptr[11]; f6r = ioptr[6]; f6i = ioptr[7]; f7r = ioptr[14]; f7i = ioptr[15]; /* Butterflys */ /* f0 - - t0 - - f0 - - f0 f1 - 1 - f1 - - f1 - - f1 f2 - - f2 - 1 - f2 - - f2 f3 - 1 - t1 - -i - f3 - - f3 f4 - - t0 - - f4 - 1 - t0 f5 - 1 - f5 - - f5 - w3 - f4 f6 - - f6 - 1 - f6 - -i - t1 f7 - 1 - t1 - -i - f7 - iw3- f6 */ t0r = f0r + f1r; t0i = f0i + f1i; f1r = f0r - f1r; f1i = f0i - f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r - t1i; f3i = f1i + t1r; f1r = f1r + t1i; f1i = f1i - t1r; t0r = f4r + f5r; t0i = f4i + f5i; f5r = f4r - f5r; f5i = f4i - f5i; t1r = f6r - f7r; t1i = f6i - f7i; f6r = f6r + f7r; f6i = f6i + f7i; f4r = t0r + f6r; f4i = t0i + f6i; f6r = t0r - f6r; f6i = t0i - f6i; f7r = f5r - t1i; f7i = f5i + t1r; f5r = f5r + t1i; f5i = f5i - t1r; t0r = f0r - f4r; t0i = f4i - f0i; // neg for rfft f0r = f0r + f4r; f0i = f0i + f4i; t1r = f2r - f6i; t1i = f2i + f6r; f2r = f2r + f6i; f2i = f2i - f6r; f4r = f1r - f5r * w0r - f5i * w0r; f4i = f1i + f5r * w0r - f5i * w0r; f1r = f1r * Two - f4r; f1i = f1i * Two - f4i; f6r = f3r + f7r * w0r - f7i * w0r; f6i = f3i + f7r * w0r + f7i * w0r; f3r = f3r * Two - f6r; f3i = f3i * Two - f6i; /* finish rfft */ f5r = f0r + f0i; /* compute Re(x[0]) */ f5i = f0r - f0i; /* compute Re(x[N/2]) */ f0r = f2r + t1r; f0i = f2i - t1i; f7r = f2i + t1i; f7i = t1r - f2r; f2r = f0r + w0r * f7r + w0r * f7i; f2i = f0i - w0r * f7r + w0r * f7i; t1r = Two * f0r - f2r; t1i = f2i - Two * f0i; f0r = f1r + f6r; f0i = f1i - f6i; f7r = f1i + f6i; f7i = f6r - f1r; f1r = f0r + w1r * f7r + w1i * f7i; f1i = f0i - w1i * f7r + w1r * f7i; f6r = Two * f0r - f1r; f6i = f1i - Two * f0i; f0r = f3r + f4r; f0i = f3i - f4i; f7r = f3i + f4i; f7i = f4r - f3r; f3r = f0r + w1i * f7r + w1r * f7i; f3i = f0i - w1r * f7r + w1i * f7i; f4r = Two * f0r - f3r; f4i = f3i - Two * f0i; /* store result */ ioptr[8] = t0r; ioptr[9] = t0i; ioptr[0] = f5r; ioptr[1] = f5i; ioptr[4] = scale*f2r; ioptr[5] = scale*f2i; ioptr[12] = scale*t1r; ioptr[13] = scale*t1i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; ioptr[6] = scale*f3r; ioptr[7] = scale*f3i; ioptr[10] = scale*f4r; ioptr[11] = scale*f4i; ioptr[14] = scale*f6r; ioptr[15] = scale*f6i; } inline void frstage(float *ioptr, long M, float *Utbl); inline void frstage(float *ioptr, long M, float *Utbl){ /* Finish RFFT */ unsigned long pos; unsigned long posi; unsigned long diffUcnt; float *p0r, *p1r; float *u0r, *u0i; float w0r, w0i; float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; float t0r, t0i, t1r, t1i; const float Two = 2.0; pos = POW2(M-1); posi = pos + 1; p0r = ioptr; p1r = ioptr + pos/2; u0r = Utbl + POW2(M-3); w0r = *u0r, f0r = *(p0r); f0i = *(p0r + 1); f4r = *(p0r + pos); f4i = *(p0r + posi); f1r = *(p1r); f1i = *(p1r + 1); f5r = *(p1r + pos); f5i = *(p1r + posi); t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */ t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */ t1r = f4r + f4r; t1i = -f4i - f4i; f0r = f1r + f5r; f0i = f1i - f5i; f4r = f1i + f5i; f4i = f5r - f1r; f1r = f0r + w0r * f4r + w0r * f4i; f1i = f0i - w0r * f4r + w0r * f4i; f5r = Two * f0r - f1r; f5i = f1i - Two * f0i; *(p0r) = t0r; *(p0r + 1) = t0i; *(p0r + pos) = t1r; *(p0r + posi) = t1i; *(p1r) = f1r; *(p1r + 1) = f1i; *(p1r + pos) = f5r; *(p1r + posi) = f5i; u0r = Utbl + 1; u0i = Utbl + (POW2(M-2)-1); w0r = *u0r, w0i = *u0i; p0r = (ioptr + 2); p1r = (ioptr + (POW2(M-2)-1)*2); /* Butterflys */ /* f0 - t0 - - f0 f5 - t1 - w0 - f5 f1 - t0 - - f1 f4 - t1 -iw0 - f4 */ for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){ f0r = *(p0r); f0i = *(p0r + 1); f5r = *(p1r + pos); f5i = *(p1r + posi); f1r = *(p1r); f1i = *(p1r + 1); f4r = *(p0r + pos); f4i = *(p0r + posi); t0r = f0r + f5r; t0i = f0i - f5i; t1r = f0i + f5i; t1i = f5r - f0r; f0r = t0r + w0r * t1r + w0i * t1i; f0i = t0i - w0i * t1r + w0r * t1i; f5r = Two * t0r - f0r; f5i = f0i - Two * t0i; t0r = f1r + f4r; t0i = f1i - f4i; t1r = f1i + f4i; t1i = f4r - f1r; f1r = t0r + w0i * t1r + w0r * t1i; f1i = t0i - w0r * t1r + w0i * t1i; f4r = Two * t0r - f1r; f4i = f1i - Two * t0i; *(p0r) = f0r; *(p0r + 1) = f0i; *(p1r + pos) = f5r; *(p1r + posi) = f5i; w0r = *++u0r; w0i = *--u0i; *(p1r) = f1r; *(p1r + 1) = f1i; *(p0r + pos) = f4r; *(p0r + posi) = f4i; p0r += 2; p1r -= 2; }; } void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ /* Compute in-place real fft on the rows of the input array */ /* The result is the complex spectra of the positive frequencies */ /* except the location for the first complex number contains the real */ /* values for DC and Nyquest */ /* INPUTS */ /* *ioptr = real input data array */ /* M = log2 of fft size */ /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ /* *Utbl = cosine table */ /* *BRLow = bit reversed counter table */ /* OUTPUTS */ /* *ioptr = output data array in the following order */ /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ float scale; long StageCnt; long NDiffU; M=M-1; switch (M){ case -1: break; case 0: for (;Rows>0;Rows--){ rfft1pt(ioptr); /* a 2 pt fft */ ioptr += 2*POW2(M); } case 1: for (;Rows>0;Rows--){ rfft2pt(ioptr); /* a 4 pt fft */ ioptr += 2*POW2(M); } break; case 2: for (;Rows>0;Rows--){ rfft4pt(ioptr); /* an 8 pt fft */ ioptr += 2*POW2(M); } break; case 3: for (;Rows>0;Rows--){ rfft8pt(ioptr); /* a 16 pt fft */ ioptr += 2*POW2(M); } break; default: scale = 0.5; for (;Rows>0;Rows--){ scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ StageCnt = (M-1) / 3; // number of radix 8 stages NDiffU = 2; // one radix 2 stage already complete if ( (M-1-(StageCnt * 3)) == 1 ){ bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ NDiffU *= 2; } if ( (M-1-(StageCnt * 3)) == 2 ){ bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ NDiffU *= 4; } if (M <= MCACHE){ bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ frstage(ioptr, M+1, Utbl); } else{ fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ frstage(ioptr, M+1, Utbl); } ioptr += 2*POW2(M); } } } /************************************************ parts of riffts1 *************************************************/ inline void rifft1pt(float *ioptr, float scale); inline void rifft1pt(float *ioptr, float scale){ /*** RADIX 2 rifft ***/ float f0r, f0i; float t0r, t0i; /* bit reversed load */ f0r = ioptr[0]; f0i = ioptr[1]; /* finish rfft */ t0r = f0r + f0i; t0i = f0r - f0i; /* store result */ ioptr[0] = scale*t0r; ioptr[1] = scale*t0i; } inline void rifft2pt(float *ioptr, float scale); inline void rifft2pt(float *ioptr, float scale){ /*** RADIX 4 rifft ***/ float f0r, f0i, f1r, f1i; float t0r, t0i; const float Two = 2.0; /* bit reversed load */ t0r = ioptr[0]; t0i = ioptr[1]; f1r = Two * ioptr[2]; f1i = Two * ioptr[3]; /* start rifft */ f0r = t0r + t0i; f0i = t0r - t0i; /* Butterflys */ /* f0 - - t0 f1 - 1 - f1 */ t0r = f0r + f1r; t0i = f0i - f1i; f1r = f0r - f1r; f1i = f0i + f1i; /* store result */ ioptr[0] = scale*t0r; ioptr[1] = scale*t0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; } inline void rifft4pt(float *ioptr, float scale); inline void rifft4pt(float *ioptr, float scale){ /*** RADIX 8 rifft ***/ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float t0r, t0i, t1r, t1i; float w0r = 1.0/MYROOT2; /* cos(pi/4) */ const float Two = 2.0; /* bit reversed load */ t0r = ioptr[0]; t0i = ioptr[1]; f2r = ioptr[2]; f2i = ioptr[3]; f1r = Two * ioptr[4]; f1i = Two * ioptr[5]; f3r = ioptr[6]; f3i = ioptr[7]; /* start rfft */ f0r = t0r + t0i; /* compute Re(x[0]) */ f0i = t0r - t0i; /* compute Re(x[N/2]) */ t1r = f2r + f3r; t1i = f2i - f3i; t0r = f2r - f3r; t0i = f2i + f3i; f2r = t1r - w0r * t0r - w0r * t0i; f2i = t1i + w0r * t0r - w0r * t0i; f3r = Two * t1r - f2r; f3i = f2i - Two * t1i; /* Butterflys */ /* f0 - - t0 - - f0 f1 - 1 - f1 - - f1 f2 - - f2 - 1 - f2 f3 - 1 - t1 - i - f3 */ t0r = f0r + f1r; t0i = f0i - f1i; f1r = f0r - f1r; f1i = f0i + f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r + t1i; f3i = f1i - t1r; f1r = f1r - t1i; f1i = f1i + t1r; /* store result */ ioptr[0] = scale*f0r; ioptr[1] = scale*f0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; ioptr[4] = scale*f2r; ioptr[5] = scale*f2i; ioptr[6] = scale*f3r; ioptr[7] = scale*f3i; } inline void rifft8pt(float *ioptr, float scale); inline void rifft8pt(float *ioptr, float scale){ /*** RADIX 16 rifft ***/ float w0r = 1.0/MYROOT2; /* cos(pi/4) */ float w1r = MYCOSPID8; /* cos(pi/8) */ float w1i = MYSINPID8; /* sin(pi/8) */ float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; float t0r, t0i, t1r, t1i; const float Two = 2.0; /* bit reversed load */ t0r = ioptr[0]; t0i = ioptr[1]; f4r = ioptr[2]; f4i = ioptr[3]; f2r = ioptr[4]; f2i = ioptr[5]; f6r = ioptr[6]; f6i = ioptr[7]; f1r = Two * ioptr[8]; f1i = Two * ioptr[9]; f5r = ioptr[10]; f5i = ioptr[11]; f3r = ioptr[12]; f3i = ioptr[13]; f7r = ioptr[14]; f7i = ioptr[15]; /* start rfft */ f0r = t0r + t0i; /* compute Re(x[0]) */ f0i = t0r - t0i; /* compute Re(x[N/2]) */ t0r = f2r + f3r; t0i = f2i - f3i; t1r = f2r - f3r; t1i = f2i + f3i; f2r = t0r - w0r * t1r - w0r * t1i; f2i = t0i + w0r * t1r - w0r * t1i; f3r = Two * t0r - f2r; f3i = f2i - Two * t0i; t0r = f4r + f7r; t0i = f4i - f7i; t1r = f4r - f7r; t1i = f4i + f7i; f4r = t0r - w1i * t1r - w1r * t1i; f4i = t0i + w1r * t1r - w1i * t1i; f7r = Two * t0r - f4r; f7i = f4i - Two * t0i; t0r = f6r + f5r; t0i = f6i - f5i; t1r = f6r - f5r; t1i = f6i + f5i; f6r = t0r - w1r * t1r - w1i * t1i; f6i = t0i + w1i * t1r - w1r * t1i; f5r = Two * t0r - f6r; f5i = f6i - Two * t0i; /* Butterflys */ /* f0 - - t0 - - f0 - - f0 f1* - 1 - f1 - - f1 - - f1 f2 - - f2 - 1 - f2 - - f2 f3 - 1 - t1 - i - f3 - - f3 f4 - - t0 - - f4 - 1 - t0 f5 - 1 - f5 - - f5 - w3 - f4 f6 - - f6 - 1 - f6 - i - t1 f7 - 1 - t1 - i - f7 - iw3- f6 */ t0r = f0r + f1r; t0i = f0i - f1i; f1r = f0r - f1r; f1i = f0i + f1i; t1r = f2r - f3r; t1i = f2i - f3i; f2r = f2r + f3r; f2i = f2i + f3i; f0r = t0r + f2r; f0i = t0i + f2i; f2r = t0r - f2r; f2i = t0i - f2i; f3r = f1r + t1i; f3i = f1i - t1r; f1r = f1r - t1i; f1i = f1i + t1r; t0r = f4r + f5r; t0i = f4i + f5i; f5r = f4r - f5r; f5i = f4i - f5i; t1r = f6r - f7r; t1i = f6i - f7i; f6r = f6r + f7r; f6i = f6i + f7i; f4r = t0r + f6r; f4i = t0i + f6i; f6r = t0r - f6r; f6i = t0i - f6i; f7r = f5r + t1i; f7i = f5i - t1r; f5r = f5r - t1i; f5i = f5i + t1r; t0r = f0r - f4r; t0i = f0i - f4i; f0r = f0r + f4r; f0i = f0i + f4i; t1r = f2r + f6i; t1i = f2i - f6r; f2r = f2r - f6i; f2i = f2i + f6r; f4r = f1r - f5r * w0r + f5i * w0r; f4i = f1i - f5r * w0r - f5i * w0r; f1r = f1r * Two - f4r; f1i = f1i * Two - f4i; f6r = f3r + f7r * w0r + f7i * w0r; f6i = f3i - f7r * w0r + f7i * w0r; f3r = f3r * Two - f6r; f3i = f3i * Two - f6i; /* store result */ ioptr[0] = scale*f0r; ioptr[1] = scale*f0i; ioptr[2] = scale*f1r; ioptr[3] = scale*f1i; ioptr[4] = scale*f2r; ioptr[5] = scale*f2i; ioptr[6] = scale*f3r; ioptr[7] = scale*f3i; ioptr[8] = scale*t0r; ioptr[9] = scale*t0i; ioptr[10] = scale*f4r; ioptr[11] = scale*f4i; ioptr[12] = scale*t1r; ioptr[13] = scale*t1i; ioptr[14] = scale*f6r; ioptr[15] = scale*f6i; } inline void ifrstage(float *ioptr, long M, float *Utbl); inline void ifrstage(float *ioptr, long M, float *Utbl){ /* Start RIFFT */ unsigned long pos; unsigned long posi; unsigned long diffUcnt; float *p0r, *p1r; float *u0r, *u0i; float w0r, w0i; float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; float t0r, t0i, t1r, t1i; const float Two = 2.0; pos = POW2(M-1); posi = pos + 1; p0r = ioptr; p1r = ioptr + pos/2; u0r = Utbl + POW2(M-3); w0r = *u0r, f0r = *(p0r); f0i = *(p0r + 1); f4r = *(p0r + pos); f4i = *(p0r + posi); f1r = *(p1r); f1i = *(p1r + 1); f5r = *(p1r + pos); f5i = *(p1r + posi); t0r = f0r + f0i; t0i = f0r - f0i; t1r = f4r + f4r; t1i = -f4i - f4i; f0r = f1r + f5r; f0i = f1i - f5i; f4r = f1r - f5r; f4i = f1i + f5i; f1r = f0r - w0r * f4r - w0r * f4i; f1i = f0i + w0r * f4r - w0r * f4i; f5r = Two * f0r - f1r; f5i = f1i - Two * f0i; *(p0r) = t0r; *(p0r + 1) = t0i; *(p0r + pos) = t1r; *(p0r + posi) = t1i; *(p1r) = f1r; *(p1r + 1) = f1i; *(p1r + pos) = f5r; *(p1r + posi) = f5i; u0r = Utbl + 1; u0i = Utbl + (POW2(M-2)-1); w0r = *u0r, w0i = *u0i; p0r = (ioptr + 2); p1r = (ioptr + (POW2(M-2)-1)*2); /* Butterflys */ /* f0 - t0 - f0 f1 - t1 -w0- f1 f2 - t0 - f2 f3 - t1 -iw0- f3 */ for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){ f0r = *(p0r); f0i = *(p0r + 1); f5r = *(p1r + pos); f5i = *(p1r + posi); f1r = *(p1r); f1i = *(p1r + 1); f4r = *(p0r + pos); f4i = *(p0r + posi); t0r = f0r + f5r; t0i = f0i - f5i; t1r = f0r - f5r; t1i = f0i + f5i; f0r = t0r - w0i * t1r - w0r * t1i; f0i = t0i + w0r * t1r - w0i * t1i; f5r = Two * t0r - f0r; f5i = f0i - Two * t0i; t0r = f1r + f4r; t0i = f1i - f4i; t1r = f1r - f4r; t1i = f1i + f4i; f1r = t0r - w0r * t1r - w0i * t1i; f1i = t0i + w0i * t1r - w0r * t1i; f4r = Two * t0r - f1r; f4i = f1i - Two * t0i; *(p0r) = f0r; *(p0r + 1) = f0i; *(p1r + pos) = f5r; *(p1r + posi) = f5i; w0r = *++u0r; w0i = *--u0i; *(p1r) = f1r; *(p1r + 1) = f1i; *(p0r + pos) = f4r; *(p0r + posi) = f4i; p0r += 2; p1r -= 2; }; } void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ /* Compute in-place real ifft on the rows of the input array */ /* data order as from rffts1 */ /* INPUTS */ /* *ioptr = input data array in the following order */ /* M = log2 of fft size */ /* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ /* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ /* *Utbl = cosine table */ /* *BRLow = bit reversed counter table */ /* OUTPUTS */ /* *ioptr = real output data array */ float scale; long StageCnt; long NDiffU; scale = 1.0/POW2(M); M=M-1; switch (M){ case -1: break; case 0: for (;Rows>0;Rows--){ rifft1pt(ioptr, scale); /* a 2 pt fft */ ioptr += 2*POW2(M); } case 1: for (;Rows>0;Rows--){ rifft2pt(ioptr, scale); /* a 4 pt fft */ ioptr += 2*POW2(M); } break; case 2: for (;Rows>0;Rows--){ rifft4pt(ioptr, scale); /* an 8 pt fft */ ioptr += 2*POW2(M); } break; case 3: for (;Rows>0;Rows--){ rifft8pt(ioptr, scale); /* a 16 pt fft */ ioptr += 2*POW2(M); } break; default: for (;Rows>0;Rows--){ ifrstage(ioptr, M+1, Utbl); scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ StageCnt = (M-1) / 3; // number of radix 8 stages NDiffU = 2; // one radix 2 stage already complete if ( (M-1-(StageCnt * 3)) == 1 ){ ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ NDiffU *= 2; } if ( (M-1-(StageCnt * 3)) == 2 ){ ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ NDiffU *= 4; } if (M <= MCACHE){ ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ } else{ ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ } ioptr += 2*POW2(M); } } }