diff --git a/gst-libs/gst/fft/kiss_fft_f32.c b/gst-libs/gst/fft/kiss_fft_f32.c index f53d1d782a..693a899e00 100644 --- a/gst-libs/gst/fft/kiss_fft_f32.c +++ b/gst-libs/gst/fft/kiss_fft_f32.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_f32_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_f32_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_f32_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_f32_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_f32.h b/gst-libs/gst/fft/kiss_fft_f32.h index 06a893d2a9..098b89da70 100644 --- a/gst-libs/gst/fft/kiss_fft_f32.h +++ b/gst-libs/gst/fft/kiss_fft_f32.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -92,6 +92,10 @@ void kiss_fft_f32_cleanup(void); */ int kiss_fft_f32_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fft_f64.c b/gst-libs/gst/fft/kiss_fft_f64.c index 4f0cbc5adb..b3117117b8 100644 --- a/gst-libs/gst/fft/kiss_fft_f64.c +++ b/gst-libs/gst/fft/kiss_fft_f64.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_f64_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_f64_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_f64_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_f64_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_f64.h b/gst-libs/gst/fft/kiss_fft_f64.h index 572b33ab61..90792bbda3 100644 --- a/gst-libs/gst/fft/kiss_fft_f64.h +++ b/gst-libs/gst/fft/kiss_fft_f64.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -92,6 +92,10 @@ void kiss_fft_f64_cleanup(void); */ int kiss_fft_f64_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fft_s16.c b/gst-libs/gst/fft/kiss_fft_s16.c index ca20004b36..ee2de4eefc 100644 --- a/gst-libs/gst/fft/kiss_fft_s16.c +++ b/gst-libs/gst/fft/kiss_fft_s16.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_s16_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_s16_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_s16_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_s16_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_s16.h b/gst-libs/gst/fft/kiss_fft_s16.h index 9e229055ff..4303127ac3 100644 --- a/gst-libs/gst/fft/kiss_fft_s16.h +++ b/gst-libs/gst/fft/kiss_fft_s16.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -95,6 +95,10 @@ void kiss_fft_s16_cleanup(void); */ int kiss_fft_s16_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fft_s32.c b/gst-libs/gst/fft/kiss_fft_s32.c index 760013c251..5acb2ed67e 100644 --- a/gst-libs/gst/fft/kiss_fft_s32.c +++ b/gst-libs/gst/fft/kiss_fft_s32.c @@ -265,6 +265,40 @@ kf_work (kiss_fft_s32_cpx * Fout, const int m = *factors++; /* stage's fft length/p */ const kiss_fft_s32_cpx *Fout_end = Fout + p * m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride == 1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k = 0; k < p; ++k) + kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, + in_stride, factors, st); + // all threads have joined by this point + + switch (p) { + case 2: + kf_bfly2 (Fout, fstride, st, m); + break; + case 3: + kf_bfly3 (Fout, fstride, st, m); + break; + case 4: + kf_bfly4 (Fout, fstride, st, m); + break; + case 5: + kf_bfly5 (Fout, fstride, st, m); + break; + default: + kf_bfly_generic (Fout, fstride, st, m, p); + break; + } + return; + } +#endif + if (m == 1) { do { *Fout = *f; @@ -272,6 +306,10 @@ kf_work (kiss_fft_s32_cpx * Fout, } while (++Fout != Fout_end); } else { do { + // recursive call: + // DFT of size m*p performed by doing + // p instances of smaller DFTs of size m, + // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); @@ -279,6 +317,7 @@ kf_work (kiss_fft_s32_cpx * Fout, Fout = Fout_beg; + // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); diff --git a/gst-libs/gst/fft/kiss_fft_s32.h b/gst-libs/gst/fft/kiss_fft_s32.h index b1c1d707a9..8edd66454a 100644 --- a/gst-libs/gst/fft/kiss_fft_s32.h +++ b/gst-libs/gst/fft/kiss_fft_s32.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #ifdef __cplusplus @@ -96,6 +96,10 @@ void kiss_fft_s32_cleanup(void); */ int kiss_fft_s32_next_fast_size(int n); +/* for real ffts, we need an even size */ +#define kiss_fftr_next_fast_size_real(n) \ + (kiss_fft_next_fast_size( ((n)+1)>>1)<<1) + #ifdef __cplusplus } #endif diff --git a/gst-libs/gst/fft/kiss_fftr_f32.c b/gst-libs/gst/fft/kiss_fftr_f32.c index ea45206eb0..ab11c04e30 100644 --- a/gst-libs/gst/fft/kiss_fftr_f32.c +++ b/gst-libs/gst/fft/kiss_fftr_f32.c @@ -40,7 +40,7 @@ kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_f32_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)) - + ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 2); + + ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_f32_cfg) KISS_FFT_F32_MALLOC (memneeded); @@ -58,8 +58,9 @@ kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_f32_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -117,7 +118,7 @@ kiss_fftr_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -155,7 +156,7 @@ kiss_fftri_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD diff --git a/gst-libs/gst/fft/kiss_fftr_f64.c b/gst-libs/gst/fft/kiss_fftr_f64.c index 59587147ba..740be2711d 100644 --- a/gst-libs/gst/fft/kiss_fftr_f64.c +++ b/gst-libs/gst/fft/kiss_fftr_f64.c @@ -41,7 +41,7 @@ kiss_fftr_f64_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_f64_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_f64_state)) + ALIGN_STRUCT (subsize) - + sizeof (kiss_fft_f64_cpx) * (nfft * 2); + + sizeof (kiss_fft_f64_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_f64_cfg) KISS_FFT_F64_MALLOC (memneeded); @@ -59,8 +59,9 @@ kiss_fftr_f64_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_f64_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -118,7 +119,7 @@ kiss_fftr_f64 (kiss_fftr_f64_cfg st, const kiss_fft_f64_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -156,7 +157,7 @@ kiss_fftri_f64 (kiss_fftr_f64_cfg st, const kiss_fft_f64_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD diff --git a/gst-libs/gst/fft/kiss_fftr_s16.c b/gst-libs/gst/fft/kiss_fftr_s16.c index 803fa63380..f1ec990cb3 100644 --- a/gst-libs/gst/fft/kiss_fftr_s16.c +++ b/gst-libs/gst/fft/kiss_fftr_s16.c @@ -41,7 +41,7 @@ kiss_fftr_s16_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_s16_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_s16_state)) + ALIGN_STRUCT (subsize) - + sizeof (kiss_fft_s16_cpx) * (nfft * 2); + + sizeof (kiss_fft_s16_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_s16_cfg) KISS_FFT_S16_MALLOC (memneeded); @@ -59,8 +59,9 @@ kiss_fftr_s16_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_s16_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -118,7 +119,7 @@ kiss_fftr_s16 (kiss_fftr_s16_cfg st, const kiss_fft_s16_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -156,7 +157,7 @@ kiss_fftri_s16 (kiss_fftr_s16_cfg st, const kiss_fft_s16_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD diff --git a/gst-libs/gst/fft/kiss_fftr_s32.c b/gst-libs/gst/fft/kiss_fftr_s32.c index e2a8541a5a..260da2962c 100644 --- a/gst-libs/gst/fft/kiss_fftr_s32.c +++ b/gst-libs/gst/fft/kiss_fftr_s32.c @@ -41,7 +41,7 @@ kiss_fftr_s32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) kiss_fft_s32_alloc (nfft, inverse_fft, NULL, &subsize); memneeded = ALIGN_STRUCT (sizeof (struct kiss_fftr_s32_state)) + ALIGN_STRUCT (subsize) - + sizeof (kiss_fft_s32_cpx) * (nfft * 2); + + sizeof (kiss_fft_s32_cpx) * (nfft * 3 / 2); if (lenmem == NULL) { st = (kiss_fftr_s32_cfg) KISS_FFT_S32_MALLOC (memneeded); @@ -59,8 +59,9 @@ kiss_fftr_s32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) st->super_twiddles = st->tmpbuf + nfft; kiss_fft_s32_alloc (nfft, inverse_fft, st->substate, &subsize); - for (i = 0; i < nfft; ++i) { - double phase = -3.14159265358979323846264338327 * ((double) i / nfft + .5); + for (i = 0; i < nfft / 2; ++i) { + double phase = + -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5); if (inverse_fft) phase *= -1; @@ -118,7 +119,7 @@ kiss_fftr_s32 (kiss_fftr_s32_cfg st, const kiss_fft_s32_scalar * timedata, C_ADD (f1k, fpk, fpnk); C_SUB (f2k, fpk, fpnk); - C_MUL (tw, f2k, st->super_twiddles[k]); + C_MUL (tw, f2k, st->super_twiddles[k - 1]); freqdata[k].r = HALF_OF (f1k.r + tw.r); freqdata[k].i = HALF_OF (f1k.i + tw.i); @@ -156,7 +157,7 @@ kiss_fftri_s32 (kiss_fftr_s32_cfg st, const kiss_fft_s32_cpx * freqdata, C_ADD (fek, fk, fnkc); C_SUB (tmp, fk, fnkc); - C_MUL (fok, tmp, st->super_twiddles[k]); + C_MUL (fok, tmp, st->super_twiddles[k - 1]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD