Skip to content

Commit 8e3318b

Browse files
committed
Math: FFT: Optimize fft_execute_32() HiFi code version
This patch optimizes the cycle count of the radix-2 Cooley-Tukey implementation with with three changes: - Dedicated depth-1 stage: all N/2 butterflies use a real twiddle factor W^0 = 1+0j, so the complex multiply is replaced by plain add or subtract. - Skip multiply for j=0 in stages >= 2: The first butterfly in every group also uses W^0, saving an additional ~N/2 complex multiplications across all remaining stages. - Pointer arithmetic: replace per-butterfly index arithmetic (outx[k+j], outx[k+j+n], twiddle[i*j]) with auto-incrementing pointers and strided twiddle access (tw_r += stride), eliminating integer multiplies for address computation. This change saves 11 MCPS (from 74 MCPS to 63 MCPS) in STFT Process module in MTL platform with 1024/256 size/hop FFT processing. It was tested with scripts: scripts/rebuild-testbench.sh -p mtl scripts/sof-testbench-helper.sh -x -m stft_process_1024_256_ \ -p profile-stft_process.txt Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 678de87 commit 8e3318b

File tree

1 file changed

+67
-26
lines changed

1 file changed

+67
-26
lines changed

src/math/fft/fft_32_hifi3.c

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
1919
ae_int32x2 sample;
2020
ae_int32x2 sample1;
2121
ae_int32x2 sample2;
22+
ae_int32x2 tw;
2223
ae_int32x2 *inx = (ae_int32x2 *)plan->inb32;
2324
ae_int32x2 *outx = (ae_int32x2 *)plan->outb32;
24-
ae_int32x2 *outtop;
25-
ae_int32x2 *outbottom;
25+
ae_int32x2 *top_ptr;
26+
ae_int32x2 *bot_ptr;
2627
uint16_t *idx = &plan->bit_reverse_idx[0];
27-
int depth, top, bottom, index;
28-
int i, j, k, m, n;
28+
const int32_t *tw_r;
29+
const int32_t *tw_i;
30+
int depth, i;
31+
int j, k, m, n;
2932
int size = plan->size;
3033
int len = plan->len;
3134

@@ -35,7 +38,6 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
3538
if (!plan->inb32 || !plan->outb32)
3639
return;
3740

38-
/* convert to complex conjugate for ifft */
3941
/* step 1: re-arrange input in bit reverse order, and shrink the level to avoid overflow */
4042
if (ifft) {
4143
/* convert to complex conjugate for ifft */
@@ -54,43 +56,82 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
5456
}
5557
}
5658

57-
/* step 2: loop to do FFT transform in smaller size */
58-
for (depth = 1; depth <= len; ++depth) {
59+
/*
60+
* Step 2a: First FFT stage (depth=1, m=2, n=1).
61+
* All butterflies use twiddle factor W^0 = 1+0j,
62+
* so the complex multiply is skipped entirely.
63+
*/
64+
top_ptr = outx;
65+
bot_ptr = outx + 1;
66+
for (k = 0; k < size; k += 2) {
67+
sample1 = AE_L32X2_I(top_ptr, 0);
68+
sample2 = AE_L32X2_I(bot_ptr, 0);
69+
sample = AE_ADD32S(sample1, sample2);
70+
AE_S32X2_I(sample, top_ptr, 0);
71+
sample = AE_SUB32S(sample1, sample2);
72+
AE_S32X2_I(sample, bot_ptr, 0);
73+
top_ptr += 2;
74+
bot_ptr += 2;
75+
}
76+
77+
/* Step 2b: Remaining FFT stages (depth >= 2) */
78+
for (depth = 2; depth <= len; ++depth) {
5979
m = 1 << depth;
6080
n = m >> 1;
6181
i = FFT_SIZE_MAX >> depth;
6282

83+
top_ptr = outx;
84+
bot_ptr = outx + n;
85+
6386
/* doing FFT transforms in size m */
6487
for (k = 0; k < size; k += m) {
65-
/* doing one FFT transform for size m */
66-
for (j = 0; j < n; ++j) {
67-
index = i * j;
68-
top = k + j;
69-
bottom = top + n;
88+
/*
89+
* j=0: twiddle factor W^0 = 1+0j,
90+
* butterfly without complex multiply.
91+
*/
92+
sample1 = AE_L32X2_I(top_ptr, 0);
93+
sample = AE_L32X2_I(bot_ptr, 0);
94+
sample2 = AE_ADD32S(sample1, sample);
95+
AE_S32X2_I(sample2, top_ptr, 0);
96+
sample2 = AE_SUB32S(sample1, sample);
97+
AE_S32X2_I(sample2, bot_ptr, 0);
98+
top_ptr++;
99+
bot_ptr++;
70100

71-
/* load twiddle factor to sample1 */
72-
sample1 = twiddle_real_32[index];
73-
sample2 = twiddle_imag_32[index];
74-
sample1 = AE_SEL32_LH(sample1, sample2);
101+
/* j=1..n-1: full butterfly with twiddle multiply */
102+
tw_r = &twiddle_real_32[i];
103+
tw_i = &twiddle_imag_32[i];
104+
for (j = 1; j < n; ++j) {
105+
/* load and combine twiddle factor {real, imag} */
106+
sample1 = tw_r[0];
107+
sample2 = tw_i[0];
108+
tw = AE_SEL32_LH(sample1, sample2);
75109

76110
/* calculate the accumulator: twiddle * bottom */
77-
sample2 = outx[bottom];
78-
res = AE_MULF32S_HH(sample1, sample2);
79-
AE_MULSF32S_LL(res, sample1, sample2);
80-
res1 = AE_MULF32S_HL(sample1, sample2);
81-
AE_MULAF32S_LH(res1, sample1, sample2);
111+
sample2 = AE_L32X2_I(bot_ptr, 0);
112+
res = AE_MULF32S_HH(tw, sample2);
113+
AE_MULSF32S_LL(res, tw, sample2);
114+
res1 = AE_MULF32S_HL(tw, sample2);
115+
AE_MULAF32S_LH(res1, tw, sample2);
82116
sample = AE_ROUND32X2F64SSYM(res, res1);
83-
sample1 = outx[top];
117+
sample1 = AE_L32X2_I(top_ptr, 0);
118+
84119
/* calculate the top output: top = top + accumulate */
85120
sample2 = AE_ADD32S(sample1, sample);
86-
outtop = outx + top;
87-
AE_S32X2_I(sample2, outtop, 0);
121+
AE_S32X2_I(sample2, top_ptr, 0);
88122

89123
/* calculate the bottom output: bottom = top - accumulate */
90124
sample2 = AE_SUB32S(sample1, sample);
91-
outbottom = outx + bottom;
92-
AE_S32X2_I(sample2, outbottom, 0);
125+
AE_S32X2_I(sample2, bot_ptr, 0);
126+
127+
top_ptr++;
128+
bot_ptr++;
129+
tw_r += i;
130+
tw_i += i;
93131
}
132+
/* advance pointers past current group's bottom half */
133+
top_ptr += n;
134+
bot_ptr += n;
94135
}
95136
}
96137

0 commit comments

Comments
 (0)