Math: FFT: Optimize fft_execute_32() HiFi code version

singalsu · singalsu · commit 8e3318bb7e6b · 2026-03-20T17:28:04.000+02:00
This patch optimizes the cycle count of the radix-2 Cooley-Tukey
implementation with with three changes:

- Dedicated depth-1 stage: all N/2 butterflies use a real twiddle
  factor W^0 = 1+0j, so the complex multiply is replaced by plain
  add or subtract.

- Skip multiply for j=0 in stages &gt;= 2: The first butterfly in every
  group also uses W^0, saving an additional ~N/2 complex multiplications
  across all remaining stages.

- Pointer arithmetic: replace per-butterfly index arithmetic
  (outx[k+j], outx[k+j+n], twiddle[i*j]) with auto-incrementing
  pointers and strided twiddle access (tw_r += stride), eliminating
  integer multiplies for address computation.

This change saves 11 MCPS (from 74 MCPS to 63 MCPS) in STFT Process
module in MTL platform with 1024/256 size/hop FFT processing. It was
tested with scripts:

scripts/rebuild-testbench.sh -p mtl
scripts/sof-testbench-helper.sh -x -m stft_process_1024_256_ \
  -p profile-stft_process.txt

Signed-off-by: Seppo Ingalsuo &lt;seppo.ingalsuo@linux.intel.com&gt;
diff --git a/src/math/fft/fft_32_hifi3.c b/src/math/fft/fft_32_hifi3.c
@@ -19,13 +19,16 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 	ae_int32x2 sample;
 	ae_int32x2 sample1;
 	ae_int32x2 sample2;
+	ae_int32x2 tw;
 	ae_int32x2 *inx = (ae_int32x2 *)plan->inb32;
 	ae_int32x2 *outx = (ae_int32x2 *)plan->outb32;
-	ae_int32x2 *outtop;
-	ae_int32x2 *outbottom;
+	ae_int32x2 *top_ptr;
+	ae_int32x2 *bot_ptr;
 	uint16_t *idx = &plan->bit_reverse_idx[0];
-	int depth, top, bottom, index;
-	int i, j, k, m, n;
+	const int32_t *tw_r;
+	const int32_t *tw_i;
+	int depth, i;
+	int j, k, m, n;
 	int size = plan->size;
 	int len = plan->len;
 
@@ -35,7 +38,6 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 	if (!plan->inb32 || !plan->outb32)
 		return;
 
-	/* convert to complex conjugate for ifft */
 	/* step 1: re-arrange input in bit reverse order, and shrink the level to avoid overflow */
 	if (ifft) {
 		/* convert to complex conjugate for ifft */
@@ -54,43 +56,82 @@ void fft_execute_32(struct fft_plan *plan, bool ifft)
 		}
 	}
 
-	/* step 2: loop to do FFT transform in smaller size */
-	for (depth = 1; depth <= len; ++depth) {
+	/*
+	 * Step 2a: First FFT stage (depth=1, m=2, n=1).
+	 * All butterflies use twiddle factor W^0 = 1+0j,
+	 * so the complex multiply is skipped entirely.
+	 */
+	top_ptr = outx;
+	bot_ptr = outx + 1;
+	for (k = 0; k < size; k += 2) {
+		sample1 = AE_L32X2_I(top_ptr, 0);
+		sample2 = AE_L32X2_I(bot_ptr, 0);
+		sample = AE_ADD32S(sample1, sample2);
+		AE_S32X2_I(sample, top_ptr, 0);
+		sample = AE_SUB32S(sample1, sample2);
+		AE_S32X2_I(sample, bot_ptr, 0);
+		top_ptr += 2;
+		bot_ptr += 2;
+	}
+
+	/* Step 2b: Remaining FFT stages (depth >= 2) */
+	for (depth = 2; depth <= len; ++depth) {
 		m = 1 << depth;
 		n = m >> 1;
 		i = FFT_SIZE_MAX >> depth;
 
+		top_ptr = outx;
+		bot_ptr = outx + n;
+
 		/* doing FFT transforms in size m */
 		for (k = 0; k < size; k += m) {
-			/* doing one FFT transform for size m */
-			for (j = 0; j < n; ++j) {
-				index = i * j;
-				top = k + j;
-				bottom = top + n;
+			/*
+			 * j=0: twiddle factor W^0 = 1+0j,
+			 * butterfly without complex multiply.
+			 */
+			sample1 = AE_L32X2_I(top_ptr, 0);
+			sample = AE_L32X2_I(bot_ptr, 0);
+			sample2 = AE_ADD32S(sample1, sample);
+			AE_S32X2_I(sample2, top_ptr, 0);
+			sample2 = AE_SUB32S(sample1, sample);
+			AE_S32X2_I(sample2, bot_ptr, 0);
+			top_ptr++;
+			bot_ptr++;
 
-				/* load twiddle factor to sample1 */
-				sample1 = twiddle_real_32[index];
-				sample2 = twiddle_imag_32[index];
-				sample1 = AE_SEL32_LH(sample1, sample2);
+			/* j=1..n-1: full butterfly with twiddle multiply */
+			tw_r = &twiddle_real_32[i];
+			tw_i = &twiddle_imag_32[i];
+			for (j = 1; j < n; ++j) {
+				/* load and combine twiddle factor {real, imag} */
+				sample1 = tw_r[0];
+				sample2 = tw_i[0];
+				tw = AE_SEL32_LH(sample1, sample2);
 
 				/* calculate the accumulator: twiddle * bottom */
-				sample2 = outx[bottom];
-				res = AE_MULF32S_HH(sample1, sample2);
-				AE_MULSF32S_LL(res, sample1, sample2);
-				res1 = AE_MULF32S_HL(sample1, sample2);
-				AE_MULAF32S_LH(res1, sample1, sample2);
+				sample2 = AE_L32X2_I(bot_ptr, 0);
+				res = AE_MULF32S_HH(tw, sample2);
+				AE_MULSF32S_LL(res, tw, sample2);
+				res1 = AE_MULF32S_HL(tw, sample2);
+				AE_MULAF32S_LH(res1, tw, sample2);
 				sample = AE_ROUND32X2F64SSYM(res, res1);
-				sample1 = outx[top];
+				sample1 = AE_L32X2_I(top_ptr, 0);
+
 				/* calculate the top output: top = top + accumulate */
 				sample2 = AE_ADD32S(sample1, sample);
-				outtop = outx + top;
-				AE_S32X2_I(sample2, outtop, 0);
+				AE_S32X2_I(sample2, top_ptr, 0);
 
 				/* calculate the bottom output: bottom = top - accumulate */
 				sample2 = AE_SUB32S(sample1, sample);
-				outbottom = outx + bottom;
-				AE_S32X2_I(sample2, outbottom, 0);
+				AE_S32X2_I(sample2, bot_ptr, 0);
+
+				top_ptr++;
+				bot_ptr++;
+				tw_r += i;
+				tw_i += i;
 			}
+			/* advance pointers past current group's bottom half */
+			top_ptr += n;
+			bot_ptr += n;
 		}
 	}