Skip to content

Commit 21d771f

Browse files
committed
Math: FFT-multi: Add Xtensa HiFi versions for hot code functions
This patch adds HiFi3 versions for functions dft3_32() and fft_multi_execute_32(). The functions are implemented to fft_multi_hifi3.c and the generic versions are moved to fft_multi_generic.c. in MTL platform the optimization saves 119 MCPS, from 237 MCPS to 118 MCPS. The test was done with script run: scripts/rebuild-testbench.sh -p mtl scripts/sof-testbench-helper.sh -x -m stft_process_1536_240_ \ -p profile-stft_process.txt The above STFT used FFT length of 1536 with hop 240. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 8e3318b commit 21d771f

File tree

6 files changed

+442
-157
lines changed

6 files changed

+442
-157
lines changed

src/include/sof/math/fft.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
9898

9999
/**
100100
* dft3_32() - Discrete Fourier Transform (DFT) for size 3.
101-
* @param input: Pointer to complex values input array, Q1.31
102-
* @param output: Pointer to complex values output array, Q3.29
101+
* @param input: Pointer to complex values input array, Q1.31.
102+
* @param output: Pointer to complex values output array, Q1.31, scaled down by 1/3.
103103
*
104104
* This function is useful for calculating some non power of two FFTs. E.g. the
105105
* FFT for size 1536 is done with three 512 size FFTs and one 3 size DFT.

src/math/fft/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if(CONFIG_MATH_32BIT_FFT)
1111
endif()
1212

1313
if(CONFIG_MATH_FFT_MULTI)
14-
list(APPEND base_files fft_multi.c)
14+
list(APPEND base_files fft_multi.c fft_multi_generic.c fft_multi_hifi3.c)
1515
endif()
1616

1717
is_zephyr(zephyr)

src/math/fft/fft_multi.c

Lines changed: 0 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ LOG_MODULE_REGISTER(math_fft_multi, CONFIG_SOF_LOG_LEVEL);
1919
SOF_DEFINE_REG_UUID(math_fft_multi);
2020
DECLARE_TR_CTX(math_fft_multi_tr, SOF_UUID(math_fft_multi_uuid), LOG_LEVEL_INFO);
2121

22-
/* Constants for size 3 DFT */
23-
#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
24-
#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */
25-
#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */
26-
2722
struct fft_multi_plan *mod_fft_multi_plan_new(struct processing_module *mod, void *inb,
2823
void *outb, uint32_t size, int bits)
2924
{
@@ -143,152 +138,3 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
143138
mod_free(mod, plan->bit_reverse_idx);
144139
mod_free(mod, plan);
145140
}
146-
147-
void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
148-
{
149-
const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
150-
const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
151-
struct icomplex32 x[3];
152-
struct icomplex32 p1, p2, sum;
153-
int i;
154-
155-
for (i = 0; i < 3; i++) {
156-
x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
157-
x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
158-
}
159-
160-
/*
161-
* | 1 1 1 |
162-
* c = | 1 c0 c1 | , x = [ x0 x1 x2 ]
163-
* | 1 c1 c0 |
164-
*
165-
* y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(0)
166-
* y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(0)
167-
* y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(0)
168-
*/
169-
170-
/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
171-
icomplex32_adds(&x[0], &x[1], &sum);
172-
icomplex32_adds(&x[2], &sum, &y[0]);
173-
174-
/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
175-
icomplex32_mul(&c0, &x[1], &p1);
176-
icomplex32_mul(&c1, &x[2], &p2);
177-
icomplex32_adds(&p1, &p2, &sum);
178-
icomplex32_adds(&x[0], &sum, &y[1]);
179-
180-
/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
181-
icomplex32_mul(&c1, &x[1], &p1);
182-
icomplex32_mul(&c0, &x[2], &p2);
183-
icomplex32_adds(&p1, &p2, &sum);
184-
icomplex32_adds(&x[0], &sum, &y[2]);
185-
}
186-
187-
void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
188-
{
189-
struct icomplex32 x[FFT_MULTI_COUNT_MAX];
190-
struct icomplex32 y[FFT_MULTI_COUNT_MAX];
191-
struct icomplex32 t, c;
192-
int i, j, k, m;
193-
194-
/* Handle 2^N FFT */
195-
if (plan->num_ffts == 1) {
196-
memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
197-
fft_execute_32(plan->fft_plan[0], ifft);
198-
return;
199-
}
200-
201-
#ifdef DEBUG_DUMP_TO_FILE
202-
FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
203-
FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
204-
FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
205-
FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
206-
#endif
207-
208-
/* convert to complex conjugate for IFFT */
209-
if (ifft) {
210-
for (i = 0; i < plan->total_size; i++)
211-
icomplex32_conj(&plan->inb32[i]);
212-
}
213-
214-
/* Copy input buffers */
215-
k = 0;
216-
for (i = 0; i < plan->fft_size; i++)
217-
for (j = 0; j < plan->num_ffts; j++)
218-
plan->tmp_i32[j][i] = plan->inb32[k++];
219-
220-
/* Clear output buffers and call individual FFTs*/
221-
for (j = 0; j < plan->num_ffts; j++) {
222-
memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
223-
fft_execute_32(plan->fft_plan[j], 0);
224-
}
225-
226-
#ifdef DEBUG_DUMP_TO_FILE
227-
for (j = 0; j < plan->num_ffts; j++)
228-
for (i = 0; i < plan->fft_size; i++)
229-
fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
230-
#endif
231-
232-
/* Multiply with twiddle factors */
233-
m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
234-
for (j = 1; j < plan->num_ffts; j++) {
235-
for (i = 0; i < plan->fft_size; i++) {
236-
c = plan->tmp_o32[j][i];
237-
k = j * i * m;
238-
t.real = multi_twiddle_real_32[k];
239-
t.imag = multi_twiddle_imag_32[k];
240-
//fprintf(fh3, "%d %d\n", t.real, t.imag);
241-
icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
242-
}
243-
}
244-
245-
#ifdef DEBUG_DUMP_TO_FILE
246-
for (j = 0; j < plan->num_ffts; j++)
247-
for (i = 0; i < plan->fft_size; i++)
248-
fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
249-
#endif
250-
251-
/* DFT of size 3 */
252-
j = plan->fft_size;
253-
k = 2 * plan->fft_size;
254-
for (i = 0; i < plan->fft_size; i++) {
255-
x[0] = plan->tmp_o32[0][i];
256-
x[1] = plan->tmp_o32[1][i];
257-
x[2] = plan->tmp_o32[2][i];
258-
dft3_32(x, y);
259-
plan->outb32[i] = y[0];
260-
plan->outb32[i + j] = y[1];
261-
plan->outb32[i + k] = y[2];
262-
}
263-
264-
#ifdef DEBUG_DUMP_TO_FILE
265-
for (i = 0; i < plan->total_size; i++)
266-
fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
267-
#endif
268-
269-
/* shift back for IFFT */
270-
271-
/* TODO: Check if time shift method for IFFT is more efficient or more accurate
272-
* tmp = 1 / N * fft(X);
273-
* x = tmp([1 N:-1:2])
274-
*/
275-
if (ifft) {
276-
/*
277-
* no need to divide N as it is already done in the input side
278-
* for Q1.31 format. Instead, we need to multiply N to compensate
279-
* the shrink we did in the FFT transform.
280-
*/
281-
for (i = 0; i < plan->total_size; i++) {
282-
/* Need to negate imag part to match reference */
283-
plan->outb32[i].imag = -plan->outb32[i].imag;
284-
icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
285-
&plan->outb32[i]);
286-
plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
287-
plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
288-
}
289-
}
290-
291-
#ifdef DEBUG_DUMP_TO_FILE
292-
fclose(fh1); fclose(fh2); fclose(fh3); fclose(fh4);
293-
#endif
294-
}

src/math/fft/fft_multi_generic.c

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
// SPDX-License-Identifier: BSD-3-Clause
2+
//
3+
// Copyright(c) 2025-2026 Intel Corporation.
4+
//
5+
// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
6+
7+
#include <sof/audio/format.h>
8+
#include <sof/math/icomplex32.h>
9+
#include <sof/common.h>
10+
#include <sof/math/fft.h>
11+
12+
#ifdef DEBUG_DUMP_TO_FILE
13+
#include <stdio.h>
14+
#endif
15+
16+
/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */
17+
#define FFT_MULTI_TWIDDLE_SIZE 2048
18+
extern const int32_t multi_twiddle_real_32[];
19+
extern const int32_t multi_twiddle_imag_32[];
20+
21+
/* Constants for size 3 DFT */
22+
#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
23+
#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */
24+
#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */
25+
26+
#ifdef FFT_GENERIC
27+
28+
void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
29+
{
30+
const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
31+
const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
32+
struct icomplex32 x[3];
33+
struct icomplex32 p1, p2, sum;
34+
int i;
35+
36+
for (i = 0; i < 3; i++) {
37+
x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
38+
x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
39+
}
40+
41+
/*
42+
* | 1 1 1 |
43+
* c = | 1 c0 c1 | , x = [ x0 x1 x2 ]
44+
* | 1 c1 c0 |
45+
*
46+
* y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(2)
47+
* y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(2)
48+
* y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(2)
49+
*/
50+
51+
/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
52+
icomplex32_adds(&x[0], &x[1], &sum);
53+
icomplex32_adds(&x[2], &sum, &y[0]);
54+
55+
/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
56+
icomplex32_mul(&c0, &x[1], &p1);
57+
icomplex32_mul(&c1, &x[2], &p2);
58+
icomplex32_adds(&p1, &p2, &sum);
59+
icomplex32_adds(&x[0], &sum, &y[1]);
60+
61+
/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
62+
icomplex32_mul(&c1, &x[1], &p1);
63+
icomplex32_mul(&c0, &x[2], &p2);
64+
icomplex32_adds(&p1, &p2, &sum);
65+
icomplex32_adds(&x[0], &sum, &y[2]);
66+
}
67+
68+
void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
69+
{
70+
struct icomplex32 x[FFT_MULTI_COUNT_MAX];
71+
struct icomplex32 y[FFT_MULTI_COUNT_MAX];
72+
struct icomplex32 t, c;
73+
int i, j, k, m;
74+
75+
/* Handle 2^N FFT */
76+
if (plan->num_ffts == 1) {
77+
memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
78+
fft_execute_32(plan->fft_plan[0], ifft);
79+
return;
80+
}
81+
82+
#ifdef DEBUG_DUMP_TO_FILE
83+
FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
84+
FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
85+
FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
86+
FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
87+
#endif
88+
89+
/* convert to complex conjugate for IFFT */
90+
if (ifft) {
91+
for (i = 0; i < plan->total_size; i++)
92+
icomplex32_conj(&plan->inb32[i]);
93+
}
94+
95+
/* Copy input buffers */
96+
k = 0;
97+
for (i = 0; i < plan->fft_size; i++)
98+
for (j = 0; j < plan->num_ffts; j++)
99+
plan->tmp_i32[j][i] = plan->inb32[k++];
100+
101+
/* Clear output buffers and call individual FFTs*/
102+
for (j = 0; j < plan->num_ffts; j++) {
103+
memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
104+
fft_execute_32(plan->fft_plan[j], 0);
105+
}
106+
107+
#ifdef DEBUG_DUMP_TO_FILE
108+
for (j = 0; j < plan->num_ffts; j++)
109+
for (i = 0; i < plan->fft_size; i++)
110+
fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
111+
#endif
112+
113+
/* Multiply with twiddle factors */
114+
m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
115+
for (j = 1; j < plan->num_ffts; j++) {
116+
for (i = 0; i < plan->fft_size; i++) {
117+
c = plan->tmp_o32[j][i];
118+
k = j * i * m;
119+
t.real = multi_twiddle_real_32[k];
120+
t.imag = multi_twiddle_imag_32[k];
121+
// fprintf(fh3, "%d %d\n", t.real, t.imag);
122+
icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
123+
}
124+
}
125+
126+
#ifdef DEBUG_DUMP_TO_FILE
127+
for (j = 0; j < plan->num_ffts; j++)
128+
for (i = 0; i < plan->fft_size; i++)
129+
fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
130+
#endif
131+
132+
/* DFT of size 3 */
133+
j = plan->fft_size;
134+
k = 2 * plan->fft_size;
135+
for (i = 0; i < plan->fft_size; i++) {
136+
x[0] = plan->tmp_o32[0][i];
137+
x[1] = plan->tmp_o32[1][i];
138+
x[2] = plan->tmp_o32[2][i];
139+
dft3_32(x, y);
140+
plan->outb32[i] = y[0];
141+
plan->outb32[i + j] = y[1];
142+
plan->outb32[i + k] = y[2];
143+
}
144+
145+
#ifdef DEBUG_DUMP_TO_FILE
146+
for (i = 0; i < plan->total_size; i++)
147+
fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
148+
#endif
149+
150+
/* shift back for IFFT */
151+
152+
/* TODO: Check if time shift method for IFFT is more efficient or more accurate
153+
* tmp = 1 / N * fft(X);
154+
* x = tmp([1 N:-1:2])
155+
*/
156+
if (ifft) {
157+
/*
158+
* no need to divide N as it is already done in the input side
159+
* for Q1.31 format. Instead, we need to multiply N to compensate
160+
* the shrink we did in the FFT transform.
161+
*/
162+
for (i = 0; i < plan->total_size; i++) {
163+
/* Need to negate imag part to match reference */
164+
plan->outb32[i].imag = -plan->outb32[i].imag;
165+
icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
166+
&plan->outb32[i]);
167+
plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
168+
plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
169+
}
170+
}
171+
172+
#ifdef DEBUG_DUMP_TO_FILE
173+
fclose(fh1);
174+
fclose(fh2);
175+
fclose(fh3);
176+
fclose(fh4);
177+
#endif
178+
}
179+
180+
#endif /* FFT_GENERIC */

0 commit comments

Comments
 (0)