Skip to content

Commit 9e70d76

Browse files
committed
Math: FFT-multi: Add Xtensa HiFi versions for hot code functions
This patch adds HiFi3 versions for functions dft3_32() and fft_multi_execute_32(). The functions are implemented to fft_multi_hifi3.c and the generic versions are moved to fft_multi_generic.c. in MTL platform the optimization saves 119 MCPS, from 237 MCPS to 118 MCPS. The test was done with script run: scripts/rebuild-testbench.sh -p mtl scripts/sof-testbench-helper.sh -x -m stft_process_1536_240_ \ -p profile-stft_process.txt The above STFT used FFT length of 1536 with hop 240. Signed-off-by: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
1 parent 8e3318b commit 9e70d76

File tree

5 files changed

+436
-155
lines changed

5 files changed

+436
-155
lines changed

src/math/fft/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ if(CONFIG_MATH_32BIT_FFT)
1111
endif()
1212

1313
if(CONFIG_MATH_FFT_MULTI)
14-
list(APPEND base_files fft_multi.c)
14+
list(APPEND base_files fft_multi.c fft_multi_generic.c fft_multi_hifi3.c)
1515
endif()
1616

1717
is_zephyr(zephyr)

src/math/fft/fft_multi.c

Lines changed: 0 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,6 @@ LOG_MODULE_REGISTER(math_fft_multi, CONFIG_SOF_LOG_LEVEL);
1919
SOF_DEFINE_REG_UUID(math_fft_multi);
2020
DECLARE_TR_CTX(math_fft_multi_tr, SOF_UUID(math_fft_multi_uuid), LOG_LEVEL_INFO);
2121

22-
/* Constants for size 3 DFT */
23-
#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
24-
#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */
25-
#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */
26-
2722
struct fft_multi_plan *mod_fft_multi_plan_new(struct processing_module *mod, void *inb,
2823
void *outb, uint32_t size, int bits)
2924
{
@@ -143,152 +138,3 @@ void mod_fft_multi_plan_free(struct processing_module *mod, struct fft_multi_pla
143138
mod_free(mod, plan->bit_reverse_idx);
144139
mod_free(mod, plan);
145140
}
146-
147-
void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
148-
{
149-
const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
150-
const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
151-
struct icomplex32 x[3];
152-
struct icomplex32 p1, p2, sum;
153-
int i;
154-
155-
for (i = 0; i < 3; i++) {
156-
x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
157-
x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
158-
}
159-
160-
/*
161-
* | 1 1 1 |
162-
* c = | 1 c0 c1 | , x = [ x0 x1 x2 ]
163-
* | 1 c1 c0 |
164-
*
165-
* y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(0)
166-
* y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(0)
167-
* y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(0)
168-
*/
169-
170-
/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
171-
icomplex32_adds(&x[0], &x[1], &sum);
172-
icomplex32_adds(&x[2], &sum, &y[0]);
173-
174-
/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
175-
icomplex32_mul(&c0, &x[1], &p1);
176-
icomplex32_mul(&c1, &x[2], &p2);
177-
icomplex32_adds(&p1, &p2, &sum);
178-
icomplex32_adds(&x[0], &sum, &y[1]);
179-
180-
/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
181-
icomplex32_mul(&c1, &x[1], &p1);
182-
icomplex32_mul(&c0, &x[2], &p2);
183-
icomplex32_adds(&p1, &p2, &sum);
184-
icomplex32_adds(&x[0], &sum, &y[2]);
185-
}
186-
187-
void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
188-
{
189-
struct icomplex32 x[FFT_MULTI_COUNT_MAX];
190-
struct icomplex32 y[FFT_MULTI_COUNT_MAX];
191-
struct icomplex32 t, c;
192-
int i, j, k, m;
193-
194-
/* Handle 2^N FFT */
195-
if (plan->num_ffts == 1) {
196-
memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
197-
fft_execute_32(plan->fft_plan[0], ifft);
198-
return;
199-
}
200-
201-
#ifdef DEBUG_DUMP_TO_FILE
202-
FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
203-
FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
204-
FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
205-
FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
206-
#endif
207-
208-
/* convert to complex conjugate for IFFT */
209-
if (ifft) {
210-
for (i = 0; i < plan->total_size; i++)
211-
icomplex32_conj(&plan->inb32[i]);
212-
}
213-
214-
/* Copy input buffers */
215-
k = 0;
216-
for (i = 0; i < plan->fft_size; i++)
217-
for (j = 0; j < plan->num_ffts; j++)
218-
plan->tmp_i32[j][i] = plan->inb32[k++];
219-
220-
/* Clear output buffers and call individual FFTs*/
221-
for (j = 0; j < plan->num_ffts; j++) {
222-
memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
223-
fft_execute_32(plan->fft_plan[j], 0);
224-
}
225-
226-
#ifdef DEBUG_DUMP_TO_FILE
227-
for (j = 0; j < plan->num_ffts; j++)
228-
for (i = 0; i < plan->fft_size; i++)
229-
fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
230-
#endif
231-
232-
/* Multiply with twiddle factors */
233-
m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
234-
for (j = 1; j < plan->num_ffts; j++) {
235-
for (i = 0; i < plan->fft_size; i++) {
236-
c = plan->tmp_o32[j][i];
237-
k = j * i * m;
238-
t.real = multi_twiddle_real_32[k];
239-
t.imag = multi_twiddle_imag_32[k];
240-
//fprintf(fh3, "%d %d\n", t.real, t.imag);
241-
icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
242-
}
243-
}
244-
245-
#ifdef DEBUG_DUMP_TO_FILE
246-
for (j = 0; j < plan->num_ffts; j++)
247-
for (i = 0; i < plan->fft_size; i++)
248-
fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
249-
#endif
250-
251-
/* DFT of size 3 */
252-
j = plan->fft_size;
253-
k = 2 * plan->fft_size;
254-
for (i = 0; i < plan->fft_size; i++) {
255-
x[0] = plan->tmp_o32[0][i];
256-
x[1] = plan->tmp_o32[1][i];
257-
x[2] = plan->tmp_o32[2][i];
258-
dft3_32(x, y);
259-
plan->outb32[i] = y[0];
260-
plan->outb32[i + j] = y[1];
261-
plan->outb32[i + k] = y[2];
262-
}
263-
264-
#ifdef DEBUG_DUMP_TO_FILE
265-
for (i = 0; i < plan->total_size; i++)
266-
fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
267-
#endif
268-
269-
/* shift back for IFFT */
270-
271-
/* TODO: Check if time shift method for IFFT is more efficient or more accurate
272-
* tmp = 1 / N * fft(X);
273-
* x = tmp([1 N:-1:2])
274-
*/
275-
if (ifft) {
276-
/*
277-
* no need to divide N as it is already done in the input side
278-
* for Q1.31 format. Instead, we need to multiply N to compensate
279-
* the shrink we did in the FFT transform.
280-
*/
281-
for (i = 0; i < plan->total_size; i++) {
282-
/* Need to negate imag part to match reference */
283-
plan->outb32[i].imag = -plan->outb32[i].imag;
284-
icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
285-
&plan->outb32[i]);
286-
plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
287-
plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
288-
}
289-
}
290-
291-
#ifdef DEBUG_DUMP_TO_FILE
292-
fclose(fh1); fclose(fh2); fclose(fh3); fclose(fh4);
293-
#endif
294-
}

src/math/fft/fft_multi_generic.c

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
// SPDX-License-Identifier: BSD-3-Clause
2+
//
3+
// Copyright(c) 2025-2026 Intel Corporation.
4+
//
5+
// Author: Seppo Ingalsuo <seppo.ingalsuo@linux.intel.com>
6+
7+
#include <sof/audio/format.h>
8+
#include <sof/math/icomplex32.h>
9+
#include <sof/common.h>
10+
#include <sof/math/fft.h>
11+
12+
/* Twiddle factor tables defined in fft_multi.c via twiddle_3072_32.h */
13+
#define FFT_MULTI_TWIDDLE_SIZE 2048
14+
extern const int32_t multi_twiddle_real_32[];
15+
extern const int32_t multi_twiddle_imag_32[];
16+
17+
/* Constants for size 3 DFT */
18+
#define DFT3_COEFR -1073741824 /* int32(-0.5 * 2^31) */
19+
#define DFT3_COEFI 1859775393 /* int32(sqrt(3) / 2 * 2^31) */
20+
#define DFT3_SCALE 715827883 /* int32(1/3*2^31) */
21+
22+
#ifdef FFT_GENERIC
23+
24+
void dft3_32(struct icomplex32 *x_in, struct icomplex32 *y)
25+
{
26+
const struct icomplex32 c0 = {DFT3_COEFR, -DFT3_COEFI};
27+
const struct icomplex32 c1 = {DFT3_COEFR, DFT3_COEFI};
28+
struct icomplex32 x[3];
29+
struct icomplex32 p1, p2, sum;
30+
int i;
31+
32+
for (i = 0; i < 3; i++) {
33+
x[i].real = Q_MULTSR_32X32((int64_t)x_in[i].real, DFT3_SCALE, 31, 31, 31);
34+
x[i].imag = Q_MULTSR_32X32((int64_t)x_in[i].imag, DFT3_SCALE, 31, 31, 31);
35+
}
36+
37+
/*
38+
* | 1 1 1 |
39+
* c = | 1 c0 c1 | , x = [ x0 x1 x2 ]
40+
* | 1 c1 c0 |
41+
*
42+
* y(0) = c(0,0) * x(0) + c(1,0) * x(1) + c(2,0) * x(2)
43+
* y(1) = c(0,1) * x(0) + c(1,1) * x(1) + c(2,1) * x(2)
44+
* y(2) = c(0,2) * x(0) + c(1,2) * x(1) + c(2,2) * x(2)
45+
*/
46+
47+
/* y(0) = 1 * x(0) + 1 * x(1) + 1 * x(2) */
48+
icomplex32_adds(&x[0], &x[1], &sum);
49+
icomplex32_adds(&x[2], &sum, &y[0]);
50+
51+
/* y(1) = 1 * x(0) + c0 * x(1) + c1 * x(2) */
52+
icomplex32_mul(&c0, &x[1], &p1);
53+
icomplex32_mul(&c1, &x[2], &p2);
54+
icomplex32_adds(&p1, &p2, &sum);
55+
icomplex32_adds(&x[0], &sum, &y[1]);
56+
57+
/* y(2) = 1 * x(0) + c1 * x(1) + c0 * x(2) */
58+
icomplex32_mul(&c1, &x[1], &p1);
59+
icomplex32_mul(&c0, &x[2], &p2);
60+
icomplex32_adds(&p1, &p2, &sum);
61+
icomplex32_adds(&x[0], &sum, &y[2]);
62+
}
63+
64+
void fft_multi_execute_32(struct fft_multi_plan *plan, bool ifft)
65+
{
66+
struct icomplex32 x[FFT_MULTI_COUNT_MAX];
67+
struct icomplex32 y[FFT_MULTI_COUNT_MAX];
68+
struct icomplex32 t, c;
69+
int i, j, k, m;
70+
71+
/* Handle 2^N FFT */
72+
if (plan->num_ffts == 1) {
73+
memset(plan->outb32, 0, plan->fft_size * sizeof(struct icomplex32));
74+
fft_execute_32(plan->fft_plan[0], ifft);
75+
return;
76+
}
77+
78+
#ifdef DEBUG_DUMP_TO_FILE
79+
FILE *fh1 = fopen("debug_fft_multi_int1.txt", "w");
80+
FILE *fh2 = fopen("debug_fft_multi_int2.txt", "w");
81+
FILE *fh3 = fopen("debug_fft_multi_twiddle.txt", "w");
82+
FILE *fh4 = fopen("debug_fft_multi_dft_out.txt", "w");
83+
#endif
84+
85+
/* convert to complex conjugate for IFFT */
86+
if (ifft) {
87+
for (i = 0; i < plan->total_size; i++)
88+
icomplex32_conj(&plan->inb32[i]);
89+
}
90+
91+
/* Copy input buffers */
92+
k = 0;
93+
for (i = 0; i < plan->fft_size; i++)
94+
for (j = 0; j < plan->num_ffts; j++)
95+
plan->tmp_i32[j][i] = plan->inb32[k++];
96+
97+
/* Clear output buffers and call individual FFTs*/
98+
for (j = 0; j < plan->num_ffts; j++) {
99+
memset(&plan->tmp_o32[j][0], 0, plan->fft_size * sizeof(struct icomplex32));
100+
fft_execute_32(plan->fft_plan[j], 0);
101+
}
102+
103+
#ifdef DEBUG_DUMP_TO_FILE
104+
for (j = 0; j < plan->num_ffts; j++)
105+
for (i = 0; i < plan->fft_size; i++)
106+
fprintf(fh1, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
107+
#endif
108+
109+
/* Multiply with twiddle factors */
110+
m = FFT_MULTI_TWIDDLE_SIZE / 2 / plan->fft_size;
111+
for (j = 1; j < plan->num_ffts; j++) {
112+
for (i = 0; i < plan->fft_size; i++) {
113+
c = plan->tmp_o32[j][i];
114+
k = j * i * m;
115+
t.real = multi_twiddle_real_32[k];
116+
t.imag = multi_twiddle_imag_32[k];
117+
// fprintf(fh3, "%d %d\n", t.real, t.imag);
118+
icomplex32_mul(&t, &c, &plan->tmp_o32[j][i]);
119+
}
120+
}
121+
122+
#ifdef DEBUG_DUMP_TO_FILE
123+
for (j = 0; j < plan->num_ffts; j++)
124+
for (i = 0; i < plan->fft_size; i++)
125+
fprintf(fh2, "%d %d\n", plan->tmp_o32[j][i].real, plan->tmp_o32[j][i].imag);
126+
#endif
127+
128+
/* DFT of size 3 */
129+
j = plan->fft_size;
130+
k = 2 * plan->fft_size;
131+
for (i = 0; i < plan->fft_size; i++) {
132+
x[0] = plan->tmp_o32[0][i];
133+
x[1] = plan->tmp_o32[1][i];
134+
x[2] = plan->tmp_o32[2][i];
135+
dft3_32(x, y);
136+
plan->outb32[i] = y[0];
137+
plan->outb32[i + j] = y[1];
138+
plan->outb32[i + k] = y[2];
139+
}
140+
141+
#ifdef DEBUG_DUMP_TO_FILE
142+
for (i = 0; i < plan->total_size; i++)
143+
fprintf(fh4, "%d %d\n", plan->outb32[i].real, plan->outb32[i].imag);
144+
#endif
145+
146+
/* shift back for IFFT */
147+
148+
/* TODO: Check if time shift method for IFFT is more efficient or more accurate
149+
* tmp = 1 / N * fft(X);
150+
* x = tmp([1 N:-1:2])
151+
*/
152+
if (ifft) {
153+
/*
154+
* no need to divide N as it is already done in the input side
155+
* for Q1.31 format. Instead, we need to multiply N to compensate
156+
* the shrink we did in the FFT transform.
157+
*/
158+
for (i = 0; i < plan->total_size; i++) {
159+
/* Need to negate imag part to match reference */
160+
plan->outb32[i].imag = -plan->outb32[i].imag;
161+
icomplex32_shift(&plan->outb32[i], plan->fft_plan[0]->len,
162+
&plan->outb32[i]);
163+
plan->outb32[i].real = sat_int32((int64_t)plan->outb32[i].real * 3);
164+
plan->outb32[i].imag = sat_int32((int64_t)plan->outb32[i].imag * 3);
165+
}
166+
}
167+
168+
#ifdef DEBUG_DUMP_TO_FILE
169+
fclose(fh1);
170+
fclose(fh2);
171+
fclose(fh3);
172+
fclose(fh4);
173+
#endif
174+
}
175+
176+
#endif /* FFT_GENERIC */

0 commit comments

Comments
 (0)