Skip to content

Commit cee9550

Browse files
pramods-cadveblush
andauthored
Optimization in 8x16 lstm layer FC for HiFi. (#3227)
* Optimization in 8x16 lstm layer for HiFi by using xa_nn_matXvec_v2_sym8sxsym16s_sym16s with appropriate conditions for FC. * Updated HiFi4/HiFi5 NNLibs to 30 Sept 2025 release, made necessary changes in kernel integration code. * Fixed formatting errors. --------- Co-authored-by: Esun Kim <[email protected]>
1 parent 11a8f50 commit cee9550

File tree

5 files changed

+44
-279
lines changed

5 files changed

+44
-279
lines changed

tensorflow/lite/micro/kernels/xtensa/conv_hifi.cc

Lines changed: 11 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ TfLiteStatus ConvPrepareHifi(TfLiteContext* context, TfLiteNode* node) {
8181
const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
8282
const int filter_height = filter_shape.Dims(1);
8383
const int filter_width = filter_shape.Dims(2);
84+
const int filter_depth = filter_shape.Dims(3);
8485
const int output_height = output_shape.Dims(1);
8586
const int output_width = output_shape.Dims(2);
8687
const int output_channels = output_shape.Dims(3);
@@ -94,43 +95,19 @@ TfLiteStatus ConvPrepareHifi(TfLiteContext* context, TfLiteNode* node) {
9495
if ((params->dilation_width_factor == 1) &&
9596
(params->dilation_height_factor == 1)) {
9697
if (input->type == kTfLiteInt8) {
97-
if (input_height == 1 && filter_height == 1 && output_height == 1) {
98-
int inp_h, filt_h, filt_w, str_h, pad_h, out_h;
99-
inp_h = input_width;
100-
filt_h = filter_width;
101-
filt_w = filter_height;
102-
str_h = stride_width;
103-
pad_h = pad_width;
104-
out_h = output_width;
105-
required_scratch = xa_nn_conv2d_std_getsize(
106-
inp_h, input_depth, filt_h, filt_w, str_h, pad_h, out_h,
107-
output_channels, PREC_ASYM8S);
108-
} else {
109-
required_scratch = xa_nn_conv2d_std_getsize(
110-
input_height, input_depth, filter_height, filter_width,
111-
stride_height, pad_height, output_height, output_channels,
112-
PREC_ASYM8S);
113-
}
98+
required_scratch = xa_nn_conv2d_std_getsize(
99+
input_height, input_width, input_depth, filter_height, filter_width,
100+
filter_depth, stride_height, pad_height, stride_width, pad_width,
101+
output_height, output_width, output_channels, PREC_ASYM8S, PREC_SYM8S,
102+
1, 1, 0);
114103
TF_LITE_ENSURE(context, required_scratch > 0);
115104
}
116105
if (input->type == kTfLiteInt16) {
117-
if (input_height == 1 && filter_height == 1 && output_height == 1) {
118-
int inp_h, filt_h, filt_w, str_h, pad_h, out_h;
119-
inp_h = input_width;
120-
filt_h = filter_width;
121-
filt_w = filter_height;
122-
str_h = stride_width;
123-
pad_h = pad_width;
124-
out_h = output_width;
125-
required_scratch = xa_nn_conv2d_std_getsize(
126-
inp_h, input_depth, filt_h, filt_w, str_h, pad_h, out_h,
127-
output_channels, PREC_SYM16S);
128-
} else {
129-
required_scratch = xa_nn_conv2d_std_getsize(
130-
input_height, input_depth, filter_height, filter_width,
131-
stride_height, pad_height, output_height, output_channels,
132-
PREC_SYM16S);
133-
}
106+
required_scratch = xa_nn_conv2d_std_getsize(
107+
input_height, input_width, input_depth, filter_height, filter_width,
108+
filter_depth, stride_height, pad_height, stride_width, pad_width,
109+
output_height, output_width, output_channels, PREC_SYM16S, PREC_SYM8S,
110+
1, 1, 0);
134111
TF_LITE_ENSURE(context, required_scratch > 0);
135112
}
136113
}

tensorflow/lite/micro/kernels/xtensa/lstm_eval.cc

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -305,16 +305,31 @@ void FullyConnected(const FullyConnectedParams& params,
305305
return;
306306
}
307307

308+
#define ARG_CHK_ALIGN(_ptr, _align) \
309+
(((unsigned int)(_ptr) & ((_align) - 1)) == 0)
310+
308311
void FullyConnected(const FullyConnectedParams& params,
309312
const int16_t* input_data, const int8_t* filter_data,
310313
const int64_t* bias_data, int16_t* output_data,
311314
const int num_batches, const int output_depth,
312315
const int accum_depth) {
313-
xa_nn_matmul_sym8sxsym16s_sym16s(
314-
output_data, filter_data, input_data, bias_data, output_depth,
315-
accum_depth, accum_depth, num_batches, accum_depth, output_depth, 1,
316-
params.input_offset, params.output_multiplier, params.output_shift,
317-
params.output_offset);
316+
WORD32 err;
317+
if (num_batches == 1 && ARG_CHK_ALIGN(output_data, sizeof(WORD16) * 8) &&
318+
ARG_CHK_ALIGN(filter_data, sizeof(WORD8) * 16) &&
319+
ARG_CHK_ALIGN(input_data, sizeof(WORD16) * 8) &&
320+
ARG_CHK_ALIGN(bias_data, sizeof(WORD64) * 2)) {
321+
err = xa_nn_matXvec_v2_sym8sxsym16s_sym16s(
322+
output_data, filter_data, input_data, bias_data, output_depth,
323+
accum_depth, accum_depth, params.output_multiplier, params.output_shift,
324+
-32768, 32767, NULL);
325+
} else {
326+
err = xa_nn_matmul_sym8sxsym16s_sym16s(
327+
output_data, filter_data, input_data, bias_data, output_depth,
328+
accum_depth, accum_depth, num_batches, accum_depth, output_depth, 1,
329+
params.input_offset, params.output_multiplier, params.output_shift,
330+
params.output_offset);
331+
}
332+
(void)err;
318333
return;
319334
}
320335

tensorflow/lite/micro/kernels/xtensa/transpose_conv.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
204204
scratch_buffer_size = xa_nn_transpose_conv_getsize(
205205
input_height, input_width, input_depth, filter_height, filter_width,
206206
stride_width, stride_height, output_height, output_width, num_channels,
207-
PREC_SYM8S, PREC_ASYM8S);
207+
1 /* num_groups */, PREC_SYM8S, PREC_ASYM8S);
208208
TFLITE_DCHECK(context->RequestScratchBufferInArena(
209209
context, scratch_buffer_size,
210210
&(data->scratch_buffer_index)) == kTfLiteOk);
@@ -232,7 +232,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
232232
scratch_buffer_size = xa_nn_transpose_conv_getsize(
233233
input_height, input_width, input_depth, filter_height, filter_width,
234234
stride_width, stride_height, output_height, output_width, num_channels,
235-
PREC_SYM8S, PREC_SYM16S);
235+
1 /* num_groups */, PREC_SYM8S, PREC_SYM16S);
236236
TFLITE_DCHECK(context->RequestScratchBufferInArena(
237237
context, scratch_buffer_size,
238238
&(data->scratch_buffer_index)) == kTfLiteOk);
@@ -404,9 +404,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
404404
stride_width, stride_height, pad_width, pad_height, input_depth,
405405
output_depth, input_height, input_width, filter_height,
406406
filter_width, output_height, output_width, num_elements / batches,
407-
data.params.input_offset, data.params.output_offset,
408-
data.per_channel_output_shift, data.per_channel_output_multiplier,
409-
scratch_buffer);
407+
1 /* num_groups */, data.params.input_offset,
408+
data.params.output_offset, data.per_channel_output_shift,
409+
data.per_channel_output_multiplier, scratch_buffer);
410410
}
411411
} else {
412412
reference_integer_ops::TransposeConv(
@@ -539,8 +539,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
539539
stride_width, stride_height, pad_width, pad_height, input_depth,
540540
output_depth, input_height, input_width, filter_height,
541541
filter_width, output_height, output_width, num_elements / batches,
542-
data.per_channel_output_shift, data.per_channel_output_multiplier,
543-
scratch_buffer);
542+
1 /* num_groups */, data.per_channel_output_shift,
543+
data.per_channel_output_multiplier, scratch_buffer);
544544
}
545545
#else // #if defined(HIFI3) || defined(HIFI4) || defined(HIFI5)
546546
reference_integer_ops::TransposeConv(

tensorflow/lite/micro/tools/make/ext_libs/xa_nnlib_hifi4.patch

Lines changed: 0 additions & 226 deletions
This file was deleted.

tensorflow/lite/micro/tools/make/ext_libs/xtensa_download.sh

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,17 @@ DOWNLOADS_DIR=${1}
4141
PATCH=""
4242

4343
if [[ ${2} == "hifi3" ]]; then
44-
LIBRARY_URL="http://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_hifi4_09_05_2023.zip"
44+
LIBRARY_URL="http://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_hifi4_09_30_2025.zip"
4545
LIBRARY_DIRNAME="xa_nnlib_hifi4"
46-
LIBRARY_MD5="2a54e056aef73a4fcffde4643998501a"
46+
LIBRARY_MD5="a1cba57501a35d0bffdd39c536689366"
4747
elif [[ ${2} == "hifi4" ]]; then
48-
LIBRARY_URL="http://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_hifi4_09_05_2023.zip"
48+
LIBRARY_URL="http://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_hifi4_09_30_2025.zip"
4949
LIBRARY_DIRNAME="xa_nnlib_hifi4"
50-
LIBRARY_MD5="2a54e056aef73a4fcffde4643998501a"
51-
PATCH="../../ext_libs/xa_nnlib_hifi4.patch"
50+
LIBRARY_MD5="a1cba57501a35d0bffdd39c536689366"
5251
elif [[ ${2} == "hifi5" ]]; then
53-
LIBRARY_URL="http://github.com/foss-xtensa/nnlib-hifi5/raw/master/archive/xa_nnlib_hifi5_09_05_2023.zip"
52+
LIBRARY_URL="http://github.com/foss-xtensa/nnlib-hifi5/raw/master/archive/xa_nnlib_hifi5_09_30_2025.zip"
5453
LIBRARY_DIRNAME="xa_nnlib_hifi5"
55-
LIBRARY_MD5="1deb55ef200bf5dbedc70b99b02140c0"
54+
LIBRARY_MD5="f3ac445ae47143b1fddf26b85d763697"
5655
elif [[ ${2} == "vision_p6" ]]; then
5756
LIBRARY_URL="https://github.com/foss-xtensa/tflmlib_vision/raw/main/archive/xi_tflmlib_vision_p6_22_06_29.zip"
5857
LIBRARY_DIRNAME="xi_tflmlib_vision_p6"

0 commit comments

Comments
 (0)