Support for DECODE operator (#3188)

ddavis-2015 · web-flow · commit a94423c6b6ee · 2025-10-31T21:56:21.000Z
* Support for DECODE operator @tensorflow/micro Add initial support for DECODE operator. Add reference implementation. Add LUT decompression support. Update op resolvers. Update Makefiles and Bazel BUILD files. Add kernel unit test. bug=fixes #3131 * update copyright * Don't use constructors with global objects (bluepill will not call them). Cleanup unit test. * Support for DECODE operator @tensorflow/micro Additional support for DECODE operator. Add Xtensa optimizations for LUT decompression. Move all Xtensa kernel source references to the Xtensa target makefile. bug=fixes #3150 * Updates to Xtensa makefiles @tensorflow/micro Reorganize Xtensa makefiles such that all references to optimized kernel sources are moved to the Xtensa target makefile. Move hifimini kernel sources to the parent directory, and rename them so they do not interfere with the target overlay mechanism of the root makefile. bug=fixes #3153 * Fix incorrect include path. Fix code style errors. * fix copyright * update generic benchmark op resolver size * Support for DECODE operator @tensorflow/micro Add reference implementation of pruning to DECODE operator. Makefile and Bazel BUILD file changes. Additional unit tests. bug=fixes #3161 * xtensa int8 single channel working * xtensa per-channel int8 normal axis working * WIP * working xtensa optimizations * Add negative unit test * Support for DECODE operator @tensorflow/micro Add optimized xtensa implementation of pruning to DECODE operator. Makefile changes. Additional unit tests. bug=fixes #3171 * all tests pass * Support for DECODE operator @tensorflow/micro Add reference implementation of Huffman decompression to DECODE operator. Makefile and Bazel BUILD file changes. Additional unit tests. bug=fixes #3187 * Add ScopedMicroProfiler * unfinished merge changes * Split out huffman unit test. Remove xtensa optimizations. * cleanup * Post-review updates.
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
@@ -252,6 +252,7 @@ tflm_kernel_cc_library(
         "cumsum.cc",
         "decode.cc",
         "decode_state.cc",
+        "decode_state_huffman.cc",
         "decode_state_lut.cc",
         "decode_state_prune.cc",
         "depth_to_space.cc",
@@ -346,6 +347,7 @@ tflm_kernel_cc_library(
         "circular_buffer.h",
         "conv.h",
         "decode_state.h",
+        "decode_state_huffman.h",
         "decode_state_lut.h",
         "decode_state_prune.h",
         "depthwise_conv.h",
@@ -664,6 +666,22 @@ tflm_cc_test(
     ],
 )
 
+tflm_cc_test(
+    name = "decode_state_huffman_test",
+    srcs = [
+        "decode_state_huffman_test.cc",
+    ],
+    deps = [
+        ":decode_test_helpers",
+        ":kernel_runner",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:debug_log",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
 tflm_cc_test(
     name = "decode_state_lut_test",
     srcs = [
diff --git a/tensorflow/lite/micro/kernels/Makefile.inc b/tensorflow/lite/micro/kernels/Makefile.inc
@@ -123,6 +123,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/ceil_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/comparisons_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum_test.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state_huffman_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state_lut_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state_prune_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_test.cc \
diff --git a/tensorflow/lite/micro/kernels/decode.cc b/tensorflow/lite/micro/kernels/decode.cc
@@ -82,6 +82,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         dsp = DecodeState::CreateDecodeStatePrune(
             context, micro_context->GetAlternateProfiler());
         break;
+      case DecodeState::kDcmTypeHuffman:
+        dsp = DecodeState::CreateDecodeStateHuffman(
+            context, micro_context->GetAlternateProfiler());
+        break;
       case DecodeState::kDcmTypeCustom:
         MicroPrintf("Custom decode type not yet supported");
         break;
diff --git a/tensorflow/lite/micro/kernels/decode_state.cc b/tensorflow/lite/micro/kernels/decode_state.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/decode_state.h"
 
+#include "tensorflow/lite/micro/kernels/decode_state_huffman.h"
 #include "tensorflow/lite/micro/kernels/decode_state_lut.h"
 #include "tensorflow/lite/micro/kernels/decode_state_prune.h"
 #include "tensorflow/lite/micro/micro_context.h"
@@ -47,4 +48,17 @@ DecodeState* DecodeState::CreateDecodeStatePrune(
   return dsp;
 }
 
+DecodeState* DecodeState::CreateDecodeStateHuffman(
+    const TfLiteContext* context, MicroProfilerInterface* profiler) {
+  MicroContext* const micro_context = GetMicroContext(context);
+  void* buffer =
+      micro_context->AllocatePersistentBuffer(sizeof(DecodeStateHuffman));
+  if (buffer == nullptr) {
+    return nullptr;
+  }
+  DecodeState* dsp = new (buffer) DecodeStateHuffman(context, profiler);
+
+  return dsp;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state.h b/tensorflow/lite/micro/kernels/decode_state.h
@@ -45,6 +45,8 @@ class DecodeState {
                                            MicroProfilerInterface* profiler);
   static DecodeState* CreateDecodeStatePrune(const TfLiteContext* context,
                                              MicroProfilerInterface* profiler);
+  static DecodeState* CreateDecodeStateHuffman(
+      const TfLiteContext* context, MicroProfilerInterface* profiler);
 
   static uint8_t Type(const TfLiteTensor& ancillary) {
     return GetTensorData<uint8_t>(&ancillary)[kDcmDecodeTypeOffset];
@@ -68,6 +70,7 @@ class DecodeState {
   // Decode Common Metadata constants
  public:
   static constexpr uint8_t kDcmTypeLUT = 0;
+  static constexpr uint8_t kDcmTypeHuffman = 1;
   static constexpr uint8_t kDcmTypePrune = 2;
   static constexpr uint8_t kDcmTypeCustom = 127;
 
diff --git a/tensorflow/lite/micro/kernels/decode_state_huffman.cc b/tensorflow/lite/micro/kernels/decode_state_huffman.cc
@@ -0,0 +1,167 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/decode_state_huffman.h"
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_profiler.h"
+
+namespace tflite {
+
+TfLiteStatus DecodeStateHuffman::Setup(const TfLiteTensor& input,
+                                       const TfLiteTensor& ancillary,
+                                       const TfLiteTensor& output) {
+  const uint8_t* const ancillary_data = GetTensorData<uint8_t>(&ancillary);
+  if (ancillary_data[kDcmVersionOffset] != 1) {
+    MicroPrintf("unsupported version %u", ancillary_data[kDcmVersionOffset]);
+    return kTfLiteError;
+  }
+
+  compressed_codewords_ = GetTensorData<uint32_t>(&input);
+  count_codewords_ = NumElements(&output);
+  huffman_tables_ = &ancillary_data[kDcmSizeInBytes];
+  use_32bit_table_ =
+      (ancillary_data[kDcmTableSizeOffset] & kDcmTableSize32BitsMask) != 0;
+  initial_table_size_ =
+      (ancillary_data[kDcmTableSizeOffset] & kDcmTableSizeInitialMask) >>
+      kDcmTableSizeInitialShift;
+
+  if (!use_32bit_table_) {
+    TF_LITE_ENSURE_TYPES_EQ(const_cast<TfLiteContext*>(context_), output.type,
+                            kTfLiteInt8);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DecodeStateHuffman::Decode(const TfLiteEvalTensor& input,
+                                        const TfLiteEvalTensor& ancillary,
+                                        const TfLiteEvalTensor& output) {
+  void* const buffer = const_cast<void*>(micro::GetTensorData<void>(&output));
+  TFLITE_DCHECK(buffer != nullptr);
+
+  switch (output.type) {
+    case kTfLiteInt8:
+      if (use_32bit_table_) {
+        DecompressToBufferWith32BitTable(static_cast<int8_t*>(buffer));
+      } else {
+        DecompressToBufferWith16BitTable(static_cast<int8_t*>(buffer));
+      }
+      break;
+    case kTfLiteInt16:
+      DecompressToBufferWith32BitTable(static_cast<int16_t*>(buffer));
+      break;
+    default:
+      MicroPrintf("unsupported tensor type %s", TfLiteTypeGetName(output.type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+void DecodeStateHuffman::DecompressToBufferWith16BitTable(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  size_t remaining = count_codewords_;
+  const size_t initial_table_size = initial_table_size_ + 1;
+  const uint16_t* huffman_tables =
+      static_cast<const uint16_t*>(huffman_tables_);
+  uint32_t head_offset = 0;             // codewords bitstring state
+  uint32_t head_hold = 0;               // codewords bitstring state
+  const uint32_t* head_next = nullptr;  // codewords bitstring state
+  uint16_t table_value = 0;
+
+  InitNextBits(head_offset, head_hold, head_next);
+
+  while (remaining--) {
+    size_t last_used_bits = initial_table_size;
+    uint32_t current_index =
+        GetNextBits(last_used_bits, head_offset, head_hold, head_next);
+    size_t table_offset = current_index;
+    table_value = huffman_tables[table_offset];
+
+    while (!(table_value & kTable16BitSymbolFoundMask)) {
+      last_used_bits =
+          ((table_value & kTable16BitCountMask) >> kTable16BitCountShift) + 1;
+      current_index =
+          GetNextBits(last_used_bits, head_offset, head_hold, head_next);
+      const size_t next_table_offset = table_value & kTable16BitValueMask;
+      table_offset += next_table_offset + current_index;
+      table_value = huffman_tables[table_offset];
+    }
+
+    *buffer++ = table_value;
+
+    const size_t symbol_residual_bits =
+        (table_value & kTable16BitCountMask) >> kTable16BitCountShift;
+    if (last_used_bits > symbol_residual_bits) {
+      PutBackBits(last_used_bits - symbol_residual_bits, head_offset, head_hold,
+                  head_next);
+    }
+  }
+}
+
+template <typename T>
+void DecodeStateHuffman::DecompressToBufferWith32BitTable(T* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  size_t remaining = count_codewords_;
+  const size_t initial_table_size = initial_table_size_ + 1;
+  const uint32_t* huffman_tables =
+      static_cast<const uint32_t*>(huffman_tables_);
+  uint32_t head_offset = 0;             // codewords bitstring state
+  uint32_t head_hold = 0;               // codewords bitstring state
+  const uint32_t* head_next = nullptr;  // codewords bitstring state
+  uint32_t table_value = 0;
+
+  InitNextBits(head_offset, head_hold, head_next);
+
+  while (remaining--) {
+    size_t last_used_bits = initial_table_size;
+    uint32_t current_index =
+        GetNextBits(last_used_bits, head_offset, head_hold, head_next);
+    size_t table_offset = current_index;
+    table_value = huffman_tables[table_offset];
+
+    while (!(table_value & kTable32BitSymbolFoundMask)) {
+      last_used_bits =
+          ((table_value & kTable32BitCountMask) >> kTable32BitCountShift) + 1;
+      current_index =
+          GetNextBits(last_used_bits, head_offset, head_hold, head_next);
+      const size_t next_table_offset = table_value & kTable32BitValueMask;
+      table_offset += next_table_offset + current_index;
+      table_value = huffman_tables[table_offset];
+    }
+
+    *buffer++ = table_value;
+
+    const size_t symbol_residual_bits =
+        (table_value & kTable32BitCountMask) >> kTable32BitCountShift;
+    if (last_used_bits > symbol_residual_bits) {
+      PutBackBits(last_used_bits - symbol_residual_bits, head_offset, head_hold,
+                  head_next);
+    }
+  }
+}
+
+template void DecodeStateHuffman::DecompressToBufferWith32BitTable<int8_t>(
+    int8_t*);
+template void DecodeStateHuffman::DecompressToBufferWith32BitTable<int16_t>(
+    int16_t*);
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state_huffman.h b/tensorflow/lite/micro/kernels/decode_state_huffman.h
diff --git a/tensorflow/lite/micro/kernels/decode_state_huffman_test.cc b/tensorflow/lite/micro/kernels/decode_state_huffman_test.cc
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile