diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index 84883d2fd19..4f404c93a18 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -649,4 +649,8 @@ TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t) {
   return kTfLiteRunStepUnknown;
 }
 
+// Returns a sentinel value to be used as the user_data field of a TfLiteNode
+// when the kernel initialization fails.
+void* TfLiteKernelInitFailed() { return reinterpret_cast<void*>(-1); }
+
 }  // extern "C"
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index c3e00cc0972..765c2bc12f2 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -1161,6 +1161,11 @@ typedef struct TfLiteRegistration {
   /// NOTE: if the data is already in the desired format, simply implement this
   /// function to return `nullptr` and implement the free function to be a
   /// no-op.
+  ///
+  /// NOTE: For a Delegate kernel, returns `TfLiteKernelInitFailed()` if it
+  /// fails on the initialization. This eventually causes user's API call to
+  /// InterpreterBuilder::operator() or Interpreter::ModifyGraphWithDelegate()
+  /// to return an error.
   void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
 
   /// The pointer `buffer` is the data previously returned by an init
@@ -1499,6 +1504,10 @@ TfLiteRunStep TfLiteTensorGetDataKnownStep(const TfLiteTensor* t);
 /// operations.
 TfLiteRunStep TfLiteTensorGetShapeKnownStep(const TfLiteTensor* t);
 
+/// Returns a sentinel value to be used as the user_data field of a TfLiteNode
+/// when the kernel initialization fails.
+void* TfLiteKernelInitFailed();
+
 /** @} */
 // Ends `\addtogroup`, it's important for the doc generator that this doesn't
 // include the CC code below.
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
index fc45d1f9822..a361a2d0e5d 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -170,6 +170,7 @@ inline void BatchQuantizeFloats(const float* float_data_ptr, int n_batch,
       tensor_utils::SymmetricQuantizeFloats(
           float_data_ptr + offset, n_data, quantized_data_ptr + offset,
           &unused_min, &unused_max, &scaling_factors[b]);
+      if (zero_points) zero_points[b] = 0;
     }
   }
 }
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index d83696219c2..54908bd24ee 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/runtime_shape.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/internal/reference/div.h b/tensorflow/lite/kernels/internal/reference/div.h
index e70fb09767e..5f26d3b8e6d 100644
--- a/tensorflow/lite/kernels/internal/reference/div.h
+++ b/tensorflow/lite/kernels/internal/reference/div.h
@@ -99,6 +99,18 @@ inline void Div(const ArithmeticParams& params,
   DivElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
+inline void Div(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16_t* input1_data,
+                const RuntimeShape& input2_shape, const int16_t* input2_data,
+                const RuntimeShape& output_shape, int16_t* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  DivElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
 template <typename T, int N = 5>
 inline void BroadcastDivSlowQuantized(
     const ArithmeticParams& params, const RuntimeShape& unextended_input1_shape,
@@ -177,6 +189,19 @@ inline void BroadcastDivSlow(const ArithmeticParams& params,
       input2_data, unextended_output_shape, output_data);
 }
 
+template <int N = 5>
+inline void BroadcastDivSlow(const ArithmeticParams& params,
+                             const RuntimeShape& unextended_input1_shape,
+                             const int16_t* input1_data,
+                             const RuntimeShape& unextended_input2_shape,
+                             const int16_t* input2_data,
+                             const RuntimeShape& unextended_output_shape,
+                             int16_t* output_data) {
+  BroadcastDivSlowQuantized<int16_t, N>(
+      params, unextended_input1_shape, input1_data, unextended_input2_shape,
+      input2_data, unextended_output_shape, output_data);
+}
+
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index aa9901d605f..1a5ef0cb1f4 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -26,10 +26,10 @@ namespace tflite {
 namespace reference_ops {
 
 // Broadcast prelu to output_shape for quantized uint8_t/int8_t data.
-template <typename T>
+template <typename T, typename U>
 inline void BroadcastPrelu4DSlow(
     const PreluParams& params, const RuntimeShape& input_shape,
-    const T* input_data, const RuntimeShape& alpha_shape, const T* alpha_data,
+    const T* input_data, const RuntimeShape& alpha_shape, const U* alpha_data,
     const RuntimeShape& output_shape, T* output_data) {
   TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
   TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
@@ -74,10 +74,10 @@ inline void BroadcastPrelu4DSlow(
   }
 }
 
-template <typename T>
+template <typename T, typename U>
 inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
                   const T* input_data, const RuntimeShape& alpha_shape,
-                  const T* alpha_data, const RuntimeShape& output_shape,
+                  const U* alpha_data, const RuntimeShape& output_shape,
                   T* output_data) {
   const int32_t quantized_min = std::numeric_limits<T>::min();
   const int32_t quantized_max = std::numeric_limits<T>::max();